[PATCH v6 5/5] x86/paravirt: remove no longer needed paravirt patching code

2023-12-09 Thread Juergen Gross
Now that paravirt is using the alternatives patching infrastructure,
remove the paravirt patching code.

Signed-off-by: Juergen Gross 
Acked-by: Peter Zijlstra (Intel) 
---
 arch/x86/include/asm/paravirt.h   | 13 --
 arch/x86/include/asm/paravirt_types.h | 38 ---
 arch/x86/include/asm/text-patching.h  | 12 -
 arch/x86/kernel/alternative.c | 67 +--
 arch/x86/kernel/paravirt.c| 30 
 arch/x86/kernel/vmlinux.lds.S | 13 --
 arch/x86/tools/relocs.c   |  2 +-
 7 files changed, 3 insertions(+), 172 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 973c2ac2d25c..8bcf7584e7dd 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -725,23 +725,10 @@ void native_pv_lock_init(void) __init;
 
 #else  /* __ASSEMBLY__ */
 
-#define _PVSITE(ptype, ops)\
-771:;  \
-   ops;\
-772:;  \
-   .pushsection .parainstructions,"a"; \
-.long 771b-.;  \
-.byte ptype;   \
-.byte 772b-771b;   \
-   .popsection
-
-
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT_XXL
 #ifdef CONFIG_DEBUG_ENTRY
 
-#define PARA_PATCH(off)((off) / 8)
-#define PARA_SITE(ptype, ops)  _PVSITE(ptype, ops)
 #define PARA_INDIRECT(addr)*addr(%rip)
 
 .macro PARA_IRQ_save_fl
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 9cad536fc08d..d8e85d2cf8d5 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -2,15 +2,6 @@
 #ifndef _ASM_X86_PARAVIRT_TYPES_H
 #define _ASM_X86_PARAVIRT_TYPES_H
 
-#ifndef __ASSEMBLY__
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
-   s32 instr_offset;   /* original instructions */
-   u8 type;/* type of this instruction */
-   u8 len; /* length of original instruction */
-} __packed;
-#endif
-
 #ifdef CONFIG_PARAVIRT
 
 #ifndef __ASSEMBLY__
@@ -250,32 +241,6 @@ struct paravirt_patch_template {
 extern struct pv_info pv_info;
 extern struct paravirt_patch_template pv_ops;
 
-#define PARAVIRT_PATCH(x)  \
-   (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
-
-#define paravirt_type(op)  \
-   [paravirt_typenum] "i" (PARAVIRT_PATCH(op)),\
-   [paravirt_opptr] "m" (pv_ops.op)
-/*
- * Generate some code, and mark it as patchable by the
- * apply_paravirt() alternate instruction patcher.
- */
-#define _paravirt_alt(insn_string, type)   \
-   "771:\n\t" insn_string "\n" "772:\n"\
-   ".pushsection .parainstructions,\"a\"\n"\
-   "  .long 771b-.\n"  \
-   "  .byte " type "\n"\
-   "  .byte 772b-771b\n"   \
-   ".popsection\n"
-
-/* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string)  \
-   _paravirt_alt(insn_string, "%c[paravirt_typenum]")
-
-/* Simple instruction patching code. */
-#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, 
unsigned int len);
 #define paravirt_ptr(op)   [paravirt_opptr] "m" (pv_ops.op)
 
 int paravirt_disable_iospace(void);
@@ -555,9 +520,6 @@ unsigned long pv_native_read_cr2(void);
 
 #define paravirt_nop   ((void *)nop_func)
 
-extern struct paravirt_patch_site __parainstructions[],
-   __parainstructions_end[];
-
 #endif /* __ASSEMBLY__ */
 
 #define ALT_NOT_XENALT_NOT(X86_FEATURE_XENPV)
diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index 29832c338cdc..0b70653a98c1 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -6,18 +6,6 @@
 #include 
 #include 
 
-struct paravirt_patch_site;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
-   struct paravirt_patch_site *end);
-#else
-static inline void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{}
-#define __parainstructions NULL
-#define __parainstructions_end NULL
-#endif
-
 /*
  * Currently, the max observed size in the kernel code is
  * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ab829b3d028d..2cad05ff7656 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1472,48 +1472,6 @@ int 

[PATCH v6 4/5] x86/paravirt: switch mixed paravirt/alternative calls to alternative_2

2023-12-09 Thread Juergen Gross
Instead of stacking alternative and paravirt patching, use the new
ALT_FLAG_CALL flag to switch those mixed calls to pure alternative
handling.

This eliminates the need to be careful regarding the sequence of
alternative and paravirt patching.

Signed-off-by: Juergen Gross 
Acked-by: Peter Zijlstra (Intel) 
---
V5:
- remove no longer needed extern declarations from alternative.c
  (Boris Petkov)
- add comment about ALTERNATIVE[_2]() macro usage (Boris Petkov)
- rebase to tip/master (Boris Petkov)
V6:
- fix SAVE_FLAGS macro (Boris Petkov)
---
 arch/x86/include/asm/alternative.h|  5 ++--
 arch/x86/include/asm/paravirt.h   | 12 
 arch/x86/include/asm/paravirt_types.h | 40 +++
 arch/x86/kernel/alternative.c |  1 -
 arch/x86/kernel/callthunks.c  | 17 ++--
 arch/x86/kernel/module.c  | 20 --
 6 files changed, 44 insertions(+), 51 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 472334eed6f3..fcd20c6dc7f9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -89,6 +89,8 @@ struct alt_instr {
u8  replacementlen; /* length of new instruction */
 } __packed;
 
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+
 /*
  * Debug flag that can be tested to see whether alternative
  * instructions were patched in already:
@@ -104,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 
*end_retpoine,
  s32 *start_cfi, s32 *end_cfi);
 
 struct module;
-struct paravirt_patch_site;
 
 struct callthunk_sites {
s32 *call_start, *call_end;
-   struct paravirt_patch_site  *pv_start, *pv_end;
+   struct alt_instr*alt_start, *alt_end;
 };
 
 #ifdef CONFIG_CALL_THUNKS
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f18bfa7f3070..973c2ac2d25c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -738,20 +738,20 @@ void native_pv_lock_init(void) __init;
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
 
 #define PARA_PATCH(off)((off) / 8)
 #define PARA_SITE(ptype, ops)  _PVSITE(ptype, ops)
 #define PARA_INDIRECT(addr)*addr(%rip)
 
-#ifdef CONFIG_DEBUG_ENTRY
 .macro PARA_IRQ_save_fl
-   PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
- ANNOTATE_RETPOLINE_SAFE;
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+   ANNOTATE_RETPOLINE_SAFE;
+   call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
 .endm
 
-#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
-   ALT_NOT_XEN
+#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;",  \
+"ALT_CALL_INSTR;", ALT_CALL_ALWAYS,\
+"pushf; pop %rax;", ALT_NOT_XEN
 #endif
 #endif /* CONFIG_PARAVIRT_XXL */
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 166e9618158f..9cad536fc08d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -276,15 +276,11 @@ extern struct paravirt_patch_template pv_ops;
 #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
 
 unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, 
unsigned int len);
+#define paravirt_ptr(op)   [paravirt_opptr] "m" (pv_ops.op)
 
 int paravirt_disable_iospace(void);
 
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
+/* This generates an indirect call based on the operation type number. */
 #define PARAVIRT_CALL  \
ANNOTATE_RETPOLINE_SAFE \
"call *%[paravirt_opptr];"
@@ -317,12 +313,6 @@ int paravirt_disable_iospace(void);
  * However, x86_64 also has to clobber all caller saved registers, which
  * unfortunately, are quite a bit (r8 - r11)
  *
- * The call instruction itself is marked by placing its start address
- * and size into the .parainstructions section, so that
- * apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend pv_init_ops
- * implementation.
- *
  * Unfortunately there's no way to get gcc to generate the args setup
  * for the call, and then allow the call itself to be generated by an
  * inline asm.  Because of this, we must do the complete arg setup and
@@ -421,14 +411,27 @@ int paravirt_disable_iospace(void);
__mask & __eax; \
})
 
-
+/*
+ * Use 

[PATCH v6 2/5] x86/paravirt: move some functions and defines to alternative

2023-12-09 Thread Juergen Gross
As a preparation for replacing paravirt patching completely by
alternative patching, move some backend functions and #defines to
alternative code and header.

Signed-off-by: Juergen Gross 
---
V4:
- rename x86_nop() to nop_func() and x86_BUG() to BUG_func() (Boris
  Petkov)
---
 arch/x86/include/asm/alternative.h| 16 
 arch/x86/include/asm/paravirt.h   | 12 -
 arch/x86/include/asm/paravirt_types.h |  4 +--
 arch/x86/include/asm/qspinlock_paravirt.h |  4 +--
 arch/x86/kernel/alternative.c | 10 
 arch/x86/kernel/kvm.c |  4 +--
 arch/x86/kernel/paravirt.c| 30 +++
 arch/x86/xen/irq.c|  2 +-
 8 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 65f79092c9d9..ce788ab4e77c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -330,6 +330,22 @@ static inline int alternatives_text_reserved(void *start, 
void *end)
  */
 #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
 
+/* Macro for creating assembler functions avoiding any C magic. */
+#define DEFINE_ASM_FUNC(func, instr, sec)  \
+   asm (".pushsection " #sec ", \"ax\"\n"  \
+".global " #func "\n\t"\
+".type " #func ", @function\n\t"   \
+ASM_FUNC_ALIGN "\n"\
+#func ":\n\t"  \
+ASM_ENDBR  \
+instr "\n\t"   \
+ASM_RET\
+".size " #func ", . - " #func "\n\t"   \
+".popsection")
+
+void BUG_func(void);
+void nop_func(void);
+
 #else /* __ASSEMBLY__ */
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index aa76ac7c806c..f18bfa7f3070 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -720,18 +720,6 @@ static __always_inline unsigned long 
arch_local_irq_save(void)
 #undef PVOP_VCALL4
 #undef PVOP_CALL4
 
-#define DEFINE_PARAVIRT_ASM(func, instr, sec)  \
-   asm (".pushsection " #sec ", \"ax\"\n"  \
-".global " #func "\n\t"\
-".type " #func ", @function\n\t"   \
-ASM_FUNC_ALIGN "\n"\
-#func ":\n\t"  \
-ASM_ENDBR  \
-instr "\n\t"   \
-ASM_RET\
-".size " #func ", . - " #func "\n\t"   \
-".popsection")
-
 extern void default_banner(void);
 void native_pv_lock_init(void) __init;
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 483e19e5ca7a..166e9618158f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -540,8 +540,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),\
 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
 
-void _paravirt_nop(void);
-void paravirt_BUG(void);
 unsigned long paravirt_ret0(void);
 #ifdef CONFIG_PARAVIRT_XXL
 u64 _paravirt_ident_64(u64);
@@ -551,7 +549,7 @@ void pv_native_irq_enable(void);
 unsigned long pv_native_read_cr2(void);
 #endif
 
-#define paravirt_nop   ((void *)_paravirt_nop)
+#define paravirt_nop   ((void *)nop_func)
 
 extern struct paravirt_patch_site __parainstructions[],
__parainstructions_end[];
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h 
b/arch/x86/include/asm/qspinlock_paravirt.h
index 85b6e3609cb9..ef9697f20129 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -56,8 +56,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, 
".spinlock.text");
"pop%rdx\n\t"   \
FRAME_END
 
-DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock,
-   PV_UNLOCK_ASM, .spinlock.text);
+DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
+   PV_UNLOCK_ASM, .spinlock.text);
 
 #else /* CONFIG_64BIT */
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index be35c8ccf826..ca25dd280b8c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -385,6 +385,16 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, 
size_t src_len)
}
 }
 
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
+
+noinstr void BUG_func(void)
+{
+   BUG();
+}
+EXPORT_SYMBOL_GPL(BUG_func);
+
 

[PATCH v6 1/5] x86/paravirt: introduce ALT_NOT_XEN

2023-12-09 Thread Juergen Gross
Introduce the macro ALT_NOT_XEN as a short form of
ALT_NOT(X86_FEATURE_XENPV).

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Juergen Gross 
---
V3:
- split off from next patch
V5:
- move patch to the start of the series (Boris Petkov)
---
 arch/x86/include/asm/paravirt.h   | 42 ---
 arch/x86/include/asm/paravirt_types.h |  3 ++
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 693c61dbdd9c..aa76ac7c806c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -142,8 +142,7 @@ static inline void write_cr0(unsigned long x)
 static __always_inline unsigned long read_cr2(void)
 {
return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
-   "mov %%cr2, %%rax;",
-   ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%cr2, %%rax;", ALT_NOT_XEN);
 }
 
 static __always_inline void write_cr2(unsigned long x)
@@ -154,13 +153,12 @@ static __always_inline void write_cr2(unsigned long x)
 static inline unsigned long __read_cr3(void)
 {
return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
- "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr3, %%rax;", ALT_NOT_XEN);
 }
 
 static inline void write_cr3(unsigned long x)
 {
-   PVOP_ALT_VCALL1(mmu.write_cr3, x,
-   "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
+   PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
 }
 
 static inline void __write_cr4(unsigned long x)
@@ -182,7 +180,7 @@ extern noinstr void pv_native_wbinvd(void);
 
 static __always_inline void wbinvd(void)
 {
-   PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
+   PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
 }
 
 static inline u64 paravirt_read_msr(unsigned msr)
@@ -390,27 +388,25 @@ static inline void paravirt_release_p4d(unsigned long pfn)
 static inline pte_t __pte(pteval_t val)
 {
return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
 }
 
 static inline pteval_t pte_val(pte_t pte)
 {
return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 static inline pgd_t __pgd(pgdval_t val)
 {
return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
 }
 
 static inline pgdval_t pgd_val(pgd_t pgd)
 {
return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 #define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -444,14 +440,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline pmd_t __pmd(pmdval_t val)
 {
return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
 }
 
 static inline pmdval_t pmd_val(pmd_t pmd)
 {
return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
@@ -464,7 +459,7 @@ static inline pud_t __pud(pudval_t val)
pudval_t ret;
 
ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
-  "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+  "mov %%rdi, %%rax", ALT_NOT_XEN);
 
return (pud_t) { ret };
 }
@@ -472,7 +467,7 @@ static inline pud_t __pud(pudval_t val)
 static inline pudval_t pud_val(pud_t pud)
 {
return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
-   "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%rax", ALT_NOT_XEN);
 }
 
 static inline void pud_clear(pud_t *pudp)
@@ -492,8 +487,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
 static inline p4d_t __p4d(p4dval_t val)
 {
p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
-   "mov %%rdi, %%rax",
-   

[PATCH v6 0/5] 86/paravirt: Get rid of paravirt patching

2023-12-09 Thread Juergen Gross
This is a small series getting rid of paravirt patching by switching
completely to alternative patching for the same functionality.

The basic idea is to add the capability to switch from indirect to
direct calls via a special alternative patching option.

This removes _some_ of the paravirt macro maze, but most of it needs
to stay due to the need of hiding the call instructions from the
compiler in order to avoid needless register save/restore.

What is going away is the nasty stacking of alternative and paravirt
patching and (of course) the special .parainstructions linker section.

I have tested the series on bare metal and as Xen PV domain to still
work.

Note that objtool might need some changes to cope with the new
indirect call patching mechanism. Additionally some paravirt handling
can probably be removed from it.

Changes in V6:
- addressed Boris' comments

Changes in V5:
- addressed Boris' comments
- rebased on top of the tip/master branch

Changes in V4:
- addressed Boris' comments in patch 1
- fixed bugs found by kernel test robot (patch 2)

Changes in V3:
- split v2 patch 3 into 2 patches as requested by Peter and Ingo

Changes in V2:
- split last patch into 2
- rebase of patch 2 as suggested by Peter
- addressed Peter's comments for patch 3

Juergen Gross (5):
  x86/paravirt: introduce ALT_NOT_XEN
  x86/paravirt: move some functions and defines to alternative
  x86/alternative: add indirect call patching
  x86/paravirt: switch mixed paravirt/alternative calls to alternative_2
  x86/paravirt: remove no longer needed paravirt patching code

 arch/x86/include/asm/alternative.h|  30 -
 arch/x86/include/asm/paravirt.h   |  77 
 arch/x86/include/asm/paravirt_types.h |  85 +-
 arch/x86/include/asm/qspinlock_paravirt.h |   4 +-
 arch/x86/include/asm/text-patching.h  |  12 --
 arch/x86/kernel/alternative.c | 136 +++---
 arch/x86/kernel/callthunks.c  |  17 ++-
 arch/x86/kernel/kvm.c |   4 +-
 arch/x86/kernel/module.c  |  20 +---
 arch/x86/kernel/paravirt.c|  54 +
 arch/x86/kernel/vmlinux.lds.S |  13 ---
 arch/x86/tools/relocs.c   |   2 +-
 arch/x86/xen/irq.c|   2 +-
 13 files changed, 169 insertions(+), 287 deletions(-)

-- 
2.35.3




[PATCH 10/14] tracing: Stop the tracing while changing the ring buffer subbuf size

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Because the main buffer and the snapshot buffer need to be the same for
some tracers, otherwise it will fail and disable all tracing, the tracers
need to be stopped while updating the sub buffer sizes so that the tracers
see the main and snapshot buffers with the same sub buffer size.

Fixes: TBD ("ring-buffer: Add interface for configuring trace sub buffer size")
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7faaaf29b504..9e5f2392d521 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9397,13 +9397,16 @@ buffer_order_write(struct file *filp, const char __user 
*ubuf,
if (val < 0 || val > 7)
return -EINVAL;
 
+   /* Do not allow tracing while changing the order of the ring buffer */
+   tracing_stop_tr(tr);
+
old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
if (old_order == val)
-   return 0;
+   goto out;
 
ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
if (ret)
-   return 0;
+   goto out;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 
@@ -9430,11 +9433,15 @@ buffer_order_write(struct file *filp, const char __user 
*ubuf,
 */
tracing_disabled = 1;
}
-   return ret;
+   goto out;
}
  out_max:
 #endif
(*ppos)++;
+ out:
+   if (ret)
+   cnt = ret;
+   tracing_start_tr(tr);
return cnt;
 }
 
-- 
2.42.0





[PATCH 05/14] ring-buffer: Read and write to ring buffers with custom sub buffer size

2023-12-09 Thread Steven Rostedt
From: "Tzvetomir Stoyanov (VMware)" 

As the size of the ring sub buffer page can be changed dynamically,
the logic that reads and writes to the buffer should be fixed to take
that into account. Some internal ring buffer APIs are changed:
 ring_buffer_alloc_read_page()
 ring_buffer_free_read_page()
 ring_buffer_read_page()
A new API is introduced:
 ring_buffer_read_page_data()

Link: 
https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoya...@gmail.com

Signed-off-by: Tzvetomir Stoyanov (VMware) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ring_buffer.h  | 11 ++--
 kernel/trace/ring_buffer.c   | 75 
 kernel/trace/ring_buffer_benchmark.c | 10 ++--
 kernel/trace/trace.c | 34 +++--
 4 files changed, 89 insertions(+), 41 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 12573306b889..fa802db216f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer 
*buffer);
 size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu);
 size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu);
 
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void 
*data);
-int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
+struct buffer_data_read_page;
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+   struct buffer_data_read_page *page);
+int ring_buffer_read_page(struct trace_buffer *buffer,
+ struct buffer_data_read_page *data_page,
  size_t len, int cpu, int full);
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page);
 
 struct trace_seq;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7f18b2fd2514..4cb03b3b99e6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -317,6 +317,11 @@ struct buffer_data_page {
unsigned chardata[] RB_ALIGN_DATA;  /* data of buffer page */
 };
 
+struct buffer_data_read_page {
+   unsignedorder;  /* order of the page */
+   struct buffer_data_page *data;  /* actual data, stored in this page */
+};
+
 /*
  * Note, the buffer_page list must be first. The buffer pages
  * are allocated in cache lines, which means that each buffer
@@ -5616,40 +5621,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  * Returns:
  *  The page allocated, or ERR_PTR
  */
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 {
struct ring_buffer_per_cpu *cpu_buffer;
-   struct buffer_data_page *bpage = NULL;
+   struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
 
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
 
+   bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+   if (!bpage)
+   return ERR_PTR(-ENOMEM);
+
+   bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(_buffer->lock);
 
if (cpu_buffer->free_page) {
-   bpage = cpu_buffer->free_page;
+   bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
 
arch_spin_unlock(_buffer->lock);
local_irq_restore(flags);
 
-   if (bpage)
+   if (bpage->data)
goto out;
 
page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
cpu_buffer->buffer->subbuf_order);
-   if (!page)
+   if (!page) {
+   kfree(bpage);
return ERR_PTR(-ENOMEM);
+   }
 
-   bpage = page_address(page);
+   bpage->data = page_address(page);
 
  out:
-   rb_init_page(bpage);
+   rb_init_page(bpage->data);
 
return bpage;
 }
@@ -5659,14 +5672,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
  * ring_buffer_free_read_page - free an allocated read page
  * @buffer: the buffer the page was allocate for
  * @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @page: the page to free
  *
  * Free a page allocated from ring_buffer_alloc_read_page.
  */
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void 
*data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+   struct buffer_data_read_page *data_page)
 {
struct ring_buffer_per_cpu *cpu_buffer;
-   struct buffer_data_page *bpage = data;
+   struct 

[PATCH 09/14] tracing: Update snapshot order along with main buffer order

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

When updating the order of the sub buffers for the main buffer, make sure
that if the snapshot buffer exists, that it gets its order updated as
well.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 45 ++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b3b36d7f1201..7faaaf29b504 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1263,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer 
*buf, unsigned long val);
 
 int tracing_alloc_snapshot_instance(struct trace_array *tr)
 {
+   int order;
int ret;
 
if (!tr->allocated_snapshot) {
 
+   /* Make the snapshot buffer have the same order as main buffer 
*/
+   order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+   ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 
order);
+   if (ret < 0)
+   return ret;
+
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(>max_buffer,
   >array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1286,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr)
 * The max_tr ring buffer has some state (e.g. ring->clock) and
 * we want preserve it.
 */
+   ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(>max_buffer, 1);
tracing_reset_online_cpus(>max_buffer);
@@ -9378,6 +9386,7 @@ buffer_order_write(struct file *filp, const char __user 
*ubuf,
 {
struct trace_array *tr = filp->private_data;
unsigned long val;
+   int old_order;
int ret;
 
ret = kstrtoul_from_user(ubuf, cnt, 10, );
@@ -9388,12 +9397,44 @@ buffer_order_write(struct file *filp, const char __user 
*ubuf,
if (val < 0 || val > 7)
return -EINVAL;
 
+   old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+   if (old_order == val)
+   return 0;
+
ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
if (ret)
-   return ret;
+   return 0;
 
-   (*ppos)++;
+#ifdef CONFIG_TRACER_MAX_TRACE
 
+   if (!tr->allocated_snapshot)
+   goto out_max;
+
+   ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val);
+   if (ret) {
+   /* Put back the old order */
+   cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, 
old_order);
+   if (WARN_ON_ONCE(cnt)) {
+   /*
+* AARGH! We are left with different orders!
+* The max buffer is our "snapshot" buffer.
+* When a tracer needs a snapshot (one of the
+* latency tracers), it swaps the max buffer
+* with the saved snap shot. We succeeded to
+* update the order of the main buffer, but failed to
+* update the order of the max buffer. But when we tried
+* to reset the main buffer to the original size, we
+* failed there too. This is very unlikely to
+* happen, but if it does, warn and kill all
+* tracing.
+*/
+   tracing_disabled = 1;
+   }
+   return ret;
+   }
+ out_max:
+#endif
+   (*ppos)++;
return cnt;
 }
 
-- 
2.42.0





[PATCH 03/14] ring-buffer: Add interface for configuring trace sub buffer size

2023-12-09 Thread Steven Rostedt
From: "Tzvetomir Stoyanov (VMware)" 

The trace ring buffer sub page size can be configured, per trace
instance. A new ftrace file "buffer_subbuf_order" is added to get and
set the size of the ring buffer sub page for current trace instance.
The size must be an order of system page size, that's why the new
interface works with system page order, instead of absolute page size:
0 means the ring buffer sub page is equal to 1 system page and so
forth:
0 - 1 system page
1 - 2 system pages
2 - 4 system pages
...
The ring buffer sub page size is limited between 1 and 128 system
pages. The default value is 1 system page.
New ring buffer APIs are introduced:
 ring_buffer_subbuf_order_set()
 ring_buffer_subbuf_order_get()
 ring_buffer_subbuf_size_get()

Link: 
https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoya...@gmail.com

Signed-off-by: Tzvetomir Stoyanov (VMware) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ring_buffer.h |  4 ++
 kernel/trace/ring_buffer.c  | 73 +
 kernel/trace/trace.c| 48 
 3 files changed, 125 insertions(+)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ce46218ce46d..12573306b889 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -202,6 +202,10 @@ struct trace_seq;
 int ring_buffer_print_entry_header(struct trace_seq *s);
 int ring_buffer_print_page_header(struct trace_buffer *buffer, struct 
trace_seq *s);
 
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer);
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order);
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
+
 enum ring_buffer_flags {
RB_FL_OVERWRITE = 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a4354287dec3..7c925ce95c96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -524,6 +524,7 @@ struct trace_buffer {
booltime_stamp_abs;
 
unsigned intsubbuf_size;
+   unsigned intsubbuf_order;
unsigned intmax_data_size;
 };
 
@@ -5894,6 +5895,78 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+   return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer 
page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This 
parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can 
be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+   if (!buffer)
+   return -EINVAL;
+
+   return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ *  0 - 1 system page
+ *  1 - 2 system pages
+ *  3 - 4 system pages
+ *  ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+   int psize;
+
+   if (!buffer || order < 0)
+   return -EINVAL;
+
+   if (buffer->subbuf_order == order)
+   return 0;
+
+   psize = (1 << order) * PAGE_SIZE;
+   if (psize <= BUF_PAGE_HDR_SIZE)
+   return -EINVAL;
+
+   buffer->subbuf_order = order;
+   buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+   /* Todo: reset the buffer with the new page size */
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was 
in
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 90e0ea91521f..503fd7a57177 100644
--- a/kernel/trace/trace.c
+++ 

[PATCH 12/14] ring-buffer: Just update the subbuffers when changing their allocation order

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

The ring_buffer_subbuf_order_set() was creating ring_buffer_per_cpu
cpu_buffers with the new subbuffers with the updated order, and if they
all successfully were created, then they the ring_buffer's per_cpu buffers
would be freed and replaced by them.

The problem is that the freed per_cpu buffers contains state that would be
lost. Running the following commands:

1. # echo 3 > /sys/kernel/tracing/buffer_subbuf_order
2. # echo 0 > /sys/kernel/tracing/tracing_cpumask
3. # echo 1 > /sys/kernel/tracing/snapshot
4. # echo ff > /sys/kernel/tracing/tracing_cpumask
5. # echo test > /sys/kernel/tracing/trace_marker

Would result in:

 -bash: echo: write error: Bad file descriptor

That's because the state of the per_cpu buffers of the snapshot buffer is
lost when the order is changed (the order of a freed snapshot buffer goes
to 0 to save memory, and when the snapshot buffer is allocated again, it
goes back to what the main buffer is).

In operation 2, the snapshot buffers were set to "disable" (as all the
ring buffers CPUs were disabled).

In operation 3, the snapshot is allocated and a call to
ring_buffer_subbuf_order_set() replaced the per_cpu buffers losing the
"record_disable" count.

When it was enabled again, the atomic_dec(_buffer->record_disable) was
decrementing a zero, setting it to -1. Writing 1 into the snapshot would
swap the snapshot buffer with the main buffer, so now the main buffer is
"disabled", and nothing can write to the ring buffer anymore.

Instead of creating new per_cpu buffers and losing the state of the old
buffers, basically do what the resize does and just allocate new subbuf
pages into the new_pages link list of the per_cpu buffer and if they all
succeed, then replace the old sub buffers with the new ones. This keeps
the per_cpu buffer descriptor in tact and by doing so, keeps its state.

Fixes: TBD ("ring-buffer: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 82 +++---
 1 file changed, 68 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4726deccd997..bbefb5838e64 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5989,7 +5989,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
  */
 int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 {
-   struct ring_buffer_per_cpu **cpu_buffers;
+   struct ring_buffer_per_cpu *cpu_buffer;
+   struct buffer_page *bpage, *tmp;
int old_order, old_size;
int nr_pages;
int psize;
@@ -6008,9 +6009,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer 
*buffer, int order)
return -EINVAL;
 
bsize = sizeof(void *) * buffer->cpus;
-   cpu_buffers = kzalloc(bsize, GFP_KERNEL);
-   if (!cpu_buffers)
-   return -ENOMEM;
 
old_order = buffer->subbuf_order;
old_size = buffer->subbuf_size;
@@ -6027,33 +6025,85 @@ int ring_buffer_subbuf_order_set(struct trace_buffer 
*buffer, int order)
 
/* Make sure all new buffers are allocated, before deleting the old 
ones */
for_each_buffer_cpu(buffer, cpu) {
+
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
 
+   cpu_buffer = buffer->buffers[cpu];
+
/* Update the number of pages to match the new size */
nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
 
-   cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, 
cpu);
-   if (!cpu_buffers[cpu]) {
+   /* we need a minimum of two pages */
+   if (nr_pages < 2)
+   nr_pages = 2;
+
+   cpu_buffer->nr_pages_to_update = nr_pages;
+
+   /* Include the reader page */
+   nr_pages++;
+
+   /* Allocate the new size buffer */
+   INIT_LIST_HEAD(_buffer->new_pages);
+   if (__rb_allocate_pages(cpu_buffer, nr_pages,
+   _buffer->new_pages)) {
+   /* not enough memory for new pages */
err = -ENOMEM;
goto error;
}
}
 
for_each_buffer_cpu(buffer, cpu) {
+
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
 
-   rb_free_cpu_buffer(buffer->buffers[cpu]);
-   buffer->buffers[cpu] = cpu_buffers[cpu];
+   cpu_buffer = buffer->buffers[cpu];
+
+   /* Clear the head bit to make the link list normal to read */
+   rb_head_page_deactivate(cpu_buffer);
+
+   /* Now walk the list and free all the old sub buffers */
+   list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {

[PATCH 08/14] ring-buffer: Make sure the spare sub buffer used for reads has same size

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Now that the ring buffer specifies the size of its sub buffers, they all
need to be the same size. When doing a read, a swap is done with a spare
page. Make sure they are the same size before doing the swap, otherwise
the read will fail.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8e9853d38c8d..b3b36d7f1201 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7567,6 +7567,7 @@ struct ftrace_buffer_info {
struct trace_iterator   iter;
void*spare;
unsigned intspare_cpu;
+   unsigned intspare_size;
unsigned intread;
 };
 
@@ -8286,6 +8287,15 @@ tracing_buffers_read(struct file *filp, char __user 
*ubuf,
 
page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
 
+   /* Make sure the spare matches the current sub buffer size */
+   if (info->spare) {
+   if (page_size != info->spare_size) {
+   ring_buffer_free_read_page(iter->array_buffer->buffer,
+  info->spare_cpu, 
info->spare);
+   info->spare = NULL;
+   }
+   }
+
if (!info->spare) {
info->spare = 
ring_buffer_alloc_read_page(iter->array_buffer->buffer,
  iter->cpu_file);
@@ -8294,6 +8304,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
+   info->spare_size = page_size;
}
}
if (!info->spare)
-- 
2.42.0





[PATCH 02/14] ring-buffer: Page size per ring buffer

2023-12-09 Thread Steven Rostedt
From: "Tzvetomir Stoyanov (VMware)" 

Currently the size of one sub buffer page is global for all buffers and
it is hard coded to one system page. In order to introduce configurable
ring buffer sub page size, the internal logic should be refactored to
work with sub page size per ring buffer.

Link: 
https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoya...@gmail.com

Signed-off-by: Tzvetomir Stoyanov (VMware) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ring_buffer.h |  2 +-
 kernel/trace/ring_buffer.c  | 65 +++--
 kernel/trace/trace.c|  2 +-
 kernel/trace/trace.h|  1 +
 kernel/trace/trace_events.c | 59 +
 5 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1b03b2c0f08..ce46218ce46d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void 
**data_page,
 struct trace_seq;
 
 int ring_buffer_print_entry_header(struct trace_seq *s);
-int ring_buffer_print_page_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct 
trace_seq *s);
 
 enum ring_buffer_flags {
RB_FL_OVERWRITE = 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d59d14e67da1..a4354287dec3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -373,14 +373,6 @@ static inline bool test_time_stamp(u64 delta)
return !!(delta & TS_DELTA_TEST);
 }
 
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-/* Events may have a timestamp attached to them */
-#define BUF_MAX_EVENT_SIZE (BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND)
-
 struct rb_irq_work {
struct irq_work work;
wait_queue_head_t   waiters;
@@ -530,6 +522,9 @@ struct trace_buffer {
 
struct rb_irq_work  irq_work;
booltime_stamp_abs;
+
+   unsigned intsubbuf_size;
+   unsigned intmax_data_size;
 };
 
 struct ring_buffer_iter {
@@ -546,7 +541,7 @@ struct ring_buffer_iter {
int missed_events;
 };
 
-int ring_buffer_print_page_header(struct trace_seq *s)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct 
trace_seq *s)
 {
struct buffer_data_page field;
 
@@ -570,7 +565,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
trace_seq_printf(s, "\tfield: char data;\t"
 "offset:%u;\tsize:%u;\tsigned:%u;\n",
 (unsigned int)offsetof(typeof(field), data),
-(unsigned int)BUF_PAGE_SIZE,
+(unsigned int)buffer->subbuf_size,
 (unsigned int)is_signed_type(char));
 
return !trace_seq_has_overflowed(s);
@@ -1822,7 +1817,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long 
size, unsigned flags,
if (!zalloc_cpumask_var(>cpumask, GFP_KERNEL))
goto fail_free_buffer;
 
-   nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+   /* Default buffer page size - one system page */
+   buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+   /* Max payload is buffer page size - header (8bytes) */
+   buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+   nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -2141,7 +2142,7 @@ static void update_pages_handler(struct work_struct *work)
  * @size: the new size.
  * @cpu_id: the cpu buffer to resize
  *
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
  *
  * Returns 0 on success and < 0 on failure.
  */
@@ -2163,7 +2164,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, 
unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
 
-   nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+   nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
 
/* we need a minimum of two pages */
if (nr_pages < 2)
@@ -2410,7 +2411,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 */
barrier();
 
-   if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+   if ((iter->head + length) > commit || length > 
iter->cpu_buffer->buffer->max_data_size)
/* Writer corrupted the read? */
goto reset;
 
@@ -2643,6 +2644,7 @@ static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
  unsigned long tail, struct rb_event_info *info)
 {
+   unsigned long bsize 

[PATCH 14/14] ringbuffer/selftest: Add basic selftest to test chaning subbuf order

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Add a self test that will write into the trace buffer with differ trace
sub buffer order sizes.

Signed-off-by: Steven Rostedt (Google) 
---
 .../ftrace/test.d/00basic/ringbuffer_order.tc | 46 +++
 1 file changed, 46 insertions(+)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc

diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc 
b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
new file mode 100644
index ..c0d76dc724d3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
@@ -0,0 +1,46 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Change the ringbuffer size
+# requires: buffer_subbuf_order
+# flags: instance
+
+get_buffer_data_size() {
+   sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+a="1234567890"
+
+make_str() {
+cnt=$1
+s=""
+while [ $cnt -gt 10 ]; do
+s="${s}${a}"
+cnt=$((cnt-10))
+done
+while [ $cnt -gt 0 ]; do
+s="${s}X"
+cnt=$((cnt-1))
+done
+echo -n $s
+}
+
+test_buffer() {
+
+   size=`get_buffer_data_size`
+
+   str=`make_str $size`
+
+   echo $str > trace_marker
+
+   grep -q $a trace
+}
+
+ORIG=`cat buffer_subbuf_order`
+
+for a in `seq 0 4`; do
+   echo 0 > buffer_subbuf_order
+   test_buffer
+done
+
+echo $ORIG > buffer_subbuf_order
+
-- 
2.42.0





[PATCH 00/14] ring-buffer/tracing: Allow ring buffer to have bigger sub buffers

2023-12-09 Thread Steven Rostedt
Note, this has been on my todo list since the ring buffer was created back
in 2008.

Tzvetomir last worked on this in 2020 and I need to finally get it in.

His last series was:

  
https://lore.kernel.org/linux-trace-devel/20211213094825.61876-1-tz.stoya...@gmail.com/

With the description of:

   Currently the size of one sub buffer page is global for all buffers and
   it is hard coded to one system page. The patch set introduces configurable
   ring buffer sub page size, per ring buffer. A new user space interface is
   introduced, which allows to change the sub page size of the ftrace buffer,
   per ftrace instance.

I'm pulling in his patches mostly untouched, except that I had to tweak
a few things to forward port them.

The issues I found I added as the last 7 patches to the series, and then
I added documentation and a selftest.

Basically, events to the tracing subsystem are limited to just under a
PAGE_SIZE, as the ring buffer is split into "sub buffers" of one page
size, and an event can not be bigger than a sub buffer. This allows users
to change the size of a sub buffer by the order:

  echo 3 > /sys/kernel/tracing/buffer_subbuf_order

Will make each sub buffer a size of 8 pages, allowing events to be almost
as big as 8 pages in size (sub buffers do have meta data on them as
well, keeping an event from reaching the same size as a sub buffer).



Steven Rostedt (Google) (9):
  ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() 
failure
  ring-buffer: Do no swap cpu buffers if order is different
  ring-buffer: Make sure the spare sub buffer used for reads has same size
  tracing: Update snapshot order along with main buffer order
  tracing: Stop the tracing while changing the ring buffer subbuf size
  ring-buffer: Keep the same size when updating the order
  ring-buffer: Just update the subbuffers when changing their allocation 
order
  ring-buffer: Add documentation on the buffer_subbuf_order file
  ringbuffer/selftest: Add basic selftest to test chaning subbuf order

Tzvetomir Stoyanov (VMware) (5):
  ring-buffer: Refactor ring buffer implementation
  ring-buffer: Page size per ring buffer
  ring-buffer: Add interface for configuring trace sub buffer size
  ring-buffer: Set new size of the ring buffer sub page
  ring-buffer: Read and write to ring buffers with custom sub buffer size


 Documentation/trace/ftrace.rst |  27 ++
 include/linux/ring_buffer.h|  17 +-
 kernel/trace/ring_buffer.c | 406 -
 kernel/trace/ring_buffer_benchmark.c   |  10 +-
 kernel/trace/trace.c   | 143 +++-
 kernel/trace/trace.h   |   1 +
 kernel/trace/trace_events.c|  59 ++-
 .../ftrace/test.d/00basic/ringbuffer_order.tc  |  46 +++
 8 files changed, 588 insertions(+), 121 deletions(-)
 create mode 100644 
tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc



[PATCH 07/14] ring-buffer: Do no swap cpu buffers if order is different

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

As all the subbuffer order (subbuffer sizes) must be the same throughout
the ring buffer, check the order of the buffers that are doing a CPU
buffer swap in ring_buffer_swap_cpu() to make sure they are the same.

If the are not the same, then fail to do the swap, otherwise the ring
buffer will think the CPU buffer has a specific subbuffer size when it
does not.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4e7eb41695f5..9725aab1b5eb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5550,6 +5550,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
 
+   if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+   goto out;
+
ret = -EAGAIN;
 
if (atomic_read(_a->record_disabled))
-- 
2.42.0





[PATCH 01/14] ring-buffer: Refactor ring buffer implementation

2023-12-09 Thread Steven Rostedt
From: "Tzvetomir Stoyanov (VMware)" 

In order to introduce sub-buffer size per ring buffer, some internal
refactoring is needed. As ring_buffer_print_page_header() will depend on
the trace_buffer structure, it is moved after the structure definition.

Link: 
https://lore.kernel.org/linux-trace-devel/20211213094825.61876-2-tz.stoya...@gmail.com

Signed-off-by: Tzvetomir Stoyanov (VMware) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 60 +++---
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3a5026702179..d59d14e67da1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -381,36 +381,6 @@ static inline bool test_time_stamp(u64 delta)
 /* Events may have a timestamp attached to them */
 #define BUF_MAX_EVENT_SIZE (BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND)
 
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
-   struct buffer_data_page field;
-
-   trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-"offset:0;\tsize:%u;\tsigned:%u;\n",
-(unsigned int)sizeof(field.time_stamp),
-(unsigned int)is_signed_type(u64));
-
-   trace_seq_printf(s, "\tfield: local_t commit;\t"
-"offset:%u;\tsize:%u;\tsigned:%u;\n",
-(unsigned int)offsetof(typeof(field), commit),
-(unsigned int)sizeof(field.commit),
-(unsigned int)is_signed_type(long));
-
-   trace_seq_printf(s, "\tfield: int overwrite;\t"
-"offset:%u;\tsize:%u;\tsigned:%u;\n",
-(unsigned int)offsetof(typeof(field), commit),
-1,
-(unsigned int)is_signed_type(long));
-
-   trace_seq_printf(s, "\tfield: char data;\t"
-"offset:%u;\tsize:%u;\tsigned:%u;\n",
-(unsigned int)offsetof(typeof(field), data),
-(unsigned int)BUF_PAGE_SIZE,
-(unsigned int)is_signed_type(char));
-
-   return !trace_seq_has_overflowed(s);
-}
-
 struct rb_irq_work {
struct irq_work work;
wait_queue_head_t   waiters;
@@ -576,6 +546,36 @@ struct ring_buffer_iter {
int missed_events;
 };
 
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+   struct buffer_data_page field;
+
+   trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+"offset:0;\tsize:%u;\tsigned:%u;\n",
+(unsigned int)sizeof(field.time_stamp),
+(unsigned int)is_signed_type(u64));
+
+   trace_seq_printf(s, "\tfield: local_t commit;\t"
+"offset:%u;\tsize:%u;\tsigned:%u;\n",
+(unsigned int)offsetof(typeof(field), commit),
+(unsigned int)sizeof(field.commit),
+(unsigned int)is_signed_type(long));
+
+   trace_seq_printf(s, "\tfield: int overwrite;\t"
+"offset:%u;\tsize:%u;\tsigned:%u;\n",
+(unsigned int)offsetof(typeof(field), commit),
+1,
+(unsigned int)is_signed_type(long));
+
+   trace_seq_printf(s, "\tfield: char data;\t"
+"offset:%u;\tsize:%u;\tsigned:%u;\n",
+(unsigned int)offsetof(typeof(field), data),
+(unsigned int)BUF_PAGE_SIZE,
+(unsigned int)is_signed_type(char));
+
+   return !trace_seq_has_overflowed(s);
+}
+
 #ifdef RB_TIME_32
 
 /*
-- 
2.42.0





[PATCH 11/14] ring-buffer: Keep the same size when updating the order

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

The function ring_buffer_subbuf_order_set() just updated the sub-buffers
to the new size, but this also changes the size of the buffer in doing so.
As the size is determined by nr_pages * subbuf_size. If the subbuf_size is
increased without decreasing the nr_pages, this causes the total size of
the buffer to increase.

This broke the latency tracers as the snapshot needs to be the same size
as the main buffer. The size of the snapshot buffer is only expanded when
needed, and because the order is still the same, the size becomes out of
sync with the main buffer, as the main buffer increased in size without
the tracing system knowing.

Calculate the nr_pages to allocate with the new subbuf_size to be
buffer_size / new_subbuf_size.

Fixes: TBD ("ring-buffer: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9725aab1b5eb..4726deccd997 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6030,7 +6030,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer 
*buffer, int order)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
 
-   nr_pages = buffer->buffers[cpu]->nr_pages;
+   /* Update the number of pages to match the new size */
+   nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+   nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, 
cpu);
if (!cpu_buffers[cpu]) {
err = -ENOMEM;
-- 
2.42.0





[PATCH 13/14] ring-buffer: Add documentation on the buffer_subbuf_order file

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Add to the documentation how to use the buffer_subbuf_order file to change
the size and how it affects what events can be added to the ring buffer.

Signed-off-by: Steven Rostedt (Google) 
---
 Documentation/trace/ftrace.rst | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 23572f6697c0..231d26ceedb0 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -203,6 +203,33 @@ of ftrace. Here is a list of some of the key files:
 
This displays the total combined size of all the trace buffers.
 
+  buffer_subbuf_order:
+
+   This sets or displays the sub buffer page size order. The ring buffer
+   is broken up into several same size "sub buffers". An event can not be
+   bigger than the size of the sub buffer. Normally, the sub buffer is
+   the size of the architecture's page (4K on x86). The sub buffer also
+   contains meta data at the start which also limits the size of an event.
+   That means when the sub buffer is a page size, no event can be larger
+   than the page size minus the sub buffer meta data.
+
+   The buffer_subbuf_order allows the user to change the size of the sub
+   buffer. As the sub buffer is a set of pages by the power of 2, thus
+   the sub buffer total size is defined by the order:
+
+   order   size
+   
+   0   PAGE_SIZE
+   1   PAGE_SIZE * 2
+   2   PAGE_SIZE * 4
+   3   PAGE_SIZE * 8
+
+   Changing the order will change the sub buffer size allowing for events
+   to be larger than the page size.
+
+   Note: When changing the order, tracing is stopped and any data in the
+   ring buffer and the snapshot buffer will be discarded.
+
   free_buffer:
 
If a process is performing tracing, and the ring buffer should be
-- 
2.42.0





[PATCH 06/14] ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

On failure to allocate ring buffer pages, the pointer to the CPU buffer
pages is freed, but the pages that were allocated previously were not.
Make sure they are freed too.

Fixes: TBD ("tracing: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4cb03b3b99e6..4e7eb41695f5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6060,6 +6060,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer 
*buffer, int order)
for_each_buffer_cpu(buffer, cpu) {
if (!cpu_buffers[cpu])
continue;
+   rb_free_cpu_buffer(cpu_buffers[cpu]);
kfree(cpu_buffers[cpu]);
}
kfree(cpu_buffers);
-- 
2.42.0





[PATCH 04/14] ring-buffer: Set new size of the ring buffer sub page

2023-12-09 Thread Steven Rostedt
From: "Tzvetomir Stoyanov (VMware)" 

There are two approaches when changing the size of the ring buffer
sub page:
 1. Destroying all pages and allocating new pages with the new size.
 2. Allocating new pages, copying the content of the old pages before
destroying them.
The first approach is easier, it is selected in the proposed
implementation. Changing the ring buffer sub page size is supposed to
not happen frequently. Usually, that size should be set only once,
when the buffer is not in use yet and is supposed to be empty.

Link: 
https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoya...@gmail.com

Signed-off-by: Tzvetomir Stoyanov (VMware) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 80 ++
 1 file changed, 73 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c925ce95c96..7f18b2fd2514 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -331,6 +331,7 @@ struct buffer_page {
unsigned read;  /* index for next read */
local_t  entries;   /* entries on this page */
unsigned longreal_end;  /* real end of data */
+   unsigned order; /* order of the page */
struct buffer_data_page *page;  /* Actual data page */
 };
 
@@ -361,7 +362,7 @@ static __always_inline unsigned int rb_page_commit(struct 
buffer_page *bpage)
 
 static void free_buffer_page(struct buffer_page *bpage)
 {
-   free_page((unsigned long)bpage->page);
+   free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
 }
 
@@ -1655,10 +1656,12 @@ static int __rb_allocate_pages(struct 
ring_buffer_per_cpu *cpu_buffer,
 
list_add(>list, pages);
 
-   page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 
0);
+   page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+   cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+   bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
 
if (user_thread && fatal_signal_pending(current))
@@ -1737,7 +1740,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long 
nr_pages, int cpu)
rb_check_bpage(cpu_buffer, bpage);
 
cpu_buffer->reader_page = bpage;
-   page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+   page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 
cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -1819,6 +1823,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long 
size, unsigned flags,
goto fail_free_buffer;
 
/* Default buffer page size - one system page */
+   buffer->subbuf_order = 0;
buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
 
/* Max payload is buffer page size - header (8bytes) */
@@ -5636,8 +5641,8 @@ void *ring_buffer_alloc_read_page(struct trace_buffer 
*buffer, int cpu)
if (bpage)
goto out;
 
-   page = alloc_pages_node(cpu_to_node(cpu),
-   GFP_KERNEL | __GFP_NORETRY, 0);
+   page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+   cpu_buffer->buffer->subbuf_order);
if (!page)
return ERR_PTR(-ENOMEM);
 
@@ -5686,7 +5691,7 @@ void ring_buffer_free_read_page(struct trace_buffer 
*buffer, int cpu, void *data
local_irq_restore(flags);
 
  out:
-   free_page((unsigned long)bpage);
+   free_pages((unsigned long)bpage, buffer->subbuf_order);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
@@ -5946,7 +5951,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
  */
 int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 {
+   struct ring_buffer_per_cpu **cpu_buffers;
+   int old_order, old_size;
+   int nr_pages;
int psize;
+   int bsize;
+   int err;
+   int cpu;
 
if (!buffer || order < 0)
return -EINVAL;
@@ -5958,12 +5969,67 @@ int ring_buffer_subbuf_order_set(struct trace_buffer 
*buffer, int order)
if (psize <= BUF_PAGE_HDR_SIZE)
return -EINVAL;
 
+   bsize = sizeof(void *) * buffer->cpus;
+   cpu_buffers = kzalloc(bsize, GFP_KERNEL);
+   if (!cpu_buffers)
+   return -ENOMEM;
+
+   old_order = buffer->subbuf_order;
+   old_size = buffer->subbuf_size;
+
+   /* prevent another thread from changing buffer sizes */
+   mutex_lock(>mutex);
+   atomic_inc(>record_disabled);
+
+   /* Make sure all commits have finished */
+   synchronize_rcu();
+
buffer->subbuf_order = order;
 

[PATCH] tracing: Increase size of trace_marker_raw to max ring buffer entry

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

There's no reason to give an arbitrary limit to the size of a raw trace
marker. Just let it be as big as the size that is allowed by the ring
buffer itself.

And there's also no reason to artificially break up the write to
TRACE_BUF_SIZE, as that's not even used.

Signed-off-by: Steven Rostedt (Google) 
---
[
  Depends on: 
https://lore.kernel.org/linux-trace-kernel/20231209175003.63db4...@gandalf.local.home/
]
 kernel/trace/trace.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index da837119a446..077b20e83e7c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7351,9 +7351,6 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
return written;
 }
 
-/* Limit it for now to 3K (including tag) */
-#define RAW_DATA_MAX_SIZE (1024*3)
-
 static ssize_t
 tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7375,18 +7372,16 @@ tracing_mark_raw_write(struct file *filp, const char 
__user *ubuf,
return -EINVAL;
 
/* The marker must at least have a tag id */
-   if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+   if (cnt < sizeof(unsigned int))
return -EINVAL;
 
-   if (cnt > TRACE_BUF_SIZE)
-   cnt = TRACE_BUF_SIZE;
-
-   BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
 
+   if (size > ring_buffer_max_event_size(buffer))
+   return -EINVAL;
+
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
tracing_gen_ctx());
-- 
2.42.0




[PATCH] trace_seq: Increase the buffer size to almost two pages

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Now that trace_marker can hold more than 1KB string, and can write as much
as the ring buffer can hold, the trace_seq is not big enough to hold
writes:

 ~# a="1234567890"
 ~# cnt=4080
 ~# s=""
 ~# while [ $cnt -gt 10 ]; do
 ~# s="${s}${a}"
 ~# cnt=$((cnt-10))
 ~# done
 ~# echo $s > trace_marker
 ~# cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #_-=> irqs-off/BH-disabled
 #   / _=> need-resched
 #  | / _---=> hardirq/softirq
 #  || / _--=> preempt-depth
 #  ||| / _-=> migrate-disable
 #   / delay
 #   TASK-PID CPU#  |  TIMESTAMP  FUNCTION
 #  | | |   | | |
<...>-860 [002] .   105.543465: tracing_mark_write[LINE TOO 
BIG]
<...>-860 [002] .   105.543496: tracing_mark_write: 
789012345678901234567890

By increasing the trace_seq buffer to almost two pages, it can now print
out the first line.

This also subtracts the rest of the trace_seq fields from the buffer, so
that the entire trace_seq is now PAGE_SIZE aligned.

Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/trace_seq.h | 9 ++---
 kernel/trace/trace.c  | 6 +++---
 kernel/trace/trace_seq.c  | 3 ---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 3691e0e76a1a..9ec229dfddaa 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -8,11 +8,14 @@
 
 /*
  * Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE).
+ * to create a string of data to use.
  */
 
+#define TRACE_SEQ_BUFFER_SIZE  (PAGE_SIZE * 2 - \
+   (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
+
 struct trace_seq {
-   charbuffer[PAGE_SIZE];
+   charbuffer[TRACE_SEQ_BUFFER_SIZE];
struct seq_buf  seq;
size_t  readpos;
int full;
@@ -21,7 +24,7 @@ struct trace_seq {
 static inline void
 trace_seq_init(struct trace_seq *s)
 {
-   seq_buf_init(>seq, s->buffer, PAGE_SIZE);
+   seq_buf_init(>seq, s->buffer, TRACE_SEQ_BUFFER_SIZE);
s->full = 0;
s->readpos = 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 83393c65ec71..da837119a446 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3753,7 +3753,7 @@ static bool trace_safe_str(struct trace_iterator *iter, 
const char *str,
 
/* OK if part of the temp seq buffer */
if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
-   (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+   (addr < (unsigned long)iter->tmp_seq.buffer + 
TRACE_SEQ_BUFFER_SIZE))
return true;
 
/* Core rodata can not be freed */
@@ -6926,8 +6926,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
goto out;
}
 
-   if (cnt >= PAGE_SIZE)
-   cnt = PAGE_SIZE - 1;
+   if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+   cnt = TRACE_SEQ_BUFFER_SIZE - 1;
 
/* reset all but tr, trace, and overruns */
trace_iterator_reset(iter);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 7be97229ddf8..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@
  * trace_seq_init() more than once to reset the trace_seq to start
  * from scratch.
  * 
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
  * A write to the buffer will either succeed or fail. That is, unlike
  * sprintf() there will not be a partial write (well it may write into
  * the buffer but it wont update the pointers). This allows users to
-- 
2.42.0




[PATCH] tracing: Allow for max buffer data size trace_marker writes

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Allow a trace write to be as big as the ring buffer tracing data will
allow. Currently, it only allows writes of 1KB in size, but there's no
reason that it cannot allow what the ring buffer can hold.

Signed-off-by: Steven Rostedt (Google) 
---
[
   Depends on: 
https://lore.kernel.org/linux-trace-kernel/20231209170139.33c1b...@gandalf.local.home/
]
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 12 
 kernel/trace/trace.c| 28 +---
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..b1b03b2c0f08 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
 bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
 
 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer);
 
 void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu);
 void ring_buffer_reset_online_cpus(struct trace_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8da1320df51d..3a5026702179 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5254,6 +5254,18 @@ unsigned long ring_buffer_size(struct trace_buffer 
*buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+   return BUF_MAX_EVENT_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
 static void rb_clear_buffer_page(struct buffer_page *page)
 {
local_set(>write, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 49f388e4fa1d..83393c65ec71 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7272,6 +7272,7 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
+   int meta_size;
ssize_t written;
int size;
int len;
@@ -7286,12 +7287,9 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
 
-   if (cnt > TRACE_BUF_SIZE)
-   cnt = TRACE_BUF_SIZE;
-
-   BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
-   size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+   meta_size = sizeof(*entry) + 2;  /* add '\0' and possible '\n' */
+ again:
+   size = cnt + meta_size;
 
/* If less than "", then make sure we can still add that */
if (cnt < FAULTED_SIZE)
@@ -7300,9 +7298,25 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
tracing_gen_ctx());
-   if (unlikely(!event))
+   if (unlikely(!event)) {
+   /*
+* If the size was greated than what was allowed, then
+* make it smaller and try again.
+*/
+   if (size > ring_buffer_max_event_size(buffer)) {
+   /* cnt < FAULTED size should never be bigger than max */
+   if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+   return -EBADF;
+   cnt = ring_buffer_max_event_size(buffer) - meta_size;
+   /* The above should only happen once */
+   if (WARN_ON_ONCE(cnt + meta_size == size))
+   return -EBADF;
+   goto again;
+   }
+
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
+   }
 
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
-- 
2.42.0




[PATCH] tracing: Have large events show up as '[LINE TOO BIG]' instead of nothing

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

If a large event was added to the ring buffer that is larger than what the
trace_seq can handle, it just drops the output:

 ~# cat /sys/kernel/tracing/trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #_-=> irqs-off/BH-disabled
 #   / _=> need-resched
 #  | / _---=> hardirq/softirq
 #  || / _--=> preempt-depth
 #  ||| / _-=> migrate-disable
 #   / delay
 #   TASK-PID CPU#  |  TIMESTAMP  FUNCTION
 #  | | |   | | |
<...>-859 [001] .   141.118951: tracing_mark_write  
 <...>-859 [001] .   141.148201: tracing_mark_write: 78901234

Instead, catch this case and add some context:

 ~# cat /sys/kernel/tracing/trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #_-=> irqs-off/BH-disabled
 #   / _=> need-resched
 #  | / _---=> hardirq/softirq
 #  || / _--=> preempt-depth
 #  ||| / _-=> migrate-disable
 #   / delay
 #   TASK-PID CPU#  |  TIMESTAMP  FUNCTION
 #  | | |   | | |
<...>-852 [001] .   121.550551: tracing_mark_write[LINE TOO 
BIG]
<...>-852 [001] .   121.550581: tracing_mark_write: 78901234

This now emulates the same output as trace_pipe.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fbcd3bafb93e..aa8f99f3e5de 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4722,7 +4722,11 @@ static int s_show(struct seq_file *m, void *v)
iter->leftover = ret;
 
} else {
-   print_trace_line(iter);
+   ret = print_trace_line(iter);
+   if (ret == TRACE_TYPE_PARTIAL_LINE) {
+   iter->seq.full = 0;
+   trace_seq_puts(>seq, "[LINE TOO BIG]\n");
+   }
ret = trace_print_seq(m, >seq);
/*
 * If we overflow the seq_file buffer, then it will
-- 
2.42.0




Re: [PATCH] ring-buffer: Fix buffer max_data_size with max_event_size

2023-12-09 Thread Steven Rostedt
On Sat, 9 Dec 2023 17:01:39 -0500
Steven Rostedt  wrote:

> From: "Steven Rostedt (Google)" 
> 
> The maximum ring buffer data size is the maximum size of data that can be
> recorded on the ring buffer. Events must be smaller than the sub buffer
> data size minus any meta data. This size is checked before trying to
> allocate from the ring buffer because the allocation assumes that the size
> will fit on the sub buffer.
> 
> The maximum size was calculated as the size of a sub buffer page (which is
> currently PAGE_SIZE minus the sub buffer header) minus the size of the
> meta data of an individual event. But it missed the possible adding of a
> time stamp for events that are added long enough apart that the event meta
> data can't hold the time delta.
> 
> When an event is added that is greater than the current BUF_MAX_DATA_SIZE
> minus the size of a time stamp, but still less than or equal to
> BUF_MAX_DATA_SIZE, the ring buffer would go into an infinite loop, looking
> for a page that can hold the event. Luckily, there's a check for this loop
> and after 1000 iterations and a warning is emitted and the ring buffer is
> disabled. But this should never happen.
> 
> This can happen when a large event is added first, or after a long period
> where an absolute timestamp is prefixed to the event, increasing its size
> by 8 bytes. This passes the check and then goes into the algorithm that
> causes the infinite loop.
> 
> Fix this by creating a BUF_MAX_EVENT_SIZE to be used to determine if the
> passed in event is too big for the buffer.
> 

Forgot to add:

Cc: sta...@vger.kernel.org
Fixes: a4543a2fa9ef3 ("ring-buffer: Get timestamp after event is allocated")

-- Steve


> Reported-by: Kent Overstreet  # (on IRC)
> Signed-off-by: Steven Rostedt (Google) 
> ---
>  kernel/trace/ring_buffer.c | 7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 8d2a4f00eca9..a38e5a3c6803 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -378,6 +378,9 @@ static inline bool test_time_stamp(u64 delta)
>  /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
>  #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
>  
> +/* Events may have a time stamp attached to them */
> +#define BUF_MAX_EVENT_SIZE (BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND)
> +
>  int ring_buffer_print_page_header(struct trace_seq *s)
>  {
>   struct buffer_data_page field;
> @@ -3810,7 +3813,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, 
> unsigned long length)
>   if (unlikely(atomic_read(_buffer->record_disabled)))
>   goto out;
>  
> - if (unlikely(length > BUF_MAX_DATA_SIZE))
> + if (unlikely(length > BUF_MAX_EVENT_SIZE))
>   goto out;
>  
>   if (unlikely(trace_recursive_lock(cpu_buffer)))
> @@ -3960,7 +3963,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
>   if (atomic_read(_buffer->record_disabled))
>   goto out;
>  
> - if (length > BUF_MAX_DATA_SIZE)
> + if (length > BUF_MAX_EVENT_SIZE)
>   goto out;
>  
>   if (unlikely(trace_recursive_lock(cpu_buffer)))




[PATCH] ring-buffer: Fix buffer max_data_size with max_event_size

2023-12-09 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

The maximum ring buffer data size is the maximum size of data that can be
recorded on the ring buffer. Events must be smaller than the sub buffer
data size minus any meta data. This size is checked before trying to
allocate from the ring buffer because the allocation assumes that the size
will fit on the sub buffer.

The maximum size was calculated as the size of a sub buffer page (which is
currently PAGE_SIZE minus the sub buffer header) minus the size of the
meta data of an individual event. But it missed the possible adding of a
time stamp for events that are added long enough apart that the event meta
data can't hold the time delta.

When an event is added that is greater than the current BUF_MAX_DATA_SIZE
minus the size of a time stamp, but still less than or equal to
BUF_MAX_DATA_SIZE, the ring buffer would go into an infinite loop, looking
for a page that can hold the event. Luckily, there's a check for this loop
and after 1000 iterations and a warning is emitted and the ring buffer is
disabled. But this should never happen.

This can happen when a large event is added first, or after a long period
where an absolute timestamp is prefixed to the event, increasing its size
by 8 bytes. This passes the check and then goes into the algorithm that
causes the infinite loop.

Fix this by creating a BUF_MAX_EVENT_SIZE to be used to determine if the
passed in event is too big for the buffer.

Reported-by: Kent Overstreet  # (on IRC)
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8d2a4f00eca9..a38e5a3c6803 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -378,6 +378,9 @@ static inline bool test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
+/* Events may have a time stamp attached to them */
+#define BUF_MAX_EVENT_SIZE (BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND)
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
struct buffer_data_page field;
@@ -3810,7 +3813,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, 
unsigned long length)
if (unlikely(atomic_read(_buffer->record_disabled)))
goto out;
 
-   if (unlikely(length > BUF_MAX_DATA_SIZE))
+   if (unlikely(length > BUF_MAX_EVENT_SIZE))
goto out;
 
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3960,7 +3963,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
if (atomic_read(_buffer->record_disabled))
goto out;
 
-   if (length > BUF_MAX_DATA_SIZE)
+   if (length > BUF_MAX_EVENT_SIZE)
goto out;
 
if (unlikely(trace_recursive_lock(cpu_buffer)))
-- 
2.42.0




Re: [PATCH v3 5/5] input/touchscreen: imagis: add support for IST3032C

2023-12-09 Thread Markuss Broks

Hi Karel,

On 12/8/23 23:59, Karel Balej wrote:

Markuss,

thank you for the review.


diff --git a/drivers/input/touchscreen/imagis.c 
b/drivers/input/touchscreen/imagis.c
index 84a02672ac47..41f28e6e9cb1 100644
--- a/drivers/input/touchscreen/imagis.c
+++ b/drivers/input/touchscreen/imagis.c
@@ -35,6 +35,8 @@
   #define IST3038B_REG_CHIPID  0x30
   #define IST3038B_WHOAMI  0x30380b
   
+#define IST3032C_WHOAMI			0x32c

+

Perhaps it should be ordered in alphabetic/alphanumeric order,
alternatively, the chip ID values could be grouped.

Here I followed suit and just started a new section for the new chip,
except there is only one entry. I do agree that it would be better to
sort the chips alphanumerically and I am actually surprised that I
didn't do that - but now I see that the chips that you added are not
sorted either, so it might be because of that.

I propose to definitely swap the order of the sections, putting 32C
first, then 38B and 38C at the end (from top to bottom). The chip ID
values could then still be grouped in a new section, but I think I would
actually prefer to keep them as parts of the respective sections as it
is now, although it is in no way a strong preference.
We could do that, yeah. It is not a problem right now since there's only 
3 models supported, but it would maker sense and set some order for when 
we'd have more supported devices.


Please let me know whether you agree with this or have a different
preference. And if the former, please confirm that I can add your
Reviewed-by trailer to the patch modified in such a way.

Yeah, it's fine.


Best regards,
K. B.


- Markuss




Re: [PATCH] init: move THIS_MODULE from to

2023-12-09 Thread Masahiro Yamada
On Tue, Dec 5, 2023 at 2:54 PM Luis Chamberlain  wrote:
>
> On Sun, Nov 26, 2023 at 04:19:14PM +0900, Masahiro Yamada wrote:
> > Commit f50169324df4 ("module.h: split out the EXPORT_SYMBOL into
> > export.h") appropriately separated EXPORT_SYMBOL into 
> > because modules and EXPORT_SYMBOL are orthogonal; modules are symbol
> > consumers, while EXPORT_SYMBOL are used by symbol providers, which
> > may not be necessarily a module.
> >
> > However, that commit also relocated THIS_MODULE. As explained in the
> > commit description, the intention was to define THIS_MODULE in a
> > lightweight header, but I do not believe  was the
> > suitable location because EXPORT_SYMBOL and THIS_MODULE are unrelated.
> >
> > Move it to another lightweight header, . The reason for
> > choosing  is to make  self-contained
> > without relying on  incorrectly including
> > .
> >
> > With this adjustment, the role of  becomes clearer as
> > it only defines EXPORT_SYMBOL.
> >
> > Signed-off-by: Masahiro Yamada 
>
> Reviewed-by: Luis Chamberlain 


I will fold your reviewed-by tag.

Thanks.


>
> Do you want this this to go through modules-next or your tree? I'm fine
> it goes either way.
>
>   Luis



-- 
Best Regards
Masahiro Yamada



Re: Building signed debs

2023-12-09 Thread Masahiro Yamada
On Fri, Dec 8, 2023 at 8:14 PM Tom Cook  wrote:
>
> I'm trying to build a signed .deb kernel package of
> https://github.com/torvalds/linux/tree/v6.6.  I've copied
> certs/default_x509.genkey to certs/x509.genkey.  The .config is the
> one from Ubuntu 23.10's default kernel with all new options accepted
> at their default and CONFIG_SYSTEM_TRUSTED_KEYS="" and
> CONFIG_SYSTEM_REVOCATION_KEYS="".
>
> This builds the kernel and modules, signs the modules, compresses the
> modules and then attempts to sign the modules again.  That fails,
> because the .ko module files are now .ko.zst files and the file it's
> trying to sign isn't there.  Full failure is pasted below.



Modules are signed before the compression.


Once you compress modules, you cannot re-sign them again unless
you decompress modules, sign them, and compress them again.


Now, the kernel can directly load compressed modules, so
perhaps allowing users to sign compressed modules might
be beneficial.
(that is, reverse the order of singing and compression)

I am not sure whether it is possible or not, though.


I added linux-modules ML.










>
> Unsetting CONFIG_MODULE_COMPRESS_ZSTD is a workaround (ie disable
> module compression).
>
> Is there a way to build a .deb of a signed kernel with compressed modules?
>
> Thanks for any help,
> Tom
>
>   INSTALL debian/linux-libc-dev/usr/include
>   SIGN
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/amd/amd-uncore.ko
>   SIGN
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/intel/intel-cstate.ko
> At main.c:298:
> - SSL error:8002:system library::No such file or
> directory: ../crypto/bio/bss_file.c:67
> - SSL error:1080:BIO routines::no such file: ../crypto/bio/bss_file.c:75
> sign-file: 
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/amd/amd-uncore.ko
>   SIGN
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/rapl.ko
> At main.c:298:
> - SSL error:8002:system library::No such file or
> directory: ../crypto/bio/bss_file.c:67
> - SSL error:1080:BIO routines::no such file: ../crypto/bio/bss_file.c:75
> sign-file: 
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/intel/intel-cstate.ko
>   SIGN
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/kernel/cpu/mce/mce-inject.ko
> make[6]: *** [scripts/Makefile.modinst:137:
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/amd/amd-uncore.ko]
> Error 1
> make[6]: *** Waiting for unfinished jobs
> make[6]: *** [scripts/Makefile.modinst:137:
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/intel/intel-cstate.ko]
> Error 1
>   SIGN
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/kernel/msr.ko
> At main.c:298:
> - SSL error:8002:system library::No such file or
> directory: ../crypto/bio/bss_file.c:67
> - SSL error:1080:BIO routines::no such file: ../crypto/bio/bss_file.c:75
> sign-file: 
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/rapl.ko
> make[6]: *** [scripts/Makefile.modinst:137:
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/events/rapl.ko]
> Error 1
> At main.c:298:
> - SSL error:8002:system library::No such file or
> directory: ../crypto/bio/bss_file.c:67
> - SSL error:1080:BIO routines::no such file: ../crypto/bio/bss_file.c:75
> sign-file: 
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/kernel/cpu/mce/mce-inject.ko
> make[6]: *** [scripts/Makefile.modinst:137:
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/kernel/cpu/mce/mce-inject.ko]
> Error 1
> At main.c:298:
> - SSL error:8002:system library::No such file or
> directory: ../crypto/bio/bss_file.c:67
> - SSL error:1080:BIO routines::no such file: ../crypto/bio/bss_file.c:75
> sign-file: 
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/kernel/msr.ko
> make[6]: *** [scripts/Makefile.modinst:137:
> debian/linux-image/lib/modules/6.6.0-local/kernel/arch/x86/kernel/msr.ko]
> Error 1
> make[5]: *** [Makefile:1821: modules_install] Error 2
> make[4]: *** [Makefile:2036: run-command] Error 2
> make[3]: *** [debian/rules:17: binary-arch] Error 2
> dpkg-buildpackage: error: make -f debian/rules binary subprocess
> returned exit status 2
> make[2]: *** [scripts/Makefile.package:146: bindeb-pkg] Error 2
> make[1]: *** [/home/tkcook/git/linux/v6.6/Makefile:1538: bindeb-pkg] Error 2
> make: *** [Makefile:234: __sub-make] Error 2



-- 
Best Regards
Masahiro Yamada



Re: [PATCH v4 2/3] remoteproc: qcom: pas: make region assign more generic

2023-12-09 Thread Konrad Dybcio
On 8.12.2023 16:04, Neil Armstrong wrote:
> The current memory region assign only supports a single
> memory region.
> 
> But new platforms introduces more regions to make the
> memory requirements more flexible for various use cases.
> Those new platforms also shares the memory region between the
> DSP and HLOS.
> 
> To handle this, make the region assign more generic in order
> to support more than a single memory region and also permit
> setting the regions permissions as shared.
> 
> Reviewed-by: Mukesh Ojha 
> Signed-off-by: Neil Armstrong 
> ---
[...]

> + for (offset = 0; offset < adsp->region_assign_count; ++offset) {
> + struct reserved_mem *rmem = NULL;
> +
> + node = of_parse_phandle(adsp->dev->of_node, "memory-region",
> + adsp->region_assign_idx + offset);
> + if (node)
> + rmem = of_reserved_mem_lookup(node);
> + of_node_put(node);
Shouldn't this only be called when parse_phandle succeeds? (separate
patch with a fix + cc stable if so?)

> + if (!rmem) {
> + dev_err(adsp->dev, "unable to resolve shareable 
> memory-region index %d\n",
> + offset);
> + return -EINVAL;
> + }
>  
> - perm.vmid = QCOM_SCM_VMID_MSS_MSA;
> - perm.perm = QCOM_SCM_PERM_RW;
> + if (adsp->region_assign_shared)  {
> + perm[0].vmid = QCOM_SCM_VMID_HLOS;
> + perm[0].perm = QCOM_SCM_PERM_RW;
> + perm[1].vmid = adsp->region_assign_vmid;
> + perm[1].perm = QCOM_SCM_PERM_RW;
> + perm_size = 2;
> + } else {
> + perm[0].vmid = adsp->region_assign_vmid;
> + perm[0].perm = QCOM_SCM_PERM_RW;
> + perm_size = 1;
> + }
>  
> - adsp->region_assign_phys = rmem->base;
> - adsp->region_assign_size = rmem->size;
> - adsp->region_assign_perms = BIT(QCOM_SCM_VMID_HLOS);
> + adsp->region_assign_phys[offset] = rmem->base;
> + adsp->region_assign_size[offset] = rmem->size;
> + adsp->region_assign_perms[offset] = BIT(QCOM_SCM_VMID_HLOS);
>  
> - ret = qcom_scm_assign_mem(adsp->region_assign_phys,
> -   adsp->region_assign_size,
> -   >region_assign_perms,
I think this should be renamed to region_assign_owner(s)

> -   , 1);
> - if (ret < 0) {
> - dev_err(adsp->dev, "assign memory failed\n");
> - return ret;
> + ret = qcom_scm_assign_mem(adsp->region_assign_phys[offset],
> +   adsp->region_assign_size[offset],
> +   >region_assign_perms[offset],
> +   perm, perm_size);
> + if (ret < 0) {
> + dev_err(adsp->dev, "assign memory %d failed\n", offset);
> + return ret;
> + }
>   }
>  
>   return 0;
> @@ -629,20 +653,23 @@ static int adsp_assign_memory_region(struct qcom_adsp 
> *adsp)
>  static void adsp_unassign_memory_region(struct qcom_adsp *adsp)
>  {
>   struct qcom_scm_vmperm perm;
> + int offset;
>   int ret;
>  
> - if (!adsp->region_assign_idx)
> + if (!adsp->region_assign_idx || adsp->region_assign_shared)
So when it's *shared*, we don't want to un-assign it?

Konrad



Re: [PATCH] nvdimm-btt: fix a potential memleak in btt_freelist_init

2023-12-09 Thread dinghao . liu
> dinghao.liu@ wrote:
> > > Dave Jiang wrote:
> 
> [snip]
> 
> > > That said, this patch does not completely fix freelist from leaking in the
> > > following error path.
> > > 
> > >   discover_arenas()
> > >   btt_freelist_init() -> ok (memory allocated)
> > >   btt_rtt_init() -> fail
> > >   goto out;
> > >   (leak because arena is not yet on btt->arena_list)
> > >   OR
> > >   btt_maplocks_init() -> fail
> > >   goto out;
> > >   (leak because arena is not yet on btt->arena_list)
> > > 
> > 
> > Thanks for pointing out this issue! I rechecked discover_arenas() and found
> > that btt_rtt_init() may also trigger a memleak for the same reason as
> > btt_freelist_init(). Also, I checked another call trace:
> > 
> > btt_init() -> btt_meta_init() -> btt_maplocks_init()
> > 
> > I think there is a memleak if btt_maplocks_init() succeeds but an error
> > happens in btt_init() after btt_meta_init() (e.g., when btt_blk_init()
> > returns an error). Therefore, we may need to fix three functions.
> 
> Yea I think we need to trace this code better.  This is why devm_ is nice for
> memory allocated for the life of the device.
> 
> > 
> > > This error could be fixed by adding to arena_list earlier but devm_*()
> > > also takes care of this without having to worry about that logic.
> > > 
> > > On normal operation all of this memory can be free'ed with the
> > > corresponding devm_kfree() and/or devm_add_action_*() calls if arenas come
> > > and go.  I'm not sure off the top of my head.
> > > 
> > > In addition, looking at this code.  discover_arenas() could make use of
> > > the scoped based management for struct btt_sb *super!
> > > 
> > > Dinghao would you be willing to submit a series of 2 or 3 patches to fix
> > > the above issues?
> > > 
> > 
> > Sure. Currently I plan to send 2 patches as follows:
> > 1. Using devm_kcalloc() to replace kcalloc() in btt_freelist_init(), 
> >btt_rtt_init(), and btt_maplocks_init(), and removing the corresponding
> >kfree in free_arenas(). I checked some uses of devm_kcalloc() and it
> >seems that we need not to call devm_kfree(). The memory is automatically
> >freed on driver detach, right?
> 
> On device put yes.  So if these allocations are scoped to the life of the
> device there would be no reason to call devm_kfree() on them at all.  I was 
> not
> sure if they got reallocated at some point or not.
> 
> > 2. Using the scoped based management for struct btt_sb *super (not a bug,
> >but it could improve the code).
> 
> Good!
> 
> > 
> > I'm not quite sure whether my understanding or bug fixing plan is correct.
> > If there are any issues, please correct me, thanks!
> 
> The above sounds right.
> Ira

Thanks for the review! I will send the patches soon.

Regards,
Dinghao


Re: trace_event names with more than NAME_MAX chars

2023-12-09 Thread Steven Rostedt
On Fri, 8 Dec 2023 18:36:01 +
Beau Belgrave  wrote:

> While developing some unrelated features I happened to create a
> trace_event that was more than NAME_MAX (255) characters. When this
> happened the creation worked, but tracefs would hang any task that tried
> to list the directory of the trace_event or remove it.
> 
> I followed the code down to the reason being eventfs would call
> simple_lookup(), and if it failed, it would still try to create the
> dentry. In this case DCACHE_PAR_LOOKUP would get set and never cleared.
> This caused d_wait_lookup() to loop forever, since that flag is used in
> d_in_lookup().
> 
> Both tracefs and eventfs use simple_lookup() and it fails for
> dentries that exceed NAME_MAX. Should we even allow trace_events to
> be created that exceed this limit? Or should tracefs/eventfs allow
> this but somehow represent these differently?

Yeah I think it's easiest to just prevent dentries from exceeding
NAME_MAX, and yes the patch below should be added. Care to send a real
patch?

Thanks,

-- Steve


> 
> I have a fix that appears to work for myself, but unsure if there are
> other locations (attached at the end of this mail).
> 
> Thanks,
> -Beau
> 
> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
> index f8a594a50ae6..d2c06ba26db4 100644
> --- a/fs/tracefs/event_inode.c
> +++ b/fs/tracefs/event_inode.c
> @@ -561,6 +561,8 @@ static struct dentry *eventfs_root_lookup(struct inode 
> *dir,
> if (strcmp(ei_child->name, name) != 0)
> continue;
> ret = simple_lookup(dir, dentry, flags);
> +   if (IS_ERR(ret))
> +   goto out;
> create_dir_dentry(ei, ei_child, ei_dentry, true);
> created = true;
> break;
> @@ -583,6 +585,8 @@ static struct dentry *eventfs_root_lookup(struct inode 
> *dir,
> if (r <= 0)
> continue;
> ret = simple_lookup(dir, dentry, flags);
> +   if (IS_ERR(ret))
> +   goto out;
> create_file_dentry(ei, i, ei_dentry, name, mode, 
> cdata,
>fops, true);
> break;




Re: [PATCH v3 2/5] dt-bindings: input/touchscreen: Add compatible for IST3038B

2023-12-09 Thread Conor Dooley
On Sat, Dec 09, 2023 at 10:05:27AM +0100, Karel Balej wrote:
> On Mon Dec 4, 2023 at 1:52 PM CET, Conor Dooley wrote:
> > On Mon, Dec 04, 2023 at 02:40:44PM +0200, Markuss Broks wrote:
> > > On 12/3/23 13:20, Conor Dooley wrote:
> > > > On Sat, Dec 02, 2023 at 01:48:33PM +0100, Karel Balej wrote:
> > > > > From: Markuss Broks 
> > > > > 
> > > > > Imagis IST3038B is a variant (firmware?) of Imagis IST3038 IC,
> > > > > add the compatible for it to the IST3038C bindings.
> > > > This one is better, but would be well served by mentioning what
> > > > specifically is different (register addresses or firmware commands?)
> > > 
> > > I don't think anyone knows this other than Imagis itself. I would guess 
> > > it's
> > > different hardware, since register addresses are indeed different, but on
> > > the other hand, there is a possibility that firmware on the MCU could be
> > > responding to those commands. I suppose "... IST3038B is a hardware 
> > > variant
> > > of ... IST3038" would be more correct.
> >
> > Only Imagis might know the specifics, but you (plural) have made driver
> > changes so you know what is different in terms of the programming model.
> > I'm just asking for you to mention how the programming model varies in
> > the commit message. Otherwise I can't know whether you should have added
> > a fallback compatible, without going and reading your driver change. The
> > commit message for the bindings should stand on its own merit in that
> > regard.
> > "Variant" alone does not suffice, as many variants of devices have a
> > compatible programming model, be that for a subset of features or
> > complete compatibility.
> >
> > > The reason why I think it could be firmware-defined is because we have a 
> > > lot
> > > of variants (30xxA, 30xxB, 30xxC, plain 30xx), and the numbers usually 
> > > mean
> > > feature level/completeness, e.g. some don't support the touch pressure or
> > > touchkeys, and we don't know what A/B/C/none means.
> >
> > Ultimately whether it is due to firmware or the hardware isn't
> > particular important, just mention what is incompatibly different.
> 
> I propose to update the commit description as such:
> 
>   Imagis IST3038B is a variant (firmware?) of Imagis IST3038 IC
>   differing from IST3038C in its register interface. Add the
>   compatible for it to the IST3038C bindings.


SGTM. You can add
Acked-by: Conor Dooley 
with that commit message update.

Thanks,
Conor.


signature.asc
Description: PGP signature


Re: Re: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

2023-12-09 Thread Michael S. Tsirkin
On Fri, Dec 08, 2023 at 12:41:38PM +0100, Tobias Huschle wrote:
> On Fri, Dec 08, 2023 at 05:31:18AM -0500, Michael S. Tsirkin wrote:
> > On Fri, Dec 08, 2023 at 10:24:16AM +0100, Tobias Huschle wrote:
> > > On Thu, Dec 07, 2023 at 01:48:40AM -0500, Michael S. Tsirkin wrote:
> > > > On Thu, Dec 07, 2023 at 07:22:12AM +0100, Tobias Huschle wrote:
> > > > > 3. vhost looping endlessly, waiting for kworker to be scheduled
> > > > > 
> > > > > I dug a little deeper on what the vhost is doing. I'm not an expert on
> > > > > virtio whatsoever, so these are just educated guesses that maybe
> > > > > someone can verify/correct. Please bear with me probably messing up 
> > > > > the terminology.
> > > > > 
> > > > > - vhost is looping through available queues.
> > > > > - vhost wants to wake up a kworker to process a found queue.
> > > > > - kworker does something with that queue and terminates quickly.
> > > > > 
> > > > > What I found by throwing in some very noisy trace statements was that,
> > > > > if the kworker is not woken up, the vhost just keeps looping accross
> > > > > all available queues (and seems to repeat itself). So it essentially
> > > > > relies on the scheduler to schedule the kworker fast enough. Otherwise
> > > > > it will just keep on looping until it is migrated off the CPU.
> > > > 
> > > > 
> > > > Normally it takes the buffers off the queue and is done with it.
> > > > I am guessing that at the same time guest is running on some other
> > > > CPU and keeps adding available buffers?
> > > > 
> > > 
> > > It seems to do just that, there are multiple other vhost instances
> > > involved which might keep filling up thoses queues. 
> > > 
> > 
> > No vhost is ever only draining queues. Guest is filling them.
> > 
> > > Unfortunately, this makes the problematic vhost instance to stay on
> > > the CPU and prevents said kworker to get scheduled. The kworker is
> > > explicitly woken up by vhost, so it wants it to do something.
> > > 
> > > At this point it seems that there is an assumption about the scheduler
> > > in place which is no longer fulfilled by EEVDF. From the discussion so
> > > far, it seems like EEVDF does what is intended to do.
> > > 
> > > Shouldn't there be a more explicit mechanism in use that allows the
> > > kworker to be scheduled in favor of the vhost?
> > > 
> > > It is also concerning that the vhost seems cannot be preempted by the
> > > scheduler while executing that loop.
> > 
> > 
> > Which loop is that, exactly?
> 
> The loop continously passes translate_desc in drivers/vhost/vhost.c
> That's where I put the trace statements.
> 
> The overall sequence seems to be (top to bottom):
> 
> handle_rx
> get_rx_bufs
> vhost_get_vq_desc
> vhost_get_avail_head
> vhost_get_avail
> __vhost_get_user_slow
> translate_desc   << trace statement in here
> vhost_iotlb_itree_first

I wonder why do you keep missing cache and re-translating.
Is pr_debug enabled for you? If not could you check if it
outputs anything?
Or you can tweak:

#define vq_err(vq, fmt, ...) do {  \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__);   \
if ((vq)->error_ctx)   \
eventfd_signal((vq)->error_ctx, 1);\
} while (0)

to do pr_err if you prefer.

> These functions show up as having increased overhead in perf.
> 
> There are multiple loops going on in there.
> Again the disclaimer though, I'm not familiar with that code at all.


So there's a limit there: vhost_exceeds_weight should requeue work:

} while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));

then we invoke scheduler each time before re-executing it:


{   
struct vhost_worker *worker = data;
struct vhost_work *work, *work_next;
struct llist_node *node;

node = llist_del_all(>work_list);
if (node) {
__set_current_state(TASK_RUNNING);

node = llist_reverse_order(node);
/* make sure flag is seen after deletion */
smp_wmb();
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, >flags);
kcov_remote_start_common(worker->kcov_handle);
work->fn(work);
kcov_remote_stop();
cond_resched();
}
}

return !!node;
}   

These are the byte and packet limits:

/* Max number of bytes transferred before requeueing the job.
 * Using this limit prevents one virtqueue from starving others. */
#define VHOST_NET_WEIGHT 0x8

/* Max number of packets transferred before requeueing the job.
 * Using this limit prevents one virtqueue from starving others with small
 * pkts.
 */
#define VHOST_NET_PKT_WEIGHT 256


Try reducing the VHOST_NET_WEIGHT limit and see if that improves things any?

-- 
MST




Re: [PATCH v3 2/5] dt-bindings: input/touchscreen: Add compatible for IST3038B

2023-12-09 Thread Karel Balej
On Mon Dec 4, 2023 at 1:52 PM CET, Conor Dooley wrote:
> On Mon, Dec 04, 2023 at 02:40:44PM +0200, Markuss Broks wrote:
> > On 12/3/23 13:20, Conor Dooley wrote:
> > > On Sat, Dec 02, 2023 at 01:48:33PM +0100, Karel Balej wrote:
> > > > From: Markuss Broks 
> > > > 
> > > > Imagis IST3038B is a variant (firmware?) of Imagis IST3038 IC,
> > > > add the compatible for it to the IST3038C bindings.
> > > This one is better, but would be well served by mentioning what
> > > specifically is different (register addresses or firmware commands?)
> > 
> > I don't think anyone knows this other than Imagis itself. I would guess it's
> > different hardware, since register addresses are indeed different, but on
> > the other hand, there is a possibility that firmware on the MCU could be
> > responding to those commands. I suppose "... IST3038B is a hardware variant
> > of ... IST3038" would be more correct.
>
> Only Imagis might know the specifics, but you (plural) have made driver
> changes so you know what is different in terms of the programming model.
> I'm just asking for you to mention how the programming model varies in
> the commit message. Otherwise I can't know whether you should have added
> a fallback compatible, without going and reading your driver change. The
> commit message for the bindings should stand on its own merit in that
> regard.
> "Variant" alone does not suffice, as many variants of devices have a
> compatible programming model, be that for a subset of features or
> complete compatibility.
>
> > The reason why I think it could be firmware-defined is because we have a lot
> > of variants (30xxA, 30xxB, 30xxC, plain 30xx), and the numbers usually mean
> > feature level/completeness, e.g. some don't support the touch pressure or
> > touchkeys, and we don't know what A/B/C/none means.
>
> Ultimately whether it is due to firmware or the hardware isn't
> particular important, just mention what is incompatibly different.

I propose to update the commit description as such:

Imagis IST3038B is a variant (firmware?) of Imagis IST3038 IC
differing from IST3038C in its register interface. Add the
compatible for it to the IST3038C bindings.

>
> Cheers,
> Conor.
>
>
> > > > Signed-off-by: Markuss Broks 
> > > > Signed-off-by: Karel Balej 
> > > > ---
> > > >   .../devicetree/bindings/input/touchscreen/imagis,ist3038c.yaml   | 1 +
> > > >   1 file changed, 1 insertion(+)
> > > > 
> > > > diff --git 
> > > > a/Documentation/devicetree/bindings/input/touchscreen/imagis,ist3038c.yaml
> > > >  
> > > > b/Documentation/devicetree/bindings/input/touchscreen/imagis,ist3038c.yaml
> > > > index 0d6b033fd5fb..b5372c4eae56 100644
> > > > --- 
> > > > a/Documentation/devicetree/bindings/input/touchscreen/imagis,ist3038c.yaml
> > > > +++ 
> > > > b/Documentation/devicetree/bindings/input/touchscreen/imagis,ist3038c.yaml
> > > > @@ -18,6 +18,7 @@ properties:
> > > > compatible:
> > > >   enum:
> > > > +  - imagis,ist3038b
> > > > - imagis,ist3038c
> > > > reg:
> > > > -- 
> > > > 2.43.0
> > > > 
> > - Markuss

Kind regards,
K. B.