[PATCH v6 2/7] LoongArch: KVM: Add hypercall instruction emulation support

2024-03-02 Thread Bibo Mao
On LoongArch system, there is hypercall instruction special for
virtualization. When system executes this instruction on host side,
there is illegal instruction exception reported, however it will
trap into host when it is executed in VM mode.

When hypercall is emulated, A0 register is set with value
KVM_HCALL_INVALID_CODE, rather than inject EXCCODE_INE invalid
instruction exception. So VM can continue to executing the next code.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/Kbuild  |  1 -
 arch/loongarch/include/asm/kvm_para.h  | 26 ++
 arch/loongarch/include/uapi/asm/Kbuild |  2 --
 arch/loongarch/kvm/exit.c  | 10 ++
 4 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 arch/loongarch/include/asm/kvm_para.h
 delete mode 100644 arch/loongarch/include/uapi/asm/Kbuild

diff --git a/arch/loongarch/include/asm/Kbuild 
b/arch/loongarch/include/asm/Kbuild
index 93783fa24f6e..22991a6f0e2b 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -23,4 +23,3 @@ generic-y += poll.h
 generic-y += param.h
 generic-y += posix_types.h
 generic-y += resource.h
-generic-y += kvm_para.h
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
new file mode 100644
index ..d48f993ae206
--- /dev/null
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_KVM_PARA_H
+#define _ASM_LOONGARCH_KVM_PARA_H
+
+/*
+ * LoongArch hypercall return code
+ */
+#define KVM_HCALL_STATUS_SUCCESS   0
+#define KVM_HCALL_INVALID_CODE -1UL
+#define KVM_HCALL_INVALID_PARAMETER-2UL
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+   return 0;
+}
+
+static inline unsigned int kvm_arch_para_hints(void)
+{
+   return 0;
+}
+
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+   return false;
+}
+#endif /* _ASM_LOONGARCH_KVM_PARA_H */
diff --git a/arch/loongarch/include/uapi/asm/Kbuild 
b/arch/loongarch/include/uapi/asm/Kbuild
deleted file mode 100644
index 4aa680ca2e5f..
--- a/arch/loongarch/include/uapi/asm/Kbuild
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generic-y += kvm_para.h
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index ed1d89d53e2e..923bbca9bd22 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -685,6 +685,15 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
 }
 
+static int kvm_handle_hypercall(struct kvm_vcpu *vcpu)
+{
+   update_pc(&vcpu->arch);
+
+   /* Treat it as noop intruction, only set return value */
+   vcpu->arch.gprs[LOONGARCH_GPR_A0] = KVM_HCALL_INVALID_CODE;
+   return RESUME_GUEST;
+}
+
 /*
  * LoongArch KVM callback handling for unimplemented guest exiting
  */
@@ -716,6 +725,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = 
{
[EXCCODE_LSXDIS]= kvm_handle_lsx_disabled,
[EXCCODE_LASXDIS]   = kvm_handle_lasx_disabled,
[EXCCODE_GSPR]  = kvm_handle_gspr,
+   [EXCCODE_HVC]   = kvm_handle_hypercall,
 };
 
 int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault)
-- 
2.39.3




[PATCH v6 3/7] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-03-02 Thread Bibo Mao
Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/inst.h  |  1 +
 arch/loongarch/include/asm/loongarch.h | 10 +
 arch/loongarch/kvm/exit.c  | 59 +++---
 3 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/arch/loongarch/include/asm/inst.h 
b/arch/loongarch/include/asm/inst.h
index d8f637f9e400..ad120f924905 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -67,6 +67,7 @@ enum reg2_op {
revhd_op= 0x11,
extwh_op= 0x16,
extwb_op= 0x17,
+   cpucfg_op   = 0x1b,
iocsrrdb_op = 0x19200,
iocsrrdh_op = 0x19201,
iocsrrdw_op = 0x19202,
diff --git a/arch/loongarch/include/asm/loongarch.h 
b/arch/loongarch/include/asm/loongarch.h
index 46366e783c84..a1d22e8b6f94 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -158,6 +158,16 @@
 #define  CPUCFG48_VFPU_CG  BIT(2)
 #define  CPUCFG48_RAM_CG   BIT(3)
 
+/*
+ * cpucfg index area: 0x4000 -- 0x40ff
+ * SW emulation for KVM hypervirsor
+ */
+#define CPUCFG_KVM_BASE0x4000UL
+#define CPUCFG_KVM_SIZE0x100
+#define CPUCFG_KVM_SIG CPUCFG_KVM_BASE
+#define  KVM_SIGNATURE "KVM\0"
+#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
+
 #ifndef __ASSEMBLY__
 
 /* CSR */
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index 923bbca9bd22..a8d3b652d3ea 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -206,10 +206,50 @@ int kvm_emu_idle(struct kvm_vcpu *vcpu)
return EMULATE_DONE;
 }
 
-static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst)
 {
int rd, rj;
unsigned int index;
+   unsigned long plv;
+
+   rd = inst.reg2_format.rd;
+   rj = inst.reg2_format.rj;
+   ++vcpu->stat.cpucfg_exits;
+   index = vcpu->arch.gprs[rj];
+
+   /*
+* By LoongArch Reference Manual 2.2.10.5
+* Return value is 0 for undefined cpucfg index
+*
+* Disable preemption since hw gcsr is accessed
+*/
+   preempt_disable();
+   plv = kvm_read_hw_gcsr(LOONGARCH_CSR_CRMD) >> CSR_CRMD_PLV_SHIFT;
+   switch (index) {
+   case 0 ... (KVM_MAX_CPUCFG_REGS - 1):
+   vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index];
+   break;
+   case CPUCFG_KVM_SIG:
+   /*
+* Cpucfg emulation between 0x4000 -- 0x40ff
+* Return value with 0 if executed in user mode
+*/
+   if ((plv & CSR_CRMD_PLV) == PLV_KERN)
+   vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE;
+   else
+   vcpu->arch.gprs[rd] = 0;
+   break;
+   default:
+   vcpu->arch.gprs[rd] = 0;
+   break;
+   }
+
+   preempt_enable();
+   return EMULATE_DONE;
+}
+
+static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+{
unsigned long curr_pc;
larch_inst inst;
enum emulation_result er = EMULATE_DONE;
@@ -224,21 +264,8 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
er = EMULATE_FAIL;
switch (((inst.word >> 24) & 0xff)) {
case 0x0: /* CPUCFG GSPR */
-   if (inst.reg2_format.opcode == 0x1B) {
-   rd = inst.reg2_format.rd;
-   rj = inst.reg2_format.rj;
-   ++vcpu->stat.cpucfg_exits;
-   index = vcpu->arch.gprs[rj];
-   er = EMULATE_DONE;
-   /*
-* By LoongArch Reference Manual 2.2.10.5
-* return value is 0 for undefined cpucfg index
-*/
-   if (index < KVM_MAX_CPUCFG_REGS)
-   vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index];
-   else
-   vcpu->arch.gprs[rd] = 0;
-   }
+   if (inst.reg2_format.opcode == cpucfg_op)
+   er = kvm_emu_cpucfg(vcpu, inst);
break;
case 0x4: /* CSR{RD,WR,XCHG} GSPR */
er = kvm_handle_csr(vcpu, inst);
-- 
2.39.3




[PATCH v6 0/7] LoongArch: Add pv ipi support on LoongArch VM

2024-03-02 Thread Bibo Mao
On physical machine, ipi HW uses IOCSR registers, however there is trap
into hypervisor when vcpu accesses IOCSR registers if system is in VM
mode. SWI is a interrupt mechanism like SGI on ARM, software can send
interrupt to CPU, only that on LoongArch SWI can only be sent to local CPU
now. So SWI can not used for IPI on real HW system, however it can be used
on VM when combined with hypercall method. IPI can be sent with hypercall
method and SWI interrupt is injected to vcpu, vcpu can treat SWI
interrupt as IPI.

With PV IPI supported, there is one trap with IPI sending, however with IPI
receiving there is no trap. with IOCSR HW ipi method, there will be one
trap with IPI sending and two trap with ipi receiving.

Also IPI multicast support is added for VM, the idea comes from x86 PV ipi.
IPI can be sent to 128 vcpus in one time. With IPI multicast support, trap
will be reduced greatly.

Here is the microbenchmarck data with "perf bench futex wake" testcase on
3C5000 single-way machine, there are 16 cpus on 3C5000 single-way machine,
VM has 16 vcpus also. The benchmark data is ms time unit to wakeup 16
threads, the performance is better if data is smaller.

physical machine 0.0176 ms
VM original  0.1140 ms
VM with pv ipi patch 0.0481 ms

It passes to boot with 128/256 vcpus, runltp command in package ltp-20230516
passes to run with 16 cores.

---
Change in V6:
  1. Add privilege checking when emulating cpucfg at index 0x400 --
0x40FF, return 0 if not executed at kernel mode.
  2. Add document about LoongArch pv ipi with new creatly directory
Documentation/virt/kvm/loongarch/
  3. Fix pv ipi handling in kvm backend function kvm_pv_send_ipi(),
where min should plus BITS_PER_LONG with second bitmap, otherwise
VM with more than 64 vpus fails to boot.
  4. Adjust patch order and code refine with review comments.
 
Change in V5:
  1. Refresh function/macro name from review comments.

Change in V4:
  1. Modfiy pv ipi hook function name call_func_ipi() and
call_func_single_ipi() with send_ipi_mask()/send_ipi_single(), since pv
ipi is used for both remote function call and reschedule notification.
  2. Refresh changelog.

Change in V3:
  1. Add 128 vcpu ipi multicast support like x86
  2. Change cpucfg base address from 0x1000 to 0x4000, in order
to avoid confliction with future hw usage
  3. Adjust patch order in this patchset, move patch
Refine-ipi-ops-on-LoongArch-platform to the first one.

Change in V2:
  1. Add hw cpuid map support since ipi routing uses hw cpuid
  2. Refine changelog description
  3. Add hypercall statistic support for vcpu
  4. Set percpu pv ipi message buffer aligned with cacheline
  5. Refine pv ipi send logic, do not send ipi message with if there is
pending ipi message.
---
Bibo Mao (7):
  LoongArch/smp: Refine some ipi functions on LoongArch platform
  LoongArch: KVM: Add hypercall instruction emulation support
  LoongArch: KVM: Add cpucfg area for kvm hypervisor
  LoongArch: KVM: Add vcpu search support from physical cpuid
  LoongArch: KVM: Add pv ipi support on kvm side
  LoongArch: Add pv ipi support on guest kernel side
  Documentation: KVM: Add hypercall for LoongArch

 Documentation/virt/kvm/index.rst  |   1 +
 .../virt/kvm/loongarch/hypercalls.rst |  79 +
 Documentation/virt/kvm/loongarch/index.rst|  10 ++
 arch/loongarch/Kconfig|   9 +
 arch/loongarch/include/asm/Kbuild |   1 -
 arch/loongarch/include/asm/hardirq.h  |   5 +
 arch/loongarch/include/asm/inst.h |   1 +
 arch/loongarch/include/asm/irq.h  |  10 +-
 arch/loongarch/include/asm/kvm_host.h |  27 +++
 arch/loongarch/include/asm/kvm_para.h | 156 ++
 arch/loongarch/include/asm/kvm_vcpu.h |   1 +
 arch/loongarch/include/asm/loongarch.h|  11 ++
 arch/loongarch/include/asm/paravirt.h |  27 +++
 .../include/asm/paravirt_api_clock.h  |   1 +
 arch/loongarch/include/asm/smp.h  |  31 ++--
 arch/loongarch/include/uapi/asm/Kbuild|   2 -
 arch/loongarch/kernel/Makefile|   1 +
 arch/loongarch/kernel/irq.c   |  24 +--
 arch/loongarch/kernel/paravirt.c  | 151 +
 arch/loongarch/kernel/perf_event.c|  14 +-
 arch/loongarch/kernel/smp.c   |  62 ---
 arch/loongarch/kernel/time.c  |  12 +-
 arch/loongarch/kvm/exit.c | 141 ++--
 arch/loongarch/kvm/vcpu.c |  94 ++-
 arch/loongarch/kvm/vm.c   |  11 ++
 25 files changed, 780 insertions(+), 102 deletions(-)
 create mode 100644 Documentation/virt/kvm/loongarch/hypercalls.rst
 create mode 100644 Documentation/virt/kvm/loongarch/index.rst
 create mode 100644 arch/loongarch/include/asm/kvm_para.h
 create mode 100644 arch/loongarch/include/asm/paravirt.h
 create

[PATCH v6 1/7] LoongArch/smp: Refine some ipi functions on LoongArch platform

2024-03-02 Thread Bibo Mao
It is code refine about ipi handling on LoongArch platform, there are
three modifications.
1. Add generic function get_percpu_irq(), replacing some percpu irq
functions such as get_ipi_irq()/get_pmc_irq()/get_timer_irq() with
get_percpu_irq().

2. Change definition about parameter action called by function
loongson_send_ipi_single() and loongson_send_ipi_mask(), and it is
defined as decimal encoding format at ipi sender side. Normal decimal
encoding is used rather than binary bitmap encoding for ipi action, ipi
hw sender uses decimal encoding code, and ipi receiver will get binary
bitmap encoding, the ipi hw will convert it into bitmap in ipi message
buffer.

3. Add structure smp_ops on LoongArch platform so that pv ipi can be used
later.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/hardirq.h |  4 ++
 arch/loongarch/include/asm/irq.h | 10 -
 arch/loongarch/include/asm/smp.h | 31 +++
 arch/loongarch/kernel/irq.c  | 22 +--
 arch/loongarch/kernel/perf_event.c   | 14 +--
 arch/loongarch/kernel/smp.c  | 58 +++-
 arch/loongarch/kernel/time.c | 12 +-
 7 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 0ef3b18f8980..9f0038e19c7f 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -12,6 +12,10 @@
 extern void ack_bad_irq(unsigned int irq);
 #define ack_bad_irq ack_bad_irq
 
+enum ipi_msg_type {
+   IPI_RESCHEDULE,
+   IPI_CALL_FUNCTION,
+};
 #define NR_IPI 2
 
 typedef struct {
diff --git a/arch/loongarch/include/asm/irq.h b/arch/loongarch/include/asm/irq.h
index 218b4da0ea90..00101b6d601e 100644
--- a/arch/loongarch/include/asm/irq.h
+++ b/arch/loongarch/include/asm/irq.h
@@ -117,8 +117,16 @@ extern struct fwnode_handle *liointc_handle;
 extern struct fwnode_handle *pch_lpc_handle;
 extern struct fwnode_handle *pch_pic_handle[MAX_IO_PICS];
 
-extern irqreturn_t loongson_ipi_interrupt(int irq, void *dev);
+static inline int get_percpu_irq(int vector)
+{
+   struct irq_domain *d;
+
+   d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
+   if (d)
+   return irq_create_mapping(d, vector);
 
+   return -EINVAL;
+}
 #include 
 
 #endif /* _ASM_IRQ_H */
diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h
index f81e5f01d619..75d30529748c 100644
--- a/arch/loongarch/include/asm/smp.h
+++ b/arch/loongarch/include/asm/smp.h
@@ -12,6 +12,13 @@
 #include 
 #include 
 
+struct smp_ops {
+   void (*init_ipi)(void);
+   void (*send_ipi_mask)(const struct cpumask *mask, unsigned int action);
+   void (*send_ipi_single)(int cpu, unsigned int action);
+};
+
+extern struct smp_ops smp_ops;
 extern int smp_num_siblings;
 extern int num_processors;
 extern int disabled_cpus;
@@ -24,8 +31,6 @@ void loongson_prepare_cpus(unsigned int max_cpus);
 void loongson_boot_secondary(int cpu, struct task_struct *idle);
 void loongson_init_secondary(void);
 void loongson_smp_finish(void);
-void loongson_send_ipi_single(int cpu, unsigned int action);
-void loongson_send_ipi_mask(const struct cpumask *mask, unsigned int action);
 #ifdef CONFIG_HOTPLUG_CPU
 int loongson_cpu_disable(void);
 void loongson_cpu_die(unsigned int cpu);
@@ -59,9 +64,12 @@ extern int __cpu_logical_map[NR_CPUS];
 
 #define cpu_physical_id(cpu)   cpu_logical_map(cpu)
 
-#define SMP_BOOT_CPU   0x1
-#define SMP_RESCHEDULE 0x2
-#define SMP_CALL_FUNCTION  0x4
+#define ACTION_BOOT_CPU0
+#define ACTION_RESCHEDULE  1
+#define ACTION_CALL_FUNCTION   2
+#define SMP_BOOT_CPU   BIT(ACTION_BOOT_CPU)
+#define SMP_RESCHEDULE BIT(ACTION_RESCHEDULE)
+#define SMP_CALL_FUNCTION  BIT(ACTION_CALL_FUNCTION)
 
 struct secondary_data {
unsigned long stack;
@@ -71,7 +79,8 @@ extern struct secondary_data cpuboot_data;
 
 extern asmlinkage void smpboot_entry(void);
 extern asmlinkage void start_secondary(void);
-
+extern void arch_send_call_function_single_ipi(int cpu);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 extern void calculate_cpu_foreign_map(void);
 
 /*
@@ -79,16 +88,6 @@ extern void calculate_cpu_foreign_map(void);
  */
 extern void show_ipi_list(struct seq_file *p, int prec);
 
-static inline void arch_send_call_function_single_ipi(int cpu)
-{
-   loongson_send_ipi_single(cpu, SMP_CALL_FUNCTION);
-}
-
-static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
-{
-   loongson_send_ipi_mask(mask, SMP_CALL_FUNCTION);
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 static inline int __cpu_disable(void)
 {
diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c
index 883e5066ae44..ce36897d1e5a 100644
--- a/arch/loongarch/kernel/irq.c
+++ b/arch/loongarch/kernel/irq.c
@@ -87,23 +87,9 @@ static void __init init_vec_parent_group(void)
acpi_table_p

[PATCH v6 4/7] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-03-02 Thread Bibo Mao
Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/kvm_host.h | 26 
 arch/loongarch/include/asm/kvm_vcpu.h |  1 +
 arch/loongarch/kvm/vcpu.c | 93 ++-
 arch/loongarch/kvm/vm.c   | 11 
 4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {
 
 #define MAX_PGTABLE_LEVELS 4
 
+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
 struct kvm_arch {
/* Guest physical mm */
kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;
 
s64 time_offset;
struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
 void kvm_restore_timer(struct kvm_vcpu *vcpu);
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);
 
 /*
  * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 36106922b5d7..a1a1dc4a3cf2 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
return 0;
 }
 
+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(&vcpu->kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(&vcpu->kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   spin_unlock(&vcpu->kvm->arch.phyid_map_lock);
+   return 0;
+   }
+   }
+
+   if (map->phys_map[val].enabled) {
+   /*
+* New cpuid is already set with other vcpu
+* Forbid sharing the same cpuid between different vcpus
+*/
+   if (map->phys_map[val].vcpu != vcpu) {
+   spin_unlock(&vcpu->kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+
+   /* Discard duplicated cpuid set operation*/
+   spin_unlock(&vcpu->kvm->arc

[PATCH v6 6/7] LoongArch: Add pv ipi support on guest kernel side

2024-03-02 Thread Bibo Mao
PARAVIRT option and pv ipi is added on guest kernel side, function
pv_ipi_init() is to add ipi sending and ipi receiving hooks. This function
firstly checks whether system runs on VM mode. If kernel runs on VM mode,
it will call function kvm_para_available() to detect current hypervirsor
type. Now only KVM type detection is supported, the paravirt function can
work only if current hypervisor type is KVM, since there is only KVM
supported on LoongArch now.

PV IPI uses virtual IPI sender and virtual IPI receiver function. With
virutal IPI sender, ipi message is stored in DDR memory rather than
emulated HW. IPI multicast is supported, and 128 vcpus can received IPIs
at the same time like X86 KVM method. Hypercall method is used for IPI
sending.

With virtual IPI receiver, HW SW0 is used rather than real IPI HW. Since
VCPU has separate HW SW0 like HW timer, there is no trap in IPI interrupt
acknowledge. And IPI message is stored in DDR, no trap in get IPI message.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/Kconfig|   9 ++
 arch/loongarch/include/asm/hardirq.h  |   1 +
 arch/loongarch/include/asm/paravirt.h |  27 
 .../include/asm/paravirt_api_clock.h  |   1 +
 arch/loongarch/kernel/Makefile|   1 +
 arch/loongarch/kernel/irq.c   |   2 +-
 arch/loongarch/kernel/paravirt.c  | 151 ++
 arch/loongarch/kernel/smp.c   |   4 +-
 8 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 arch/loongarch/include/asm/paravirt.h
 create mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
 create mode 100644 arch/loongarch/kernel/paravirt.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 929f68926b34..fdaae9a0435c 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -587,6 +587,15 @@ config CPU_HAS_PREFETCH
bool
default y
 
+config PARAVIRT
+   bool "Enable paravirtualization code"
+   depends on AS_HAS_LVZ_EXTENSION
+   help
+  This changes the kernel so it can modify itself when it is run
+ under a hypervisor, potentially improving performance significantly
+ over full virtualization.  However, when run without a hypervisor
+ the kernel is theoretically slower and slightly larger.
+
 config ARCH_SUPPORTS_KEXEC
def_bool y
 
diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 9f0038e19c7f..b26d596a73aa 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -21,6 +21,7 @@ enum ipi_msg_type {
 typedef struct {
unsigned int ipi_irqs[NR_IPI];
unsigned int __softirq_pending;
+   atomic_t message cacheline_aligned_in_smp;
 } cacheline_aligned irq_cpustat_t;
 
 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
new file mode 100644
index ..58f7b7b89f2c
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_PARAVIRT_H
+#define _ASM_LOONGARCH_PARAVIRT_H
+
+#ifdef CONFIG_PARAVIRT
+#include 
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+u64 dummy_steal_clock(int cpu);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+   return static_call(pv_steal_clock)(cpu);
+}
+
+int pv_ipi_init(void);
+#else
+static inline int pv_ipi_init(void)
+{
+   return 0;
+}
+
+#endif // CONFIG_PARAVIRT
+#endif
diff --git a/arch/loongarch/include/asm/paravirt_api_clock.h 
b/arch/loongarch/include/asm/paravirt_api_clock.h
new file mode 100644
index ..65ac7cee0dad
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include 
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 3c808c680370..662e6e9de12d 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_MODULES) += module.o module-sections.o
 obj-$(CONFIG_STACKTRACE)   += stacktrace.o
 
 obj-$(CONFIG_PROC_FS)  += proc.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o
 
 obj-$(CONFIG_SMP)  += smp.o
 
diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c
index ce36897d1e5a..4863e6c1b739 100644
--- a/arch/loongarch/kernel/irq.c
+++ b/arch/loongarch/kernel/irq.c
@@ -113,5 +113,5 @@ void __init init_IRQ(void)
per_cpu(irq_stack, i), per_cpu(irq_stack, i) + 
IRQ_STACK_SIZE);
}
 
-   set_csr_ecfg(ECFGF_IP0 | ECFGF_IP1 | ECFGF_IP2 | ECFGF_IPI | ECFGF_PMC);
+   set_csr_ecfg(ECFGF_SIP0 | ECFGF_IP0 | ECFGF_IP1 | ECFGF_IP2 | ECFGF_IPI 
| ECFGF_PMC);
 }
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loon

[PATCH v6 5/7] LoongArch: KVM: Add pv ipi support on kvm side

2024-03-02 Thread Bibo Mao
On LoongArch system, ipi hw uses iocsr registers, there is one iocsr
register access on ipi sending, and two iocsr access on ipi receiving
which is ipi interrupt handler. On VM mode all iocsr accessing will
cause VM to trap into hypervisor. So with one ipi hw notification
there will be three times of trap.

PV ipi is added for VM, hypercall instruction is used for ipi sender,
and hypervisor will inject SWI to destination vcpu. During SWI interrupt
handler, only estat CSR register is written to clear irq. Estat CSR
register access will not trap into hypervisor. So with pv ipi supported,
there is one trap with pv ipi sender, and no trap with ipi receiver,
there is only one trap with ipi notification.

Also this patch adds ipi multicast support, the method is similar with
x86. With ipi multicast support, ipi notification can be sent to at most
128 vcpus at one time. It reduces trap times into hypervisor greatly.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/kvm_host.h  |   1 +
 arch/loongarch/include/asm/kvm_para.h  | 130 +
 arch/loongarch/include/asm/loongarch.h |   1 +
 arch/loongarch/kvm/exit.c  |  76 ++-
 arch/loongarch/kvm/vcpu.c  |   1 +
 5 files changed, 207 insertions(+), 2 deletions(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 3ba16ef1fe69..0b96c6303cf7 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -43,6 +43,7 @@ struct kvm_vcpu_stat {
u64 idle_exits;
u64 cpucfg_exits;
u64 signal_exits;
+   u64 hypercall_exits;
 };
 
 #define KVM_MEM_HUGEPAGE_CAPABLE   (1UL << 0)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index d48f993ae206..a82bffbbf8a1 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -2,6 +2,16 @@
 #ifndef _ASM_LOONGARCH_KVM_PARA_H
 #define _ASM_LOONGARCH_KVM_PARA_H
 
+/*
+ * Hypercall code field
+ */
+#define HYPERVISOR_KVM 1
+#define HYPERVISOR_VENDOR_SHIFT8
+#define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)
+#define KVM_HCALL_CODE_PV_SERVICE  0
+#define KVM_HCALL_PV_SERVICE   HYPERCALL_CODE(HYPERVISOR_KVM, 
KVM_HCALL_CODE_PV_SERVICE)
+#define  KVM_HCALL_FUNC_PV_IPI 1
+
 /*
  * LoongArch hypercall return code
  */
@@ -9,6 +19,126 @@
 #define KVM_HCALL_INVALID_CODE -1UL
 #define KVM_HCALL_INVALID_PARAMETER-2UL
 
+/*
+ * Hypercall interface for KVM hypervisor
+ *
+ * a0: function identifier
+ * a1-a6: args
+ * Return value will be placed in v0.
+ * Up to 6 arguments are passed in a1, a2, a3, a4, a5, a6.
+ */
+static __always_inline long kvm_hypercall(u64 fid)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HCALL_PV_SERVICE)
+   : "=r" (ret)
+   : "r" (fun)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall1(u64 fid, unsigned long arg0)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HCALL_PV_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall2(u64 fid,
+   unsigned long arg0, unsigned long arg1)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+   register unsigned long a2  asm("a2") = arg1;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HCALL_PV_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1), "r" (a2)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall3(u64 fid,
+   unsigned long arg0, unsigned long arg1, unsigned long arg2)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+   register unsigned long a2  asm("a2") = arg1;
+   register unsigned long a3  asm("a3") = arg2;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HCALL_PV_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1), "r" (a2), "r" (a3)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall4(u64 fid,
+   unsigned long arg0, unsigned long arg1, unsigned long arg2,
+   unsigned long arg3)
+{
+   register long ret asm("v0");
+   r

[PATCH v6 7/7] Documentation: KVM: Add hypercall for LoongArch

2024-03-02 Thread Bibo Mao
Add documentation topic for using pv_virt when running as a guest
on KVM hypervisor.

Signed-off-by: Bibo Mao 
---
 Documentation/virt/kvm/index.rst  |  1 +
 .../virt/kvm/loongarch/hypercalls.rst | 79 +++
 Documentation/virt/kvm/loongarch/index.rst| 10 +++
 3 files changed, 90 insertions(+)
 create mode 100644 Documentation/virt/kvm/loongarch/hypercalls.rst
 create mode 100644 Documentation/virt/kvm/loongarch/index.rst

diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst
index ad13ec55ddfe..9ca5a45c2140 100644
--- a/Documentation/virt/kvm/index.rst
+++ b/Documentation/virt/kvm/index.rst
@@ -14,6 +14,7 @@ KVM
s390/index
ppc-pv
x86/index
+   loongarch/index
 
locking
vcpu-requests
diff --git a/Documentation/virt/kvm/loongarch/hypercalls.rst 
b/Documentation/virt/kvm/loongarch/hypercalls.rst
new file mode 100644
index ..1679e48d67d2
--- /dev/null
+++ b/Documentation/virt/kvm/loongarch/hypercalls.rst
@@ -0,0 +1,79 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===
+The LoongArch paravirtual interface
+===
+
+KVM hypercalls use the HVCL instruction with code 0x100, and the hypercall
+number is put in a0 and up to five arguments may be placed in a1-a5, the
+return value is placed in v0 (alias with a0).
+
+The code for that interface can be found in arch/loongarch/kvm/*
+
+Querying for existence
+==
+
+To find out if we're running on KVM or not, cpucfg can be used with index
+CPUCFG_KVM_BASE (0x4000), cpucfg range between 0x4000 - 0x40FF
+is marked as a specially reserved range. All existing and future processors
+will not implement any features in this range.
+
+When Linux is running on KVM, cpucfg with index CPUCFG_KVM_BASE (0x4000)
+returns magic string "KVM\0"
+
+Once you determined you're running under a PV capable KVM, you can now use
+hypercalls as described below.
+
+KVM hypercall ABI
+=
+
+Hypercall ABI on KVM is simple, only one scratch register a0 (v0) and at most
+five generic registers used as input parameter. FP register and vector register
+is not used for input register and should not be modified during hypercall.
+Hypercall function can be inlined since there is only one scratch register.
+
+The parameters are as follows:
+
+   
+   RegisterIN  OUT
+   
+   a0  function number Return code
+   a1  1st parameter   -
+   a2  2nd parameter   -
+   a3  3rd parameter   -
+   a4  4th parameter   -
+   a5  5th parameter   -
+   
+
+Return codes can be as follows:
+
+   =
+   CodeMeaning
+   =
+   0   Success
+   -1  Hypercall not implemented
+   -2  Hypercall parameter error
+   =
+
+KVM Hypercalls Documentation
+
+
+The template for each hypercall is:
+1. Hypercall name
+2. Purpose
+
+1. KVM_HCALL_FUNC_PV_IPI
+
+
+:Purpose: Send IPIs to multiple vCPUs.
+
+- a0: KVM_HCALL_FUNC_PV_IPI
+- a1: lower part of the bitmap of destination physical CPUIDs
+- a2: higher part of the bitmap of destination physical CPUIDs
+- a3: the lowest physical CPUID in bitmap
+
+The hypercall lets a guest send multicast IPIs, with at most 128
+destinations per hypercall.  The destinations are represented by a bitmap
+contained in the first two arguments (a1 and a2). Bit 0 of a1 corresponds
+to the physical CPUID in the third argument (a3), bit 1 corresponds to the
+physical ID a3+1, and so on.
diff --git a/Documentation/virt/kvm/loongarch/index.rst 
b/Documentation/virt/kvm/loongarch/index.rst
new file mode 100644
index ..83387b4c5345
--- /dev/null
+++ b/Documentation/virt/kvm/loongarch/index.rst
@@ -0,0 +1,10 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=
+KVM for LoongArch systems
+=
+
+.. toctree::
+   :maxdepth: 2
+
+   hypercalls.rst
-- 
2.39.3




Re: [PATCH v6 7/7] Documentation: KVM: Add hypercall for LoongArch

2024-03-02 Thread WANG Xuerui

On 3/2/24 16:47, Bibo Mao wrote:

Add documentation topic for using pv_virt when running as a guest
on KVM hypervisor.

Signed-off-by: Bibo Mao 
---
  Documentation/virt/kvm/index.rst  |  1 +
  .../virt/kvm/loongarch/hypercalls.rst | 79 +++
  Documentation/virt/kvm/loongarch/index.rst| 10 +++
  3 files changed, 90 insertions(+)
  create mode 100644 Documentation/virt/kvm/loongarch/hypercalls.rst
  create mode 100644 Documentation/virt/kvm/loongarch/index.rst

diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst
index ad13ec55ddfe..9ca5a45c2140 100644
--- a/Documentation/virt/kvm/index.rst
+++ b/Documentation/virt/kvm/index.rst
@@ -14,6 +14,7 @@ KVM
 s390/index
 ppc-pv
 x86/index
+   loongarch/index
  
 locking

 vcpu-requests
diff --git a/Documentation/virt/kvm/loongarch/hypercalls.rst 
b/Documentation/virt/kvm/loongarch/hypercalls.rst
new file mode 100644
index ..1679e48d67d2
--- /dev/null
+++ b/Documentation/virt/kvm/loongarch/hypercalls.rst
@@ -0,0 +1,79 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===
+The LoongArch paravirtual interface
+===
+
+KVM hypercalls use the HVCL instruction with code 0x100, and the hypercall
+number is put in a0 and up to five arguments may be placed in a1-a5, the
+return value is placed in v0 (alias with a0).


Just say a0: the name v0 is long deprecated (has been the case ever 
since LoongArch got mainlined).



+
+The code for that interface can be found in arch/loongarch/kvm/*
+
+Querying for existence
+==
+
+To find out if we're running on KVM or not, cpucfg can be used with index
+CPUCFG_KVM_BASE (0x4000), cpucfg range between 0x4000 - 0x40FF
+is marked as a specially reserved range. All existing and future processors
+will not implement any features in this range.
+
+When Linux is running on KVM, cpucfg with index CPUCFG_KVM_BASE (0x4000)
+returns magic string "KVM\0"
+
+Once you determined you're running under a PV capable KVM, you can now use
+hypercalls as described below.


So this is still the approach similar to the x86 CPUID-based 
implementation. But here the non-privileged behavior isn't specified -- 
I see there is PLV checking in Patch 3 but it's safer to have the 
requirement spelled out here too.


But I still think this approach touches more places than strictly 
needed. As it is currently the case in 
arch/loongarch/kernel/cpu-probe.c, the FEATURES IOCSR is checked for a 
bit IOCSRF_VM that already signifies presence of a hypervisor; if this 
information can be interpreted as availability of the HVCL instruction 
(which I suppose is the case -- a hypervisor can always trap-and-emulate 
in case HVCL isn't provided by hardware), here we can already start 
making calls with HVCL.


We can and should define a uniform interface for probing the hypervisor 
kind, similar to the centrally-managed RISC-V SBI implementation ID 
registry [1]: otherwise future non-KVM hypervisors would have to


1. somehow pretend they are KVM and eventually fail to do so, leading to 
subtle incompatibilities,

2. invent another way of probing for their existence,
3. piggy-back on the current KVM definition, which is inelegant (reading 
the LoongArch-KVM-defined CPUCFG leaf only to find it's not KVM) and 
utterly makes the definition here *not* KVM-specific.


[1]: 
https://github.com/riscv-non-isa/riscv-sbi-doc/blob/v2.0/src/ext-base.adoc


My take on this:

To check if we are running on Linux KVM or not, first check IOCSR 0x8 
(``LOONGARCH_IOCSR_FEATURES``) for bit 11 (``IOCSRF_VM``); we are 
running under a hypervisor if the bit is set. Then invoke ``HVCL 0`` to 
find out the hypervisor implementation ID; a return value in ``$a0`` of 
0x004d564b (``KVM\0``) means Linux KVM, in which case the rest of the 
convention applies.



+
+KVM hypercall ABI
+=
+
+Hypercall ABI on KVM is simple, only one scratch register a0 (v0) and at most
+five generic registers used as input parameter. FP register and vector register
+is not used for input register and should not be modified during hypercall.
+Hypercall function can be inlined since there is only one scratch register.


It should be pointed out explicitly that on hypercall return all 
architectural state except ``$a0`` is preserved. Or is the whole ``$a0 - 
$t8`` range clobbered, just like with Linux syscalls?



+
+The parameters are as follows:
+
+   
+   RegisterIN  OUT
+   
+   a0  function number Return code
+   a1  1st parameter   -
+   a2  2nd parameter   -
+   a3  3rd parameter   -
+   a4  4th parameter   -
+   a5  5th parameter   

Re: Re: [PATCH] net/ipv4: add tracepoint for icmp_send

2024-03-02 Thread He Peilin
> >  include/trace/events/icmp.h | 57 
> > +
> >  net/ipv4/icmp.c |  4 
> >  2 files changed, 61 insertions(+)
> >  create mode 100644 include/trace/events/icmp.h
> >
> > diff --git a/include/trace/events/icmp.h b/include/trace/events/icmp.h
> > new file mode 100644
> > index ..3d9af5769bc3
> > --- /dev/null
> > +++ b/include/trace/events/icmp.h
> > @@ -0,0 +1,57 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#undef TRACE_SYSTEM
> > +#define TRACE_SYSTEM icmp
> > +
> > +#if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ)
> > +#define _TRACE_ICMP_H
> > +
> > +#include 
> > +#include 
> > +
> > +TRACE_EVENT(icmp_send,
> > +
> > +   TP_PROTO(const struct sk_buff *skb, int type, int code),
> > +
> > +   TP_ARGS(skb, type, code),
> > +
> > +   TP_STRUCT__entry(
> > +   __field(__u16, sport)
> > +   __field(__u16, dport)
> > +   __field(unsigned short, ulen)
> > +   __field(const void *, skbaddr)
> > +   __field(int, type)
> > +   __field(int, code)
> > +   __array(__u8, saddr, 4)
> > +   __array(__u8, daddr, 4)
> > +   ),
> > +
> > +   TP_fast_assign(
> > +   // Get UDP header
> > +   struct udphdr *uh = udp_hdr(skb);
> > +   struct iphdr *iph = ip_hdr(skb);
> > +   __be32 *p32;
> > +
> > +   __entry->sport = ntohs(uh->source);
> > +   __entry->dport = ntohs(uh->dest);
> > +   __entry->ulen = ntohs(uh->len);
> > +   __entry->skbaddr = skb;
> > +   __entry->type = type;
> > +   __entry->code = code;
> > +
> > +   p32 = (__be32 *) __entry->saddr;
> > +   *p32 = iph->saddr;
> > +
> > +   p32 = (__be32 *) __entry->daddr;
> > +   *p32 =  iph->daddr;
> > +   ),
> > +
> 
> FYI, ICMP can be generated for many other protocols than UDP.

We have noted this issue. Therefore, a UDP judgment confition has been added in 
TP_fast_assign.This condition will only track abnormal messages when icmp_send 
is called with the UDP protocol. Otherwise, it will simply print the abnormal 
type and code.

As follows:
if(proto_4 == IPPROTO_UDP) {
struct udphdr *uh = udp_hdr(skb);
__entry->sport = nthos(uh->source);
__entry_dport = nthos(uh->dest);
__entry->ulen = nthos(uh->len);
} else {
__entry->sport = 0;
__entry_dport = 0;
__entry->ulen = 0;
}

> 
> > +   TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to 
> > %pI4:%u ulen=%d skbaddr=%p",
> > +   __entry->type, __entry->code,
> > +   __entry->saddr, __entry->sport, __entry->daddr,
> > +   __entry->dport, __entry->ulen, __entry->skbaddr)
> > +);
> > +
> > +#endif /* _TRACE_ICMP_H */
> > +
> > +/* This part must be outside protection */
> > +#include 
> > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> > index e63a3bf99617..437bdb7e2650 100644
> > --- a/net/ipv4/icmp.c
> > +++ b/net/ipv4/icmp.c
> > @@ -92,6 +92,8 @@
> >  #include 
> >  #include 
> >  #include 
> > +#define CREATE_TRACE_POINTS
> > +#include 
> >
> >  /*
> >   * Build xmit assembly blocks
> > @@ -599,6 +601,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int 
> > code, __be32 info,
> > struct net *net;
> > struct sock *sk;
> >
> > +   trace_icmp_send(skb_in, type, code);
> 
> I think you missed many sanity checks between lines 622 and 676
Thank you for the reminder. The next step is to move the trace point to line 
676.

> Honestly, a kprobe BPF based solution would be less risky, and less
> maintenance for us.
emm, yeah, but tracepoints has advantages on its convienice, especially for 
those Embedded Linux which doesn't support EBPF.




Re: Re: [PATCH] net/ipv4: add tracepoint for icmp_send

2024-03-02 Thread He Peilin
> >  include/trace/events/icmp.h | 57 
> > +
> >  net/ipv4/icmp.c |  4 
> >  2 files changed, 61 insertions(+)
> >  create mode 100644 include/trace/events/icmp.h
> > 
> > diff --git a/include/trace/events/icmp.h b/include/trace/events/icmp.h
> > new file mode 100644
> > index ..3d9af5769bc3
> > --- /dev/null
> > +++ b/include/trace/events/icmp.h
> > @@ -0,0 +1,57 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#undef TRACE_SYSTEM
> > +#define TRACE_SYSTEM icmp
> > +
> > +#if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ)
> > +#define _TRACE_ICMP_H
> > +
> > +#include 
> > +#include 
> > +
> > +TRACE_EVENT(icmp_send,
> > +
> > +   TP_PROTO(const struct sk_buff *skb, int type, int code),
> > +
> > +   TP_ARGS(skb, type, code),
> > +
> > +   TP_STRUCT__entry(
> > +   __field(__u16, sport)
> 
> 2 bytes
> 
> > +   __field(__u16, dport)
> 
> 2 bytes
> 
> > +   __field(unsigned short, ulen)
> 
> 2 bytes
> 
> [ 2 byte hole for alignment ]
> 
> > +   __field(const void *, skbaddr)
> 
> 4/8 bytes
> 
> It's best to keep the holes at the end of the TP_STRUCT__entry().
> 
> That is, I would move ulen to the end of the structure. It doesn't affect
> anything else.
> 
> -- Steve
Thank you for pointing that out. The next step is to move __field(unsigned 
short, ulen) to the end of TP_STRUCT__entry().

> 
> > +   __field(int, type)
> > +   __field(int, code)
> > +   __array(__u8, saddr, 4)
> > +   __array(__u8, daddr, 4)
> > +   ),
> > +
> > +   TP_fast_assign(
> > +   // Get UDP header
> > +   struct udphdr *uh = udp_hdr(skb);
> > +   struct iphdr *iph = ip_hdr(skb);
> > +   __be32 *p32;
> > +
> > +   __entry->sport = ntohs(uh->source);
> > +   __entry->dport = ntohs(uh->dest);
> > +   __entry->ulen = ntohs(uh->len);
> > +   __entry->skbaddr = skb;
> > +   __entry->type = type;
> > +   __entry->code = code;
> > +
> > +   p32 = (__be32 *) __entry->saddr;
> > +   *p32 = iph->saddr;
> > +
> > +   p32 = (__be32 *) __entry->daddr;
> > +   *p32 =  iph->daddr;
> > +   ),
> > +
> > +   TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to %pI4:%u 
> > ulen=%d skbaddr=%p",
> > +   __entry->type, __entry->code,
> > +   __entry->saddr, __entry->sport, __entry->daddr,
> > +   __entry->dport, __entry->ulen, __entry->skbaddr)
> > +);
> > +
> > +#endif /* _TRACE_ICMP_H */
> > +
> > +/* This part must be outside protection */
> > +#include 
> > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> > index e63a3bf99617..437bdb7e2650 100644
> > --- a/net/ipv4/icmp.c
> > +++ b/net/ipv4/icmp.c
> > @@ -92,6 +92,8 @@
> >  #include 
> >  #include 
> >  #include 
> > +#define CREATE_TRACE_POINTS
> > +#include 
> > 
> >  /*
> >   * Build xmit assembly blocks
> > @@ -599,6 +601,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int 
> > code, __be32 info,
> > struct net *net;
> > struct sock *sk;
> > 
> > +   trace_icmp_send(skb_in, type, code);
> > +
> > if (!rt)
> > goto out;
> >