[PATCH v5 04/15] sparc: simplify module_alloc()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Define MODULES_VADDR and MODULES_END as VMALLOC_START and VMALLOC_END
for 32-bit and reduce module_alloc() to

__vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, ...)

as with the new defines the allocations becames identical for both 32
and 64 bits.

While on it, drop unsed include of 

Suggested-by: Sam Ravnborg 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/sparc/include/asm/pgtable_32.h |  2 ++
 arch/sparc/kernel/module.c  | 25 +
 2 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_32.h 
b/arch/sparc/include/asm/pgtable_32.h
index 9e85d57ac3f2..62bcafe38b1f 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -432,6 +432,8 @@ static inline int io_remap_pfn_range(struct vm_area_struct 
*vma,
 
 #define VMALLOC_START   _AC(0xfe60,UL)
 #define VMALLOC_END _AC(0xffc0,UL)
+#define MODULES_VADDR   VMALLOC_START
+#define MODULES_END VMALLOC_END
 
 /* We provide our own get_unmapped_area to cope with VA holes for userland */
 #define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index 66c45a2764bc..d37adb2a0b54 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -21,35 +21,12 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_SPARC64
-
-#include 
-
-static void *module_map(unsigned long size)
+void *module_alloc(unsigned long size)
 {
-   if (PAGE_ALIGN(size) > MODULES_LEN)
-   return NULL;
return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
-#else
-static void *module_map(unsigned long size)
-{
-   return vmalloc(size);
-}
-#endif /* CONFIG_SPARC64 */
-
-void *module_alloc(unsigned long size)
-{
-   void *ret;
-
-   ret = module_map(size);
-   if (ret)
-   memset(ret, 0, size);
-
-   return ret;
-}
 
 /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
 int module_frob_arch_sections(Elf_Ehdr *hdr,
-- 
2.43.0




[PATCH v5 03/15] nios2: define virtual address space for modules

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

nios2 uses kmalloc() to implement module_alloc() because CALL26/PCREL26
cannot reach all of vmalloc address space.

Define module space as 32MiB below the kernel base and switch nios2 to
use vmalloc for module allocations.

Suggested-by: Thomas Gleixner 
Acked-by: Dinh Nguyen 
Acked-by: Song Liu 
Signed-off-by: Mike Rapoport (IBM) 
---
 arch/nios2/include/asm/pgtable.h |  5 -
 arch/nios2/kernel/module.c   | 19 ---
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index d052dfcbe8d3..eab87c6beacb 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -25,7 +25,10 @@
 #include 
 
 #define VMALLOC_START  CONFIG_NIOS2_KERNEL_MMU_REGION_BASE
-#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
+#define VMALLOC_END(CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M - 1)
+
+#define MODULES_VADDR  (CONFIG_NIOS2_KERNEL_REGION_BASE - SZ_32M)
+#define MODULES_END(CONFIG_NIOS2_KERNEL_REGION_BASE - 1)
 
 struct mm_struct;
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 76e0a42d6e36..9c97b7513853 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -21,23 +21,12 @@
 
 #include 
 
-/*
- * Modules should NOT be allocated with kmalloc for (obvious) reasons.
- * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
- * from 0x8000 (vmalloc area) to 0xc (kernel) (kmalloc returns
- * addresses in 0xc000)
- */
 void *module_alloc(unsigned long size)
 {
-   if (size == 0)
-   return NULL;
-   return kmalloc(size, GFP_KERNEL);
-}
-
-/* Free memory returned from module_alloc */
-void module_memfree(void *module_region)
-{
-   kfree(module_region);
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_EXEC,
+   VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
+   __builtin_return_address(0));
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
-- 
2.43.0




[PATCH v5 02/15] mips: module: rename MODULE_START to MODULES_VADDR

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

and MODULE_END to MODULES_END to match other architectures that define
custom address space for modules.

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/mips/include/asm/pgtable-64.h | 4 ++--
 arch/mips/kernel/module.c  | 4 ++--
 arch/mips/mm/fault.c   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/asm/pgtable-64.h 
b/arch/mips/include/asm/pgtable-64.h
index 20ca48c1b606..c0109aff223b 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -147,8 +147,8 @@
 #if defined(CONFIG_MODULES) && defined(KBUILD_64BIT_SYM32) && \
VMALLOC_START != CKSSEG
 /* Load modules into 32bit-compatible segment. */
-#define MODULE_START   CKSSEG
-#define MODULE_END (FIXADDR_START-2*PAGE_SIZE)
+#define MODULES_VADDR  CKSSEG
+#define MODULES_END(FIXADDR_START-2*PAGE_SIZE)
 #endif
 
 #define pte_ERROR(e) \
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 7b2fbaa9cac5..9a6c96014904 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,10 +31,10 @@ struct mips_hi16 {
 static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
-#ifdef MODULE_START
+#ifdef MODULES_VADDR
 void *module_alloc(unsigned long size)
 {
-   return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
 }
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index aaa9a242ebba..37fedeaca2e9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -83,8 +83,8 @@ static void __do_page_fault(struct pt_regs *regs, unsigned 
long write,
 
if (unlikely(address >= VMALLOC_START && address <= VMALLOC_END))
goto VMALLOC_FAULT_TARGET;
-#ifdef MODULE_START
-   if (unlikely(address >= MODULE_START && address < MODULE_END))
+#ifdef MODULES_VADDR
+   if (unlikely(address >= MODULES_VADDR && address < MODULES_END))
goto VMALLOC_FAULT_TARGET;
 #endif
 
-- 
2.43.0




[PATCH v5 01/15] arm64: module: remove unneeded call to kasan_alloc_module_shadow()

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Since commit f6f37d9320a1 ("arm64: select KASAN_VMALLOC for SW/HW_TAGS
modes") KASAN_VMALLOC is always enabled when KASAN is on. This means
that allocations in module_alloc() will be tracked by KASAN protection
for vmalloc() and that kasan_alloc_module_shadow() will be always an
empty inline and there is no point in calling it.

Drop meaningless call to kasan_alloc_module_shadow() from
module_alloc().

Signed-off-by: Mike Rapoport (IBM) 
---
 arch/arm64/kernel/module.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 47e0be610bb6..e92da4da1b2a 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -141,11 +141,6 @@ void *module_alloc(unsigned long size)
__func__);
}
 
-   if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
-   vfree(p);
-   return NULL;
-   }
-
/* Memory is intended to be executable, reset the pointer tag. */
return kasan_reset_tag(p);
 }
-- 
2.43.0




Re: [PATCH net-next v7 1/7] net: introduce rstreason to detect why the RST is sent

2024-04-22 Thread Matthieu Baerts
Hi Jason,

On 22/04/2024 05:01, Jason Xing wrote:
> From: Jason Xing 
> 
> Add a new standalone file for the easy future extension to support
> both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

Thank you for looking at that!

(...)

> diff --git a/include/net/rstreason.h b/include/net/rstreason.h
> new file mode 100644
> index ..c57bc5413c17
> --- /dev/null
> +++ b/include/net/rstreason.h
> @@ -0,0 +1,144 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +
> +#ifndef _LINUX_RSTREASON_H
> +#define _LINUX_RSTREASON_H
> +#include 
> +#include 
> +
> +#define DEFINE_RST_REASON(FN, FNe)   \
> + FN(MPTCP_RST_EUNSPEC)   \
> + FN(MPTCP_RST_EMPTCP)\
> + FN(MPTCP_RST_ERESOURCE) \
> + FN(MPTCP_RST_EPROHIBIT) \
> + FN(MPTCP_RST_EWQ2BIG)   \
> + FN(MPTCP_RST_EBADPERF)  \
> + FN(MPTCP_RST_EMIDDLEBOX)\

Small detail: should it not make more sense to put the ones linked to
MPTCP at the end? I mean I guess MPTCP should be treated in second
priority: CONFIG_MPTCP could not be set, and the ones linked to TCP
should be more frequent, etc.

> + FN(NOT_SPECIFIED)   \
> + FN(NO_SOCKET)   \
> + FNe(MAX)

(...)

> +/* Convert reset reasons in MPTCP to our own enum type */
> +static inline enum sk_rst_reason convert_mptcpreason(u32 reason)
> +{
> + switch (reason) {
> + case MPTCP_RST_EUNSPEC:
> + return SK_RST_REASON_MPTCP_RST_EUNSPEC;
> + case MPTCP_RST_EMPTCP:
> + return SK_RST_REASON_MPTCP_RST_EMPTCP;
> + case MPTCP_RST_ERESOURCE:
> + return SK_RST_REASON_MPTCP_RST_ERESOURCE;
> + case MPTCP_RST_EPROHIBIT:
> + return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
> + case MPTCP_RST_EWQ2BIG:
> + return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
> + case MPTCP_RST_EBADPERF:
> + return SK_RST_REASON_MPTCP_RST_EBADPERF;
> + case MPTCP_RST_EMIDDLEBOX:
> + return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
> + default:
> + /**
> +  * It should not happen, or else errors may occur
> +  * in MPTCP layer
> +  */
> + return SK_RST_REASON_ERROR;
> + }
> +}

If this helper is only used on MPTCP, maybe better to move it to
net/mptcp/protocol.h (and to patch 5/7?)? We tried to isolate MPTCP code.

Also, maybe it is just me, but I'm not a big fan of the helper name:
convert_mptcpreason() (same for the "drop" one). I think it should at
least mention its "origin" (rst reason): e.g. something like
(sk_)rst_reason_convert_mptcp or (sk_)rst_convert_mptcp_reason() (or
mptcp_to_rst_reason())?

And (sk_)rst_reason_convert_(skb_)drop() (or skb_drop_to_rst_reason())?

> +/* Convert reset reasons in MPTCP to our own enum type */

I don't think this part is linked to MPTCP, right?

> +static inline enum sk_rst_reason convert_dropreason(enum skb_drop_reason 
> reason)
> +{
> + switch (reason) {
> + case SKB_DROP_REASON_NOT_SPECIFIED:
> + return SK_RST_REASON_NOT_SPECIFIED;
> + case SKB_DROP_REASON_NO_SOCKET:
> + return SK_RST_REASON_NO_SOCKET;
> + default:
> + /* If we don't have our own corresponding reason */
> + return SK_RST_REASON_NOT_SPECIFIED;
> + }
> +}

(This helper could be introduced in patch 4/7 because it is not used
before, but I'm fine either ways.)

> +#endif

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.




Re: [PATCH v2] ipvs: Fix checksumming on GSO of SCTP packets

2024-04-22 Thread Julian Anastasov

Hello,

On Sun, 21 Apr 2024, Ismael Luceno wrote:

> It was observed in the wild that pairs of consecutive packets would leave
> the IPVS with the same wrong checksum, and the issue only went away when
> disabling GSO.
> 
> IPVS needs to avoid computing the SCTP checksum when using GSO.
> 
> Fixes: 90017accff61 ("sctp: Add GSO support", 2016-06-02)
> Co-developed-by: Firo Yang 
> Signed-off-by: Ismael Luceno 
> Tested-by: Andreas Taschner 
> CC: Michal Kubeček 
> CC: Simon Horman 
> CC: Julian Anastasov 
> CC: lvs-de...@vger.kernel.org
> CC: netfilter-de...@vger.kernel.org
> CC: net...@vger.kernel.org
> CC: coret...@netfilter.org

Looks good to me, thanks!

Acked-by: Julian Anastasov 

As scripts/checkpatch.pl --strict /tmp/file.patch complains
about Co-developed-by and Signed-off-by lines you may want to
send v3...

> ---
> 
> Notes:
> Changes since v1:
> * Added skb_is_gso before skb_is_gso_sctp.
> * Added "Fixes" tag.
> 
>  net/netfilter/ipvs/ip_vs_proto_sctp.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c 
> b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> index a0921adc31a9..1e689c714127 100644
> --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
> +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> @@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct 
> ip_vs_protocol *pp,
>   if (sctph->source != cp->vport || payload_csum ||
>   skb->ip_summed == CHECKSUM_PARTIAL) {
>   sctph->source = cp->vport;
> - sctp_nat_csum(skb, sctph, sctphoff);
> + if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb))
> + sctp_nat_csum(skb, sctph, sctphoff);
>   } else {
>   skb->ip_summed = CHECKSUM_UNNECESSARY;
>   }
> @@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct 
> ip_vs_protocol *pp,
>   (skb->ip_summed == CHECKSUM_PARTIAL &&
>!(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) {
>   sctph->dest = cp->dport;
> - sctp_nat_csum(skb, sctph, sctphoff);
> + if (!skb_is_gso(skb) || !skb_is_gso_sctp(skb))
> + sctp_nat_csum(skb, sctph, sctphoff);
>   } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
>   skb->ip_summed = CHECKSUM_UNNECESSARY;
>   }
> -- 
> 2.43.0

Regards

--
Julian Anastasov 

[PATCH v5 00/15] mm: jit/text allocator

2024-04-22 Thread Mike Rapoport
From: "Mike Rapoport (IBM)" 

Hi,

Since v3 I looked into making execmem more of an utility toolbox, as we
discussed at LPC with Mark Rutland, but it was getting more hairier than
having a struct describing architecture constraints and a type identifying
the consumer of execmem.

And I do think that having the description of architecture constraints for
allocations of executable memory in a single place is better than having it
spread all over the place.

The patches available via git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v5

v5 changes:
* rebase on v6.9-rc4 to avoid a conflict in kprobes
* add copyrights to mm/execmem.c (Luis)
* fix spelling (Ingo)
* define MODULES_VADDDR for sparc (Sam)
* consistently initialize struct execmem_info (Peter)
* reduce #ifdefs in function bodies in kprobes (Masami) 

v4: https://lore.kernel.org/all/20240411160051.2093261-1-r...@kernel.org
* rebase on v6.9-rc2
* rename execmem_params to execmem_info and execmem_arch_params() to
  execmem_arch_setup()
* use single execmem_alloc() API instead of execmem_{text,data}_alloc() (Song)
* avoid extra copy of execmem parameters (Rick)
* run execmem_init() as core_initcall() except for the architectures that
  may allocated text really early (currently only x86) (Will)
* add acks for some of arm64 and riscv changes, thanks Will and Alexandre
* new commits:
  - drop call to kasan_alloc_module_shadow() on arm64 because it's not
needed anymore
  - rename MODULE_START to MODULES_VADDR on MIPS
  - use CONFIG_EXECMEM instead of CONFIG_MODULES on powerpc as per Christophe:
https://lore.kernel.org/all/79062fa3-3402-47b3-8920-9231ad05e...@csgroup.eu/

v3: https://lore.kernel.org/all/20230918072955.2507221-1-r...@kernel.org
* add type parameter to execmem allocation APIs
* remove BPF dependency on modules

v2: https://lore.kernel.org/all/20230616085038.4121892-1-r...@kernel.org
* Separate "module" and "others" allocations with execmem_text_alloc()
and jit_text_alloc()
* Drop ROX entailment on x86
* Add ack for nios2 changes, thanks Dinh Nguyen

v1: https://lore.kernel.org/all/20230601101257.530867-1-r...@kernel.org

= Cover letter from v1 (sligtly updated) =

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystmes
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

A centralized infrastructure for code allocation allows allocations of
executable memory as ROX, and future optimizations such as caching large
pages for better iTLB performance and providing sub-page allocations for
users that only need small jit code snippets.

Rick Edgecombe proposed perm_alloc extension to vmalloc [1] and Song Liu
proposed execmem_alloc [2], but both these approaches were targeting BPF
allocations and lacked the ground work to abstract executable allocations
and split them from the modules core.

Thomas Gleixner suggested to express module allocation restrictions and
requirements as struct mod_alloc_type_params [3] that would define ranges,
protections and other parameters for different types of allocations used by
modules and following that suggestion Song separated allocations of
different types in modules (commit ac3b43283923 ("module: replace
module_layout with module_memory")) and posted "Type aware module
allocator" set [4].

I liked the idea of parametrising code allocation requirements as a
structure, but I believe the original proposal and Song's module allocator
was too module centric, so I came up with these patches.

This set splits code allocation from modules by introducing execmem_alloc()
and and execmem_free(), APIs, replaces call sites of module_alloc() and
module_memfree() with the new APIs and implements core text and related
allocations in a central place.

Instead of architecture specific overrides for module_alloc(), the
architectures that require non-default behaviour for text allocation must
fill execmem_info structure and implement execmem_arch_setup() that returns
a pointer to that structure. If an architecture does not implement
execmem_arch_setup(), the defaults compatible with the current
modules::module_alloc() are used.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem APIs
take a type argument, that will be used to identify the calling subsystem
and to allow architectures to define parameters for ranges suitable for that
subsystem.

The new infrastructure allows decoupling of BPF, kprobes and ftrace from
modules, and most importantly it paves the way for ROX allocations for

Re: Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function

2024-04-22 Thread zhenwei pi




On 4/22/24 15:47, David Hildenbrand wrote:

On 22.04.24 09:42, zhenwei pi wrote:

All the VM events related statistics have dependence on
'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any
VM events in future, we would have codes like:
  #ifdef CONFIG_VM_EVENT_COUNTERS
   unsigned long foo;
  #endif
   ...
  #ifdef CONFIG_VM_EVENT_COUNTERS
   foo = events[XXX] + events[YYY];
   update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
  #endif

Separate vm events into a single function, also remove


Why not simply use __maybe_unused for that variable?



1>
static unsigned int update_balloon_stats()
{
unsigned __maybe_unused long foo;

...
#ifdef CONFIG_VM_EVENT_COUNTERS
foo = events[XXX] + events[YYY];
update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
#endif
}

2>
static inline unsigned int update_balloon_vm_stats()
{
#ifdef CONFIG_VM_EVENT_COUNTERS
unsigned long foo;

foo = events[XXX] + events[YYY];
update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
#endif
}

From the point of my view, I don't need to compile code in my brain 
when reading codes for case 2. :)


--
zhenwei pi



Re: [PATCH v2 3/4] virtio_balloon: introduce memory allocation stall counter

2024-04-22 Thread David Hildenbrand

On 22.04.24 09:42, zhenwei pi wrote:

Memory allocation stall counter represents the performance/latency of
memory allocation, expose this counter to the host side by virtio
balloon device via out-of-bound way.

Signed-off-by: zhenwei pi 
---
  drivers/virtio/virtio_balloon.c | 8 
  include/uapi/linux/virtio_balloon.h | 6 --
  2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 87a1d6fa77fb..ab039e83bc6f 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -323,6 +323,8 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb,
  #ifdef CONFIG_VM_EVENT_COUNTERS
unsigned long events[NR_VM_EVENT_ITEMS];
unsigned int idx = start;
+   unsigned int zid;
+   unsigned long stall = 0;
  
  	all_vm_events(events);

update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
@@ -333,6 +335,12 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb,
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]);
  
+	/* sum all the stall events */

+   for (zid = 0; zid < MAX_NR_ZONES; zid++)
+   stall += events[ALLOCSTALL_NORMAL - ZONE_NORMAL + zid];
+
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall);
+
  #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index b17bbe033697..487b893a160e 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -72,7 +72,8 @@ struct virtio_balloon_config {
  #define VIRTIO_BALLOON_S_HTLB_PGALLOC  8  /* Hugetlb page allocations */
  #define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures 
*/
  #define VIRTIO_BALLOON_S_OOM_KILL  10 /* OOM killer invocations */
-#define VIRTIO_BALLOON_S_NR   11
+#define VIRTIO_BALLOON_S_ALLOC_STALL   11 /* Stall count of memory allocatoin 
*/
+#define VIRTIO_BALLOON_S_NR   12
  
  #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \

VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@@ -85,7 +86,8 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
-   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \
+   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \
+   VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \
  }
  
  #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")


Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb




Re: [PATCH v2 1/4] virtio_balloon: separate vm events into a function

2024-04-22 Thread David Hildenbrand

On 22.04.24 09:42, zhenwei pi wrote:

All the VM events related statistics have dependence on
'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any
VM events in future, we would have codes like:
  #ifdef CONFIG_VM_EVENT_COUNTERS
   unsigned long foo;
  #endif
   ...
  #ifdef CONFIG_VM_EVENT_COUNTERS
   foo = events[XXX] + events[YYY];
   update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
  #endif

Separate vm events into a single function, also remove


Why not simply use __maybe_unused for that variable?

--
Cheers,

David / dhildenb




[PATCH v2 4/4] virtio_balloon: introduce memory scan/reclaim info

2024-04-22 Thread zhenwei pi
Expose memory scan/reclaim information to the host side via virtio
balloon device.

Now we have a metric to analyze the memory performance:

y: counter increases
n: counter does not changes
h: the rate of counter change is high
l: the rate of counter change is low

OOM: VIRTIO_BALLOON_S_OOM_KILL
STALL: VIRTIO_BALLOON_S_ALLOC_STALL
ASCAN: VIRTIO_BALLOON_S_SCAN_ASYNC
DSCAN: VIRTIO_BALLOON_S_SCAN_DIRECT
ARCLM: VIRTIO_BALLOON_S_RECLAIM_ASYNC
DRCLM: VIRTIO_BALLOON_S_RECLAIM_DIRECT

- OOM[y], STALL[*], ASCAN[*], DSCAN[*], ARCLM[*], DRCLM[*]:
  the guest runs under really critial memory pressure

- OOM[n], STALL[h], ASCAN[*], DSCAN[l], ARCLM[*], DRCLM[l]:
  the memory allocation stalls due to cgroup, not the global memory
  pressure.

- OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[h]:
  the memory allocation stalls due to global memory pressure. The
  performance gets hurt a lot. A high ratio between DRCLM/DSCAN shows
  quite effective memory reclaiming.

- OOM[n], STALL[h], ASCAN[*], DSCAN[h], ARCLM[*], DRCLM[l]:
  the memory allocation stalls due to global memory pressure.
  the ratio between DRCLM/DSCAN gets low, the guest OS is thrashing
  heavily, the serious case leads poor performance and difficult
  trouble shooting. Ex, sshd may block on memory allocation when
  accepting new connections, a user can't login a VM by ssh command.

- OOM[n], STALL[n], ASCAN[h], DSCAN[n], ARCLM[l], DRCLM[n]:
  the low ratio between ARCLM/ASCAN shows that the guest tries to
  reclaim more memory, but it can't. Once more memory is required in
  future, it will struggle to reclaim memory.

Acked-by: David Hildenbrand 
Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c |  9 +
 include/uapi/linux/virtio_balloon.h | 12 ++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index ab039e83bc6f..e45146fde164 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -341,6 +341,15 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb,
 
update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall);
 
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_SCAN,
+   pages_to_bytes(events[PGSCAN_KSWAPD]));
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_SCAN,
+   pages_to_bytes(events[PGSCAN_DIRECT]));
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_ASYNC_RECLAIM,
+   pages_to_bytes(events[PGSTEAL_KSWAPD]));
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_DIRECT_RECLAIM,
+   pages_to_bytes(events[PGSTEAL_DIRECT]));
+
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index 487b893a160e..ee35a372805d 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -73,7 +73,11 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures 
*/
 #define VIRTIO_BALLOON_S_OOM_KILL  10 /* OOM killer invocations */
 #define VIRTIO_BALLOON_S_ALLOC_STALL   11 /* Stall count of memory allocatoin 
*/
-#define VIRTIO_BALLOON_S_NR   12
+#define VIRTIO_BALLOON_S_ASYNC_SCAN12 /* Amount of memory scanned 
asynchronously */
+#define VIRTIO_BALLOON_S_DIRECT_SCAN   13 /* Amount of memory scanned directly 
*/
+#define VIRTIO_BALLOON_S_ASYNC_RECLAIM 14 /* Amount of memory reclaimed 
asynchronously */
+#define VIRTIO_BALLOON_S_DIRECT_RECLAIM 15 /* Amount of memory reclaimed 
directly */
+#define VIRTIO_BALLOON_S_NR   16
 
 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@@ -87,7 +91,11 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \
-   VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \
+   VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls", \
+   VIRTIO_BALLOON_S_NAMES_prefix "async-scans", \
+   VIRTIO_BALLOON_S_NAMES_prefix "direct-scans", \
+   VIRTIO_BALLOON_S_NAMES_prefix "async-reclaims", \
+   VIRTIO_BALLOON_S_NAMES_prefix "direct-reclaims" \
 }
 
 #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
-- 
2.34.1




[PATCH v2 3/4] virtio_balloon: introduce memory allocation stall counter

2024-04-22 Thread zhenwei pi
Memory allocation stall counter represents the performance/latency of
memory allocation, expose this counter to the host side by virtio
balloon device via out-of-bound way.

Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c | 8 
 include/uapi/linux/virtio_balloon.h | 6 --
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 87a1d6fa77fb..ab039e83bc6f 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -323,6 +323,8 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb,
 #ifdef CONFIG_VM_EVENT_COUNTERS
unsigned long events[NR_VM_EVENT_ITEMS];
unsigned int idx = start;
+   unsigned int zid;
+   unsigned long stall = 0;
 
all_vm_events(events);
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
@@ -333,6 +335,12 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb,
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]);
 
+   /* sum all the stall events */
+   for (zid = 0; zid < MAX_NR_ZONES; zid++)
+   stall += events[ALLOCSTALL_NORMAL - ZONE_NORMAL + zid];
+
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_ALLOC_STALL, stall);
+
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index b17bbe033697..487b893a160e 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -72,7 +72,8 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_HTLB_PGALLOC  8  /* Hugetlb page allocations */
 #define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures 
*/
 #define VIRTIO_BALLOON_S_OOM_KILL  10 /* OOM killer invocations */
-#define VIRTIO_BALLOON_S_NR   11
+#define VIRTIO_BALLOON_S_ALLOC_STALL   11 /* Stall count of memory allocatoin 
*/
+#define VIRTIO_BALLOON_S_NR   12
 
 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@@ -85,7 +86,8 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
-   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \
+   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills", \
+   VIRTIO_BALLOON_S_NAMES_prefix "alloc-stalls" \
 }
 
 #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
-- 
2.34.1




[PATCH v2 2/4] virtio_balloon: introduce oom-kill invocations

2024-04-22 Thread zhenwei pi
When the guest OS runs under critical memory pressure, the guest
starts to kill processes. A guest monitor agent may scan 'oom_kill'
from /proc/vmstat, and reports the OOM KILL event. However, the agent
may be killed and we will loss this critical event(and the later
events).

For now we can also grep for magic words in guest kernel log from host
side. Rather than this unstable way, virtio balloon reports OOM-KILL
invocations instead.

Acked-by: David Hildenbrand 
Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c | 1 +
 include/uapi/linux/virtio_balloon.h | 6 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 59fe157e5722..87a1d6fa77fb 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -331,6 +331,7 @@ static inline unsigned int update_balloon_vm_stats(struct 
virtio_balloon *vb,
pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+   update_stat(vb, idx++, VIRTIO_BALLOON_S_OOM_KILL, events[OOM_KILL]);
 
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index ddaa45e723c4..b17bbe033697 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -71,7 +71,8 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_CACHES   7   /* Disk caches */
 #define VIRTIO_BALLOON_S_HTLB_PGALLOC  8  /* Hugetlb page allocations */
 #define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures 
*/
-#define VIRTIO_BALLOON_S_NR   10
+#define VIRTIO_BALLOON_S_OOM_KILL  10 /* OOM killer invocations */
+#define VIRTIO_BALLOON_S_NR   11
 
 #define VIRTIO_BALLOON_S_NAMES_WITH_PREFIX(VIRTIO_BALLOON_S_NAMES_prefix) { \
VIRTIO_BALLOON_S_NAMES_prefix "swap-in", \
@@ -83,7 +84,8 @@ struct virtio_balloon_config {
VIRTIO_BALLOON_S_NAMES_prefix "available-memory", \
VIRTIO_BALLOON_S_NAMES_prefix "disk-caches", \
VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-allocations", \
-   VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures" \
+   VIRTIO_BALLOON_S_NAMES_prefix "hugetlb-failures", \
+   VIRTIO_BALLOON_S_NAMES_prefix "oom-kills" \
 }
 
 #define VIRTIO_BALLOON_S_NAMES VIRTIO_BALLOON_S_NAMES_WITH_PREFIX("")
-- 
2.34.1




[PATCH v2 1/4] virtio_balloon: separate vm events into a function

2024-04-22 Thread zhenwei pi
All the VM events related statistics have dependence on
'CONFIG_VM_EVENT_COUNTERS', once any stack variable is required by any
VM events in future, we would have codes like:
 #ifdef CONFIG_VM_EVENT_COUNTERS
  unsigned long foo;
 #endif
  ...
 #ifdef CONFIG_VM_EVENT_COUNTERS
  foo = events[XXX] + events[YYY];
  update_stat(vb, idx++, VIRTIO_BALLOON_S_XXX, foo);
 #endif

Separate vm events into a single function, also remove
'CONFIG_VM_EVENT_COUNTERS' from 'update_balloon_stats'.

Signed-off-by: zhenwei pi 
---
 drivers/virtio/virtio_balloon.c | 44 ++---
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1f5b3dd31fcf..59fe157e5722 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -316,34 +316,48 @@ static inline void update_stat(struct virtio_balloon *vb, 
int idx,
 
 #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT)
 
-static unsigned int update_balloon_stats(struct virtio_balloon *vb)
+/* Return the number of entries filled by vm events */
+static inline unsigned int update_balloon_vm_stats(struct virtio_balloon *vb,
+  unsigned int start)
 {
+#ifdef CONFIG_VM_EVENT_COUNTERS
unsigned long events[NR_VM_EVENT_ITEMS];
-   struct sysinfo i;
-   unsigned int idx = 0;
-   long available;
-   unsigned long caches;
+   unsigned int idx = start;
 
all_vm_events(events);
-   si_meminfo();
-
-   available = si_mem_available();
-   caches = global_node_page_state(NR_FILE_PAGES);
-
-#ifdef CONFIG_VM_EVENT_COUNTERS
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
-   pages_to_bytes(events[PSWPIN]));
+   pages_to_bytes(events[PSWPIN]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
-   pages_to_bytes(events[PSWPOUT]));
+   pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+
 #ifdef CONFIG_HUGETLB_PAGE
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
events[HTLB_BUDDY_PGALLOC]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
events[HTLB_BUDDY_PGALLOC_FAIL]);
-#endif
-#endif
+#endif /* CONFIG_HUGETLB_PAGE */
+
+   return idx - start;
+#else /* CONFIG_VM_EVENT_COUNTERS */
+
+   return 0;
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+}
+
+static unsigned int update_balloon_stats(struct virtio_balloon *vb)
+{
+   struct sysinfo i;
+   unsigned int idx = 0;
+   long available;
+   unsigned long caches;
+
+   idx += update_balloon_vm_stats(vb, idx);
+
+   si_meminfo();
+   available = si_mem_available();
+   caches = global_node_page_state(NR_FILE_PAGES);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
pages_to_bytes(i.freeram));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
-- 
2.34.1




[PATCH v2 0/4] Improve memory statistics for virtio balloon

2024-04-22 Thread zhenwei pi
Hi,

v1 -> v2:
- Add a new patch 'virtio_balloon: separate vm events into a function'
  to avoid any compiler warnings(unused stack variable on
  CONFIG_VM_EVENT_COUNTERS=n)
- Suggested by David, use a loop 'for (zid = 0; zid < MAX_NR_ZONES; zid++)'
  to obtain all the stall events.

RFC -> v1:
- several text changes: oom-kill -> oom-kills, SCAN_ASYNC -> ASYN_SCAN.
- move vm events codes into '#ifdef CONFIG_VM_EVENT_COUNTERS'

RFC version:
Link: 
https://lore.kernel.org/lkml/20240415084113.1203428-1-pizhen...@bytedance.com/T/#m1898963b3c27a989b1123db475135c3ca687ca84

zhenwei pi (4):
  virtio_balloon: separate vm events into a function
  virtio_balloon: introduce oom-kill invocations
  virtio_balloon: introduce memory allocation stall counter
  virtio_balloon: introduce memory scan/reclaim info

 drivers/virtio/virtio_balloon.c | 62 ++---
 include/uapi/linux/virtio_balloon.h | 16 +++-
 2 files changed, 61 insertions(+), 17 deletions(-)

-- 
2.34.1




Re: [PATCH net-next v3 1/2] ipvs: add READ_ONCE barrier for ipvs->sysctl_amemthresh

2024-04-22 Thread Aleksandr Mikhalitsyn
On Sun, Apr 21, 2024 at 1:06 PM Julian Anastasov  wrote:
>
>
> Hello,

Dear Julian,

Thanks a lot for the fast review and suggestions!

Kind regards,
Alex

>
> On Thu, 18 Apr 2024, Alexander Mikhalitsyn wrote:
>
> > Cc: Julian Anastasov 
> > Cc: Simon Horman 
> > Cc: Pablo Neira Ayuso 
> > Cc: Jozsef Kadlecsik 
> > Cc: Florian Westphal 
> > Suggested-by: Julian Anastasov 
> > Signed-off-by: Alexander Mikhalitsyn 
>
> Looks good to me, thanks!
>
> Acked-by: Julian Anastasov 
>
> > ---
> >  net/netfilter/ipvs/ip_vs_ctl.c | 14 +++---
> >  1 file changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> > index 143a341bbc0a..32be24f0d4e4 100644
> > --- a/net/netfilter/ipvs/ip_vs_ctl.c
> > +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> > @@ -94,6 +94,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
> >  {
> >   struct sysinfo i;
> >   int availmem;
> > + int amemthresh;
> >   int nomem;
> >   int to_change = -1;
> >
> > @@ -105,7 +106,8 @@ static void update_defense_level(struct netns_ipvs 
> > *ipvs)
> >   /* si_swapinfo(); */
> >   /* availmem = availmem - (i.totalswap - i.freeswap); */
> >
> > - nomem = (availmem < ipvs->sysctl_amemthresh);
> > + amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
> > + nomem = (availmem < amemthresh);
> >
> >   local_bh_disable();
> >
> > @@ -145,9 +147,8 @@ static void update_defense_level(struct netns_ipvs 
> > *ipvs)
> >   break;
> >   case 1:
> >   if (nomem) {
> > - ipvs->drop_rate = ipvs->drop_counter
> > - = ipvs->sysctl_amemthresh /
> > - (ipvs->sysctl_amemthresh-availmem);
> > + ipvs->drop_counter = amemthresh / (amemthresh - 
> > availmem);
> > + ipvs->drop_rate = ipvs->drop_counter;
> >   ipvs->sysctl_drop_packet = 2;
> >   } else {
> >   ipvs->drop_rate = 0;
> > @@ -155,9 +156,8 @@ static void update_defense_level(struct netns_ipvs 
> > *ipvs)
> >   break;
> >   case 2:
> >   if (nomem) {
> > - ipvs->drop_rate = ipvs->drop_counter
> > - = ipvs->sysctl_amemthresh /
> > - (ipvs->sysctl_amemthresh-availmem);
> > + ipvs->drop_counter = amemthresh / (amemthresh - 
> > availmem);
> > + ipvs->drop_rate = ipvs->drop_counter;
> >   } else {
> >   ipvs->drop_rate = 0;
> >   ipvs->sysctl_drop_packet = 1;
> > --
> > 2.34.1
>
> Regards
>
> --
> Julian Anastasov 
>



[PATCH v2 1/1] virtio: Add support for the virtio suspend feature

2024-04-22 Thread David Stevens
Add support for the VIRTIO_F_SUSPEND feature. When this feature is
negotiated, power management can use it to suspend virtio devices
instead of resorting to resetting the devices entirely.

Signed-off-by: David Stevens 
---
 drivers/virtio/virtio.c| 60 ++
 drivers/virtio/virtio_pci_common.c | 34 -
 drivers/virtio/virtio_pci_modern.c | 19 ++
 include/linux/virtio.h |  8 
 include/uapi/linux/virtio_config.h | 10 -
 5 files changed, 112 insertions(+), 19 deletions(-)

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index f4080692b351..c7685a0dc995 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -498,6 +499,13 @@ void unregister_virtio_device(struct virtio_device *dev)
 }
 EXPORT_SYMBOL_GPL(unregister_virtio_device);
 
+void virtio_device_mark_removed(struct virtio_device *dev)
+{
+   /* Pairs with READ_ONCE() in virtio_device_set_suspend_bit(). */
+   WRITE_ONCE(dev->removed, true);
+}
+EXPORT_SYMBOL_GPL(virtio_device_mark_removed);
+
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev)
 {
@@ -580,6 +588,58 @@ int virtio_device_restore(struct virtio_device *dev)
return ret;
 }
 EXPORT_SYMBOL_GPL(virtio_device_restore);
+
+static int virtio_device_set_suspend_bit(struct virtio_device *dev, bool 
enabled)
+{
+   u8 status, target;
+
+   status = dev->config->get_status(dev);
+   if (enabled)
+   target = status | VIRTIO_CONFIG_S_SUSPEND;
+   else
+   target = status & ~VIRTIO_CONFIG_S_SUSPEND;
+
+   if (target == status)
+   return 0;
+
+   dev->config->set_status(dev, target);
+
+   while ((status = dev->config->get_status(dev)) != target) {
+   if (status & VIRTIO_CONFIG_S_NEEDS_RESET)
+   return -EIO;
+   /* Pairs with WRITE_ONCE() in virtio_device_mark_removed(). */
+   if (READ_ONCE(dev->removed))
+   return -EIO;
+   msleep(10);
+   }
+   return 0;
+}
+
+bool virtio_device_can_suspend(struct virtio_device *dev)
+{
+   return virtio_has_feature(dev, VIRTIO_F_SUSPEND) &&
+  (dev->config->get_status(dev) & VIRTIO_CONFIG_S_FEATURES_OK);
+}
+EXPORT_SYMBOL_GPL(virtio_device_can_suspend);
+
+int virtio_device_suspend(struct virtio_device *dev)
+{
+   return virtio_device_set_suspend_bit(dev, true);
+}
+EXPORT_SYMBOL_GPL(virtio_device_suspend);
+
+bool virtio_device_can_resume(struct virtio_device *dev)
+{
+   return virtio_has_feature(dev, VIRTIO_F_SUSPEND) &&
+  (dev->config->get_status(dev) & VIRTIO_CONFIG_S_SUSPEND);
+}
+EXPORT_SYMBOL_GPL(virtio_device_can_resume);
+
+int virtio_device_resume(struct virtio_device *dev)
+{
+   return virtio_device_set_suspend_bit(dev, false);
+}
+EXPORT_SYMBOL_GPL(virtio_device_resume);
 #endif
 
 static int virtio_init(void)
diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index b655fccaf773..a6ca7718b5dc 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -495,31 +495,26 @@ static int virtio_pci_restore(struct device *dev)
return virtio_device_restore(_dev->vdev);
 }
 
-static bool vp_supports_pm_no_reset(struct device *dev)
+static int virtio_pci_suspend(struct device *dev)
 {
struct pci_dev *pci_dev = to_pci_dev(dev);
-   u16 pmcsr;
-
-   if (!pci_dev->pm_cap)
-   return false;
-
-   pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, );
-   if (PCI_POSSIBLE_ERROR(pmcsr)) {
-   dev_err(dev, "Unable to query pmcsr");
-   return false;
-   }
+   struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
 
-   return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET;
-}
+   if (virtio_device_can_suspend(_dev->vdev))
+   return virtio_device_suspend(_dev->vdev);
 
-static int virtio_pci_suspend(struct device *dev)
-{
-   return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev);
+   return virtio_pci_freeze(dev);
 }
 
 static int virtio_pci_resume(struct device *dev)
 {
-   return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev);
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+
+   if (virtio_device_can_resume(_dev->vdev))
+   return virtio_device_resume(_dev->vdev);
+
+   return virtio_pci_restore(dev);
 }
 
 static const struct dev_pm_ops virtio_pci_pm_ops = {
@@ -623,9 +618,12 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
 * Device is marked broken on surprise removal so that virtio upper
 * layers can abort any ongoing operation.
 */
-   if (!pci_device_is_present(pci_dev))
+   

[PATCH v2 0/1] virtio: Add suspend support

2024-04-22 Thread David Stevens
This series implements support for the virtio device suspend feature
that is under discussion. Unfortunately, the virtio mailing list is
currently being migrated, so recent discussion of the proposal is not
archived anywhere. There current version of the proposal is a
combination of [1] and [2].

[1] https://lore.kernel.org/all/20230906081637.32185-3-lingshan@intel.com/
[2] https://lists.oasis-open.org/archives/virtio-comment/202402/msg00088.html

v1 -> v2:
 - Check for device removal while waiting for suspend bit.
 - Don't try to suspend uninitialized deivces.
 - Use msleep instead of mdelay.

David Stevens (1):
  virtio: Add support for the virtio suspend feature

 drivers/virtio/virtio.c| 60 ++
 drivers/virtio/virtio_pci_common.c | 34 -
 drivers/virtio/virtio_pci_modern.c | 19 ++
 include/linux/virtio.h |  8 
 include/uapi/linux/virtio_config.h | 10 -
 5 files changed, 112 insertions(+), 19 deletions(-)


base-commit: e8f897f4afef0031fe618a8e94127a0934896aba
-- 
2.44.0.769.g3c40516874-goog




<    1   2