the following patch seems applicable and might fix an issue observed in our enterprise support a while ago. containers run in their own cgroups, thus were probably not scanned by the kernel shrinker - this resulted in Dnode cache numbers of 300+% reported in arc_summary.
FWICT the issue was introduced in ZFS 2.2.7 (commit 5f73630e9cbea5efa23d16809f06e0d08523b241 see: https://github.com/openzfs/zfs/issues/17052#issuecomment-3065907783) but I assume that the increase of zfs_arc_max by default makes it trigger OOMs far easier. The discussion of the PR was quite instructive: https://github.com/openzfs/zfs/pull/17542 minimally tested on a pair of trixie VMs (building + running replication of a couple of containers) Suggested-by: Thomas Lamprecht <t.lampre...@proxmox.com> Signed-off-by: Stoiko Ivanov <s.iva...@proxmox.com> --- .../0010-enforce-arc_dnode_limit.patch | 216 ++++++++++++++++++ debian/patches/series | 1 + 2 files changed, 217 insertions(+) create mode 100644 debian/patches/0010-enforce-arc_dnode_limit.patch diff --git a/debian/patches/0010-enforce-arc_dnode_limit.patch b/debian/patches/0010-enforce-arc_dnode_limit.patch new file mode 100644 index 000000000..38814bfab --- /dev/null +++ b/debian/patches/0010-enforce-arc_dnode_limit.patch @@ -0,0 +1,216 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: shodanshok <g.da...@assyoma.it> +Date: Mon, 21 Jul 2025 19:32:01 +0200 +Subject: [PATCH] enforce arc_dnode_limit + +Linux kernel shrinker in the context of null/root memcg does not scan +dentry and inode caches added by a task running in non-root memcg. For +ZFS this means that dnode cache routinely overflows, evicting valuable +meta/data and putting additional memory pressure on the system. + +This patch restores zfs_prune_aliases as fallback when the kernel +shrinker does nothing, enabling zfs to actually free dnodes. Moreover, +it (indirectly) calls arc_evict when dnode_size > dnode_limit. + +Reviewed-by: Rob Norris <r...@despairlabs.com> +Reviewed-by: Alexander Motin <m...@freebsd.org> +Reviewed-by: Brian Behlendorf <behlendo...@llnl.gov> +Signed-off-by: Gionatan Danti <g.da...@assyoma.it> +Closes #17487 +Closes #17542 +(cherry picked from commit a7a144e655850b4160943e4ba315eb9a5dc2b2fe) +Signed-off-by: Stoiko Ivanov <s.iva...@proxmox.com> +--- + include/sys/arc_impl.h | 2 +- + module/os/linux/zfs/zfs_vfsops.c | 65 ++++++++++++++++++++++++++++++++ + module/zfs/arc.c | 22 ++++++----- + 3 files changed, 78 insertions(+), 11 deletions(-) + +diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h +index 1b30389107c55d558ecc4a21f7471bc03e4155a4..b55d5da3378c0608c62b0004848baeca925eda4a 100644 +--- a/include/sys/arc_impl.h ++++ b/include/sys/arc_impl.h +@@ -954,7 +954,7 @@ typedef struct arc_sums { + wmsum_t arcstat_data_size; + wmsum_t arcstat_metadata_size; + wmsum_t arcstat_dbuf_size; +- wmsum_t arcstat_dnode_size; ++ aggsum_t arcstat_dnode_size; + wmsum_t arcstat_bonus_size; + wmsum_t arcstat_l2_hits; + wmsum_t arcstat_l2_misses; +diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c +index 56af4fe0a4648fa6a94ebc3e0bdb88fc95836aa6..7961549e637bd080f74e8a4fde9560f7bd86f179 100644 +--- a/module/os/linux/zfs/zfs_vfsops.c ++++ b/module/os/linux/zfs/zfs_vfsops.c +@@ -1176,6 +1176,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) + return (error); + } + ++/* ++ * Dentry and inode caches referenced by a task in non-root memcg are ++ * not going to be scanned by the kernel-provided shrinker. So, if ++ * kernel prunes nothing, fall back to this manual walk to free dnodes. ++ * To avoid scanning the same znodes multiple times they are always rotated ++ * to the end of the z_all_znodes list. New znodes are inserted at the ++ * end of the list so we're always scanning the oldest znodes first. ++ */ ++static int ++zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) ++{ ++ znode_t **zp_array, *zp; ++ int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); ++ int objects = 0; ++ int i = 0, j = 0; ++ ++ zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); ++ ++ mutex_enter(&zfsvfs->z_znodes_lock); ++ while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { ++ ++ if ((i++ > nr_to_scan) || (j >= max_array)) ++ break; ++ ++ ASSERT(list_link_active(&zp->z_link_node)); ++ list_remove(&zfsvfs->z_all_znodes, zp); ++ list_insert_tail(&zfsvfs->z_all_znodes, zp); ++ ++ /* Skip active znodes and .zfs entries */ ++ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) ++ continue; ++ ++ if (igrab(ZTOI(zp)) == NULL) ++ continue; ++ ++ zp_array[j] = zp; ++ j++; ++ } ++ mutex_exit(&zfsvfs->z_znodes_lock); ++ ++ for (i = 0; i < j; i++) { ++ zp = zp_array[i]; ++ ++ ASSERT3P(zp, !=, NULL); ++ d_prune_aliases(ZTOI(zp)); ++ ++ if (atomic_read(&ZTOI(zp)->i_count) == 1) ++ objects++; ++ ++ zrele(zp); ++ } ++ ++ vmem_free(zp_array, max_array * sizeof (znode_t *)); ++ ++ return (objects); ++} ++ + /* + * The ARC has requested that the filesystem drop entries from the dentry + * and inode caches. This can occur when the ARC needs to free meta data +@@ -1227,6 +1284,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) + *objects = (*shrinker->scan_objects)(shrinker, &sc); + #endif + ++ /* ++ * Fall back to zfs_prune_aliases if kernel's shrinker did nothing ++ * due to dentry and inode caches being referenced by a task running ++ * in non-root memcg. ++ */ ++ if (*objects == 0) ++ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); ++ + zfs_exit(zfsvfs, FTAG); + + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, +diff --git a/module/zfs/arc.c b/module/zfs/arc.c +index 75be2b02a7e5c6ed3f97377c05c062b9199c3dd3..520171eae37b8b3a7eae8991d4a8b5dc71846db9 100644 +--- a/module/zfs/arc.c ++++ b/module/zfs/arc.c +@@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) + ARCSTAT_INCR(arcstat_bonus_size, space); + break; + case ARC_SPACE_DNODE: +- ARCSTAT_INCR(arcstat_dnode_size, space); ++ aggsum_add(&arc_sums.arcstat_dnode_size, space); + break; + case ARC_SPACE_DBUF: + ARCSTAT_INCR(arcstat_dbuf_size, space); +@@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) + ARCSTAT_INCR(arcstat_bonus_size, -space); + break; + case ARC_SPACE_DNODE: +- ARCSTAT_INCR(arcstat_dnode_size, -space); ++ aggsum_add(&arc_sums.arcstat_dnode_size, -space); + break; + case ARC_SPACE_DBUF: + ARCSTAT_INCR(arcstat_dbuf_size, -space); +@@ -4490,7 +4490,7 @@ arc_evict(void) + * target is not evictable or if they go over arc_dnode_limit. + */ + int64_t prune = 0; +- int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); ++ int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); + int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) + - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +@@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) + * in the ARC. In practice, that's in the tens of MB, which is low + * enough to be safe. + */ +- int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - ++ int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - + zfs_max_recordsize; ++ int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) - ++ arc_dnode_limit; + + /* Always allow at least one block of overflow. */ +- if (over < 0) ++ if (arc_over < 0 && dn_over <= 0) + return (ARC_OVF_NONE); + + /* If we are under memory pressure, report severe overflow. */ +@@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) + int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; + if (use_reserve) + overflow *= 3; +- return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); ++ return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); + } + + static abd_t * +@@ -7343,7 +7345,7 @@ arc_kstat_update(kstat_t *ksp, int rw) + #if defined(COMPAT_FREEBSD11) + as->arcstat_other_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_bonus_size) + +- wmsum_value(&arc_sums.arcstat_dnode_size) + ++ aggsum_value(&arc_sums.arcstat_dnode_size) + + wmsum_value(&arc_sums.arcstat_dbuf_size); + #endif + +@@ -7385,7 +7387,7 @@ arc_kstat_update(kstat_t *ksp, int rw) + &as->arcstat_uncached_evictable_metadata); + + as->arcstat_dnode_size.value.ui64 = +- wmsum_value(&arc_sums.arcstat_dnode_size); ++ aggsum_value(&arc_sums.arcstat_dnode_size); + as->arcstat_bonus_size.value.ui64 = + wmsum_value(&arc_sums.arcstat_bonus_size); + as->arcstat_l2_hits.value.ui64 = +@@ -7755,7 +7757,7 @@ arc_state_init(void) + wmsum_init(&arc_sums.arcstat_data_size, 0); + wmsum_init(&arc_sums.arcstat_metadata_size, 0); + wmsum_init(&arc_sums.arcstat_dbuf_size, 0); +- wmsum_init(&arc_sums.arcstat_dnode_size, 0); ++ aggsum_init(&arc_sums.arcstat_dnode_size, 0); + wmsum_init(&arc_sums.arcstat_bonus_size, 0); + wmsum_init(&arc_sums.arcstat_l2_hits, 0); + wmsum_init(&arc_sums.arcstat_l2_misses, 0); +@@ -7914,7 +7916,7 @@ arc_state_fini(void) + wmsum_fini(&arc_sums.arcstat_data_size); + wmsum_fini(&arc_sums.arcstat_metadata_size); + wmsum_fini(&arc_sums.arcstat_dbuf_size); +- wmsum_fini(&arc_sums.arcstat_dnode_size); ++ aggsum_fini(&arc_sums.arcstat_dnode_size); + wmsum_fini(&arc_sums.arcstat_bonus_size); + wmsum_fini(&arc_sums.arcstat_l2_hits); + wmsum_fini(&arc_sums.arcstat_l2_misses); diff --git a/debian/patches/series b/debian/patches/series index f3f297e33..d59978cbb 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -7,3 +7,4 @@ 0007-Add-systemd-unit-for-importing-specific-pools.patch 0008-Patch-move-manpage-arcstat-1-to-arcstat-8.patch 0009-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch +0010-enforce-arc_dnode_limit.patch -- 2.39.5 _______________________________________________ pve-devel mailing list pve-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel