Re: [Cluster-devel] [PATCH v2 11/47] gfs2: dynamically allocate the gfs2-qd shrinker
On 2023/7/24 17:43, Qi Zheng wrote: Use new APIs to dynamically allocate the gfs2-qd shrinker. Signed-off-by: Qi Zheng --- fs/gfs2/main.c | 6 +++--- fs/gfs2/quota.c | 26 -- fs/gfs2/quota.h | 3 ++- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index afcb32854f14..e47b1cc79f59 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); + error = gfs2_qd_shrinker_init(); if (error) goto fail_shrinker; @@ -196,7 +196,7 @@ static int __init init_gfs2_fs(void) fail_wq2: destroy_workqueue(gfs_recovery_wq); fail_wq1: - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); fail_cachep8: @@ -229,7 +229,7 @@ static int __init init_gfs2_fs(void) static void __exit exit_gfs2_fs(void) { - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 704192b73605..bc9883cea847 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -186,13 +186,27 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } -struct shrinker gfs2_qd_shrinker = { - .count_objects = gfs2_qd_shrink_count, - .scan_objects = gfs2_qd_shrink_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, -}; +static struct shrinker *gfs2_qd_shrinker; + +int gfs2_qd_shrinker_init(void) It's better to declare this as __init. +{ + gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd"); + if (!gfs2_qd_shrinker) + return -ENOMEM; + + gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count; + gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan; + gfs2_qd_shrinker->seeks = DEFAULT_SEEKS; + + shrinker_register(gfs2_qd_shrinker); + return 0; +} + +void gfs2_qd_shrinker_exit(void) +{ + shrinker_unregister(gfs2_qd_shrinker); +} static u64 qd2index(struct gfs2_quota_data *qd) { diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 21ada332d555..f9cb863373f7 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -59,7 +59,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, } extern const struct quotactl_ops gfs2_quotactl_ops; -extern struct shrinker gfs2_qd_shrinker; +int gfs2_qd_shrinker_init(void); +void gfs2_qd_shrinker_exit(void); extern struct list_lru gfs2_qd_lru; extern void __init gfs2_quota_hash_init(void);
Re: [Cluster-devel] [PATCH v2 10/47] gfs2: dynamically allocate the gfs2-glock shrinker
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > Use new APIs to dynamically allocate the gfs2-glock shrinker. > > Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Thanks.
[Cluster-devel] [PATCH v6 5/7] xfs: switch to multigrain timestamps
Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. Also, anytime the mtime changes, the ctime must also change, and those are now the only two options for xfs_trans_ichgtime. Have that function unconditionally bump the ctime, and ASSERT that XFS_ICHGTIME_CHG is always set. Signed-off-by: Jeff Layton --- fs/xfs/libxfs/xfs_trans_inode.c | 6 +++--- fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_super.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 6b2296ff248a..ad22656376d3 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 3a9363953ef2..3f89ef5a2820 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -573,10 +573,10 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode_get_ctime(inode); stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); + fill_mg_cmtime(request_mask, inode, stat); + if (xfs_has_v3inodes(mp)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 818510243130..4b10edb2c972 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2009,7 +2009,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context= xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb= kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("xfs"); -- 2.41.0
[Cluster-devel] [PATCH v6 6/7] ext4: switch to multigrain timestamps
Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. For ext4, we only need to enable the FS_MGTIME flag. Signed-off-by: Jeff Layton --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b54c70e1a74e..cb1ff47af156 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -7279,7 +7279,7 @@ static struct file_system_type ext4_fs_type = { .init_fs_context= ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb= kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); -- 2.41.0
[Cluster-devel] [PATCH v6 7/7] btrfs: convert to multigrain timestamps
Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. Beyond enabling the FS_MGTIME flag, this patch eliminates update_time_for_write, which goes to great pains to avoid in-memory stores. Just have it overwrite the timestamps unconditionally. Signed-off-by: Jeff Layton --- fs/btrfs/file.c | 24 fs/btrfs/super.c | 5 +++-- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d7a9ece7a40b..b9e75c9f95ac 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1106,25 +1106,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now, ctime; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) - inode->i_mtime = now; - - ctime = inode_get_ctime(inode); - if (!timespec64_equal(&ctime, &now)) - inode_set_ctime_to_ts(inode, now); - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) { @@ -1156,7 +1137,10 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode->i_mtime = inode_set_ctime_current(inode); + inode_inc_iversion(inode); + } start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f1dd172d8d5b..8eda51b095c9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2144,7 +2144,7 @@ static struct file_system_type btrfs_fs_type = { .name = "btrfs", .mount = btrfs_mount, .kill_sb= btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME, }; static struct file_system_type btrfs_root_fs_type = { @@ -2152,7 +2152,8 @@ static struct file_system_type btrfs_root_fs_type = { .name = "btrfs", .mount = btrfs_mount_root, .kill_sb= btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); -- 2.41.0
[Cluster-devel] [PATCH v6 4/7] tmpfs: add support for multigrain timestamps
Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. tmpfs only requires the FS_MGTIME flag. Signed-off-by: Jeff Layton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 654d9a585820..b6019c905058 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4264,7 +4264,7 @@ static struct file_system_type shmem_fs_type = { #endif .kill_sb= kill_litter_super, #ifdef CONFIG_SHMEM - .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, #else .fs_flags = FS_USERNS_MOUNT, #endif -- 2.41.0
[Cluster-devel] [PATCH v6 3/7] tmpfs: bump the mtime/ctime/iversion when page becomes writeable
Most filesystems that use the pagecache will update the mtime, ctime, and change attribute when a page becomes writeable. Add a page_mkwrite operation for tmpfs and just use it to bump the mtime, ctime and change attribute. This fixes xfstest generic/080 on tmpfs. Signed-off-by: Jeff Layton --- mm/shmem.c | 12 1 file changed, 12 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index b154af49d2df..654d9a585820 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2169,6 +2169,16 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) return ret; } +static vm_fault_t shmem_page_mkwrite(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + + file_update_time(vma->vm_file); + inode_inc_iversion(inode); + return 0; +} + unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -4210,6 +4220,7 @@ static const struct super_operations shmem_ops = { static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, + .page_mkwrite = shmem_page_mkwrite, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, @@ -4219,6 +4230,7 @@ static const struct vm_operations_struct shmem_vm_ops = { static const struct vm_operations_struct shmem_anon_vm_ops = { .fault = shmem_fault, + .page_mkwrite = shmem_page_mkwrite, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, -- 2.41.0
[Cluster-devel] [PATCH v6 2/7] fs: add infrastructure for multigrain timestamps
The VFS always uses coarse-grained timestamps when updating the ctime and mtime after a change. This has the benefit of allowing filesystems to optimize away a lot metadata updates, down to around 1 per jiffy, even when a file is under heavy writes. Unfortunately, this has always been an issue when we're exporting via NFSv3, which relies on timestamps to validate caches. A lot of changes can happen in a jiffy, so timestamps aren't sufficient to help the client decide to invalidate the cache. Even with NFSv4, a lot of exported filesystems don't properly support a change attribute and are subject to the same problems with timestamp granularity. Other applications have similar issues with timestamps (e.g backup applications). If we were to always use fine-grained timestamps, that would improve the situation, but that becomes rather expensive, as the underlying filesystem would have to log a lot more metadata updates. What we need is a way to only use fine-grained timestamps when they are being actively queried. POSIX generally mandates that when the the mtime changes, the ctime must also change. The kernel always stores normalized ctime values, so only the first 30 bits of the tv_nsec field are ever used. Use the 31st bit of the ctime tv_nsec field to indicate that something has queried the inode for the mtime or ctime. When this flag is set, on the next mtime or ctime update, the kernel will fetch a fine-grained timestamp instead of the usual coarse-grained one. Filesytems can opt into this behavior by setting the FS_MGTIME flag in the fstype. Filesystems that don't set this flag will continue to use coarse-grained timestamps. Later patches will convert individual filesystems to use the new infrastructure. Signed-off-by: Jeff Layton --- fs/inode.c | 98 ++ fs/stat.c | 41 +-- include/linux/fs.h | 45 +++-- 3 files changed, 151 insertions(+), 33 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index d4ab92233062..369621e7faf5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1919,6 +1919,21 @@ int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) } EXPORT_SYMBOL(inode_update_time); +/** + * current_coarse_time - Return FS time + * @inode: inode. + * + * Return the current coarse-grained time truncated to the time + * granularity supported by the fs. + */ +static struct timespec64 current_coarse_time(struct inode *inode) +{ + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + return timestamp_truncate(now, inode); +} + /** * atime_needs_update - update the access time * @path: the &struct path to update @@ -1952,7 +1967,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; - now = current_time(inode); + now = current_coarse_time(inode); if (!relatime_need_update(mnt, inode, now)) return false; @@ -1986,7 +2001,7 @@ void touch_atime(const struct path *path) * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ - now = current_time(inode); + now = current_coarse_time(inode); inode_update_time(inode, &now, S_ATIME); __mnt_drop_write(mnt); skip_update: @@ -2072,6 +2087,56 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); +/** + * current_mgtime - Return FS time (possibly fine-grained) + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs, as suitable for a ctime/mtime change. If the ctime is flagged + * as having been QUERIED, get a fine-grained timestamp. + */ +static struct timespec64 current_mgtime(struct inode *inode) +{ + struct timespec64 now; + atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; + long nsec = atomic_long_read(pnsec); + + if (nsec & I_CTIME_QUERIED) { + ktime_get_real_ts64(&now); + } else { + struct timespec64 ctime; + + ktime_get_coarse_real_ts64(&now); + + /* +* If we've recently fetched a fine-grained timestamp +* then the coarse-grained one may still be earlier than the +* existing one. Just keep the existing ctime if so. +*/ + ctime = inode_get_ctime(inode); + if (timespec64_compare(&ctime, &now) > 0) + now = ctime; + } + + return timestamp_truncate(now, inode); +} + +/** + * current_time - Return timestamp suitable for ctime update + * @inode: inode to eventually be updated + * + * Return the current time, which is usually coarse-grained but may be fine + * grained if the filesystem uses multigra
[Cluster-devel] [PATCH v6 1/7] fs: pass the request_mask to generic_fillattr
generic_fillattr just fills in the entire stat struct indiscriminately today, copying data from the inode. There is at least one attribute (STATX_CHANGE_COOKIE) that can have side effects when it is reported, and we're looking at adding more with the addition of multigrain timestamps. Add a request_mask argument to generic_fillattr and have most callers just pass in the value that is passed to getattr. Have other callers (e.g. ksmbd) just pass in STATX_BASIC_STATS. Also move the setting of STATX_CHANGE_COOKIE into generic_fillattr. Signed-off-by: Jeff Layton --- fs/9p/vfs_inode.c | 4 ++-- fs/9p/vfs_inode_dotl.c | 4 ++-- fs/afs/inode.c | 2 +- fs/btrfs/inode.c| 2 +- fs/ceph/inode.c | 2 +- fs/coda/inode.c | 3 ++- fs/ecryptfs/inode.c | 5 +++-- fs/erofs/inode.c| 2 +- fs/exfat/file.c | 2 +- fs/ext2/inode.c | 2 +- fs/ext4/inode.c | 2 +- fs/f2fs/file.c | 2 +- fs/fat/file.c | 2 +- fs/fuse/dir.c | 2 +- fs/gfs2/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/kernfs/inode.c | 2 +- fs/libfs.c | 4 ++-- fs/minix/inode.c| 2 +- fs/nfs/inode.c | 2 +- fs/nfs/namespace.c | 3 ++- fs/ntfs3/file.c | 2 +- fs/ocfs2/file.c | 2 +- fs/orangefs/inode.c | 2 +- fs/proc/base.c | 4 ++-- fs/proc/fd.c| 2 +- fs/proc/generic.c | 2 +- fs/proc/proc_net.c | 2 +- fs/proc/proc_sysctl.c | 2 +- fs/proc/root.c | 3 ++- fs/smb/client/inode.c | 2 +- fs/smb/server/smb2pdu.c | 22 +++--- fs/smb/server/vfs.c | 3 ++- fs/stat.c | 18 ++ fs/sysv/itree.c | 3 ++- fs/ubifs/dir.c | 2 +- fs/udf/symlink.c| 2 +- fs/vboxsf/utils.c | 2 +- include/linux/fs.h | 2 +- mm/shmem.c | 2 +- 40 files changed, 70 insertions(+), 62 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 16d85e6033a3..d24d1f20e922 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1016,7 +1016,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache & CACHE_WRITEBACK) { if (S_ISREG(inode->i_mode)) { @@ -1037,7 +1037,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); p9stat_free(st); kfree(st); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 464ea73d1bf8..8e8d5d2a13d8 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -451,7 +451,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache) { if (S_ISREG(inode->i_mode)) { @@ -476,7 +476,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6b636f43f548..1c794a1896aa 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -773,7 +773,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bcccd551f547..7346059209aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8773,7 +8773,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(idmap, inode,
[Cluster-devel] [PATCH v6 0/7] fs: implement multigrain timestamps
The VFS always uses coarse-grained timestamps when updating the ctime and mtime after a change. This has the benefit of allowing filesystems to optimize away a lot metadata updates, down to around 1 per jiffy, even when a file is under heavy writes. Unfortunately, this coarseness has always been an issue when we're exporting via NFSv3, which relies on timestamps to validate caches. A lot of changes can happen in a jiffy, so timestamps aren't sufficient to help the client decide to invalidate the cache. Even with NFSv4, a lot of exported filesystems don't properly support a change attribute and are subject to the same problems with timestamp granularity. Other applications have similar issues with timestamps (e.g backup applications). If we were to always use fine-grained timestamps, that would improve the situation, but that becomes rather expensive, as the underlying filesystem would have to log a lot more metadata updates. What we need is a way to only use fine-grained timestamps when they are being actively queried. The idea is to use an unused bit in the ctime's tv_nsec field to mark when the mtime or ctime has been queried via getattr. Once that has been marked, the next m/ctime update will use a fine-grained timestamp. This patch series is based on top of Christian's vfs.all branch, which has the recent conversion to the new ctime accessors. It should apply cleanly on top of linux-next. The first two patches should probably go in via the vfs tree. Should the fs-specific patches go in that way as well, or should they go via maintainer trees? Either should be fine. The first two patches should probably go in via Christian's vfs tree. The rest could go via maintainer trees or the vfs tree. For now, I'd like to get these into linux-next. Christian, would you be willing to pick these up for now? Alternately, I can feed them there via the iversion branch that Stephen is already pulling in from my tree. Signed-off-by: Jeff Layton base-commit: cf22d118b89a09a0160586412160d89098f7c4c7 --- Changes in v6: - drop the patch that removed XFS_ICHGTIME_CHG - change WARN_ON_ONCE to ASSERT in xfs conversion patch --- Jeff Layton (7): fs: pass the request_mask to generic_fillattr fs: add infrastructure for multigrain timestamps tmpfs: bump the mtime/ctime/iversion when page becomes writeable tmpfs: add support for multigrain timestamps xfs: switch to multigrain timestamps ext4: switch to multigrain timestamps btrfs: convert to multigrain timestamps fs/9p/vfs_inode.c | 4 +- fs/9p/vfs_inode_dotl.c | 4 +- fs/afs/inode.c | 2 +- fs/btrfs/file.c | 24 ++ fs/btrfs/inode.c| 2 +- fs/btrfs/super.c| 5 ++- fs/ceph/inode.c | 2 +- fs/coda/inode.c | 3 +- fs/ecryptfs/inode.c | 5 ++- fs/erofs/inode.c| 2 +- fs/exfat/file.c | 2 +- fs/ext2/inode.c | 2 +- fs/ext4/inode.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/file.c | 2 +- fs/fat/file.c | 2 +- fs/fuse/dir.c | 2 +- fs/gfs2/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/inode.c | 98 + fs/kernfs/inode.c | 2 +- fs/libfs.c | 4 +- fs/minix/inode.c| 2 +- fs/nfs/inode.c | 2 +- fs/nfs/namespace.c | 3 +- fs/ntfs3/file.c | 2 +- fs/ocfs2/file.c | 2 +- fs/orangefs/inode.c | 2 +- fs/proc/base.c | 4 +- fs/proc/fd.c| 2 +- fs/proc/generic.c | 2 +- fs/proc/proc_net.c | 2 +- fs/proc/proc_sysctl.c | 2 +- fs/proc/root.c | 3 +- fs/smb/client/inode.c | 2 +- fs/smb/server/smb2pdu.c | 22 - fs/smb/server/vfs.c | 3 +- fs/stat.c | 59 - fs/sysv/itree.c | 3 +- fs/ubifs/dir.c | 2 +- fs/udf/symlink.c| 2 +- fs/vboxsf/utils.c | 2 +- fs/xfs/libxfs/xfs_trans_inode.c | 6 +-- fs/xfs/xfs_iops.c | 4 +- fs/xfs/xfs_super.c | 2 +- include/linux/fs.h | 47 ++-- mm/shmem.c | 16 ++- 47 files changed, 248 insertions(+), 125 deletions(-) --- base-commit: 810b5fff7917119ea82ff96e312e2d4350d6b681 change-id: 20230713-mgctime-f2a9fc324918 Best regards, -- Jeff Layton
Re: [Cluster-devel] [PATCHv3 v6.5-rc2 3/3] fs: dlm: fix F_CANCELLK to cancel pending request
Hi, On Tue, Jul 18, 2023 at 2:07 PM Alexander Aring wrote: > > This patch fixes the current handling of F_CANCELLK by not just doing a > unlock as we need to try to cancel a lock at first. A unlock makes sense > on a non-blocking lock request but if it's a blocking lock request we > need to cancel the request until it's not granted yet. This patch is fixing > this behaviour by first try to cancel a lock request and if it's failed > it's unlocking the lock which seems to be granted. > > Note: currently the nfs locking handling was disabled by commit > 40595cdc93ed ("nfs: block notification on fs with its own ->lock"). > However DLM was never being updated regarding to this change. Future > patches will try to fix lockd lock requests for DLM. This patch is > currently assuming the upstream DLM lockd handling is correct. > > Signed-off-by: Alexander Aring > --- > fs/dlm/plock.c| 102 +- > fs/gfs2/file.c| 9 ++-- > fs/ocfs2/stack_user.c | 13 ++--- > include/linux/dlm_plock.h | 2 + > 4 files changed, 97 insertions(+), 29 deletions(-) > > diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c > index a8ffa0760913..84510994b177 100644 > --- a/fs/dlm/plock.c > +++ b/fs/dlm/plock.c > @@ -42,6 +42,27 @@ static inline void set_version(struct dlm_plock_info *info) > info->version[2] = DLM_PLOCK_VERSION_PATCH; > } > > +static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info > *info) > +{ > + struct plock_op *op = NULL, *iter; > + > + list_for_each_entry(iter, &recv_list, list) { > + if (iter->info.fsid == info->fsid && > + iter->info.number == info->number && > + iter->info.owner == info->owner && > + iter->info.pid == info->pid && > + iter->info.start == info->start && > + iter->info.end == info->end && > + iter->info.ex == info->ex && > + iter->info.wait) { > + op = iter; > + break; > + } > + } > + > + return op; > +} > + > static int check_version(struct dlm_plock_info *info) > { > if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || > @@ -334,6 +355,73 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 > number, struct file *file, > } > EXPORT_SYMBOL_GPL(dlm_posix_unlock); > > +/* > + * NOTE: This implementation can only handle async lock requests as nfs > + * do it. It cannot handle cancellation of a pending lock request sitting > + * in wait_event(), but for now only nfs is the only user local kernel > + * user. > + */ > +int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file > *file, > +struct file_lock *fl) > +{ > + struct dlm_plock_info info; > + struct plock_op *op; > + struct dlm_ls *ls; > + int rv; > + > + /* this only works for async request for now and nfs is the only > +* kernel user right now. > +*/ > + if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant)) > + return -EOPNOTSUPP; > + > + ls = dlm_find_lockspace_local(lockspace); > + if (!ls) > + return -EINVAL; > + > + info.pid = fl->fl_pid; > + info.ex = (fl->fl_type == F_WRLCK); > + info.fsid = ls->ls_global_id; > + dlm_put_lockspace(ls); > + info.number = number; > + info.start = fl->fl_start; > + info.end = fl->fl_end; > + info.owner = (__u64)fl->fl_pid; > + > + rv = do_lock_cancel(&info); > + switch (rv) { > + case 0: > + spin_lock(&ops_lock); > + /* lock request to cancel must be on recv_list because > +* do_lock_cancel() synchronizes it. > +*/ > + op = plock_lookup_waiter(&info); > + if (WARN_ON_ONCE(!op)) { > + rv = -ENOLCK; > + break; missing spin_unlock() here. I will add it to my upcoming patch series. - Alex
Re: [Cluster-devel] [PATCH v2 03/47] mm: shrinker: add infrastructure for dynamically allocating shrinker
Hi Muchun, On 2023/7/25 17:02, Muchun Song wrote: On 2023/7/24 17:43, Qi Zheng wrote: Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). So this commit adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() Used to allocate shrinker instance itself and related memory, it will return a pointer to the shrinker instance on success and NULL on failure. 2. shrinker_free_non_registered() Used to destroy the non-registered shrinker instance. At least I don't like this name. I know you want to tell others this function only should be called when shrinker has not been registed but allocated. Maybe shrinker_free() is more simple. And and a comment to tell the users when to use it. OK, if no one else objects, I will change it to shrinker_free() in the next version. 3. shrinker_register() Used to register the shrinker instance, which is same as the current register_shrinker_prepared(). 4. shrinker_unregister() Used to unregister and free the shrinker instance. In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, subsequent submissions will use the above API to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using uncoverted shrinkers in the new setup. ``` Signed-off-by: Qi Zheng --- include/linux/shrinker.h | 6 +++ mm/shrinker.c | 113 +++ 2 files changed, 119 insertions(+) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 961cb84e51f5..296f5e163861 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -70,6 +70,8 @@ struct shrinker { int seeks; /* seeks to recreate an obj */ unsigned flags; + void *private_data; + /* These are for internal use */ struct list_head list; #ifdef CONFIG_MEMCG @@ -98,6 +100,10 @@ struct shrinker { unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); +void shrinker_free_non_registered(struct shrinker *shrinker); +void shrinker_register(struct shrinker *shrinker); +void shrinker_unregister(struct shrinker *shrinker); extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/shrinker.c b/mm/shrinker.c index 0a32ef42f2a7..d820e4cc5806 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -548,6 +548,119 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, return freed; } +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) +{ + struct shrinker *shrinker; + unsigned int size; + va_list __maybe_unused ap; + int err; + + shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); + if (!shrinker) + return NULL; + +#ifdef CONFIG_SHRINKER_DEBUG + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + goto err_name; +#endif So why not introduce another helper to handle this and declare it as a void function when !CONFIG_SHRINKER_DEBUG? Something like the following: #ifdef CONFIG_SHRINKER_DEBUG static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list vargs) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, vargs); return shrinker->name ? 0 : -ENOMEM; } #else static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list vargs) { return 0; } #endif Will do in the next versio
Re: [Cluster-devel] [PATCH v2 03/47] mm: shrinker: add infrastructure for dynamically allocating shrinker
On 2023/7/24 17:43, Qi Zheng wrote: Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). So this commit adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() Used to allocate shrinker instance itself and related memory, it will return a pointer to the shrinker instance on success and NULL on failure. 2. shrinker_free_non_registered() Used to destroy the non-registered shrinker instance. At least I don't like this name. I know you want to tell others this function only should be called when shrinker has not been registed but allocated. Maybe shrinker_free() is more simple. And and a comment to tell the users when to use it. 3. shrinker_register() Used to register the shrinker instance, which is same as the current register_shrinker_prepared(). 4. shrinker_unregister() Used to unregister and free the shrinker instance. In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, subsequent submissions will use the above API to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using uncoverted shrinkers in the new setup. ``` Signed-off-by: Qi Zheng --- include/linux/shrinker.h | 6 +++ mm/shrinker.c| 113 +++ 2 files changed, 119 insertions(+) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 961cb84e51f5..296f5e163861 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -70,6 +70,8 @@ struct shrinker { int seeks; /* seeks to recreate an obj */ unsigned flags; + void *private_data; + /* These are for internal use */ struct list_head list; #ifdef CONFIG_MEMCG @@ -98,6 +100,10 @@ struct shrinker { unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); +void shrinker_free_non_registered(struct shrinker *shrinker); +void shrinker_register(struct shrinker *shrinker); +void shrinker_unregister(struct shrinker *shrinker); extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/shrinker.c b/mm/shrinker.c index 0a32ef42f2a7..d820e4cc5806 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -548,6 +548,119 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, return freed; } +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) +{ + struct shrinker *shrinker; + unsigned int size; + va_list __maybe_unused ap; + int err; + + shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); + if (!shrinker) + return NULL; + +#ifdef CONFIG_SHRINKER_DEBUG + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + goto err_name; +#endif So why not introduce another helper to handle this and declare it as a void function when !CONFIG_SHRINKER_DEBUG? Something like the following: #ifdef CONFIG_SHRINKER_DEBUG static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list vargs) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, vargs); return shrinker->name ? 0 : -ENOMEM; } #else static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list vargs) { return 0; } #endif + shrinker->flags = flags; + + if (flags & SHRINKER_MEMCG_AWARE) { +
Re: [Cluster-devel] [PATCH v2 01/47] mm: vmscan: move shrinker-related code into a separate file
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > The mm/vmscan.c file is too large, so separate the shrinker-related > code from it into a separate file. No functional changes. > > Signed-off-by: Qi Zheng > --- > include/linux/shrinker.h | 3 + > mm/Makefile | 4 +- > mm/shrinker.c| 707 +++ > mm/vmscan.c | 701 -- > 4 files changed, 712 insertions(+), 703 deletions(-) > create mode 100644 mm/shrinker.c > > diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h > index 224293b2dd06..961cb84e51f5 100644 > --- a/include/linux/shrinker.h > +++ b/include/linux/shrinker.h > @@ -96,6 +96,9 @@ struct shrinker { > */ > #define SHRINKER_NONSLAB (1 << 3) > > +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, > +int priority); A good cleanup, vmscan.c is so huge. I'd like to introduce a new header in mm/ directory and contains those declarations of functions (like this and other debug function in shrinker_debug.c) since they are used internally across mm. Thanks.
Re: [Cluster-devel] [PATCH v2 07/47] xenbus/backend: dynamically allocate the xen-backend shrinker
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > Use new APIs to dynamically allocate the xen-backend shrinker. > > Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Thanks.
Re: [Cluster-devel] [PATCH v2 09/47] f2fs: dynamically allocate the f2fs-shrinker
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > Use new APIs to dynamically allocate the f2fs-shrinker. > > Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Thanks.
Re: [Cluster-devel] [PATCH v2 08/47] erofs: dynamically allocate the erofs-shrinker
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > Use new APIs to dynamically allocate the erofs-shrinker. > > Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Thanks.
[Cluster-devel] [syzbot] [gfs2?] BUG: sleeping function called from invalid context in gfs2_make_fs_ro
Hello, syzbot found the following issue on: HEAD commit:46670259519f Merge tag 'for-6.5-rc2-tag' of git://git.kern.. git tree: upstream console+strace: https://syzkaller.appspot.com/x/log.txt?x=16bf15aea8 kernel config: https://syzkaller.appspot.com/x/.config?x=a4507c291b5ab5d4 dashboard link: https://syzkaller.appspot.com/bug?extid=60369f4775c014dd1804 compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=1602904ea8 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=12d67e9ea8 Downloadable assets: disk image: https://storage.googleapis.com/syzbot-assets/f3b4b06a5f02/disk-46670259.raw.xz vmlinux: https://storage.googleapis.com/syzbot-assets/4db334f36495/vmlinux-46670259.xz kernel image: https://storage.googleapis.com/syzbot-assets/5977e704aeb2/bzImage-46670259.xz mounted in repro: https://storage.googleapis.com/syzbot-assets/053f03da9748/mount_0.gz IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+60369f4775c014dd1...@syzkaller.appspotmail.com gfs2: fsid=syz:syz.0: found 1 quota changes syz-executor154: attempt to access beyond end of device loop0: rw=1, sector=131324, nr_sectors = 4 limit=32768 gfs2: fsid=syz:syz.0: Error 10 writing to journal, jid=0 gfs2: fsid=syz:syz.0: fatal: I/O error(s) gfs2: fsid=syz:syz.0: about to withdraw this file system BUG: sleeping function called from invalid context at kernel/sched/completion.c:101 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 5019, name: syz-executor154 preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 5 locks held by syz-executor154/5019: #0: 8880297960e0 (&type->s_umount_key#47){+.+.}-{3:3}, at: deactivate_super+0xad/0xf0 fs/super.c:360 #1: 88802854cb78 (&sdp->sd_quota_sync_mutex){+.+.}-{3:3}, at: gfs2_quota_sync+0xa1/0x700 fs/gfs2/quota.c:1304 #2: 88802854d060 (&sdp->sd_log_flush_lock){}-{3:3}, at: gfs2_log_flush+0x105/0x25f0 fs/gfs2/log.c:1042 #3: 88802854ce88 (&sdp->sd_log_lock){+.+.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] #3: 88802854ce88 (&sdp->sd_log_lock){+.+.}-{2:2}, at: gfs2_log_lock fs/gfs2/log.h:32 [inline] #3: 88802854ce88 (&sdp->sd_log_lock){+.+.}-{2:2}, at: gfs2_flush_revokes+0x53/0x90 fs/gfs2/log.c:814 #4: 88802854d248 (&sdp->sd_freeze_mutex){+.+.}-{3:3}, at: signal_our_withdraw fs/gfs2/util.c:151 [inline] #4: 88802854d248 (&sdp->sd_freeze_mutex){+.+.}-{3:3}, at: gfs2_withdraw+0x477/0x11e0 fs/gfs2/util.c:334 Preemption disabled at: [<>] 0x0 CPU: 1 PID: 5019 Comm: syz-executor154 Not tainted 6.5.0-rc2-syzkaller-00066-g46670259519f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 __might_resched+0x5cf/0x780 kernel/sched/core.c:10189 __wait_for_common kernel/sched/completion.c:101 [inline] wait_for_common kernel/sched/completion.c:117 [inline] wait_for_completion+0x1b/0x60 kernel/sched/completion.c:138 kthread_stop+0x18e/0x5a0 kernel/kthread.c:710 gfs2_make_fs_ro+0x183/0x680 fs/gfs2/super.c:555 signal_our_withdraw fs/gfs2/util.c:153 [inline] gfs2_withdraw+0x48a/0x11e0 fs/gfs2/util.c:334 gfs2_ail1_empty+0x7d0/0x860 fs/gfs2/log.c:377 gfs2_flush_revokes+0x5e/0x90 fs/gfs2/log.c:815 revoke_lo_before_commit+0x2c/0x5f0 fs/gfs2/lops.c:868 lops_before_commit fs/gfs2/lops.h:40 [inline] gfs2_log_flush+0xc93/0x25f0 fs/gfs2/log.c:1101 do_sync+0xa35/0xc80 fs/gfs2/quota.c:977 gfs2_quota_sync+0x30e/0x700 fs/gfs2/quota.c:1320 gfs2_sync_fs+0x4d/0xb0 fs/gfs2/super.c:680 sync_filesystem+0xec/0x220 fs/sync.c:56 generic_shutdown_super+0x6f/0x340 fs/super.c:472 kill_block_super+0x68/0xa0 fs/super.c:1417 deactivate_locked_super+0xa4/0x110 fs/super.c:330 cleanup_mnt+0x426/0x4c0 fs/namespace.c:1254 task_work_run+0x24a/0x300 kernel/task_work.c:179 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0x68f/0x2290 kernel/exit.c:874 do_group_exit+0x206/0x2c0 kernel/exit.c:1024 __do_sys_exit_group kernel/exit.c:1035 [inline] __se_sys_exit_group kernel/exit.c:1033 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1033 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fca0c3e4749 Code: Unable to access opcode bytes at 0x7fca0c3e471f. RSP: 002b:7ffdd6ff7a08 EFLAGS: 0246 ORIG_RAX: 00e7 RAX: ffda RBX: 0001 RCX: 7fca0c3e4749 RDX: 003c RSI: 00e7 RDI: 0001 RBP: 7fca0c47f2b0 R08: ffb8 R09: 0001f6db R10: R11: 0246 R12: 7fca0c47f2b0 R13: R14: 7fca0c480020 R15: 7fca0c3b2c90 BUG: scheduling while atomic: syz-executor154/5019/0x0002 5 locks h
Re: [Cluster-devel] [PATCH v2 01/47] mm: vmscan: move shrinker-related code into a separate file
> On Jul 25, 2023, at 11:09, Qi Zheng wrote: > > > > On 2023/7/25 10:35, Muchun Song wrote: >>> On Jul 24, 2023, at 17:43, Qi Zheng wrote: >>> >>> The mm/vmscan.c file is too large, so separate the shrinker-related >>> code from it into a separate file. No functional changes. >>> >>> Signed-off-by: Qi Zheng >>> --- >>> include/linux/shrinker.h | 3 + >>> mm/Makefile | 4 +- >>> mm/shrinker.c| 707 +++ >>> mm/vmscan.c | 701 -- >>> 4 files changed, 712 insertions(+), 703 deletions(-) >>> create mode 100644 mm/shrinker.c >>> >>> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h >>> index 224293b2dd06..961cb84e51f5 100644 >>> --- a/include/linux/shrinker.h >>> +++ b/include/linux/shrinker.h >>> @@ -96,6 +96,9 @@ struct shrinker { >>> */ >>> #define SHRINKER_NONSLAB (1 << 3) >>> >>> +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup >>> *memcg, >>> +int priority); >> A good cleanup, vmscan.c is so huge. >> I'd like to introduce a new header in mm/ directory and contains those >> declarations of functions (like this and other debug function in >> shrinker_debug.c) since they are used internally across mm. > > How about putting them in the mm/internal.h file? Either is fine to me. > >> Thanks.
Re: [Cluster-devel] [PATCH v2 06/47] drm/ttm: dynamically allocate the drm-ttm_pool shrinker
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > Use new APIs to dynamically allocate the drm-ttm_pool shrinker. > > Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Thanks.
Re: [Cluster-devel] [PATCH v2 04/47] kvm: mmu: dynamically allocate the x86-mmu shrinker
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > Use new APIs to dynamically allocate the x86-mmu shrinker. > > Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Thanks.
Re: [Cluster-devel] [PATCH v2 02/47] mm: shrinker: remove redundant shrinker_rwsem in debugfs operations
> On Jul 24, 2023, at 17:43, Qi Zheng wrote: > > The debugfs_remove_recursive() will wait for debugfs_file_put() to return, > so the shrinker will not be freed when doing debugfs operations (such as > shrinker_debugfs_count_show() and shrinker_debugfs_scan_write()), so there > is no need to hold shrinker_rwsem during debugfs operations. > > Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Thanks.