Re: [Cluster-devel] [PATCH v2 11/47] gfs2: dynamically allocate the gfs2-qd shrinker

2023-07-25 Thread Muchun Song




On 2023/7/24 17:43, Qi Zheng wrote:

Use new APIs to dynamically allocate the gfs2-qd shrinker.

Signed-off-by: Qi Zheng 
---
  fs/gfs2/main.c  |  6 +++---
  fs/gfs2/quota.c | 26 --
  fs/gfs2/quota.h |  3 ++-
  3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index afcb32854f14..e47b1cc79f59 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_trans_cachep)
goto fail_cachep8;
  
-	error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");

+   error = gfs2_qd_shrinker_init();
if (error)
goto fail_shrinker;
  
@@ -196,7 +196,7 @@ static int __init init_gfs2_fs(void)

  fail_wq2:
destroy_workqueue(gfs_recovery_wq);
  fail_wq1:
-   unregister_shrinker(&gfs2_qd_shrinker);
+   gfs2_qd_shrinker_exit();
  fail_shrinker:
kmem_cache_destroy(gfs2_trans_cachep);
  fail_cachep8:
@@ -229,7 +229,7 @@ static int __init init_gfs2_fs(void)
  
  static void __exit exit_gfs2_fs(void)

  {
-   unregister_shrinker(&gfs2_qd_shrinker);
+   gfs2_qd_shrinker_exit();
gfs2_glock_exit();
gfs2_unregister_debugfs();
unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 704192b73605..bc9883cea847 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -186,13 +186,27 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker 
*shrink,
return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
  }
  
-struct shrinker gfs2_qd_shrinker = {

-   .count_objects = gfs2_qd_shrink_count,
-   .scan_objects = gfs2_qd_shrink_scan,
-   .seeks = DEFAULT_SEEKS,
-   .flags = SHRINKER_NUMA_AWARE,
-};
+static struct shrinker *gfs2_qd_shrinker;
+
+int gfs2_qd_shrinker_init(void)


It's better to declare this as __init.


+{
+   gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd");
+   if (!gfs2_qd_shrinker)
+   return -ENOMEM;
+
+   gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count;
+   gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan;
+   gfs2_qd_shrinker->seeks = DEFAULT_SEEKS;
+
+   shrinker_register(gfs2_qd_shrinker);
  
+	return 0;

+}
+
+void gfs2_qd_shrinker_exit(void)
+{
+   shrinker_unregister(gfs2_qd_shrinker);
+}
  
  static u64 qd2index(struct gfs2_quota_data *qd)

  {
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 21ada332d555..f9cb863373f7 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -59,7 +59,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
  }
  
  extern const struct quotactl_ops gfs2_quotactl_ops;

-extern struct shrinker gfs2_qd_shrinker;
+int gfs2_qd_shrinker_init(void);
+void gfs2_qd_shrinker_exit(void);
  extern struct list_lru gfs2_qd_lru;
  extern void __init gfs2_quota_hash_init(void);
  




Re: [Cluster-devel] [PATCH v2 10/47] gfs2: dynamically allocate the gfs2-glock shrinker

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> Use new APIs to dynamically allocate the gfs2-glock shrinker.
> 
> Signed-off-by: Qi Zheng 

Reviewed-by: Muchun Song 

Thanks.



[Cluster-devel] [PATCH v6 5/7] xfs: switch to multigrain timestamps

2023-07-25 Thread Jeff Layton
Enable multigrain timestamps, which should ensure that there is an
apparent change to the timestamp whenever it has been written after
being actively observed via getattr.

Also, anytime the mtime changes, the ctime must also change, and those
are now the only two options for xfs_trans_ichgtime. Have that function
unconditionally bump the ctime, and ASSERT that XFS_ICHGTIME_CHG is
always set.

Signed-off-by: Jeff Layton 
---
 fs/xfs/libxfs/xfs_trans_inode.c | 6 +++---
 fs/xfs/xfs_iops.c   | 4 ++--
 fs/xfs/xfs_super.c  | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 6b2296ff248a..ad22656376d3 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
-   tv = current_time(inode);
+   /* If the mtime changes, then ctime must also change */
+   ASSERT(flags & XFS_ICHGTIME_CHG);
 
+   tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
-   if (flags & XFS_ICHGTIME_CHG)
-   inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 3a9363953ef2..3f89ef5a2820 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -573,10 +573,10 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
-   stat->mtime = inode->i_mtime;
-   stat->ctime = inode_get_ctime(inode);
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
 
+   fill_mg_cmtime(request_mask, inode, stat);
+
if (xfs_has_v3inodes(mp)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 818510243130..4b10edb2c972 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2009,7 +2009,7 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context= xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb= kill_block_super,
-   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
 };
 MODULE_ALIAS_FS("xfs");
 

-- 
2.41.0



[Cluster-devel] [PATCH v6 6/7] ext4: switch to multigrain timestamps

2023-07-25 Thread Jeff Layton
Enable multigrain timestamps, which should ensure that there is an
apparent change to the timestamp whenever it has been written after
being actively observed via getattr.

For ext4, we only need to enable the FS_MGTIME flag.

Signed-off-by: Jeff Layton 
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b54c70e1a74e..cb1ff47af156 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -7279,7 +7279,7 @@ static struct file_system_type ext4_fs_type = {
.init_fs_context= ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb= kill_block_super,
-   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
 };
 MODULE_ALIAS_FS("ext4");
 

-- 
2.41.0



[Cluster-devel] [PATCH v6 7/7] btrfs: convert to multigrain timestamps

2023-07-25 Thread Jeff Layton
Enable multigrain timestamps, which should ensure that there is an
apparent change to the timestamp whenever it has been written after
being actively observed via getattr.

Beyond enabling the FS_MGTIME flag, this patch eliminates
update_time_for_write, which goes to great pains to avoid in-memory
stores. Just have it overwrite the timestamps unconditionally.

Signed-off-by: Jeff Layton 
---
 fs/btrfs/file.c  | 24 
 fs/btrfs/super.c |  5 +++--
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d7a9ece7a40b..b9e75c9f95ac 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1106,25 +1106,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
 }
 
-static void update_time_for_write(struct inode *inode)
-{
-   struct timespec64 now, ctime;
-
-   if (IS_NOCMTIME(inode))
-   return;
-
-   now = current_time(inode);
-   if (!timespec64_equal(&inode->i_mtime, &now))
-   inode->i_mtime = now;
-
-   ctime = inode_get_ctime(inode);
-   if (!timespec64_equal(&ctime, &now))
-   inode_set_ctime_to_ts(inode, now);
-
-   if (IS_I_VERSION(inode))
-   inode_inc_iversion(inode);
-}
-
 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
 size_t count)
 {
@@ -1156,7 +1137,10 @@ static int btrfs_write_check(struct kiocb *iocb, struct 
iov_iter *from,
 * need to start yet another transaction to update the inode as we will
 * update the inode when we finish writing whatever data we write.
 */
-   update_time_for_write(inode);
+   if (!IS_NOCMTIME(inode)) {
+   inode->i_mtime = inode_set_ctime_current(inode);
+   inode_inc_iversion(inode);
+   }
 
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f1dd172d8d5b..8eda51b095c9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2144,7 +2144,7 @@ static struct file_system_type btrfs_fs_type = {
.name   = "btrfs",
.mount  = btrfs_mount,
.kill_sb= btrfs_kill_super,
-   .fs_flags   = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+   .fs_flags   = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME,
 };
 
 static struct file_system_type btrfs_root_fs_type = {
@@ -2152,7 +2152,8 @@ static struct file_system_type btrfs_root_fs_type = {
.name   = "btrfs",
.mount  = btrfs_mount_root,
.kill_sb= btrfs_kill_super,
-   .fs_flags   = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | 
FS_ALLOW_IDMAP,
+   .fs_flags   = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA |
+ FS_ALLOW_IDMAP | FS_MGTIME,
 };
 
 MODULE_ALIAS_FS("btrfs");

-- 
2.41.0



[Cluster-devel] [PATCH v6 4/7] tmpfs: add support for multigrain timestamps

2023-07-25 Thread Jeff Layton
Enable multigrain timestamps, which should ensure that there is an
apparent change to the timestamp whenever it has been written after
being actively observed via getattr.

tmpfs only requires the FS_MGTIME flag.

Signed-off-by: Jeff Layton 
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 654d9a585820..b6019c905058 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4264,7 +4264,7 @@ static struct file_system_type shmem_fs_type = {
 #endif
.kill_sb= kill_litter_super,
 #ifdef CONFIG_SHMEM
-   .fs_flags   = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+   .fs_flags   = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
 #else
.fs_flags   = FS_USERNS_MOUNT,
 #endif

-- 
2.41.0



[Cluster-devel] [PATCH v6 3/7] tmpfs: bump the mtime/ctime/iversion when page becomes writeable

2023-07-25 Thread Jeff Layton
Most filesystems that use the pagecache will update the mtime, ctime,
and change attribute when a page becomes writeable. Add a page_mkwrite
operation for tmpfs and just use it to bump the mtime, ctime and change
attribute.

This fixes xfstest generic/080 on tmpfs.

Signed-off-by: Jeff Layton 
---
 mm/shmem.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index b154af49d2df..654d9a585820 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2169,6 +2169,16 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
return ret;
 }
 
+static vm_fault_t shmem_page_mkwrite(struct vm_fault *vmf)
+{
+   struct vm_area_struct *vma = vmf->vma;
+   struct inode *inode = file_inode(vma->vm_file);
+
+   file_update_time(vma->vm_file);
+   inode_inc_iversion(inode);
+   return 0;
+}
+
 unsigned long shmem_get_unmapped_area(struct file *file,
  unsigned long uaddr, unsigned long len,
  unsigned long pgoff, unsigned long flags)
@@ -4210,6 +4220,7 @@ static const struct super_operations shmem_ops = {
 
 static const struct vm_operations_struct shmem_vm_ops = {
.fault  = shmem_fault,
+   .page_mkwrite   = shmem_page_mkwrite,
.map_pages  = filemap_map_pages,
 #ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
@@ -4219,6 +4230,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
 
 static const struct vm_operations_struct shmem_anon_vm_ops = {
.fault  = shmem_fault,
+   .page_mkwrite   = shmem_page_mkwrite,
.map_pages  = filemap_map_pages,
 #ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,

-- 
2.41.0



[Cluster-devel] [PATCH v6 2/7] fs: add infrastructure for multigrain timestamps

2023-07-25 Thread Jeff Layton
The VFS always uses coarse-grained timestamps when updating the ctime
and mtime after a change. This has the benefit of allowing filesystems
to optimize away a lot metadata updates, down to around 1 per jiffy,
even when a file is under heavy writes.

Unfortunately, this has always been an issue when we're exporting via
NFSv3, which relies on timestamps to validate caches. A lot of changes
can happen in a jiffy, so timestamps aren't sufficient to help the
client decide to invalidate the cache. Even with NFSv4, a lot of
exported filesystems don't properly support a change attribute and are
subject to the same problems with timestamp granularity. Other
applications have similar issues with timestamps (e.g backup
applications).

If we were to always use fine-grained timestamps, that would improve the
situation, but that becomes rather expensive, as the underlying
filesystem would have to log a lot more metadata updates.

What we need is a way to only use fine-grained timestamps when they are
being actively queried.

POSIX generally mandates that when the the mtime changes, the ctime must
also change. The kernel always stores normalized ctime values, so only
the first 30 bits of the tv_nsec field are ever used.

Use the 31st bit of the ctime tv_nsec field to indicate that something
has queried the inode for the mtime or ctime. When this flag is set,
on the next mtime or ctime update, the kernel will fetch a fine-grained
timestamp instead of the usual coarse-grained one.

Filesytems can opt into this behavior by setting the FS_MGTIME flag in
the fstype. Filesystems that don't set this flag will continue to use
coarse-grained timestamps.

Later patches will convert individual filesystems to use the new
infrastructure.

Signed-off-by: Jeff Layton 
---
 fs/inode.c | 98 ++
 fs/stat.c  | 41 +--
 include/linux/fs.h | 45 +++--
 3 files changed, 151 insertions(+), 33 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index d4ab92233062..369621e7faf5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1919,6 +1919,21 @@ int inode_update_time(struct inode *inode, struct 
timespec64 *time, int flags)
 }
 EXPORT_SYMBOL(inode_update_time);
 
+/**
+ * current_coarse_time - Return FS time
+ * @inode: inode.
+ *
+ * Return the current coarse-grained time truncated to the time
+ * granularity supported by the fs.
+ */
+static struct timespec64 current_coarse_time(struct inode *inode)
+{
+   struct timespec64 now;
+
+   ktime_get_coarse_real_ts64(&now);
+   return timestamp_truncate(now, inode);
+}
+
 /**
  * atime_needs_update  -   update the access time
  * @path: the &struct path to update
@@ -1952,7 +1967,7 @@ bool atime_needs_update(const struct path *path, struct 
inode *inode)
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
return false;
 
-   now = current_time(inode);
+   now = current_coarse_time(inode);
 
if (!relatime_need_update(mnt, inode, now))
return false;
@@ -1986,7 +2001,7 @@ void touch_atime(const struct path *path)
 * We may also fail on filesystems that have the ability to make parts
 * of the fs read only, e.g. subvolumes in Btrfs.
 */
-   now = current_time(inode);
+   now = current_coarse_time(inode);
inode_update_time(inode, &now, S_ATIME);
__mnt_drop_write(mnt);
 skip_update:
@@ -2072,6 +2087,56 @@ int file_remove_privs(struct file *file)
 }
 EXPORT_SYMBOL(file_remove_privs);
 
+/**
+ * current_mgtime - Return FS time (possibly fine-grained)
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
+ * as having been QUERIED, get a fine-grained timestamp.
+ */
+static struct timespec64 current_mgtime(struct inode *inode)
+{
+   struct timespec64 now;
+   atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
+   long nsec = atomic_long_read(pnsec);
+
+   if (nsec & I_CTIME_QUERIED) {
+   ktime_get_real_ts64(&now);
+   } else {
+   struct timespec64 ctime;
+
+   ktime_get_coarse_real_ts64(&now);
+
+   /*
+* If we've recently fetched a fine-grained timestamp
+* then the coarse-grained one may still be earlier than the
+* existing one. Just keep the existing ctime if so.
+*/
+   ctime = inode_get_ctime(inode);
+   if (timespec64_compare(&ctime, &now) > 0)
+   now = ctime;
+   }
+
+   return timestamp_truncate(now, inode);
+}
+
+/**
+ * current_time - Return timestamp suitable for ctime update
+ * @inode: inode to eventually be updated
+ *
+ * Return the current time, which is usually coarse-grained but may be fine
+ * grained if the filesystem uses multigra

[Cluster-devel] [PATCH v6 1/7] fs: pass the request_mask to generic_fillattr

2023-07-25 Thread Jeff Layton
generic_fillattr just fills in the entire stat struct indiscriminately
today, copying data from the inode. There is at least one attribute
(STATX_CHANGE_COOKIE) that can have side effects when it is reported,
and we're looking at adding more with the addition of multigrain
timestamps.

Add a request_mask argument to generic_fillattr and have most callers
just pass in the value that is passed to getattr. Have other callers
(e.g. ksmbd) just pass in STATX_BASIC_STATS. Also move the setting of
STATX_CHANGE_COOKIE into generic_fillattr.

Signed-off-by: Jeff Layton 
---
 fs/9p/vfs_inode.c   |  4 ++--
 fs/9p/vfs_inode_dotl.c  |  4 ++--
 fs/afs/inode.c  |  2 +-
 fs/btrfs/inode.c|  2 +-
 fs/ceph/inode.c |  2 +-
 fs/coda/inode.c |  3 ++-
 fs/ecryptfs/inode.c |  5 +++--
 fs/erofs/inode.c|  2 +-
 fs/exfat/file.c |  2 +-
 fs/ext2/inode.c |  2 +-
 fs/ext4/inode.c |  2 +-
 fs/f2fs/file.c  |  2 +-
 fs/fat/file.c   |  2 +-
 fs/fuse/dir.c   |  2 +-
 fs/gfs2/inode.c |  2 +-
 fs/hfsplus/inode.c  |  2 +-
 fs/kernfs/inode.c   |  2 +-
 fs/libfs.c  |  4 ++--
 fs/minix/inode.c|  2 +-
 fs/nfs/inode.c  |  2 +-
 fs/nfs/namespace.c  |  3 ++-
 fs/ntfs3/file.c |  2 +-
 fs/ocfs2/file.c |  2 +-
 fs/orangefs/inode.c |  2 +-
 fs/proc/base.c  |  4 ++--
 fs/proc/fd.c|  2 +-
 fs/proc/generic.c   |  2 +-
 fs/proc/proc_net.c  |  2 +-
 fs/proc/proc_sysctl.c   |  2 +-
 fs/proc/root.c  |  3 ++-
 fs/smb/client/inode.c   |  2 +-
 fs/smb/server/smb2pdu.c | 22 +++---
 fs/smb/server/vfs.c |  3 ++-
 fs/stat.c   | 18 ++
 fs/sysv/itree.c |  3 ++-
 fs/ubifs/dir.c  |  2 +-
 fs/udf/symlink.c|  2 +-
 fs/vboxsf/utils.c   |  2 +-
 include/linux/fs.h  |  2 +-
 mm/shmem.c  |  2 +-
 40 files changed, 70 insertions(+), 62 deletions(-)

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 16d85e6033a3..d24d1f20e922 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1016,7 +1016,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct 
path *path,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
-   generic_fillattr(&nop_mnt_idmap, inode, stat);
+   generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
} else if (v9ses->cache & CACHE_WRITEBACK) {
if (S_ISREG(inode->i_mode)) {
@@ -1037,7 +1037,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct 
path *path,
return PTR_ERR(st);
 
v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
-   generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+   generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
 
p9stat_free(st);
kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 464ea73d1bf8..8e8d5d2a13d8 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -451,7 +451,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
-   generic_fillattr(&nop_mnt_idmap, inode, stat);
+   generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
} else if (v9ses->cache) {
if (S_ISREG(inode->i_mode)) {
@@ -476,7 +476,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
return PTR_ERR(st);
 
v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
-   generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+   generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
 
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 6b636f43f548..1c794a1896aa 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -773,7 +773,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path 
*path,
 
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
-   generic_fillattr(&nop_mnt_idmap, inode, stat);
+   generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
stat->nlink -= 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bcccd551f547..7346059209aa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8773,7 +8773,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
  STATX_ATTR_IMMUTABLE |
  STATX_ATTR_NODUMP);
 
-   generic_fillattr(idmap, inode,

[Cluster-devel] [PATCH v6 0/7] fs: implement multigrain timestamps

2023-07-25 Thread Jeff Layton
The VFS always uses coarse-grained timestamps when updating the
ctime and mtime after a change. This has the benefit of allowing
filesystems to optimize away a lot metadata updates, down to around 1
per jiffy, even when a file is under heavy writes.

Unfortunately, this coarseness has always been an issue when we're
exporting via NFSv3, which relies on timestamps to validate caches. A
lot of changes can happen in a jiffy, so timestamps aren't sufficient to
help the client decide to invalidate the cache.

Even with NFSv4, a lot of exported filesystems don't properly support a
change attribute and are subject to the same problems with timestamp
granularity. Other applications have similar issues with timestamps (e.g
backup applications).

If we were to always use fine-grained timestamps, that would improve the
situation, but that becomes rather expensive, as the underlying
filesystem would have to log a lot more metadata updates.

What we need is a way to only use fine-grained timestamps when they are
being actively queried. The idea is to use an unused bit in the ctime's
tv_nsec field to mark when the mtime or ctime has been queried via
getattr. Once that has been marked, the next m/ctime update will use a
fine-grained timestamp.

This patch series is based on top of Christian's vfs.all branch, which
has the recent conversion to the new ctime accessors. It should apply
cleanly on top of linux-next.

The first two patches should probably go in via the vfs tree. Should the
fs-specific patches go in that way as well, or should they go via
maintainer trees? Either should be fine.

The first two patches should probably go in via Christian's vfs tree.
The rest could go via maintainer trees or the vfs tree.

For now, I'd like to get these into linux-next. Christian, would you be
willing to pick these up for now? Alternately, I can feed them there via
the iversion branch that Stephen is already pulling in from my tree.

Signed-off-by: Jeff Layton 
base-commit: cf22d118b89a09a0160586412160d89098f7c4c7
---
Changes in v6:
- drop the patch that removed XFS_ICHGTIME_CHG
- change WARN_ON_ONCE to ASSERT in xfs conversion patch

---
Jeff Layton (7):
  fs: pass the request_mask to generic_fillattr
  fs: add infrastructure for multigrain timestamps
  tmpfs: bump the mtime/ctime/iversion when page becomes writeable
  tmpfs: add support for multigrain timestamps
  xfs: switch to multigrain timestamps
  ext4: switch to multigrain timestamps
  btrfs: convert to multigrain timestamps

 fs/9p/vfs_inode.c   |  4 +-
 fs/9p/vfs_inode_dotl.c  |  4 +-
 fs/afs/inode.c  |  2 +-
 fs/btrfs/file.c | 24 ++
 fs/btrfs/inode.c|  2 +-
 fs/btrfs/super.c|  5 ++-
 fs/ceph/inode.c |  2 +-
 fs/coda/inode.c |  3 +-
 fs/ecryptfs/inode.c |  5 ++-
 fs/erofs/inode.c|  2 +-
 fs/exfat/file.c |  2 +-
 fs/ext2/inode.c |  2 +-
 fs/ext4/inode.c |  2 +-
 fs/ext4/super.c |  2 +-
 fs/f2fs/file.c  |  2 +-
 fs/fat/file.c   |  2 +-
 fs/fuse/dir.c   |  2 +-
 fs/gfs2/inode.c |  2 +-
 fs/hfsplus/inode.c  |  2 +-
 fs/inode.c  | 98 +
 fs/kernfs/inode.c   |  2 +-
 fs/libfs.c  |  4 +-
 fs/minix/inode.c|  2 +-
 fs/nfs/inode.c  |  2 +-
 fs/nfs/namespace.c  |  3 +-
 fs/ntfs3/file.c |  2 +-
 fs/ocfs2/file.c |  2 +-
 fs/orangefs/inode.c |  2 +-
 fs/proc/base.c  |  4 +-
 fs/proc/fd.c|  2 +-
 fs/proc/generic.c   |  2 +-
 fs/proc/proc_net.c  |  2 +-
 fs/proc/proc_sysctl.c   |  2 +-
 fs/proc/root.c  |  3 +-
 fs/smb/client/inode.c   |  2 +-
 fs/smb/server/smb2pdu.c | 22 -
 fs/smb/server/vfs.c |  3 +-
 fs/stat.c   | 59 -
 fs/sysv/itree.c |  3 +-
 fs/ubifs/dir.c  |  2 +-
 fs/udf/symlink.c|  2 +-
 fs/vboxsf/utils.c   |  2 +-
 fs/xfs/libxfs/xfs_trans_inode.c |  6 +--
 fs/xfs/xfs_iops.c   |  4 +-
 fs/xfs/xfs_super.c  |  2 +-
 include/linux/fs.h  | 47 ++--
 mm/shmem.c  | 16 ++-
 47 files changed, 248 insertions(+), 125 deletions(-)
---
base-commit: 810b5fff7917119ea82ff96e312e2d4350d6b681
change-id: 20230713-mgctime-f2a9fc324918

Best regards,
-- 
Jeff Layton 



Re: [Cluster-devel] [PATCHv3 v6.5-rc2 3/3] fs: dlm: fix F_CANCELLK to cancel pending request

2023-07-25 Thread Alexander Aring
Hi,

On Tue, Jul 18, 2023 at 2:07 PM Alexander Aring  wrote:
>
> This patch fixes the current handling of F_CANCELLK by not just doing a
> unlock as we need to try to cancel a lock at first. A unlock makes sense
> on a non-blocking lock request but if it's a blocking lock request we
> need to cancel the request until it's not granted yet. This patch is fixing
> this behaviour by first try to cancel a lock request and if it's failed
> it's unlocking the lock which seems to be granted.
>
> Note: currently the nfs locking handling was disabled by commit
> 40595cdc93ed ("nfs: block notification on fs with its own ->lock").
> However DLM was never being updated regarding to this change. Future
> patches will try to fix lockd lock requests for DLM. This patch is
> currently assuming the upstream DLM lockd handling is correct.
>
> Signed-off-by: Alexander Aring 
> ---
>  fs/dlm/plock.c| 102 +-
>  fs/gfs2/file.c|   9 ++--
>  fs/ocfs2/stack_user.c |  13 ++---
>  include/linux/dlm_plock.h |   2 +
>  4 files changed, 97 insertions(+), 29 deletions(-)
>
> diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
> index a8ffa0760913..84510994b177 100644
> --- a/fs/dlm/plock.c
> +++ b/fs/dlm/plock.c
> @@ -42,6 +42,27 @@ static inline void set_version(struct dlm_plock_info *info)
> info->version[2] = DLM_PLOCK_VERSION_PATCH;
>  }
>
> +static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info 
> *info)
> +{
> +   struct plock_op *op = NULL, *iter;
> +
> +   list_for_each_entry(iter, &recv_list, list) {
> +   if (iter->info.fsid == info->fsid &&
> +   iter->info.number == info->number &&
> +   iter->info.owner == info->owner &&
> +   iter->info.pid == info->pid &&
> +   iter->info.start == info->start &&
> +   iter->info.end == info->end &&
> +   iter->info.ex == info->ex &&
> +   iter->info.wait) {
> +   op = iter;
> +   break;
> +   }
> +   }
> +
> +   return op;
> +}
> +
>  static int check_version(struct dlm_plock_info *info)
>  {
> if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
> @@ -334,6 +355,73 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 
> number, struct file *file,
>  }
>  EXPORT_SYMBOL_GPL(dlm_posix_unlock);
>
> +/*
> + * NOTE: This implementation can only handle async lock requests as nfs
> + * do it. It cannot handle cancellation of a pending lock request sitting
> + * in wait_event(), but for now only nfs is the only user local kernel
> + * user.
> + */
> +int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file 
> *file,
> +struct file_lock *fl)
> +{
> +   struct dlm_plock_info info;
> +   struct plock_op *op;
> +   struct dlm_ls *ls;
> +   int rv;
> +
> +   /* this only works for async request for now and nfs is the only
> +* kernel user right now.
> +*/
> +   if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant))
> +   return -EOPNOTSUPP;
> +
> +   ls = dlm_find_lockspace_local(lockspace);
> +   if (!ls)
> +   return -EINVAL;
> +
> +   info.pid = fl->fl_pid;
> +   info.ex = (fl->fl_type == F_WRLCK);
> +   info.fsid = ls->ls_global_id;
> +   dlm_put_lockspace(ls);
> +   info.number = number;
> +   info.start = fl->fl_start;
> +   info.end = fl->fl_end;
> +   info.owner = (__u64)fl->fl_pid;
> +
> +   rv = do_lock_cancel(&info);
> +   switch (rv) {
> +   case 0:
> +   spin_lock(&ops_lock);
> +   /* lock request to cancel must be on recv_list because
> +* do_lock_cancel() synchronizes it.
> +*/
> +   op = plock_lookup_waiter(&info);
> +   if (WARN_ON_ONCE(!op)) {
> +   rv = -ENOLCK;
> +   break;

missing spin_unlock() here. I will add it to my upcoming patch series.

- Alex



Re: [Cluster-devel] [PATCH v2 03/47] mm: shrinker: add infrastructure for dynamically allocating shrinker

2023-07-25 Thread Qi Zheng

Hi Muchun,

On 2023/7/25 17:02, Muchun Song wrote:



On 2023/7/24 17:43, Qi Zheng wrote:

Currently, the shrinker instances can be divided into the following three
types:

a) global shrinker instance statically defined in the kernel, such as
    workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, 
such

    as mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b,
the memory of shrinker instance will be freed after synchronize_rcu() 
when

the module is unloaded. For case c, the memory of shrinker instance will
be freed along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to
dynamically allocate those shrinker instances in case c, then the memory
can be dynamically freed alone by calling kfree_rcu().

So this commit adds the following new APIs for dynamically allocating
shrinker, and add a private_data field to struct shrinker to record and
get the original embedded structure.

1. shrinker_alloc()

Used to allocate shrinker instance itself and related memory, it will
return a pointer to the shrinker instance on success and NULL on failure.

2. shrinker_free_non_registered()

Used to destroy the non-registered shrinker instance.


At least I don't like this name. I know you want to tell others
this function only should be called when shrinker has not been
registed but allocated. Maybe shrinker_free() is more simple.
And and a comment to tell the users when to use it.


OK, if no one else objects, I will change it to shrinker_free() in
the next version.





3. shrinker_register()

Used to register the shrinker instance, which is same as the current
register_shrinker_prepared().

4. shrinker_unregister()

Used to unregister and free the shrinker instance.

In order to simplify shrinker-related APIs and make shrinker more
independent of other kernel mechanisms, subsequent submissions will use
the above API to convert all shrinkers (including case a and b) to
dynamically allocated, and then remove all existing APIs.

This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing
out of tree code and third party modules using the old API and will
no longer work with a kernel using lockless slab shrinkers. They
need to break (both at the source and binary levels) to stop bad
things from happening due to using uncoverted shrinkers in the new
setup.
```

Signed-off-by: Qi Zheng 
---
  include/linux/shrinker.h |   6 +++
  mm/shrinker.c    | 113 +++
  2 files changed, 119 insertions(+)

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 961cb84e51f5..296f5e163861 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -70,6 +70,8 @@ struct shrinker {
  int seeks;    /* seeks to recreate an obj */
  unsigned flags;
+    void *private_data;
+
  /* These are for internal use */
  struct list_head list;
  #ifdef CONFIG_MEMCG
@@ -98,6 +100,10 @@ struct shrinker {
  unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup 
*memcg,

    int priority);
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, 
...);

+void shrinker_free_non_registered(struct shrinker *shrinker);
+void shrinker_register(struct shrinker *shrinker);
+void shrinker_unregister(struct shrinker *shrinker);
  extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
  const char *fmt, ...);
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 0a32ef42f2a7..d820e4cc5806 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -548,6 +548,119 @@ unsigned long shrink_slab(gfp_t gfp_mask, int 
nid, struct mem_cgroup *memcg,

  return freed;
  }
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, 
...)

+{
+    struct shrinker *shrinker;
+    unsigned int size;
+    va_list __maybe_unused ap;
+    int err;
+
+    shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
+    if (!shrinker)
+    return NULL;
+
+#ifdef CONFIG_SHRINKER_DEBUG
+    va_start(ap, fmt);
+    shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+    va_end(ap);
+    if (!shrinker->name)
+    goto err_name;
+#endif


So why not introduce another helper to handle this and declare it
as a void function when !CONFIG_SHRINKER_DEBUG? Something like the
following:

#ifdef CONFIG_SHRINKER_DEBUG
static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const 
char *fmt,

                                    va_list vargs)

{
     shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, vargs);
     return shrinker->name ? 0 : -ENOMEM;
}
#else
static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const 
char *fmt,

                                    va_list vargs)
{
     return 0;
}
#endif


Will do in the next versio

Re: [Cluster-devel] [PATCH v2 03/47] mm: shrinker: add infrastructure for dynamically allocating shrinker

2023-07-25 Thread Muchun Song




On 2023/7/24 17:43, Qi Zheng wrote:

Currently, the shrinker instances can be divided into the following three
types:

a) global shrinker instance statically defined in the kernel, such as
workingset_shadow_shrinker.

b) global shrinker instance statically defined in the kernel modules, such
as mmu_shrinker in x86.

c) shrinker instance embedded in other structures.

For case a, the memory of shrinker instance is never freed. For case b,
the memory of shrinker instance will be freed after synchronize_rcu() when
the module is unloaded. For case c, the memory of shrinker instance will
be freed along with the structure it is embedded in.

In preparation for implementing lockless slab shrink, we need to
dynamically allocate those shrinker instances in case c, then the memory
can be dynamically freed alone by calling kfree_rcu().

So this commit adds the following new APIs for dynamically allocating
shrinker, and add a private_data field to struct shrinker to record and
get the original embedded structure.

1. shrinker_alloc()

Used to allocate shrinker instance itself and related memory, it will
return a pointer to the shrinker instance on success and NULL on failure.

2. shrinker_free_non_registered()

Used to destroy the non-registered shrinker instance.


At least I don't like this name. I know you want to tell others
this function only should be called when shrinker has not been
registed but allocated. Maybe shrinker_free() is more simple.
And and a comment to tell the users when to use it.



3. shrinker_register()

Used to register the shrinker instance, which is same as the current
register_shrinker_prepared().

4. shrinker_unregister()

Used to unregister and free the shrinker instance.

In order to simplify shrinker-related APIs and make shrinker more
independent of other kernel mechanisms, subsequent submissions will use
the above API to convert all shrinkers (including case a and b) to
dynamically allocated, and then remove all existing APIs.

This will also have another advantage mentioned by Dave Chinner:

```
The other advantage of this is that it will break all the existing
out of tree code and third party modules using the old API and will
no longer work with a kernel using lockless slab shrinkers. They
need to break (both at the source and binary levels) to stop bad
things from happening due to using uncoverted shrinkers in the new
setup.
```

Signed-off-by: Qi Zheng 
---
  include/linux/shrinker.h |   6 +++
  mm/shrinker.c| 113 +++
  2 files changed, 119 insertions(+)

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 961cb84e51f5..296f5e163861 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -70,6 +70,8 @@ struct shrinker {
int seeks;  /* seeks to recreate an obj */
unsigned flags;
  
+	void *private_data;

+
/* These are for internal use */
struct list_head list;
  #ifdef CONFIG_MEMCG
@@ -98,6 +100,10 @@ struct shrinker {
  
  unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,

  int priority);
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
+void shrinker_free_non_registered(struct shrinker *shrinker);
+void shrinker_register(struct shrinker *shrinker);
+void shrinker_unregister(struct shrinker *shrinker);
  
  extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,

const char *fmt, ...);
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 0a32ef42f2a7..d820e4cc5806 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -548,6 +548,119 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct 
mem_cgroup *memcg,
return freed;
  }
  
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)

+{
+   struct shrinker *shrinker;
+   unsigned int size;
+   va_list __maybe_unused ap;
+   int err;
+
+   shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
+   if (!shrinker)
+   return NULL;
+
+#ifdef CONFIG_SHRINKER_DEBUG
+   va_start(ap, fmt);
+   shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+   va_end(ap);
+   if (!shrinker->name)
+   goto err_name;
+#endif


So why not introduce another helper to handle this and declare it
as a void function when !CONFIG_SHRINKER_DEBUG? Something like the
following:

#ifdef CONFIG_SHRINKER_DEBUG
static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const 
char *fmt,

                                   va_list vargs)

{
    shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, vargs);
    return shrinker->name ? 0 : -ENOMEM;
}
#else
static int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const 
char *fmt,

                                   va_list vargs)
{
    return 0;
}
#endif


+   shrinker->flags = flags;
+
+   if (flags & SHRINKER_MEMCG_AWARE) {
+ 

Re: [Cluster-devel] [PATCH v2 01/47] mm: vmscan: move shrinker-related code into a separate file

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> The mm/vmscan.c file is too large, so separate the shrinker-related
> code from it into a separate file. No functional changes.
> 
> Signed-off-by: Qi Zheng 
> ---
> include/linux/shrinker.h |   3 +
> mm/Makefile  |   4 +-
> mm/shrinker.c| 707 +++
> mm/vmscan.c  | 701 --
> 4 files changed, 712 insertions(+), 703 deletions(-)
> create mode 100644 mm/shrinker.c
> 
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index 224293b2dd06..961cb84e51f5 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -96,6 +96,9 @@ struct shrinker {
>  */
> #define SHRINKER_NONSLAB (1 << 3)
> 
> +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
> +int priority);

A good cleanup, vmscan.c is so huge.

I'd like to introduce a new header in mm/ directory and contains those
declarations of functions (like this and other debug function in
shrinker_debug.c) since they are used internally across mm.

Thanks.



Re: [Cluster-devel] [PATCH v2 07/47] xenbus/backend: dynamically allocate the xen-backend shrinker

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> Use new APIs to dynamically allocate the xen-backend shrinker.
> 
> Signed-off-by: Qi Zheng 

Reviewed-by: Muchun Song 

Thanks.



Re: [Cluster-devel] [PATCH v2 09/47] f2fs: dynamically allocate the f2fs-shrinker

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> Use new APIs to dynamically allocate the f2fs-shrinker.
> 
> Signed-off-by: Qi Zheng 

Reviewed-by: Muchun Song 

Thanks.



Re: [Cluster-devel] [PATCH v2 08/47] erofs: dynamically allocate the erofs-shrinker

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> Use new APIs to dynamically allocate the erofs-shrinker.
> 
> Signed-off-by: Qi Zheng 

Reviewed-by: Muchun Song 

Thanks.



[Cluster-devel] [syzbot] [gfs2?] BUG: sleeping function called from invalid context in gfs2_make_fs_ro

2023-07-25 Thread syzbot
Hello,

syzbot found the following issue on:

HEAD commit:46670259519f Merge tag 'for-6.5-rc2-tag' of git://git.kern..
git tree:   upstream
console+strace: https://syzkaller.appspot.com/x/log.txt?x=16bf15aea8
kernel config:  https://syzkaller.appspot.com/x/.config?x=a4507c291b5ab5d4
dashboard link: https://syzkaller.appspot.com/bug?extid=60369f4775c014dd1804
compiler:   Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 
2.40
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1602904ea8
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=12d67e9ea8

Downloadable assets:
disk image: 
https://storage.googleapis.com/syzbot-assets/f3b4b06a5f02/disk-46670259.raw.xz
vmlinux: 
https://storage.googleapis.com/syzbot-assets/4db334f36495/vmlinux-46670259.xz
kernel image: 
https://storage.googleapis.com/syzbot-assets/5977e704aeb2/bzImage-46670259.xz
mounted in repro: 
https://storage.googleapis.com/syzbot-assets/053f03da9748/mount_0.gz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+60369f4775c014dd1...@syzkaller.appspotmail.com

gfs2: fsid=syz:syz.0: found 1 quota changes
syz-executor154: attempt to access beyond end of device
loop0: rw=1, sector=131324, nr_sectors = 4 limit=32768
gfs2: fsid=syz:syz.0: Error 10 writing to journal, jid=0
gfs2: fsid=syz:syz.0: fatal: I/O error(s)
gfs2: fsid=syz:syz.0: about to withdraw this file system
BUG: sleeping function called from invalid context at 
kernel/sched/completion.c:101
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 5019, name: 
syz-executor154
preempt_count: 1, expected: 0
RCU nest depth: 0, expected: 0
5 locks held by syz-executor154/5019:
 #0: 8880297960e0 (&type->s_umount_key#47){+.+.}-{3:3}, at: 
deactivate_super+0xad/0xf0 fs/super.c:360
 #1: 88802854cb78 (&sdp->sd_quota_sync_mutex){+.+.}-{3:3}, at: 
gfs2_quota_sync+0xa1/0x700 fs/gfs2/quota.c:1304
 #2: 88802854d060 (&sdp->sd_log_flush_lock){}-{3:3}, at: 
gfs2_log_flush+0x105/0x25f0 fs/gfs2/log.c:1042
 #3: 88802854ce88 (&sdp->sd_log_lock){+.+.}-{2:2}, at: spin_lock 
include/linux/spinlock.h:351 [inline]
 #3: 88802854ce88 (&sdp->sd_log_lock){+.+.}-{2:2}, at: gfs2_log_lock 
fs/gfs2/log.h:32 [inline]
 #3: 88802854ce88 (&sdp->sd_log_lock){+.+.}-{2:2}, at: 
gfs2_flush_revokes+0x53/0x90 fs/gfs2/log.c:814
 #4: 88802854d248 (&sdp->sd_freeze_mutex){+.+.}-{3:3}, at: 
signal_our_withdraw fs/gfs2/util.c:151 [inline]
 #4: 88802854d248 (&sdp->sd_freeze_mutex){+.+.}-{3:3}, at: 
gfs2_withdraw+0x477/0x11e0 fs/gfs2/util.c:334
Preemption disabled at:
[<>] 0x0
CPU: 1 PID: 5019 Comm: syz-executor154 Not tainted 
6.5.0-rc2-syzkaller-00066-g46670259519f #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
07/12/2023
Call Trace:
 
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106
 __might_resched+0x5cf/0x780 kernel/sched/core.c:10189
 __wait_for_common kernel/sched/completion.c:101 [inline]
 wait_for_common kernel/sched/completion.c:117 [inline]
 wait_for_completion+0x1b/0x60 kernel/sched/completion.c:138
 kthread_stop+0x18e/0x5a0 kernel/kthread.c:710
 gfs2_make_fs_ro+0x183/0x680 fs/gfs2/super.c:555
 signal_our_withdraw fs/gfs2/util.c:153 [inline]
 gfs2_withdraw+0x48a/0x11e0 fs/gfs2/util.c:334
 gfs2_ail1_empty+0x7d0/0x860 fs/gfs2/log.c:377
 gfs2_flush_revokes+0x5e/0x90 fs/gfs2/log.c:815
 revoke_lo_before_commit+0x2c/0x5f0 fs/gfs2/lops.c:868
 lops_before_commit fs/gfs2/lops.h:40 [inline]
 gfs2_log_flush+0xc93/0x25f0 fs/gfs2/log.c:1101
 do_sync+0xa35/0xc80 fs/gfs2/quota.c:977
 gfs2_quota_sync+0x30e/0x700 fs/gfs2/quota.c:1320
 gfs2_sync_fs+0x4d/0xb0 fs/gfs2/super.c:680
 sync_filesystem+0xec/0x220 fs/sync.c:56
 generic_shutdown_super+0x6f/0x340 fs/super.c:472
 kill_block_super+0x68/0xa0 fs/super.c:1417
 deactivate_locked_super+0xa4/0x110 fs/super.c:330
 cleanup_mnt+0x426/0x4c0 fs/namespace.c:1254
 task_work_run+0x24a/0x300 kernel/task_work.c:179
 exit_task_work include/linux/task_work.h:38 [inline]
 do_exit+0x68f/0x2290 kernel/exit.c:874
 do_group_exit+0x206/0x2c0 kernel/exit.c:1024
 __do_sys_exit_group kernel/exit.c:1035 [inline]
 __se_sys_exit_group kernel/exit.c:1033 [inline]
 __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1033
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fca0c3e4749
Code: Unable to access opcode bytes at 0x7fca0c3e471f.
RSP: 002b:7ffdd6ff7a08 EFLAGS: 0246 ORIG_RAX: 00e7
RAX: ffda RBX: 0001 RCX: 7fca0c3e4749
RDX: 003c RSI: 00e7 RDI: 0001
RBP: 7fca0c47f2b0 R08: ffb8 R09: 0001f6db
R10:  R11: 0246 R12: 7fca0c47f2b0
R13:  R14: 7fca0c480020 R15: 7fca0c3b2c90
 
BUG: scheduling while atomic: syz-executor154/5019/0x0002
5 locks h

Re: [Cluster-devel] [PATCH v2 01/47] mm: vmscan: move shrinker-related code into a separate file

2023-07-25 Thread Muchun Song



> On Jul 25, 2023, at 11:09, Qi Zheng  wrote:
> 
> 
> 
> On 2023/7/25 10:35, Muchun Song wrote:
>>> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
>>> 
>>> The mm/vmscan.c file is too large, so separate the shrinker-related
>>> code from it into a separate file. No functional changes.
>>> 
>>> Signed-off-by: Qi Zheng 
>>> ---
>>> include/linux/shrinker.h |   3 +
>>> mm/Makefile  |   4 +-
>>> mm/shrinker.c| 707 +++
>>> mm/vmscan.c  | 701 --
>>> 4 files changed, 712 insertions(+), 703 deletions(-)
>>> create mode 100644 mm/shrinker.c
>>> 
>>> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
>>> index 224293b2dd06..961cb84e51f5 100644
>>> --- a/include/linux/shrinker.h
>>> +++ b/include/linux/shrinker.h
>>> @@ -96,6 +96,9 @@ struct shrinker {
>>>  */
>>> #define SHRINKER_NONSLAB (1 << 3)
>>> 
>>> +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup 
>>> *memcg,
>>> +int priority);
>> A good cleanup, vmscan.c is so huge.
>> I'd like to introduce a new header in mm/ directory and contains those
>> declarations of functions (like this and other debug function in
>> shrinker_debug.c) since they are used internally across mm.
> 
> How about putting them in the mm/internal.h file?

Either is fine to me.

> 
>> Thanks.




Re: [Cluster-devel] [PATCH v2 06/47] drm/ttm: dynamically allocate the drm-ttm_pool shrinker

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> Use new APIs to dynamically allocate the drm-ttm_pool shrinker.
> 
> Signed-off-by: Qi Zheng 

Reviewed-by: Muchun Song 

Thanks.



Re: [Cluster-devel] [PATCH v2 04/47] kvm: mmu: dynamically allocate the x86-mmu shrinker

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> Use new APIs to dynamically allocate the x86-mmu shrinker.
> 
> Signed-off-by: Qi Zheng 

Reviewed-by: Muchun Song 

Thanks.



Re: [Cluster-devel] [PATCH v2 02/47] mm: shrinker: remove redundant shrinker_rwsem in debugfs operations

2023-07-25 Thread Muchun Song



> On Jul 24, 2023, at 17:43, Qi Zheng  wrote:
> 
> The debugfs_remove_recursive() will wait for debugfs_file_put() to return,
> so the shrinker will not be freed when doing debugfs operations (such as
> shrinker_debugfs_count_show() and shrinker_debugfs_scan_write()), so there
> is no need to hold shrinker_rwsem during debugfs operations.
> 
> Signed-off-by: Qi Zheng 

Reviewed-by: Muchun Song 

Thanks.