Re: [PATCH 11/11] sysctl: treewide: constify the ctl_table argument of handlers

2024-03-27 Thread Dave Chinner
   int write,
>  void *buffer, size_t *lenp, loff_t *ppos)
>  {
>   int ret;

And this.

> @@ -474,8 +475,10 @@ int perf_event_max_sample_rate_handler(struct ctl_table 
> *table, int write,
>  
>  int sysctl_perf_cpu_time_max_percent __read_mostly = 
> DEFAULT_CPU_TIME_MAX_PERCENT;
>  
> -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
> - void *buffer, size_t *lenp, loff_t *ppos)
> +int perf_cpu_time_max_percent_handler(const struct ctl_table *table,
> +   int write,
> +   void *buffer, size_t *lenp,
> +   loff_t *ppos)
>  {
>   int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>  

And this.

> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index b2fc2727d654..003f0f5cb111 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -239,9 +239,10 @@ static long hung_timeout_jiffies(unsigned long 
> last_checked,
>  /*
>   * Process updating of timeout sysctl
>   */
> -static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
> -   void *buffer,
> -   size_t *lenp, loff_t *ppos)
> +static int proc_dohung_task_timeout_secs(const struct ctl_table *table,
> +  int write,
> +  void *buffer,
> +  size_t *lenp, loff_t *ppos)
>  {
>   int ret;
>  

And this.

> diff --git a/kernel/latencytop.c b/kernel/latencytop.c
> index 781249098cb6..0a5c22b19821 100644
> --- a/kernel/latencytop.c
> +++ b/kernel/latencytop.c
> @@ -65,8 +65,9 @@ static struct latency_record latency_record[MAXLR];
>  int latencytop_enabled;
>  
>  #ifdef CONFIG_SYSCTL
> -static int sysctl_latencytop(struct ctl_table *table, int write, void 
> *buffer,
> -     size_t *lenp, loff_t *ppos)
> +static int sysctl_latencytop(const struct ctl_table *table, int write,
> +  void *buffer,
> +  size_t *lenp, loff_t *ppos)
>  {
>   int err;
>  

And this.

I could go on, but there are so many examples of this in the patch
that I think that it needs to be toosed away and regenerated in a
way that doesn't trash the existing function parameter formatting.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [PATCH 11/11] sysctl: treewide: constify the ctl_table argument of handlers

2024-03-15 Thread Dave Chinner
   int write,
>  void *buffer, size_t *lenp, loff_t *ppos)
>  {
>   int ret;

And this.

> @@ -474,8 +475,10 @@ int perf_event_max_sample_rate_handler(struct ctl_table 
> *table, int write,
>  
>  int sysctl_perf_cpu_time_max_percent __read_mostly = 
> DEFAULT_CPU_TIME_MAX_PERCENT;
>  
> -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
> - void *buffer, size_t *lenp, loff_t *ppos)
> +int perf_cpu_time_max_percent_handler(const struct ctl_table *table,
> +   int write,
> +   void *buffer, size_t *lenp,
> +   loff_t *ppos)
>  {
>   int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>  

And this.

> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index b2fc2727d654..003f0f5cb111 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -239,9 +239,10 @@ static long hung_timeout_jiffies(unsigned long 
> last_checked,
>  /*
>   * Process updating of timeout sysctl
>   */
> -static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
> -   void *buffer,
> -   size_t *lenp, loff_t *ppos)
> +static int proc_dohung_task_timeout_secs(const struct ctl_table *table,
> +  int write,
> +  void *buffer,
> +  size_t *lenp, loff_t *ppos)
>  {
>   int ret;
>  

And this.

> diff --git a/kernel/latencytop.c b/kernel/latencytop.c
> index 781249098cb6..0a5c22b19821 100644
> --- a/kernel/latencytop.c
> +++ b/kernel/latencytop.c
> @@ -65,8 +65,9 @@ static struct latency_record latency_record[MAXLR];
>  int latencytop_enabled;
>  
>  #ifdef CONFIG_SYSCTL
> -static int sysctl_latencytop(struct ctl_table *table, int write, void 
> *buffer,
> -     size_t *lenp, loff_t *ppos)
> +static int sysctl_latencytop(const struct ctl_table *table, int write,
> +  void *buffer,
> +  size_t *lenp, loff_t *ppos)
>  {
>   int err;
>  

And this.

I could go on, but there are so many examples of this in the patch
that I think that it needs to be toosed away and regenerated in a
way that doesn't trash the existing function parameter formatting.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH 11/11] sysctl: treewide: constify the ctl_table argument of handlers

2024-03-15 Thread Dave Chinner
   int write,
>  void *buffer, size_t *lenp, loff_t *ppos)
>  {
>   int ret;

And this.

> @@ -474,8 +475,10 @@ int perf_event_max_sample_rate_handler(struct ctl_table 
> *table, int write,
>  
>  int sysctl_perf_cpu_time_max_percent __read_mostly = 
> DEFAULT_CPU_TIME_MAX_PERCENT;
>  
> -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
> - void *buffer, size_t *lenp, loff_t *ppos)
> +int perf_cpu_time_max_percent_handler(const struct ctl_table *table,
> +   int write,
> +   void *buffer, size_t *lenp,
> +   loff_t *ppos)
>  {
>   int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>  

And this.

> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index b2fc2727d654..003f0f5cb111 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -239,9 +239,10 @@ static long hung_timeout_jiffies(unsigned long 
> last_checked,
>  /*
>   * Process updating of timeout sysctl
>   */
> -static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
> -   void *buffer,
> -   size_t *lenp, loff_t *ppos)
> +static int proc_dohung_task_timeout_secs(const struct ctl_table *table,
> +  int write,
> +  void *buffer,
> +  size_t *lenp, loff_t *ppos)
>  {
>   int ret;
>  

And this.

> diff --git a/kernel/latencytop.c b/kernel/latencytop.c
> index 781249098cb6..0a5c22b19821 100644
> --- a/kernel/latencytop.c
> +++ b/kernel/latencytop.c
> @@ -65,8 +65,9 @@ static struct latency_record latency_record[MAXLR];
>  int latencytop_enabled;
>  
>  #ifdef CONFIG_SYSCTL
> -static int sysctl_latencytop(struct ctl_table *table, int write, void 
> *buffer,
> -     size_t *lenp, loff_t *ppos)
> +static int sysctl_latencytop(const struct ctl_table *table, int write,
> +  void *buffer,
> +  size_t *lenp, loff_t *ppos)
>  {
>   int err;
>  

And this.

I could go on, but there are so many examples of this in the patch
that I think that it needs to be toosed away and regenerated in a
way that doesn't trash the existing function parameter formatting.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH v2] statx: stx_subvol

2024-03-10 Thread Dave Chinner
On Fri, Mar 08, 2024 at 08:56:33AM -0800, Darrick J. Wong wrote:
> Should the XFS data and rt volumes be reported with different stx_vol
> values?

No, because all the inodes are on the data volume and the same inode
can have data on the data volume or the rt volume. i.e. "data on rt,
truncate, clear rt, copy data back into data dev".  It's still the
same inode, and may have exactly the same data, so why should change
stx_vol and make it appear to userspace as being a different inode?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH 3/4] xattr: Use dedicated slab buckets for setxattr()

2024-03-04 Thread Dave Chinner
On Mon, Mar 04, 2024 at 10:49:31AM -0800, Kees Cook wrote:
> The setxattr() API can be used for exploiting[1][2][3] use-after-free
> type confusion flaws in the kernel. Avoid having a user-controlled size
> cache share the global kmalloc allocator by using a separate set of
> kmalloc buckets.
> 
> Link: https://duasynt.com/blog/linux-kernel-heap-spray [1]
> Link: https://etenal.me/archives/1336 [2]
> Link: 
> https://github.com/a13xp0p0v/kernel-hack-drill/blob/master/drill_exploit_uaf.c
>  [3]
> Signed-off-by: Kees Cook 
> ---
> Cc: Christian Brauner 
> Cc: Alexander Viro 
> Cc: Jan Kara 
> Cc: linux-fsde...@vger.kernel.org
> ---
>  fs/xattr.c | 12 +++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xattr.c b/fs/xattr.c
> index 09d927603433..2b06316f1d1f 100644
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -821,6 +821,16 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, 
> name,
>   return error;
>  }
>  
> +static struct kmem_buckets *xattr_buckets;
> +static int __init init_xattr_buckets(void)
> +{
> + xattr_buckets = kmem_buckets_create("xattr", 0, 0, 0,
> + XATTR_LIST_MAX, NULL);
> +
> + return 0;
> +}
> +subsys_initcall(init_xattr_buckets);
> +
>  /*
>   * Extended attribute LIST operations
>   */
> @@ -833,7 +843,7 @@ listxattr(struct dentry *d, char __user *list, size_t 
> size)
>   if (size) {
>   if (size > XATTR_LIST_MAX)
>   size = XATTR_LIST_MAX;
> - klist = kvmalloc(size, GFP_KERNEL);
> + klist = kmem_buckets_alloc(xattr_buckets, size, GFP_KERNEL);

There's a reason this uses kvmalloc() - allocations can be up to
64kB in size and it's not uncommon for large slab allocation to
fail on long running machines. hence this needs to fall back to
vmalloc() to ensure that large xattrs can always be read.

Essentially, you're trading a heap spraying vector that almost
no-one will ever see for a far more frequent -ENOMEM denial of
service that will be seen on production systems where large xattrs
are used.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH v3 25/35] xfs: Memory allocation profiling fixups

2024-02-14 Thread Dave Chinner
On Mon, Feb 12, 2024 at 01:39:11PM -0800, Suren Baghdasaryan wrote:
> From: Kent Overstreet 
> 
> This adds an alloc_hooks() wrapper around kmem_alloc(), so that we can
> have allocations accounted to the proper callsite.
> 
> Signed-off-by: Kent Overstreet 
> Signed-off-by: Suren Baghdasaryan 
> ---
>  fs/xfs/kmem.c |  4 ++--
>  fs/xfs/kmem.h | 10 --
>  2 files changed, 6 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> index c557a030acfe..9aa57a4e2478 100644
> --- a/fs/xfs/kmem.c
> +++ b/fs/xfs/kmem.c
> @@ -8,7 +8,7 @@
>  #include "xfs_trace.h"
>  
>  void *
> -kmem_alloc(size_t size, xfs_km_flags_t flags)
> +kmem_alloc_noprof(size_t size, xfs_km_flags_t flags)
>  {
>   int retries = 0;
>   gfp_t   lflags = kmem_flags_convert(flags);
> @@ -17,7 +17,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
>   trace_kmem_alloc(size, flags, _RET_IP_);
>  
>   do {
> - ptr = kmalloc(size, lflags);
> + ptr = kmalloc_noprof(size, lflags);
>   if (ptr || (flags & KM_MAYFAIL))
>   return ptr;
>   if (!(++retries % 100))
> diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
> index b987dc2c6851..c4cf1dc2a7af 100644
> --- a/fs/xfs/kmem.h
> +++ b/fs/xfs/kmem.h
> @@ -6,6 +6,7 @@
>  #ifndef __XFS_SUPPORT_KMEM_H__
>  #define __XFS_SUPPORT_KMEM_H__
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -56,18 +57,15 @@ kmem_flags_convert(xfs_km_flags_t flags)
>   return lflags;
>  }
>  
> -extern void *kmem_alloc(size_t, xfs_km_flags_t);
>  static inline void  kmem_free(const void *ptr)
>  {
>   kvfree(ptr);
>  }
>  
> +extern void *kmem_alloc_noprof(size_t, xfs_km_flags_t);
> +#define kmem_alloc(...)  
> alloc_hooks(kmem_alloc_noprof(__VA_ARGS__))
>  
> -static inline void *
> -kmem_zalloc(size_t size, xfs_km_flags_t flags)
> -{
> - return kmem_alloc(size, flags | KM_ZERO);
> -}
> +#define kmem_zalloc(_size, _flags)   kmem_alloc((_size), (_flags) | KM_ZERO)
>  
>  /*
>   * Zone interfaces
> -- 
> 2.43.0.687.g38aa6559b0-goog

These changes can be dropped - the fs/xfs/kmem.[ch] stuff is now
gone in linux-xfs/for-next.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [RFC 05/18] pkernfs: add file mmap callback

2024-02-05 Thread Dave Chinner
On Mon, Feb 05, 2024 at 12:01:50PM +, James Gowans wrote:
> Make the file data useable to userspace by adding mmap. That's all that
> QEMU needs for guest RAM, so that's all be bother implementing for now.
> 
> When mmaping the file the VMA is marked as PFNMAP to indicate that there
> are no struct pages for the memory in this VMA. Remap_pfn_range() is
> used to actually populate the page tables. All PTEs are pre-faulted into
> the pgtables at mmap time so that the pgtables are useable when this
> virtual address range is given to VFIO's MAP_DMA.

And so what happens when this file is truncated whilst it is mmap()d
by an application? Ain't that just a great big UAF waiting to be
exploited?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

___
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec


Re: [RFC PATCH v2 7/8] Introduce dcache_is_aliasing() across all architectures

2024-01-31 Thread Dave Chinner
On Wed, Jan 31, 2024 at 09:58:21AM -0500, Mathieu Desnoyers wrote:
> On 2024-01-30 21:48, Dave Chinner wrote:
> > On Tue, Jan 30, 2024 at 11:52:54AM -0500, Mathieu Desnoyers wrote:
> > > Introduce a generic way to query whether the dcache is virtually aliased
> > > on all architectures. Its purpose is to ensure that subsystems which
> > > are incompatible with virtually aliased data caches (e.g. FS_DAX) can
> > > reliably query this.
> > > 
> > > For dcache aliasing, there are three scenarios dependending on the
> > > architecture. Here is a breakdown based on my understanding:
> > > 
> > > A) The dcache is always aliasing:
> > > 
> > > * arc
> > > * csky
> > > * m68k (note: shared memory mappings are incoherent ? SHMLBA is missing 
> > > there.)
> > > * sh
> > > * parisc
> > 
> > /me wonders why the dentry cache aliasing has problems on these
> > systems.
> > 
> > Oh, dcache != fs/dcache.c (the VFS dentry cache).
> > 
> > Can you please rename this function appropriately so us dumb
> > filesystem people don't confuse cpu data cache configurations with
> > the VFS dentry cache aliasing when we read this code? Something like
> > cpu_dcache_is_aliased(), perhaps?
> 
> Good point, will do. I'm planning go rename as follows for v3 to
> eliminate confusion with dentry cache (and with "page cache" in
> general):
> 
> ARCH_HAS_CACHE_ALIASING -> ARCH_HAS_CPU_CACHE_ALIASING
> dcache_is_aliasing() -> cpu_dcache_is_aliasing()
> 
> I noticed that you suggested "aliased" rather than "aliasing",
> but I followed what arm64 did for icache_is_aliasing(). Do you
> have a strong preference one way or another ?

Not really.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [RFC PATCH v2 8/8] dax: Fix incorrect list of dcache aliasing architectures

2024-01-30 Thread Dave Chinner
On Tue, Jan 30, 2024 at 11:52:55AM -0500, Mathieu Desnoyers wrote:
> commit d92576f1167c ("dax: does not work correctly with virtual aliasing 
> caches")
> prevents DAX from building on architectures with virtually aliased
> dcache with:
> 
>   depends on !(ARM || MIPS || SPARC)
> 
> This check is too broad (e.g. recent ARMv7 don't have virtually aliased
> dcaches), and also misses many other architectures with virtually
> aliased dcache.
> 
> This is a regression introduced in the v5.13 Linux kernel where the
> dax mount option is removed for 32-bit ARMv7 boards which have no dcache
> aliasing, and therefore should work fine with FS_DAX.
> 
> This was turned into the following implementation of dax_is_supported()
> by a preparatory change:
> 
> return !IS_ENABLED(CONFIG_ARM) &&
>!IS_ENABLED(CONFIG_MIPS) &&
>!IS_ENABLED(CONFIG_SPARC);
> 
> Use dcache_is_aliasing() instead to figure out whether the environment
> has aliasing dcaches.
> 
> Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing 
> caches")
> Signed-off-by: Mathieu Desnoyers 
> Cc: Andrew Morton 
> Cc: Linus Torvalds 
> Cc: linux...@kvack.org
> Cc: linux-a...@vger.kernel.org
> Cc: Dan Williams 
> Cc: Vishal Verma 
> Cc: Dave Jiang 
> Cc: Matthew Wilcox 
> Cc: Arnd Bergmann 
> Cc: Russell King 
> Cc: nvd...@lists.linux.dev
> Cc: linux-...@vger.kernel.org
> Cc: linux-fsde...@vger.kernel.org
> ---
>  include/linux/dax.h | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index cfc8cd4a3eae..f59e604662e4 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -5,6 +5,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  typedef unsigned long dax_entry_t;
>  
> @@ -80,9 +81,7 @@ static inline bool daxdev_mapping_supported(struct 
> vm_area_struct *vma,
>  }
>  static inline bool dax_is_supported(void)
>  {
> - return !IS_ENABLED(CONFIG_ARM) &&
> -!IS_ENABLED(CONFIG_MIPS) &&
> -!IS_ENABLED(CONFIG_SPARC);
> + return !dcache_is_aliasing();

Yeah, if this is just a one liner should go into
fs_dax_get_by_bdev(), similar to the blk_queue_dax() check at the
start of the function.

I also noticed that device mapper uses fs_dax_get_by_bdev() to
determine if it can support DAX, but this patch set does not address
that case. Hence it really seems to me like fs_dax_get_by_bdev() is
the right place to put this check.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [RFC PATCH v2 7/8] Introduce dcache_is_aliasing() across all architectures

2024-01-30 Thread Dave Chinner
On Tue, Jan 30, 2024 at 11:52:54AM -0500, Mathieu Desnoyers wrote:
> Introduce a generic way to query whether the dcache is virtually aliased
> on all architectures. Its purpose is to ensure that subsystems which
> are incompatible with virtually aliased data caches (e.g. FS_DAX) can
> reliably query this.
> 
> For dcache aliasing, there are three scenarios dependending on the
> architecture. Here is a breakdown based on my understanding:
> 
> A) The dcache is always aliasing:
> 
> * arc
> * csky
> * m68k (note: shared memory mappings are incoherent ? SHMLBA is missing 
> there.)
> * sh
> * parisc

/me wonders why the dentry cache aliasing has problems on these
systems.

Oh, dcache != fs/dcache.c (the VFS dentry cache).

Can you please rename this function appropriately so us dumb
filesystem people don't confuse cpu data cache configurations with
the VFS dentry cache aliasing when we read this code? Something like
cpu_dcache_is_aliased(), perhaps?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [RFC PATCH v2 1/8] dax: Introduce dax_is_supported()

2024-01-30 Thread Dave Chinner
On Tue, Jan 30, 2024 at 11:52:48AM -0500, Mathieu Desnoyers wrote:
> Introduce a new dax_is_supported() static inline to check whether the
> architecture supports DAX.
> 
> This replaces the following fs/Kconfig:FS_DAX dependency:
> 
>   depends on !(ARM || MIPS || SPARC)
> 
> This is done in preparation for its use by each filesystem supporting
> the dax mount option to validate whether dax is indeed supported.
> 
> This is done in preparation for using dcache_is_aliasing() in a
> following change which will properly support architectures which detect
> dcache aliasing at runtime.
> 
> Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing 
> caches")
> Signed-off-by: Mathieu Desnoyers 
> Cc: Andrew Morton 
> Cc: Linus Torvalds 
> Cc: linux...@kvack.org
> Cc: linux-a...@vger.kernel.org
> Cc: Dan Williams 
> Cc: Vishal Verma 
> Cc: Dave Jiang 
> Cc: Matthew Wilcox 
> Cc: Arnd Bergmann 
> Cc: Russell King 
> Cc: nvd...@lists.linux.dev
> Cc: linux-...@vger.kernel.org
> Cc: linux-fsde...@vger.kernel.org
> ---
>  fs/Kconfig  |  1 -
>  include/linux/dax.h | 10 ++
>  2 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/Kconfig b/fs/Kconfig
> index 42837617a55b..e5efdb3b276b 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -56,7 +56,6 @@ endif # BLOCK
>  config FS_DAX
>   bool "File system based Direct Access (DAX) support"
>   depends on MMU
> - depends on !(ARM || MIPS || SPARC)
>   depends on ZONE_DEVICE || FS_DAX_LIMITED
>   select FS_IOMAP
>   select DAX
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index b463502b16e1..cfc8cd4a3eae 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -78,6 +78,12 @@ static inline bool daxdev_mapping_supported(struct 
> vm_area_struct *vma,
>   return false;
>   return dax_synchronous(dax_dev);
>  }
> +static inline bool dax_is_supported(void)
> +{
> + return !IS_ENABLED(CONFIG_ARM) &&
> +!IS_ENABLED(CONFIG_MIPS) &&
> +!IS_ENABLED(CONFIG_SPARC);
> +}

Uh, ok. Now I see what dax_is_supported() does.

I think this should be folded into fs_dax_get_by_bdev(), which
currently returns NULL if CONFIG_FS_DAX=n and so should be cahnged
to return NULL if any of these platform configs is enabled.

Then I don't think you need to change a single line of filesystem
code - they'll all just do what they do now if the block device
doesn't support DAX

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [RFC PATCH 7/7] xfs: Use dax_is_supported()

2024-01-29 Thread Dave Chinner
On Mon, Jan 29, 2024 at 04:06:31PM -0500, Mathieu Desnoyers wrote:
> Use dax_is_supported() to validate whether the architecture has
> virtually aliased caches at mount time.
> 
> This is relevant for architectures which require a dynamic check
> to validate whether they have virtually aliased data caches
> (ARCH_HAS_CACHE_ALIASING_DYNAMIC=y).

Where's the rest of this patchset? I have no idea what
dax_is_supported() actually does, how it interacts with
CONFIG_FS_DAX, etc.

If you are changing anything to do with FSDAX, the cc-ing the
-entire- patchset to linux-fsdevel is absolutely necessary so the
entire patchset lands in our inboxes and not just a random patch
from the middle of a bigger change.

> Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing 
> caches")
> Signed-off-by: Mathieu Desnoyers 
> Cc: Chandan Babu R 
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Andrew Morton 
> Cc: Linus Torvalds 
> Cc: linux...@kvack.org
> Cc: linux-a...@vger.kernel.org
> Cc: Dan Williams 
> Cc: Vishal Verma 
> Cc: Dave Jiang 
> Cc: Matthew Wilcox 
> Cc: nvd...@lists.linux.dev
> Cc: linux-...@vger.kernel.org
> ---
>  fs/xfs/xfs_super.c | 20 ++--
>  1 file changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 764304595e8b..b27ecb11db66 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1376,14 +1376,22 @@ xfs_fs_parse_param(
>   case Opt_nodiscard:
>   parsing_mp->m_features &= ~XFS_FEAT_DISCARD;
>   return 0;
> -#ifdef CONFIG_FS_DAX
>   case Opt_dax:
> - xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS);
> - return 0;
> + if (dax_is_supported()) {
> + xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS);
> + return 0;
> + } else {
> + xfs_warn(parsing_mp, "dax option not supported.");
> + return -EINVAL;
> + }
>   case Opt_dax_enum:
> - xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
> - return 0;
> -#endif
> + if (dax_is_supported()) {
> + xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
> + return 0;
> + } else {
> + xfs_warn(parsing_mp, "dax option not supported.");
> + return -EINVAL;
> + }

Assuming that I understand what dax_is_supported() is doing, this
change isn't right.  We're just setting the DAX configuration flags
from the mount options here, we don't validate them until 
we've parsed all options and eliminated conflicts and rejected
conflicting options. We validate whether the options are
appropriate for the underlying hardware configuration later in the
mount process.

dax=always suitability is check in xfs_setup_dax_always() called
later in the mount process when we have enough context and support
to open storage devices and check them for DAX support. If the
hardware does not support DAX then we simply we turn off DAX
support, we do not reject the mount as this change does.

dax=inode and dax=never are valid options on all configurations,
even those with without FSDAX support or have hardware that is not
capable of using DAX. dax=inode only affects how an inode is
instantiated in cache - if the inode has a flag that says "use DAX"
and dax is suppoortable by the hardware, then the turn on DAX for
that inode. Otherwise we just use the normal non-dax IO paths.

Again, we don't error out the filesystem if DAX is not supported,
we just don't turn it on. This check is done in
xfs_inode_should_enable_dax() and I think all you need to do is
replace the IS_ENABLED(CONFIG_FS_DAX) with a dax_is_supported()
call...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [f2fs-dev] [PATCH 1/5] zonefs: pass GFP_KERNEL to blkdev_zone_mgmt() call

2024-01-23 Thread Dave Chinner via Linux-f2fs-devel
On Tue, Jan 23, 2024 at 09:39:02PM +0100, Mikulas Patocka wrote:
> 
> 
> On Tue, 23 Jan 2024, Johannes Thumshirn wrote:
> 
> > Pass GFP_KERNEL instead of GFP_NOFS to the blkdev_zone_mgmt() call in
> > zonefs_zone_mgmt().
> > 
> > As as zonefs_zone_mgmt() and zonefs_inode_zone_mgmt() are never called
> > from a place that can recurse back into the filesystem on memory reclaim,
> > it is save to call blkdev_zone_mgmt() with GFP_KERNEL.
> > 
> > Signed-off-by: Johannes Thumshirn 
> > ---
> >  fs/zonefs/super.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> > index 93971742613a..63fbac018c04 100644
> > --- a/fs/zonefs/super.c
> > +++ b/fs/zonefs/super.c
> > @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
> >  
> > trace_zonefs_zone_mgmt(sb, z, op);
> > ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
> > -  z->z_size >> SECTOR_SHIFT, GFP_NOFS);
> > +  z->z_size >> SECTOR_SHIFT, GFP_KERNEL);
> > if (ret) {
> > zonefs_err(sb,
> >"Zone management operation %s at %llu failed %d\n",
> > 
> > -- 
> > 2.43.0
> 
> zonefs_inode_zone_mgmt calls 
> lockdep_assert_held(_I(inode)->i_truncate_mutex); - so, this 
> function is called with the mutex held - could it happen that the 
> GFP_KERNEL allocation recurses into the filesystem and attempts to take 
> i_truncate_mutex as well?
> 
> i.e. GFP_KERNEL -> iomap_do_writepage -> zonefs_write_map_blocks -> 
> zonefs_write_iomap_begin -> mutex_lock(>i_truncate_mutex)

zonefs doesn't have a ->writepage method, so writeback can't be
called from memory reclaim like this.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [PATCH 1/5] zonefs: pass GFP_KERNEL to blkdev_zone_mgmt() call

2024-01-23 Thread Dave Chinner
On Tue, Jan 23, 2024 at 09:39:02PM +0100, Mikulas Patocka wrote:
> 
> 
> On Tue, 23 Jan 2024, Johannes Thumshirn wrote:
> 
> > Pass GFP_KERNEL instead of GFP_NOFS to the blkdev_zone_mgmt() call in
> > zonefs_zone_mgmt().
> > 
> > As as zonefs_zone_mgmt() and zonefs_inode_zone_mgmt() are never called
> > from a place that can recurse back into the filesystem on memory reclaim,
> > it is save to call blkdev_zone_mgmt() with GFP_KERNEL.
> > 
> > Signed-off-by: Johannes Thumshirn 
> > ---
> >  fs/zonefs/super.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> > index 93971742613a..63fbac018c04 100644
> > --- a/fs/zonefs/super.c
> > +++ b/fs/zonefs/super.c
> > @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
> >  
> > trace_zonefs_zone_mgmt(sb, z, op);
> > ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
> > -  z->z_size >> SECTOR_SHIFT, GFP_NOFS);
> > +  z->z_size >> SECTOR_SHIFT, GFP_KERNEL);
> > if (ret) {
> > zonefs_err(sb,
> >"Zone management operation %s at %llu failed %d\n",
> > 
> > -- 
> > 2.43.0
> 
> zonefs_inode_zone_mgmt calls 
> lockdep_assert_held(_I(inode)->i_truncate_mutex); - so, this 
> function is called with the mutex held - could it happen that the 
> GFP_KERNEL allocation recurses into the filesystem and attempts to take 
> i_truncate_mutex as well?
> 
> i.e. GFP_KERNEL -> iomap_do_writepage -> zonefs_write_map_blocks -> 
> zonefs_write_iomap_begin -> mutex_lock(>i_truncate_mutex)

zonefs doesn't have a ->writepage method, so writeback can't be
called from memory reclaim like this.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH] bcachefs: fix incorrect usage of REQ_OP_FLUSH

2024-01-22 Thread Dave Chinner
On Mon, Jan 22, 2024 at 07:41:47PM +0100, Christoph Hellwig wrote:
> On Mon, Jan 22, 2024 at 12:42:24PM -0500, Kent Overstreet wrote:
> > updating my tests for the MD_FAULTY removal, then will do. Is there
> > anything you want me to look for?
> 
> Nothing fancy.  Just that you do data integrity operations and do
> not see an error.
> 
> > considering most tests won't break or won't clearly break if flush/fua
> > is being dropped (even generic/388 was passing reliably, of course,
> > since virtual block devices aren't going to reorder writes...) maybe we
> > could do some print statement sanity checking...
> 
> Well, xfstests is not very good about power fail testing, because it
> can't inject a power fail..  Which is a problem in it's own, but
> would need hardware power failing or at least some pretty good VM
> emulation of the same outside the scope of the actual xfstests runner.

We do actually have stuff in fstests that checks write vs flush
ordering as issued by the filesystem. We use dm-logwrites for
tracking the writes and flushes and to be able to replay arbitrary
groups of writes between flushes.  generic/482 is one of those
tests.

These are the ordering problems that power failures expose,
so we do actually have tests that cover the same conditions that
pulling the power from a device would exercise.

I even wrote a debugging script (tools/dm-logwrite-replay) to
iterate write+fua groups in the test corpse to isolate the write
groups where the consistency failure occurs when doing work to
optimise flushes being issued by the XFS journal checkpoint writes.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: file handle in statx

2023-12-12 Thread Dave Chinner
On Tue, Dec 12, 2023 at 05:39:27PM -0500, Kent Overstreet wrote:
> On Wed, Dec 13, 2023 at 09:23:18AM +1100, Dave Chinner wrote:
> > On Wed, Dec 13, 2023 at 08:57:43AM +1100, NeilBrown wrote:
> > > On Wed, 13 Dec 2023, Dave Chinner wrote:
> > > > On Tue, Dec 12, 2023 at 09:15:29AM -0800, Frank Filz wrote:
> > > > > > On Tue, Dec 12, 2023 at 10:10:23AM +0100, Donald Buczek wrote:
> > > > > > > On 12/12/23 06:53, Dave Chinner wrote:
> > > > > > >
> > > > > > > > So can someone please explain to me why we need to try to 
> > > > > > > > re-invent
> > > > > > > > a generic filehandle concept in statx when we already have a 
> > > > > > > > have
> > > > > > > > working and widely supported user API that provides exactly this
> > > > > > > > functionality?
> > > > > > >
> > > > > > > name_to_handle_at() is fine, but userspace could profit from being
> > > > > > > able to retrieve the filehandle together with the other metadata 
> > > > > > > in a
> > > > > > > single system call.
> > > > > > 
> > > > > > Can you say more?  What, specifically is the application that would 
> > > > > > want
> > > > > to do
> > > > > > that, and is it really in such a hot path that it would be a 
> > > > > > user-visible
> > > > > > improveable, let aloine something that can be actually be measured?
> > > > > 
> > > > > A user space NFS server like Ganesha could benefit from getting 
> > > > > attributes
> > > > > and file handle in a single system call.
> > > > 
> > > > At the cost of every other application that doesn't need those
> > > > attributes.
> > > 
> > > Why do you think there would be a cost?
> > 
> > It's as much maintenance and testing cost as it is a runtime cost.
> > We have to test and check this functionality works as advertised,
> > and we have to maintain that in working order forever more. That's
> > not free, especially if it is decided that the implementation needs
> > to be hyper-optimised in each individual filesystem because of
> > performance cost reasons.
> > 
> > Indeed, even the runtime "do we need to fetch this information"
> > checks have a measurable cost, especially as statx() is a very hot
> > kernel path. We've been optimising branches out of things like
> > setting up kiocbs because when that path is taken millions of times
> > every second each logic branch that decides if something needs to be
> > done or not has a direct measurable cost. statx() is a hot path that
> > can be called millions of times a second.
> 
> Like Neal mentioned we won't even be fetching the fh if it wasn't
> explicitly requested - and like I mentioned, we can avoid the
> .encode_fh() call for local filesystems with a bit of work at the VFS
> layer.
> 
> OTOH, when you're running rsync in incremental mode, and detecting
> hardlinks, your point that "statx can be called millions of times per
> second" would apply just as much to the additional name_to_handle_at()
> call - we'd be nearly doubling their overhead for scanning files that
> don't need to be sent.

Hardlinked files are indicated by st_nlink > 1, not by requiring
userspace to store every st_ino/dev it sees and having to compare
the st-ino/dev of every newly stat()d inode against that ino/dev
cache.

We only need ino/dev/filehandles for hardlink path disambiguation.

IOWs, this use case does not need name_to_handle_at() for millions
of inodes - it is just needed on the regular file inodes that have
st_nlink > 1.

Hence even for wrokloads like rsync with hardlink detection, we
don't need filehandles for every inode being stat()d.  And that's
ignoring the fact that, outside of certain niche use cases,
hardlinks are rare.

I'm really struggling to see what filehandles in statx() actually
optimises in any meaningful manner

> > And then comes the cost of encoding dynamically sized information in
> > struct statx - filehandles are not fixed size - and statx is most
> > definitely not set up or intended for dynamically sized attribute
> > data. This adds more complexity to statx because it wasn't designed
> > or intended to handle dynamically sized attributes. Optional
> > attributes, yes, but not attributes that might vary in size from fs
> > to fs or even inode type to inode type within a fileystem (e.

Re: file handle in statx

2023-12-12 Thread Dave Chinner
On Tue, Dec 12, 2023 at 09:15:29AM -0800, Frank Filz wrote:
> > On Tue, Dec 12, 2023 at 10:10:23AM +0100, Donald Buczek wrote:
> > > On 12/12/23 06:53, Dave Chinner wrote:
> > >
> > > > So can someone please explain to me why we need to try to re-invent
> > > > a generic filehandle concept in statx when we already have a have
> > > > working and widely supported user API that provides exactly this
> > > > functionality?
> > >
> > > name_to_handle_at() is fine, but userspace could profit from being
> > > able to retrieve the filehandle together with the other metadata in a
> > > single system call.
> > 
> > Can you say more?  What, specifically is the application that would want
> to do
> > that, and is it really in such a hot path that it would be a user-visible
> > improveable, let aloine something that can be actually be measured?
> 
> A user space NFS server like Ganesha could benefit from getting attributes
> and file handle in a single system call.

At the cost of every other application that doesn't need those
attributes. That's not a good trade-off - the cost of a syscall is
tiny compared to the rest of the work that has to be done to create
a stable filehandle for an inode, even if we have a variant of
name_to_handle_at() that takes an open fd rather than a path.

> Potentially it could also avoid some of the challenges of using
> name_to_handle_at that is a privileged operation.

It isn't. open_by_handle_at() is privileged because it bypasses all
the path based access checking that a normal open() does. Anyone can
get a filehandle for a path from the kernel, but few can actually
use it for anything other than file uniqueness checks

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: file handle in statx (was: Re: How to cope with subvolumes and snapshots on muti-user systems?)

2023-12-12 Thread Dave Chinner
On Tue, Dec 12, 2023 at 10:21:53AM -0500, Kent Overstreet wrote:
> On Tue, Dec 12, 2023 at 04:53:28PM +1100, Dave Chinner wrote:
> > Doesn't anyone else see or hear the elephant trumpeting loudly in
> > the middle of the room?
> > 
> > I mean, we already have name_to_handle_at() for userspace to get a
> > unique, opaque, filesystem defined file handle for any given file.
> > It's the same filehandle that filesystems hand to the nfsd so nfs
> > clients can uniquely identify the file they are asking the nfsd to
> > operate on.
> > 
> > The contents of these filehandles is entirely defined by the file
> > system and completely opaque to the user. The only thing that
> > parses the internal contents of the handle is the filesystem itself.
> > Therefore, as long as the fs encodes the information it needs into the
> > handle to determine what subvol/snapshot the inode belongs to when
> > the handle is passed back to it (e.g. from open_by_handle_at()) then
> > nothing else needs to care how it is encoded.
> > 
> > So can someone please explain to me why we need to try to re-invent
> > a generic filehandle concept in statx when we already have a
> > have working and widely supported user API that provides exactly
> > this functionality?
> 
> Definitely should be part of the discussion :)
> 
> But I think it _does_ need to be in statx; because:
>  - we've determined that 64 bit ino_t just isn't a future proof
>interface, we're having real problems with it today
>  - statx is _the_ standard, future proofed interface for getting inode
>attributes

No, it most definitely isn't, and statx was never intended as a
dumping ground for anything and everything inode related. e.g. Any
inode attribute that can be modified needs to use a different
interface - one that has a corresponding "set" operation.

>  - therefore, if we want userspace programmers to be using filehandles,
>instead of inode numbers, so there code isn't broken, we need to be
>providing interfaces that guide them in that direction.

We already have a filehandle interface they can use for this
purpose. It is already used by some userspace applications for this
purpose.

Anything new API function do with statx() will require application
changes, and the vast majority of applications aren't using statx()
directly - they are using stat() which glibc wraps to statx()
internally. So they are going to need a change of API, anyway.

So, fundamentally, there is a change of API for most applications
that need to do thorough inode uniqueness checks regardless of
anything else. They can do this right now - just continue using
stat() as they do right now, and then use name_to_filehandle_at()
for uniqueness checks.

> Even assuming we can update all the documentation to say "filehandles
> are the correct way to test inode uniqueness", you know at least half of
> programmers will stick to stx_ino instead of the filehandle if the
> filehandle is an extra syscall.

Your argument is "programmers suck so we must design for the
lowest common denominator". That's an -awful- way to design APIs.

Further, this "programmers suck" design comes at a cost to every
statx() call that does not need filehandles. That's the vast
majority of statx() calls that are made on a system. Why should we
slow down statx() for all users when so few applications actually
need uniqueness and they can take the cost of robust uniqueness
tests with an extra syscall entirely themselves?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: file handle in statx (was: Re: How to cope with subvolumes and snapshots on muti-user systems?)

2023-12-11 Thread Dave Chinner
On Tue, Dec 12, 2023 at 11:59:51AM +1100, NeilBrown wrote:
> On Tue, 12 Dec 2023, Kent Overstreet wrote:
> > On Tue, Dec 12, 2023 at 10:53:07AM +1100, NeilBrown wrote:
> > > On Tue, 12 Dec 2023, Kent Overstreet wrote:
> > > > On Tue, Dec 12, 2023 at 09:43:27AM +1100, NeilBrown wrote:
> > > > > On Sat, 09 Dec 2023, Kent Overstreet wrote:
> > > > Thoughts?
> > > > 
> > > 
> > > I'm completely in favour of exporting the (full) filehandle through
> > > statx. (If the application asked for the filehandle, it will expect a
> > > larger structure to be returned.  We don't need to use the currently
> > > reserved space).
> > > 
> > > I'm completely in favour of updating user-space tools to use the
> > > filehandle to check if two handles are for the same file.
> > > 
> > > I'm not in favour of any filesystem depending on this for correct
> > > functionality today.  As long as the filesystem isn't so large that
> > > inum+volnum simply cannot fit in 64 bits, we should make a reasonable
> > > effort to present them both in 64 bits.  Depending on the filehandle is a
> > > good plan for long term growth, not for basic functionality today.
> > 
> > My standing policy in these situations is that I'll do the stopgap/hacky
> > measure... but not before doing actual, real work on the longterm
> > solution :)
> 
> Eminently sensible.
> 
> > 
> > So if we're all in favor of statx as the real long term solution, how
> > about we see how far we get with that?
> > 
> 
> I suggest:
> 
>  STATX_ATTR_INUM_NOT_UNIQUE - it is possible that two files have the
>   same inode number
> 
>  
>  __u64 stx_vol Volume identifier.  Two files with same stx_vol and 
>stx_ino MUST be the same.  Exact meaning of volumes
>is filesys-specific
>  
>  STATX_VOL Want stx_vol
> 
>   __u8 stx_handle_len  Length of stx_handle if present
>   __u8 stx_handle[128] Unique stable identifier for this file.  Will
>NEVER be reused for a different file.
>This appears AFTER __statx_pad2, beyond
>the current 'struct statx'.
>  STATX_HANDLE  Want stx_handle_len and stx_handle. Buffer for
>receiving statx info has at least
>sizeof(struct statx)+128 bytes.

Hmmm.

Doesn't anyone else see or hear the elephant trumpeting loudly in
the middle of the room?

I mean, we already have name_to_handle_at() for userspace to get a
unique, opaque, filesystem defined file handle for any given file.
It's the same filehandle that filesystems hand to the nfsd so nfs
clients can uniquely identify the file they are asking the nfsd to
operate on.

The contents of these filehandles is entirely defined by the file
system and completely opaque to the user. The only thing that
parses the internal contents of the handle is the filesystem itself.
Therefore, as long as the fs encodes the information it needs into the
handle to determine what subvol/snapshot the inode belongs to when
the handle is passed back to it (e.g. from open_by_handle_at()) then
nothing else needs to care how it is encoded.

So can someone please explain to me why we need to try to re-invent
a generic filehandle concept in statx when we already have a
have working and widely supported user API that provides exactly
this functionality?

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: About the conflict between XFS inode recycle and VFS rcu-walk

2023-12-07 Thread Dave Chinner
On Tue, Dec 05, 2023 at 07:38:33PM +0800, alexjlzh...@gmail.com wrote:
> Hi, all
> 
> I would like to ask if the conflict between xfs inode recycle and vfs rcu-walk
> which can lead to null pointer references has been resolved?
> 
> I browsed through emails about the following patches and their discussions:
> - 
> https://lore.kernel.org/linux-xfs/20220217172518.3842951-2-bfos...@redhat.com/
> - 
> https://lore.kernel.org/linux-xfs/20220121142454.1994916-1-bfos...@redhat.com/
> - 
> https://lore.kernel.org/linux-xfs/164180589176.86426.501271559065590169.st...@mickey.themaw.net/
> 
> And then came to the conclusion that this problem has not been solved, am I
> right? Did I miss some patch that could solve this problem?

We fixed the known problems this caused by turning off the VFS
functionality that the rcu pathwalks kept tripping over. See commit
7b7820b83f23 ("xfs: don't expose internal symlink metadata buffers to
the vfs").

Apart from that issue, I'm not aware of any other issues that the
XFS inode recycling directly exposes.

> According to my understanding, the essence of this problem is that XFS reuses
> the inode evicted by VFS, but VFS rcu-walk assumes that this will not happen.

It assumes that the inode will not change identity during the RCU
grace period after the inode has been evicted from cache. We can
safely reinstantiate an evicted inode without waiting for an RCU
grace period as long as it is the same inode with the same content
and same state.

Problems *may* arise when we unlink the inode, then evict it, then a
new file is created and the old slab cache memory address is used
for the new inode. I describe the issue here:

https://lore.kernel.org/linux-xfs/20220118232547.gd59...@dread.disaster.area/

That said, we have exactly zero evidence that this is actually a
problem in production systems. We did get systems tripping over the
symlink issue, but there's no evidence that the
unlink->close->open(O_CREAT) issues are manifesting in the wild and
hence there hasn't been any particular urgency to address it.

> Are there any recommended workarounds until an elegant and efficient solution
> can be proposed? After all, causing a crash is extremely unacceptable in a
> production environment.

What crashes are you seeing in your production environment?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list

2023-12-06 Thread Dave Chinner
On Thu, Dec 07, 2023 at 12:42:59AM -0500, Kent Overstreet wrote:
> On Wed, Dec 06, 2023 at 05:05:33PM +1100, Dave Chinner wrote:
> > From: Waiman Long 
> > 
> > The dlock list needs one list for each of the CPUs available. However,
> > for sibling CPUs, they are sharing the L2 and probably L1 caches
> > too. As a result, there is not much to gain in term of avoiding
> > cacheline contention while increasing the cacheline footprint of the
> > L1/L2 caches as separate lists may need to be in the cache.
> > 
> > This patch makes all the sibling CPUs share the same list, thus
> > reducing the number of lists that need to be maintained in each
> > dlock list without having any noticeable impact on performance. It
> > also improves dlock list iteration performance as fewer lists need
> > to be iterated.
> 
> Seems Waiman was missed on the CC

Oops, I knew I missed someone important

> it looks like there's some duplication of this with list_lru
> functionality - similar list-sharded-by-node idea.

For completely different reasons. The list_lru is aligned to the mm
zone architecture which only partitions memory management accounting
and scanning actions down into NUMA nodes. It's also a per-node
ordered list (LRU), and it has intricate locking semantics that
expose internal list locks to external isolation functions that can
be called whilst a lock protected traversal is in progress.

Further, we have to consider that list-lru is tightly tied to
memcgs.  For a single NUMA- and memcg- aware list-lru, there is
actually nr_memcgs * nr_nodes LRUs per list. The memory footprint of
a superblock list_lru gets quite gigantic when we start talking
about machines with hundreds of nodes running tens of thousands of
containers each with tens of superblocks.

That's the biggest problem with using a more memory expensive
structure for the list_lru - we're talking gigabytes of memory just
for the superblock shrinker tracking structure overhead on large
machines. This was one of the reasons why we haven't tried to make
list_lrus any more fine-grained that it absolutely needs to be to
provide acceptible scalability.

> list_lru does the sharding by page_to_nid() of the item, which
> saves a pointer and allows just using a list_head in the item.
> OTOH, it's less granular than what dlock-list is doing?

Sure, but there's a lot more to list_lrus than it being a "per-node
list".  OTOH, dlock_list is really nothing more than a "per-cpu
list"

> I think some attempt ought to be made to factor out the common
> ideas hear; perhaps reworking list_lru to use this thing, and I
> hope someone has looked at the page_nid idea vs. dlock_list using
> the current core.

I certainly have, and I haven't been able to justify the additional
memory footprint of a dlock_list over the existing per-node lists.
That may change given that XFS appears to be on the theshold of
per-node list-lru lock breakdown at 64 threads, but there's a lot
more to consider from a system perspective here than just
inode/dentry cache scalability

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH 08/11] vfs: inode cache conversion to hash-bl

2023-12-06 Thread Dave Chinner
On Wed, Dec 06, 2023 at 11:58:44PM -0500, Kent Overstreet wrote:
> On Wed, Dec 06, 2023 at 05:05:37PM +1100, Dave Chinner wrote:
> > From: Dave Chinner 
> > 
> > Scalability of the global inode_hash_lock really sucks for
> > filesystems that use the vfs inode cache (i.e. everything but XFS).
> 
> Ages ago, we talked about (and I attempted, but ended up swearing at
> inode lifetime rules) - conversion to rhashtable instead, which I still
> believe would be preferable since that code is fully lockless (and
> resizeable, of course). But it turned out to be a much bigger project...

I don't think that the size of the has table is a big issue at the
moment. We already have RCU lookups for the inode cache
(find_inode_rcu() and find_inode_by_ino_rcu()) even before this
patchset, so we don't need rhashtable for that.

We still have to prevent duplicate inodes from being added to the cache
due to racing inserts, so I think we still need some form of
serialisation on the "lookup miss+insert" side. I've not thought
about it further than that - the hash-bl removes the existing
VFS contention points and the limitations move to
filesystem internal algorithms once again.

So until the filesystems can scale to much larger thread counts and
put the pressure back on the VFS inode cache scalability, I
don't see any need to try to do anything more complex or smarter...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH 03/11] vfs: Use dlock list for superblock's inode list

2023-12-06 Thread Dave Chinner
On Thu, Dec 07, 2023 at 02:40:24AM +, Al Viro wrote:
> On Wed, Dec 06, 2023 at 05:05:32PM +1100, Dave Chinner wrote:
> 
> > @@ -303,6 +303,7 @@ static void destroy_unused_super(struct super_block *s)
> > super_unlock_excl(s);
> > list_lru_destroy(>s_dentry_lru);
> > list_lru_destroy(>s_inode_lru);
> > +   free_dlock_list_heads(>s_inodes);
> > security_sb_free(s);
> > put_user_ns(s->s_user_ns);
> > kfree(s->s_subtype);
> 
> Umm...  Who's going to do that on normal umount?

Huh. So neither KASAN nor kmemleak has told me that s->s-inodes was
being leaked.  I'm guessing a rebase sometime in the past silently
dropped a critical hunk from deactivate_locked_super() in the bit
bucket, but as nothing since whenever that happened has failed or
flagged a memory leak I didn't notice.

Such great tools we have..

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep

2023-12-06 Thread Dave Chinner
On Wed, Dec 06, 2023 at 11:16:50PM -0500, Kent Overstreet wrote:
> On Wed, Dec 06, 2023 at 05:05:39PM +1100, Dave Chinner wrote:
> > From: Dave Chinner 
> > 
> > hash-bl nests spinlocks inside the bit locks. This causes problems
> > for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks,
> > and we're not allowed to sleep while holding a spinning lock.
> > 
> > Further, lockdep does not support bit locks, so we lose lockdep
> > coverage of the inode hash table with the hash-bl conversion.
> > 
> > To enable these configs to work, add an external per-chain spinlock
> > to the hlist_bl_head() and add helpers to use this instead of the
> > bit spinlock when preempt_rt or lockdep are enabled.
> > 
> > This converts all users of hlist-bl to use the external spinlock in
> > these situations, so we also gain lockdep coverage of things like
> > the dentry cache hash table with this change.
> > 
> > Signed-off-by: Dave Chinner 
> 
> Sleepable bit locks can be done with wait_on_bit(), is that worth
> considering for PREEMPT_RT? Or are the other features of real locks
> important there?

I think wait_on_bit() is not scalable. It hashes down to one of 256
shared struct wait_queue_heads which have thundering herd
behaviours, and it requires the locker to always run
prepare_to_wait() and finish_wait(). This means there is at least
one spinlock_irqsave()/unlock pair needed, sometimes two, just to
get an uncontended sleeping bit lock.

So as a fast path operation that requires lock scalability, it's
going to be better to use a straight spinlock that doesn't require
irq safety as it's far less expensive than a sleeping bit lock.

Whether CONFIG_PREEMPT_RT changes that equation at all is not at all
clear to me, and so I'll leave that consideration to RT people if
they see a need to address it. In the mean time, we need to use an
external spinlock for lockdep validation so it really doesn't make
any sense at all to add a third locking variant with completely
different semantics just for PREEMPT_RT...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH 05/11] selinux: use dlist for isec inode list

2023-12-06 Thread Dave Chinner
On Wed, Dec 06, 2023 at 04:52:42PM -0500, Paul Moore wrote:
> On Wed, Dec 6, 2023 at 1:07 AM Dave Chinner  wrote:
> >
> > From: Dave Chinner 
> >
> > Because it's a horrible point of lock contention under heavily
> > concurrent directory traversals...
> >
> >   - 12.14% d_instantiate
> >  - 12.06% security_d_instantiate
> > - 12.13% selinux_d_instantiate
> >- 12.16% inode_doinit_with_dentry
> >   - 15.45% _raw_spin_lock
> >  - do_raw_spin_lock
> >       14.68% __pv_queued_spin_lock_slowpath
> >
> >
> > Signed-off-by: Dave Chinner 
> > ---
> >  include/linux/dlock-list.h|  9 
> >  security/selinux/hooks.c  | 72 +++
> >  security/selinux/include/objsec.h |  6 +--
> >  3 files changed, 47 insertions(+), 40 deletions(-)
> 
> In the cover letter you talk about testing, but I didn't see any
> mention of testing with SELinux enabled.  Given the lock contention
> stats in the description above I'm going to assume you did test this
> and pass along my ACK, but if you haven't tested the changes below
> please do before sending this anywhere important.

AFAIA, I've been testing with selinux enabled - I'm trying to run
these tests in an environment as close to typical production systems
as possible and that means selinux needs to be enabled.

As such, all the fstests and perf testing has been done with selinux
in permissive mode using "-o context=system_u:object_r:root_t:s0" as
the default context for the mount.

I see this sort of thing in the profiles:

- 87.13% path_lookupat
   - 86.46% walk_component
  - 84.20% lookup_slow
 - 84.05% __lookup_slow
- 80.81% xfs_vn_lookup
   - 77.84% xfs_lookup

   - 2.91% d_splice_alias
  - 1.52% security_d_instantiate
 - 1.50% selinux_d_instantiate
- 1.47% inode_doinit_with_dentry
   - 0.83% inode_doinit_use_xattr
0.52% __vfs_getxattr

Which tells me that selinux is definitely doing -something- on every
inode being instantiated, so I'm pretty sure the security and
selinux paths are getting exercised...

> Acked-by: Paul Moore 

Thanks!

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



[PATCH 03/11] vfs: Use dlock list for superblock's inode list

2023-12-05 Thread Dave Chinner
From: Waiman Long 

[dchinner: original commit message preserved]

When many threads are trying to add or delete inode to or from
a superblock's s_inodes list, spinlock contention on the list can
become a performance bottleneck.

This patch changes the s_inodes field to become a dlock list which
is a distributed set of lists with per-list spinlocks.  As a result,
the following superblock inode list (sb->s_inodes) iteration functions
in vfs are also being modified:

 1. iterate_bdevs()
 2. drop_pagecache_sb()
 3. evict_inodes()
 4. invalidate_inodes()
 5. fsnotify_unmount_inodes()
 6. add_dquot_ref()
 7. remove_dquot_ref()

With an exit microbenchmark that creates a large number of threads,
attachs many inodes to them in procfs and then exits. The runtimes of
that microbenchmark with various number of threads before and after
the patch on a 4-socket Intel E7-8867 v3 system (64 cores, 128 threads)
on a 4.19-rc3 based kernel were as follows:

  # of threads  Elapsed/Sys TimeElapsed/Sys TimeSpeedup
Unpatched Kernel Patched Kernel
    ---
  1000  59.17s/123m09.8s18.90s/24m44.5s  3.13
  1200  73.20s/151m24.1s27.54s/50m05.3s  2.66
  1400 102.04s/212m00.9s36.75s/68m26.7s  2.78
  1600 131.13s/272m52.4s50.16s/94m23.7s  2.61

[dchinner: forward port, add new inode list traversals, etc]
[dchinner: scalability results on current TOT XFS]

With 400k inodes per thread concurrent directory traversal workload,
scalability improves at >=16 threads on 6.7-rc4 on XFS. We only test
XFS here as it is the only filesystem that demonstrates sufficient
internal scalability for the superblock inode list to be a
scalability bottleneck.

Table contains test runtime in seconds; perfect scalability is
demonstrated by the runtime staying constant as thread count goes up.

Threads 6.4-rc7 patched
--- --- ---
2   11.673  11.158
49.665   9.444
8   10.622   9.275
16  12.148   9.508
32  20.518  10.308

Unpatched kernel profile at 32 threads:

- 95.45% vfs_fstatat
  - 95.00% vfs_statx
 - 91.00% filename_lookup
- 90.90% path_lookupat
   - 90.40% walk_component
  - 89.05% lookup_slow
 - 88.95% __lookup_slow
- 86.38% xfs_vn_lookup
   - 84.05% xfs_lookup
  - 78.82% xfs_iget
 - 72.58% xfs_setup_inode
- 72.54% inode_sb_list_add
   - 71.12% _raw_spin_lock
  - 71.09% do_raw_spin_lock
 - 68.85% __pv_queued_spin_lock_slowpath

Patched kernel profile at 32 threads - the biggest single point of
contention is now the dentry cache LRU via dput():

-   21.59% 0.25%  [kernel]  [k] dput
   - 21.34% dput
  - 19.93% retain_dentry
 - d_lru_add
- 19.82% list_lru_add
   - 14.62% _raw_spin_lock
  - 14.47% do_raw_spin_lock
   10.89% __pv_queued_spin_lock_slowpath
 1.78% __list_add_valid_or_report
   - 0.81% _raw_spin_unlock
  - do_raw_spin_unlock
   0.77% __raw_callee_save___pv_queued_spin_unlock
  - 0.79% _raw_spin_unlock
 - 0.78% do_raw_spin_unlock
  0.67% __raw_callee_save___pv_queued_spin_unlock

Signed-off-by: Waiman Long 
Signed-off-by: Dave Chinner 
---
 block/bdev.c   | 24 
 fs/drop_caches.c   |  9 -
 fs/gfs2/ops_fstype.c   | 21 +++--
 fs/inode.c | 37 -
 fs/notify/fsnotify.c   | 12 ++--
 fs/quota/dquot.c   | 22 ++
 fs/super.c | 13 +++--
 include/linux/fs.h |  8 
 security/landlock/fs.c | 25 ++---
 9 files changed, 68 insertions(+), 103 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 750aec178b6a..07135fd6fda4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -437,11 +437,11 @@ long nr_blockdev_pages(void)
 {
struct inode *inode;
long ret = 0;
+   DEFINE_DLOCK_LIST_ITER(iter, _superblock->s_inodes);
 
-   spin_lock(_superblock->s_inode_list_lock);
-   list_for_each_entry(inode, _superblock->s_inodes, i_sb_list)
+   dlist_for_each_entry(inode, , i_sb_list) {
ret += inode->i_mapping->nrpages;
-   spin_unlock(_superblock->s_inode_list_lock);
+   }
 
return ret;
 }
@@ -1032,9 +1032,9 @@ EXPORT_SYMBOL_GPL(bdev_mark_dead);
 void sync_bdevs(bool wait)
 {
struct inode *inode, *old_inode = NULL;
+   DEFINE_DLOCK_LIST_ITER(ite

[PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep

2023-12-05 Thread Dave Chinner
From: Dave Chinner 

hash-bl nests spinlocks inside the bit locks. This causes problems
for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks,
and we're not allowed to sleep while holding a spinning lock.

Further, lockdep does not support bit locks, so we lose lockdep
coverage of the inode hash table with the hash-bl conversion.

To enable these configs to work, add an external per-chain spinlock
to the hlist_bl_head() and add helpers to use this instead of the
bit spinlock when preempt_rt or lockdep are enabled.

This converts all users of hlist-bl to use the external spinlock in
these situations, so we also gain lockdep coverage of things like
the dentry cache hash table with this change.

Signed-off-by: Dave Chinner 
---
 include/linux/list_bl.h| 126 -
 include/linux/rculist_bl.h |  13 
 2 files changed, 110 insertions(+), 29 deletions(-)

diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 8ee2bf5af131..990ad8e24e0b 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -4,14 +4,27 @@
 
 #include 
 #include 
+#include 
 
 /*
  * Special version of lists, where head of the list has a lock in the lowest
  * bit. This is useful for scalable hash tables without increasing memory
  * footprint overhead.
  *
- * For modification operations, the 0 bit of hlist_bl_head->first
- * pointer must be set.
+ * Whilst the general use of bit spin locking is considered safe, PREEMPT_RT
+ * introduces a problem with nesting spin locks inside bit locks: spin locks
+ * become sleeping locks, and we can't sleep inside spinning locks such as bit
+ * locks. However, for RTPREEMPT, performance is less of an issue than
+ * correctness, so we trade off the memory and cache footprint of a spinlock 
per
+ * list so the list locks are converted to sleeping locks and work correctly
+ * with PREEMPT_RT kernels.
+ *
+ * An added advantage of this is that we can use the same trick when lockdep is
+ * enabled (again, performance doesn't matter) and gain lockdep coverage of all
+ * the hash-bl operations.
+ *
+ * For modification operations when using pure bit locking, the 0 bit of
+ * hlist_bl_head->first pointer must be set.
  *
  * With some small modifications, this can easily be adapted to store several
  * arbitrary bits (not just a single lock bit), if the need arises to store
@@ -30,16 +43,21 @@
 #define LIST_BL_BUG_ON(x)
 #endif
 
+#undef LIST_BL_USE_SPINLOCKS
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_LOCKDEP)
+#define LIST_BL_USE_SPINLOCKS  1
+#endif
 
 struct hlist_bl_head {
struct hlist_bl_node *first;
+#ifdef LIST_BL_USE_SPINLOCKS
+   spinlock_t lock;
+#endif
 };
 
 struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
 };
-#define INIT_HLIST_BL_HEAD(ptr) \
-   ((ptr)->first = NULL)
 
 static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
 {
@@ -54,6 +72,69 @@ static inline bool  hlist_bl_unhashed(const struct 
hlist_bl_node *h)
return !h->pprev;
 }
 
+#ifdef LIST_BL_USE_SPINLOCKS
+#define INIT_HLIST_BL_HEAD(ptr) do { \
+   (ptr)->first = NULL; \
+   spin_lock_init(&(ptr)->lock); \
+} while (0)
+
+static inline void hlist_bl_lock(struct hlist_bl_head *b)
+{
+   spin_lock(>lock);
+}
+
+static inline void hlist_bl_unlock(struct hlist_bl_head *b)
+{
+   spin_unlock(>lock);
+}
+
+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
+{
+   return spin_is_locked(>lock);
+}
+
+static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
+{
+   return h->first;
+}
+
+static inline void hlist_bl_set_first(struct hlist_bl_head *h,
+   struct hlist_bl_node *n)
+{
+   h->first = n;
+}
+
+static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
+   struct hlist_bl_node *n)
+{
+   WRITE_ONCE(*pprev, n);
+}
+
+static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
+{
+   return !READ_ONCE(h->first);
+}
+
+#else /* !LIST_BL_USE_SPINLOCKS */
+
+#define INIT_HLIST_BL_HEAD(ptr) \
+   ((ptr)->first = NULL)
+
+static inline void hlist_bl_lock(struct hlist_bl_head *b)
+{
+   bit_spin_lock(0, (unsigned long *)b);
+}
+
+static inline void hlist_bl_unlock(struct hlist_bl_head *b)
+{
+   __bit_spin_unlock(0, (unsigned long *)b);
+}
+
+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
+{
+   return bit_spin_is_locked(0, (unsigned long *)b);
+}
+
 static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
 {
return (struct hlist_bl_node *)
@@ -69,11 +150,21 @@ static inline void hlist_bl_set_first(struct hlist_bl_head 
*h,
h->first = (struct hlist_bl_node *)((unsigned long)n | 
LIST_BL_LOCKMASK);
 }
 
+static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
+   struct hlist_bl_node *n)
+{
+  

[PATCH 08/11] vfs: inode cache conversion to hash-bl

2023-12-05 Thread Dave Chinner
From: Dave Chinner 

Scalability of the global inode_hash_lock really sucks for
filesystems that use the vfs inode cache (i.e. everything but XFS).

Profiles of a 32-way concurrent sharded directory walk (no contended
directories) on a couple of different filesystems. All numbers from
a 6.7-rc4 kernel.

Bcachefs:

  - 98.78% vfs_statx
 - 97.74% filename_lookup
- 97.70% path_lookupat
   - 97.54% walk_component
  - 97.06% lookup_slow
 - 97.03% __lookup_slow
- 96.21% bch2_lookup
   - 91.87% bch2_vfs_inode_get
  - 84.10% iget5_locked
 - 44.09% ilookup5
- 43.50% _raw_spin_lock
   - 43.49% do_raw_spin_lock
42.75% __pv_queued_spin_lock_slowpath
 - 39.06% inode_insert5
- 38.46% _raw_spin_lock
   - 38.46% do_raw_spin_lock
37.51% __pv_queued_spin_lock_slowpath

ext4:

  - 93.75% vfs_statx
 - 92.39% filename_lookup
- 92.34% path_lookupat
   - 92.09% walk_component
  - 91.48% lookup_slow
 - 91.43% __lookup_slow
- 90.18% ext4_lookup
   - 84.84% __ext4_iget
  - 83.67% iget_locked
 - 81.24% _raw_spin_lock
- 81.23% do_raw_spin_lock
   - 78.90% __pv_queued_spin_lock_slowpath


Both bcachefs and ext4 demonstrate poor scaling at >=8 threads on
concurrent lookup or create workloads.

Hence convert the inode hash table to a RCU-aware hash-bl table just
like the dentry cache. Note that we need to store a pointer to the
hlist_bl_head the inode has been added to in the inode so that when
it comes to unhash the inode we know what list to lock. We need to
do this because, unlike the dentry cache, the hash value that is
used to hash the inode is not generated from the inode itself. i.e.
filesystems can provide this themselves so we have to either store
the hashval or the hlist head pointer in the inode to be able to
find the right list head for removal...

Concurrent walk of 400k files per thread with varying thread count
in seconds is as follows. Perfect scaling is an unchanged walk time
as thread count increases.

ext4bcachefs
threads vanilla  patchedvanilla patched
27.9237.358  8.003   7.276
48.1527.530  9.097   8.506
8   13.0907.871 11.752  10.015
16  24.6029.540 24.614  13.989
32  49.536   19.314 49.179  25.982

The big wins here are at >= 8 threads, with both filesytsems now
being limited by internal filesystem algorithms, not the VFS inode
cache scalability.

Ext4 contention moves to the buffer cache on directory block
lookups:

-   66.45% 0.44%  [kernel]  [k] __ext4_read_dirblock
   - 66.01% __ext4_read_dirblock
  - 66.01% ext4_bread
 - ext4_getblk
- 64.77% bdev_getblk
   - 64.69% __find_get_block
  - 63.01% _raw_spin_lock
 - 62.96% do_raw_spin_lock
  59.21% __pv_queued_spin_lock_slowpath

bcachefs contention moves to internal btree traversal locks.

 - 95.37% __lookup_slow
- 93.95% bch2_lookup
   - 82.57% bch2_vfs_inode_get
  - 65.44% bch2_inode_find_by_inum_trans
 - 65.41% bch2_inode_peek_nowarn
- 64.60% bch2_btree_iter_peek_slot
   - 64.55% bch2_btree_path_traverse_one
  - bch2_btree_path_traverse_cached
 - 63.02% bch2_btree_path_traverse_cached_slowpath
- 56.60% mutex_lock
   - 55.29% __mutex_lock_slowpath
  - 55.25% __mutex_lock
   50.29% osq_lock
   1.84% 
__raw_callee_save___kvm_vcpu_is_preempted
   0.54% mutex_spin_on_owner

Signed-off-by: Dave Chinner 
---
 fs/inode.c | 200 -
 include/linux/fs.h |   9 +-
 2 files changed, 132 insertions(+), 77 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index fead81550cf4..3eb9c4e5b279 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -56,8 +56,7 @@
 
 static unsigned int i_hash_mask __ro_after_init;
 static unsigned int i_hash_shift __ro_after_init;
-static struct hlist_head *inode_hashtable __ro_after_init;
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
+static struct hlist_bl_head *inode_hashtable __ro_after_init;
 
 static unsigned long hash(

[PATCH 01/11] lib/dlock-list: Distributed and lock-protected lists

2023-12-05 Thread Dave Chinner
From: Waiman Long 

Linked list is used everywhere in the Linux kernel. However, if many
threads are trying to add or delete entries into the same linked list,
it can create a performance bottleneck.

This patch introduces a new list APIs that provide a set of distributed
lists (one per CPU), each of which is protected by its own spinlock.
To the callers, however, the set of lists acts like a single
consolidated list.  This allows list entries insertion and deletion
operations to happen in parallel instead of being serialized with a
global list and lock.

List entry insertion is strictly per cpu. List deletion, however, can
happen in a cpu other than the one that did the insertion. So we still
need lock to protect the list. Because of that, there may still be
a small amount of contention when deletion is being done.

A new header file include/linux/dlock-list.h will be added with the
associated dlock_list_head and dlock_list_node structures. The following
functions are provided to manage the per-cpu list:

 1. int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
 2. void free_dlock_list_heads(struct dlock_list_heads *dlist)
 3. void dlock_list_add(struct dlock_list_node *node,
struct dlock_list_heads *dlist)
 4. void dlock_list_del(struct dlock_list *node)

Iteration of all the list entries within a dlock list array
is done by calling either the dlist_for_each_entry() or
dlist_for_each_entry_safe() macros. They correspond to the
list_for_each_entry() and list_for_each_entry_safe() macros
respectively. The iteration states are keep in a dlock_list_iter
structure that is passed to the iteration macros.

Signed-off-by: Waiman Long 
Reviewed-by: Jan Kara 
---
 include/linux/dlock-list.h | 242 +
 lib/Makefile   |   2 +-
 lib/dlock-list.c   | 234 +++
 3 files changed, 477 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/dlock-list.h
 create mode 100644 lib/dlock-list.c

diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
new file mode 100644
index ..327cb9edc7e3
--- /dev/null
+++ b/include/linux/dlock-list.h
@@ -0,0 +1,242 @@
+/*
+ * Distributed and locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2017-2018 Red Hat, Inc.
+ *
+ * Authors: Waiman Long 
+ */
+#ifndef __LINUX_DLOCK_LIST_H
+#define __LINUX_DLOCK_LIST_H
+
+#include 
+#include 
+
+/*
+ * include/linux/dlock-list.h
+ *
+ * The dlock_list_head structure contains the spinlock. It is cacheline
+ * aligned to reduce contention among different CPUs. The other
+ * dlock_list_node structures contains a pointer to the head entry instead.
+ */
+struct dlock_list_head {
+   struct list_head list;
+   spinlock_t lock;
+} cacheline_aligned_in_smp;
+
+struct dlock_list_heads {
+   struct dlock_list_head *heads;
+};
+
+/*
+ * dlock list node data structure
+ */
+struct dlock_list_node {
+   struct list_head list;
+   struct dlock_list_head *head;
+};
+
+/*
+ * dlock list iteration state
+ *
+ * This is an opaque data structure that may change. Users of this structure
+ * should not access the structure members directly other than using the
+ * helper functions and macros provided in this header file.
+ */
+struct dlock_list_iter {
+   int index;
+   struct dlock_list_head *head, *entry;
+};
+
+#define DLOCK_LIST_ITER_INIT(dlist)\
+   {   \
+   .index = -1,\
+   .head = (dlist)->heads, \
+   }
+
+#define DEFINE_DLOCK_LIST_ITER(s, heads)   \
+   struct dlock_list_iter s = DLOCK_LIST_ITER_INIT(heads)
+
+static inline void init_dlock_list_iter(struct dlock_list_iter *iter,
+   struct dlock_list_heads *heads)
+{
+   *iter = (struct dlock_list_iter)DLOCK_LIST_ITER_INIT(heads);
+}
+
+#define DLOCK_LIST_NODE_INIT(name) \
+   {   \
+   .list = LIST_HEAD_INIT(name)\
+   }
+
+static inline void init_dlock_list_node(struct dlock_list_node *node)
+{
+   *node = (struct dlock_list_node)DLOCK_LIST_NODE_INIT(node->list);
+}
+
+/**
+ * dlock_list_unlock - unlock the spinlock that protects the current list
+ * @iter: Pointer to the dlock list iterator structure
+ */
+static inline void dlock_list_unlock(struct 

[PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list

2023-12-05 Thread Dave Chinner
From: Waiman Long 

The dlock list needs one list for each of the CPUs available. However,
for sibling CPUs, they are sharing the L2 and probably L1 caches
too. As a result, there is not much to gain in term of avoiding
cacheline contention while increasing the cacheline footprint of the
L1/L2 caches as separate lists may need to be in the cache.

This patch makes all the sibling CPUs share the same list, thus
reducing the number of lists that need to be maintained in each
dlock list without having any noticeable impact on performance. It
also improves dlock list iteration performance as fewer lists need
to be iterated.

Signed-off-by: Waiman Long 
Reviewed-by: Jan Kara 
---
 lib/dlock-list.c | 74 ++--
 1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/lib/dlock-list.c b/lib/dlock-list.c
index f64ea4cc5e79..e2860944ec9f 100644
--- a/lib/dlock-list.c
+++ b/lib/dlock-list.c
@@ -25,31 +25,65 @@
  * The distributed and locked list is a distributed set of lists each of
  * which is protected by its own spinlock, but acts like a single
  * consolidated list to the callers. For scaling purpose, the number of
- * lists used is equal to the number of possible CPUs in the system to
- * minimize contention.
+ * lists used is equal to the number of possible cores in the system to
+ * minimize contention. All threads of the same CPU core will share the
+ * same list.
  *
- * However, it is possible that individual CPU numbers may be equal to
- * or greater than the number of possible CPUs when there are holes in
- * the CPU number list. As a result, we need to map the CPU number to a
- * list index.
+ * We need to map each CPU number to a list index.
  */
 static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx);
+static int nr_dlock_lists __read_mostly;
 
 /*
- * Initialize cpu2idx mapping table
+ * Initialize cpu2idx mapping table & nr_dlock_lists.
  *
  * It is possible that a dlock-list can be allocated before the cpu2idx is
  * initialized. In this case, all the cpus are mapped to the first entry
  * before initialization.
  *
+ * All the sibling CPUs of a sibling group will map to the same dlock list so
+ * as to reduce the number of dlock lists to be maintained while minimizing
+ * cacheline contention.
+ *
+ * As the sibling masks are set up in the core initcall phase, this function
+ * has to be done in the postcore phase to get the right data.
  */
 static int __init cpu2idx_init(void)
 {
int idx, cpu;
+   struct cpumask *sibling_mask;
+   static struct cpumask mask __initdata;
 
+   cpumask_clear();
idx = 0;
-   for_each_possible_cpu(cpu)
-   per_cpu(cpu2idx, cpu) = idx++;
+   for_each_possible_cpu(cpu) {
+   int scpu;
+
+   if (cpumask_test_cpu(cpu, ))
+   continue;
+   per_cpu(cpu2idx, cpu) = idx;
+   cpumask_set_cpu(cpu, );
+
+   sibling_mask = topology_sibling_cpumask(cpu);
+   if (sibling_mask) {
+   for_each_cpu(scpu, sibling_mask) {
+   per_cpu(cpu2idx, scpu) = idx;
+   cpumask_set_cpu(scpu, );
+   }
+   }
+   idx++;
+   }
+
+   /*
+* nr_dlock_lists can only be set after cpu2idx is properly
+* initialized.
+*/
+   smp_mb();
+   nr_dlock_lists = idx;
+   WARN_ON(nr_dlock_lists > nr_cpu_ids);
+
+   pr_info("dlock-list: %d head entries per dlock list.\n",
+   nr_dlock_lists);
return 0;
 }
 postcore_initcall(cpu2idx_init);
@@ -67,19 +101,23 @@ postcore_initcall(cpu2idx_init);
  *
  * Dynamically allocated locks need to have their own special lock class
  * to avoid lockdep warning.
+ *
+ * Since nr_dlock_lists will always be <= nr_cpu_ids, having more lists
+ * than necessary allocated is not a problem other than some wasted memory.
+ * The extra lists will not be ever used as all the cpu2idx entries will be
+ * 0 before initialization.
  */
 int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
 struct lock_class_key *key)
 {
-   int idx;
+   int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;
 
-   dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
-  GFP_KERNEL);
+   dlist->heads = kcalloc(cnt, sizeof(struct dlock_list_head), GFP_KERNEL);
 
if (!dlist->heads)
return -ENOMEM;
 
-   for (idx = 0; idx < nr_cpu_ids; idx++) {
+   for (idx = 0; idx < cnt; idx++) {
struct dlock_list_head *head = >heads[idx];
 
INIT_LIST_HEAD(>list);
@@ -117,7 +155,10 @@ bool dlock_lists_empty(struct dlock_list_heads *dlist)
 {
int idx;
 
-   for (idx = 0; idx < nr_cpu_ids; idx++)
+   /* Shouldn't be called before nr_dlock_lists is initialized */
+   

[PATCH 11/11] hlist-bl: introduced nested locking for dm-snap

2023-12-05 Thread Dave Chinner
From: Dave Chinner 

Testing with lockdep enabled threw this warning from generic/081 in
fstests:

[ 2369.724151] 
[ 2369.725805] WARNING: possible recursive locking detected
[ 2369.727125] 6.7.0-rc2-dgc+ #1952 Not tainted
[ 2369.728647] 
[ 2369.730197] systemd-udevd/389493 is trying to acquire lock:
[ 2369.732378] 888116a1a320 (&(et->table + i)->lock){+.+.}-{2:2}, at: 
snapshot_map+0x13e/0x7f0
[ 2369.736197]
   but task is already holding lock:
[ 2369.738657] 8881098a4fd0 (&(et->table + i)->lock){+.+.}-{2:2}, at: 
snapshot_map+0x136/0x7f0
[ 2369.742118]
   other info that might help us debug this:
[ 2369.744403]  Possible unsafe locking scenario:

[ 2369.746814]CPU0
[ 2369.747675]
[ 2369.748496]   lock(&(et->table + i)->lock);
[ 2369.749877]   lock(&(et->table + i)->lock);
[ 2369.751241]
*** DEADLOCK ***

[ 2369.753173]  May be due to missing lock nesting notation

[ 2369.754963] 4 locks held by systemd-udevd/389493:
[ 2369.756124]  #0: 88811b3a1f48 (mapping.invalidate_lock#2){}-{3:3}, 
at: page_cache_ra_unbounded+0x69/0x190
[ 2369.758516]  #1: 888121ceff10 (>io_barrier){.+.+}-{0:0}, at: 
dm_get_live_table+0x52/0xd0
[ 2369.760888]  #2: 888110240078 (>lock#2){}-{3:3}, at: 
snapshot_map+0x12e/0x7f0
[ 2369.763254]  #3: 8881098a4fd0 (&(et->table + i)->lock){+.+.}-{2:2}, at: 
snapshot_map+0x136/0x7f0
[ 2369.765896]
   stack backtrace:
[ 2369.767429] CPU: 3 PID: 389493 Comm: systemd-udevd Not tainted 
6.7.0-rc2-dgc+ #1952
[ 2369.770203] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.16.2-debian-1.16.2-1 04/01/2014
[ 2369.773771] Call Trace:
[ 2369.774657]  
[ 2369.775494]  dump_stack_lvl+0x5c/0xc0
[ 2369.776765]  dump_stack+0x10/0x20
[ 2369.778031]  print_deadlock_bug+0x220/0x2f0
[ 2369.779568]  __lock_acquire+0x1255/0x2180
[ 2369.781013]  lock_acquire+0xb9/0x2c0
[ 2369.782456]  ? snapshot_map+0x13e/0x7f0
[ 2369.783927]  ? snapshot_map+0x136/0x7f0
[ 2369.785240]  _raw_spin_lock+0x34/0x70
[ 2369.786413]  ? snapshot_map+0x13e/0x7f0
[ 2369.787482]  snapshot_map+0x13e/0x7f0
[ 2369.788462]  ? lockdep_init_map_type+0x75/0x250
[ 2369.789650]  __map_bio+0x1d7/0x200
[ 2369.790364]  dm_submit_bio+0x17d/0x570
[ 2369.791387]  __submit_bio+0x4a/0x80
[ 2369.792215]  submit_bio_noacct_nocheck+0x108/0x350
[ 2369.793357]  submit_bio_noacct+0x115/0x450
[ 2369.794334]  submit_bio+0x43/0x60
[ 2369.795112]  mpage_readahead+0xf1/0x130
[ 2369.796037]  ? blkdev_write_begin+0x30/0x30
[ 2369.797007]  blkdev_readahead+0x15/0x20
[ 2369.797893]  read_pages+0x5c/0x230
[ 2369.798703]  page_cache_ra_unbounded+0x143/0x190
[ 2369.799810]  force_page_cache_ra+0x9a/0xc0
[ 2369.800754]  page_cache_sync_ra+0x2e/0x50
[ 2369.801704]  filemap_get_pages+0x112/0x630
[ 2369.802696]  ? __lock_acquire+0x413/0x2180
[ 2369.803663]  filemap_read+0xfc/0x3a0
[ 2369.804527]  ? __might_sleep+0x42/0x70
[ 2369.805443]  blkdev_read_iter+0x6d/0x150
[ 2369.806370]  vfs_read+0x1a6/0x2d0
[ 2369.807148]  ksys_read+0x71/0xf0
[ 2369.807936]  __x64_sys_read+0x19/0x20
[ 2369.808810]  do_syscall_64+0x3c/0xe0
[ 2369.809746]  entry_SYSCALL_64_after_hwframe+0x63/0x6b
[ 2369.810914] RIP: 0033:0x7f9f14dbb03d

Turns out that dm-snap holds two hash-bl locks at the same time,
so we need nesting semantics to ensure lockdep understands what is
going on.

Signed-off-by: Dave Chinner 
---
 drivers/md/dm-snap.c|  2 +-
 include/linux/list_bl.h | 10 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index bf7a574499a3..cd97d5cb295d 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -645,7 +645,7 @@ static void dm_exception_table_lock_init(struct dm_snapshot 
*s, chunk_t chunk,
 static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
 {
hlist_bl_lock(lock->complete_slot);
-   hlist_bl_lock(lock->pending_slot);
+   hlist_bl_lock_nested(lock->pending_slot, SINGLE_DEPTH_NESTING);
 }
 
 static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 990ad8e24e0b..0e3e60c10563 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -83,6 +83,11 @@ static inline void hlist_bl_lock(struct hlist_bl_head *b)
spin_lock(>lock);
 }
 
+static inline void hlist_bl_lock_nested(struct hlist_bl_head *b, int subclass)
+{
+   spin_lock_nested(>lock, subclass);
+}
+
 static inline void hlist_bl_unlock(struct hlist_bl_head *b)
 {
spin_unlock(>lock);
@@ -125,6 +130,11 @@ static inline void hlist_bl_lock(struct hlist_bl_head *b)
bit_spin_lock(0, (unsigned long *)b);
 }
 
+static inline void hlist_bl_lock_nested(struct hlist_bl_head *b, int subclass)
+{
+   hlist_bl_lock(b);
+}
+
 static inline 

[PATCH 09/11] hash-bl: explicitly initialise hash-bl heads

2023-12-05 Thread Dave Chinner
From: Dave Chinner 

Because we are going to change how the structure is laid out to
support RTPREEMPT and LOCKDEP, just assuming that the hash table is
allocated as zeroed memory is no longer sufficient to initialise
a hash-bl table.

Signed-off-by: Dave Chinner 
---
 fs/dcache.c   | 21 -
 fs/fscache/cookie.c   |  8 
 fs/fscache/internal.h |  6 --
 fs/fscache/main.c |  3 +++
 fs/fscache/volume.c   |  8 
 fs/inode.c| 19 ++-
 6 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index c82ae731df9a..9059b3a55370 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3284,7 +3284,10 @@ __setup("dhash_entries=", set_dhash_entries);
 
 static void __init dcache_init_early(void)
 {
-   /* If hashes are distributed across NUMA nodes, defer
+   int i;
+
+   /*
+* If hashes are distributed across NUMA nodes, defer
 * hash allocation until vmalloc space is available.
 */
if (hashdist)
@@ -3300,11 +3303,20 @@ static void __init dcache_init_early(void)
NULL,
0,
0);
+   /*
+* The value returned in d_hash_shift tells us the size of the
+* hash table that was allocated as a log2 value.
+*/
+   for (i = 0; i < (1 << d_hash_shift); i++)
+   INIT_HLIST_BL_HEAD(_hashtable[i]);
+
d_hash_shift = 32 - d_hash_shift;
 }
 
 static void __init dcache_init(void)
 {
+   int i;
+
/*
 * A constructor could be added for stable state like the lists,
 * but it is probably not worth it because of the cache nature
@@ -3328,6 +3340,13 @@ static void __init dcache_init(void)
NULL,
0,
0);
+   /*
+* The value returned in d_hash_shift tells us the size of the
+* hash table that was allocated as a log2 value.
+*/
+   for (i = 0; i < (1 << d_hash_shift); i++)
+   INIT_HLIST_BL_HEAD(_hashtable[i]);
+
d_hash_shift = 32 - d_hash_shift;
 }
 
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index bce2492186d0..21617f7c88e4 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -32,6 +32,14 @@ static DECLARE_WORK(fscache_cookie_lru_work, 
fscache_cookie_lru_worker);
 static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = 
"-LCAIFUWRD";
 static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
 
+void fscache_cookie_hash_init(void)
+{
+   int i;
+
+   for (i = 0; i < (1 << fscache_cookie_hash_shift); i++)
+   INIT_HLIST_BL_HEAD(_cookie_hash[i]);
+}
+
 void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
 {
const u8 *k;
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 1336f517e9b1..6cbe07decc11 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -61,8 +61,9 @@ extern const struct seq_operations fscache_cookies_seq_ops;
 #endif
 extern struct timer_list fscache_cookie_lru_timer;
 
-extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
-extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+void fscache_cookie_hash_init(void);
+void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
enum fscache_access_trace why);
 
 static inline void fscache_see_cookie(struct fscache_cookie *cookie,
@@ -143,6 +144,7 @@ int fscache_stats_show(struct seq_file *m, void *v);
 extern const struct seq_operations fscache_volumes_seq_ops;
 #endif
 
+void fscache_volume_hash_init(void);
 struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
  enum fscache_volume_trace where);
 void fscache_put_volume(struct fscache_volume *volume,
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index dad85fd84f6f..7db2a4423315 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -92,6 +92,9 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
 
+   fscache_volume_hash_init();
+   fscache_cookie_hash_init();
+
pr_notice("Loaded\n");
return 0;
 
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c
index cdf991bdd9de..8b029c46a3a3 100644
--- a/fs/fscache/volume.c
+++ b/fs/fscache/volume.c
@@ -17,6 +17,14 @@ static LIST_HEAD(fscache_volumes);
 
 static void fscache_create_volume_work(struct work_struct *work);
 
+void fscache_volume_hash_init(void)
+{
+   int i;
+
+   for (i = 0; i < (1 << fscache_volume_hash_shift); i++)
+   INIT_HLIST_BL_HEAD(_volume_hash[i]);
+}
+
 struc

[PATCH 05/11] selinux: use dlist for isec inode list

2023-12-05 Thread Dave Chinner
From: Dave Chinner 

Because it's a horrible point of lock contention under heavily
concurrent directory traversals...

  - 12.14% d_instantiate
 - 12.06% security_d_instantiate
- 12.13% selinux_d_instantiate
   - 12.16% inode_doinit_with_dentry
  - 15.45% _raw_spin_lock
 - do_raw_spin_lock
  14.68% __pv_queued_spin_lock_slowpath


Signed-off-by: Dave Chinner 
---
 include/linux/dlock-list.h|  9 
 security/selinux/hooks.c  | 72 +++
 security/selinux/include/objsec.h |  6 +--
 3 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
index 327cb9edc7e3..7ad933b8875d 100644
--- a/include/linux/dlock-list.h
+++ b/include/linux/dlock-list.h
@@ -132,6 +132,15 @@ extern void dlock_lists_add(struct dlock_list_node *node,
struct dlock_list_heads *dlist);
 extern void dlock_lists_del(struct dlock_list_node *node);
 
+static inline void
+dlock_list_del_iter(struct dlock_list_iter *iter,
+   struct dlock_list_node *node)
+{
+   WARN_ON_ONCE((iter->entry != READ_ONCE(node->head)));
+   list_del_init(>list);
+   WRITE_ONCE(node->head, NULL);
+}
+
 /*
  * Find the first entry of the next available list.
  */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..0358d7c66949 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -340,26 +340,11 @@ static struct inode_security_struct 
*backing_inode_security(struct dentry *dentr
 static void inode_free_security(struct inode *inode)
 {
struct inode_security_struct *isec = selinux_inode(inode);
-   struct superblock_security_struct *sbsec;
 
if (!isec)
return;
-   sbsec = selinux_superblock(inode->i_sb);
-   /*
-* As not all inode security structures are in a list, we check for
-* empty list outside of the lock to make sure that we won't waste
-* time taking a lock doing nothing.
-*
-* The list_del_init() function can be safely called more than once.
-* It should not be possible for this function to be called with
-* concurrent list_add(), but for better safety against future changes
-* in the code, we use list_empty_careful() here.
-*/
-   if (!list_empty_careful(>list)) {
-   spin_lock(>isec_lock);
-   list_del_init(>list);
-   spin_unlock(>isec_lock);
-   }
+   if (!list_empty(>list.list))
+   dlock_lists_del(>list);
 }
 
 struct selinux_mnt_opts {
@@ -547,6 +532,8 @@ static int sb_finish_set_opts(struct super_block *sb)
struct superblock_security_struct *sbsec = selinux_superblock(sb);
struct dentry *root = sb->s_root;
struct inode *root_inode = d_backing_inode(root);
+   struct dlock_list_iter iter;
+   struct inode_security_struct *isec, *n;
int rc = 0;
 
if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
@@ -570,27 +557,28 @@ static int sb_finish_set_opts(struct super_block *sb)
/* Initialize the root inode. */
rc = inode_doinit_with_dentry(root_inode, root);
 
-   /* Initialize any other inodes associated with the superblock, e.g.
-  inodes created prior to initial policy load or inodes created
-  during get_sb by a pseudo filesystem that directly
-  populates itself. */
-   spin_lock(>isec_lock);
-   while (!list_empty(>isec_head)) {
-   struct inode_security_struct *isec =
-   list_first_entry(>isec_head,
-  struct inode_security_struct, list);
+   /*
+* Initialize any other inodes associated with the superblock, e.g.
+* inodes created prior to initial policy load or inodes created during
+* get_sb by a pseudo filesystem that directly populates itself.
+*/
+   init_dlock_list_iter(, >isec_head);
+   dlist_for_each_entry_safe(isec, n, , list) {
struct inode *inode = isec->inode;
-   list_del_init(>list);
-   spin_unlock(>isec_lock);
+
+   dlock_list_del_iter(, >list);
+   dlock_list_unlock();
+
inode = igrab(inode);
if (inode) {
if (!IS_PRIVATE(inode))
inode_doinit_with_dentry(inode, NULL);
iput(inode);
}
-   spin_lock(>isec_lock);
+
+   dlock_list_relock();
}
-   spin_unlock(>isec_lock);
+   WARN_ON_ONCE(!dlock_lists_empty(>isec_head));
return rc;
 }
 
@@ -1428,10 +1416,8 @@ static int inode_doinit_with_dentry(struct inode *inode, 
struct dentry *opt_dent
/* Defer initiali

[PATCH 0/11] vfs: inode cache scalability improvements

2023-12-05 Thread Dave Chinner
We all know that the global inode_hash_lock and the per-fs global 
sb->s_inode_list_lock locks are contention points in filesystem workloads
that stream inodes through memory, so it's about time we addressed these
limitations.

The first part of the patchset address the sb->s_inode_list_lock.
This was done a long time ago by Waiman Long by converting the
global linked list to a per-cpu linked list - those infrastructure
patches are pretty much unchanged from when Waiman first wrote them,
and as such the still carry the RVB that Jan Kara gave for them. I
have no idea if the problem that Waiman was trying to solve still
exists, but that's largely irrelevant because there are other
problems that I can easily reproduce.

That is, once at ~16 threads trying to instantiate or tear down
inodes at the same time in a filesystem, the sb->s_inode_list_lock
becomes a single point of contention. Adding an inode to the inode
cache requires adding it to the sb->s_inodes list, and removing an
inode from the cache requires removing it from the sb->s_inodes
list. That's two exclusive lock operations per inode we cycle
through the inode cache.

This creates a hard limit on the number of inodes we can cycle
through memory in a single filesystem. It tops out at around
600-700,000 inodes per second on XFS, and at that point we see
catastrophic cacheline contention breakdown and nothing goes any
faster. We can easily burn hundreds of CPUs on the sb->s_inodes list
operations, yet we still can only get 600-700k inodes/s through the
cache.

Converting the sb->s_inodes list to a dlist gets rid of this single
contention point and makes the sb->s_inodes list operations
disappear from the profiles completely. Prior to this change, at 32
threads XFS could pull 12.8 million inodes into cache in ~20s
(that's ~600k inodes/s - sound familiar?). With this change, those
12.8 million inodes are pulled into cache in ~10s. That's double the
rate at which XFS can pull inodes into memory from the
filesystem

I'm talking about XFS here, because none of the other filesystem
actually stress the sb->s_inode_list_lock at all. They all hit
catastrophic cacheline contention on the inode_hash_lock long before
they get anywhere near the sb->s_inodes limits. For ext4 and
bcachefs, the inode_hash_lock becomes a limiting factor at 8
threads. btrfs hits internal namespace tree contention limits at 2
threads, so it's not even stressing the inode_hash_lock unless
highly threaded workloads are manually sharded across subvolumes.

So to bring the rest of the filesystems up, we need to fix the
inode_hash_lock contention problems.  This patchset replaces the
global inode_hash_lock with the same lock-per-chain implementation
that the dentry cache uses. i.e. hash-bl lists. This is more complex
than the dentry cache implementation, however, because we nest spin
locks inside the inode_hash_lock. This conversion means we nest spin
locks inside bit spin locks in the inode cache.

Whilst this doesn't sound particularly problematic, the issue arises
on CONFIG_PREEMPT_RT kernels, where spinlocks are converted to
sleeping locks. We can't place sleeping locks inside spinning bit
locks, and that's exactly what happens if we use hash-bl lists in
the inode cache and then turn on CONFIG_PREEMPT_RT.

The other downside to converting to hash-bl is that we lose lockdep
coverage of the inode hash table - lockdep does not track bit locks
at all.

Both of these issues can be solved the same way: whenever either of
these two config options are turned on, we change the hash-bl
implementation from using a bit spin lock on th elowest bit of the
chain head pointer to using as dedicated spinlock per chain. This
trades off performance and memory footprint for configurations where
correctness is more important than performance, but allows optimal
implementations of hash-bl lists when performance is the primary
concern.

In making this conversion, we make all hash-bl implementations safe
for PREEMPT_RT usage and gain lockdep coverage of all hash-bl lists.
It also pointed out that several hash-bl list users did not actually
initialise the hash list heads correctly - they elided the
initialisation and only got away with it because they allocated
zeroed memory and the hash list head would still work from empty.
This needed fixing for lockdep

The result of this conversion is that inode cache lookup heavy
workloads such as filesystem traversals and inode creation/removal
no longer contend on the inode_hash_lock to stream inodes through
the inode cache. This results in big performance improvements at
thread counts of >= 8.

I've run this through fstests with lockdep enabled on ext4 and XFS
without discovering any issues except for dm-snapshot needing
lockdep nesting annotations for list-bl locks. I've run a bunch of
"will-it-scale" like tests across XFS, ext4, bcachefs and btrfs, and
the raw table results for 6.7-rc4 are below.

The tests runs a fixed number of files per thread, so as the 

[PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants

2023-12-05 Thread Dave Chinner
From: Jan Kara 

evict_inodes() and invalidate_inodes() use list_for_each_entry_safe()
to iterate sb->s_inodes list. However, since we use i_lru list entry for
our local temporary list of inodes to destroy, the inode is guaranteed
to stay in sb->s_inodes list while we hold sb->s_inode_list_lock. So
there is no real need for safe iteration variant and we can use
list_for_each_entry() just fine.

Signed-off-by: Jan Kara 
Signed-off-by: Waiman Long 
---
 fs/inode.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index f238d987dec9..17c50a75514f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -712,12 +712,12 @@ static void dispose_list(struct list_head *head)
  */
 void evict_inodes(struct super_block *sb)
 {
-   struct inode *inode, *next;
+   struct inode *inode;
LIST_HEAD(dispose);
 
 again:
spin_lock(>s_inode_list_lock);
-   list_for_each_entry_safe(inode, next, >s_inodes, i_sb_list) {
+   list_for_each_entry(inode, >s_inodes, i_sb_list) {
if (atomic_read(>i_count))
continue;
 
@@ -758,12 +758,12 @@ EXPORT_SYMBOL_GPL(evict_inodes);
  */
 void invalidate_inodes(struct super_block *sb)
 {
-   struct inode *inode, *next;
+   struct inode *inode;
LIST_HEAD(dispose);
 
 again:
spin_lock(>s_inode_list_lock);
-   list_for_each_entry_safe(inode, next, >s_inodes, i_sb_list) {
+   list_for_each_entry(inode, >s_inodes, i_sb_list) {
spin_lock(>i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(>i_lock);
-- 
2.42.0




[PATCH 07/11] hlist-bl: add hlist_bl_fake()

2023-12-05 Thread Dave Chinner
From: Dave Chinner 

in preparation for switching the VFS inode cache over the hlist_bl
lists, we nee dto be able to fake a list node that looks like it is
hased for correct operation of filesystems that don't directly use
the VFS indoe cache.

Signed-off-by: Dave Chinner 
---
 include/linux/list_bl.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index ae1b541446c9..8ee2bf5af131 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node 
*n)
}
 }
 
+/**
+ * hlist_bl_add_fake - create a fake list consisting of a single headless node
+ * @n: Node to make a fake list out of
+ *
+ * This makes @n appear to be its own predecessor on a headless hlist.
+ * The point of this is to allow things like hlist_bl_del() to work correctly
+ * in cases where there is no list.
+ */
+static inline void hlist_bl_add_fake(struct hlist_bl_node *n)
+{
+   n->pprev = >next;
+}
+
+/**
+ * hlist_fake: Is this node a fake hlist_bl?
+ * @h: Node to check for being a self-referential fake hlist.
+ */
+static inline bool hlist_bl_fake(struct hlist_bl_node *n)
+{
+   return n->pprev == >next;
+}
+
 static inline void hlist_bl_lock(struct hlist_bl_head *b)
 {
bit_spin_lock(0, (unsigned long *)b);
-- 
2.42.0




[PATCH 06/11] vfs: factor out inode hash head calculation

2023-12-05 Thread Dave Chinner
From: Dave Chinner 

In preparation for changing the inode hash table implementation.

Signed-off-by: Dave Chinner 
---
 fs/inode.c | 44 +---
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 3426691fa305..fead81550cf4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -59,6 +59,22 @@ static unsigned int i_hash_shift __ro_after_init;
 static struct hlist_head *inode_hashtable __ro_after_init;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+   unsigned long tmp;
+
+   tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+   L1_CACHE_BYTES;
+   tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+   return tmp & i_hash_mask;
+}
+
+static inline struct hlist_head *i_hash_head(struct super_block *sb,
+   unsigned int hashval)
+{
+   return inode_hashtable + hash(sb, hashval);
+}
+
 /*
  * Empty aops. Can be used for the cases where the user does not
  * define any of the address_space operations.
@@ -502,16 +518,6 @@ static inline void inode_sb_list_del(struct inode *inode)
dlock_lists_del(>i_sb_list);
 }
 
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-   unsigned long tmp;
-
-   tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-   L1_CACHE_BYTES;
-   tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
-   return tmp & i_hash_mask;
-}
-
 /**
  * __insert_inode_hash - hash an inode
  * @inode: unhashed inode
@@ -1187,7 +1193,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned 
long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *data)
 {
-   struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
+   struct hlist_head *head = i_hash_head(inode->i_sb, hashval);
struct inode *old;
 
 again:
@@ -1291,7 +1297,7 @@ EXPORT_SYMBOL(iget5_locked);
  */
 struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 {
-   struct hlist_head *head = inode_hashtable + hash(sb, ino);
+   struct hlist_head *head = i_hash_head(sb, ino);
struct inode *inode;
 again:
spin_lock(_hash_lock);
@@ -1359,7 +1365,7 @@ EXPORT_SYMBOL(iget_locked);
  */
 static int test_inode_iunique(struct super_block *sb, unsigned long ino)
 {
-   struct hlist_head *b = inode_hashtable + hash(sb, ino);
+   struct hlist_head *b = i_hash_head(sb, ino);
struct inode *inode;
 
hlist_for_each_entry_rcu(inode, b, i_hash) {
@@ -1446,7 +1452,7 @@ EXPORT_SYMBOL(igrab);
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
 {
-   struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+   struct hlist_head *head = i_hash_head(sb, hashval);
struct inode *inode;
 
spin_lock(_hash_lock);
@@ -1501,7 +1507,7 @@ EXPORT_SYMBOL(ilookup5);
  */
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
-   struct hlist_head *head = inode_hashtable + hash(sb, ino);
+   struct hlist_head *head = i_hash_head(sb, ino);
struct inode *inode;
 again:
spin_lock(_hash_lock);
@@ -1550,7 +1556,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
 void *),
void *data)
 {
-   struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+   struct hlist_head *head = i_hash_head(sb, hashval);
struct inode *inode, *ret_inode = NULL;
int mval;
 
@@ -1595,7 +1601,7 @@ EXPORT_SYMBOL(find_inode_nowait);
 struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
 int (*test)(struct inode *, void *), void *data)
 {
-   struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+   struct hlist_head *head = i_hash_head(sb, hashval);
struct inode *inode;
 
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
@@ -1633,7 +1639,7 @@ EXPORT_SYMBOL(find_inode_rcu);
 struct inode *find_inode_by_ino_rcu(struct super_block *sb,
unsigned long ino)
 {
-   struct hlist_head *head = inode_hashtable + hash(sb, ino);
+   struct hlist_head *head = i_hash_head(sb, ino);
struct inode *inode;
 
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
@@ -1653,7 +1659,7 @@ int insert_inode_locked(struct inode *inode)
 {
struct super_block *sb = inode->i_sb;
ino_t ino = inode->i_ino;
-   struct hlist_head *head = inode_hashtable + hash(sb, ino);
+   struct hlist_head *head = i_hash_head(sb, ino);
 
while (1) {
struct inode *old = NULL;
-- 
2.42.0




Re: [PATCH 2/5] pstore: inode: Convert mutex usage to guard(mutex)

2023-12-04 Thread Dave Chinner
> @@ -466,17 +455,14 @@ static struct dentry *pstore_mount(struct 
> file_system_type *fs_type,
>  
>  static void pstore_kill_sb(struct super_block *sb)
>  {
> - mutex_lock(_sb_lock);
> + guard(mutex)(_sb_lock);
>   WARN_ON(pstore_sb && pstore_sb != sb);
>  
>   kill_litter_super(sb);
>   pstore_sb = NULL;
>  
> - mutex_lock(_list_lock);
> + guard(mutex)(_list_lock);
>   INIT_LIST_HEAD(_list);
> - mutex_unlock(_list_lock);
> -
> - mutex_unlock(_sb_lock);
>  }

And this worries me, because guard() makes it harder to see
where locks are nested and the scope they apply to. At least with
lock/unlock pairs the scope of the critical sections and the
nestings are obvious.

So, yeah, i see that there is a bit less code with these fancy new
macros, but I don't think it's made the code is easier to read and
maintain at all.

Just my 2c worth...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH 2/3] MAINTAINERS: Require kvm-xfstests smoke for ext4

2023-11-22 Thread Dave Chinner
On Wed, Nov 22, 2023 at 08:17:46AM -0800, Darrick J. Wong wrote:
> On Wed, Nov 22, 2023 at 04:44:58PM +0200, Nikolai Kondrashov wrote:
> > On 11/20/23 00:54, Theodore Ts'o wrote:
> > > So as for *me*, I'm going to point people at:
> > > 
> > > https://github.com/tytso/xfstests-bld/blob/master/Documentation/kvm-quickstart.md
> > 
> > ...
> > 
> > > (And note that I keep the xfstests-bld repo's on kernel.org and
> > > github.com both uptodate, and I prefer using the using the github.com
> > > URL because it's easier for the new developer to read and understand
> > > it.)
> > 
> > I already queued a switch to the kernel.org URL, which Darrick has 
> > suggested.
> > I'll drop it now, but you guys would have to figure it out between 
> > yourselves,
> > which one you want :D
> > 
> > Personally, I agree that the one on GitHub is more reader-friendly, FWIW.
> 
> For xfstests-bld links, I'm ok with whichever domain Ted wants.
> 
> > > And similarly, just because the V: line might say, "kvm-xfstests
> > > smoke", someone could certainly use kdevops if they wanted to.  So
> > > perhaps we need to be a bit clearer about what we expect the V: line
> > > to mean?
> > 
> > I tried to handle some of that with the "subsets", so that you can run a 
> > wider
> > test suite and still pass the Tested-with: check. I think this has to be
> > balanced between allowing all the possible ways to run the tests and a
> > reasonable way to certify the commit was tested automatically.
> > 
> > E.g. name the test "xfstests", and list all the ways it can be executed, 
> > thus
> > communicating that it should still say "Tested-with: xfstests" regardless of
> > the way. And if there is a smaller required subset, name it just "xfstests
> > smoke" and list all the ways it can be run, including the simplest
> > "kvm-xfstests smoke", but accept just "Tested-with: xfstests-smoke".
> > 
> > I'm likely getting things wrong, but I hope you get what I'm trying to say.
> 
> Not entirely -- for drive-by contributions and obvious bugfixes, a quick
> "V: xfstests-bld: kvm-xfstests smoke" / "V: fstests: ./check -g smoke"
> run is probably sufficient.

For trivial drive-by contributions and obvious bug fixes, I think
this is an unnecessary burden on these potential contributors. If
it's trivial, then there's little burden on the reviewer/maintainer
to actually test it, whilst there is significant burden on the
one-off contributor to set up an entirely new, unfamiliar testing
environment just to fix something trivial.

If you want every patch tested, the follow the lead of the btrfs
developers: set up a CI mechanism on github and ask people to submit
changes there first and then provide a link to the successful test
completion ticket with the patch submission.

> (Insofar as n00bs running ./check isn't sufficient, but that's something
> that fstests needs to solve...)
> 
> For nontrivial code tidying, the author really ought to run the whole
> test suite.  It's still an open question as to whether xfs tidying
> should run the full fuzz suite too, since that increases the runtime
> from overnightish to a week.

Yes, the auto group tests should be run before non-trivial patch
sets are submitted. That is the entire premise of the the auto group
existing - it's the set of regression tests we expect to pass for
any change.

However, the full on disk format fuzz tests should not be required
to be run. Asking people to run tests that take a week for general
cleanups and code quality improvements is just crazy talk - the
cost-benefit equation is all out of whack here, especially if the
changes have no interaction with the on-disk format at all.

IMO, extensive fuzz testing is something that should be done
post-integration - requiring individual developers to run
tests that take at least a week to run before they can submit a
patchset for review/inclusion is an excessive burden, and we don't
need every developer to run such tests every time they do something
even slightly non-trivial.

It is the job of the release manager to co-ordinate with the testing
resources to run extensive, long running tests after code has been
integrated into the tree. Forcing individual developers to run this
sort of testing just isn't an efficient use of resources

> For /new features/, the developer(s) ought to come up with a testing
> plan and run that by the community.  Eventually those will merge into
> fstests or ktest or wherever.

That's how it already works, isn't it?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH v9 0/3] [PATCH v9 0/3] Introduce provisioning primitives

2023-11-20 Thread Dave Chinner
On Mon, Nov 13, 2023 at 01:26:51PM -0800, Sarthak Kukreti wrote:
> On Fri, Nov 10, 2023 at 4:56 PM Dave Chinner  wrote:
> >
> > On Thu, Nov 09, 2023 at 05:01:35PM -0800, Sarthak Kukreti wrote:
> > > Hi,
> > >
> > > This patch series is version 9 of the patch series to introduce
> > > block-level provisioning mechanism (original [1]), which is useful for
> > > provisioning space across thinly provisioned storage architectures (loop
> > > devices backed by sparse files, dm-thin devices, virtio-blk). This
> > > series has minimal changes over v8[2], with a couple of patches dropped
> > > (suggested by Dave).
> > >
> > > This patch series is rebased from the linux-dm/dm-6.5-provision-support
> > > [3] on to (a12deb44f973 Merge tag 'input-for-v6.7-rc0' ...). The final
> > > patch in the series is a blktest (suggested by Dave in 4) which was used
> > > to test out the provisioning flow for loop devices on sparse files on an
> > > ext4 filesystem.
> >
> > What happened to the XFS patch I sent to support provisioning for
> > fallocate() operations through XFS?
> >
> Apologies, I missed out on mentioning that the XFS patches work well
> with loop devices.
> 
> I might have misunderstood: were those patches only for sanity testing
> or would you prefer that I send those out as a part of this series? I
> can whip up a quick v10 if so!

I was implying that if you are going to be adding support to random
block devices for people to actually test out, then you should be
adding support to filesystems and writing new -fstests- to ensure
that loop devices are actually provisioning blocks at exactly the
locations that correspond to the physical file extents the
filesystem provisioned, too.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH v9 0/3] [PATCH v9 0/3] Introduce provisioning primitives

2023-11-10 Thread Dave Chinner
On Thu, Nov 09, 2023 at 05:01:35PM -0800, Sarthak Kukreti wrote:
> Hi,
> 
> This patch series is version 9 of the patch series to introduce
> block-level provisioning mechanism (original [1]), which is useful for
> provisioning space across thinly provisioned storage architectures (loop
> devices backed by sparse files, dm-thin devices, virtio-blk). This
> series has minimal changes over v8[2], with a couple of patches dropped
> (suggested by Dave).
> 
> This patch series is rebased from the linux-dm/dm-6.5-provision-support
> [3] on to (a12deb44f973 Merge tag 'input-for-v6.7-rc0' ...). The final 
> patch in the series is a blktest (suggested by Dave in 4) which was used
> to test out the provisioning flow for loop devices on sparse files on an
> ext4 filesystem.

What happened to the XFS patch I sent to support provisioning for
fallocate() operations through XFS?

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: (subset) [PATCH 22/32] vfs: inode cache conversion to hash-bl

2023-11-04 Thread Dave Chinner
On Wed, Nov 01, 2023 at 10:36:15PM -0400, Kent Overstreet wrote:
> On Tue, Oct 31, 2023 at 12:02:47PM +0100, Christian Brauner wrote:
> > > The follow up including a statement about "being arsed" once more was
> > > to Christian, not you and was rather "tongue in cheek".
> > 
> > Fyi, I can't be arsed to be talked to like that.
> > 
> > > Whether the patch is ready for reviews and whatnot is your call to
> > > make as the author.
> > 
> > This is basically why that patch never staid in -next. Dave said this
> > patch is meaningless without his other patchs and I had no reason to
> > doubt that claim nor currently the cycles to benchmark and disprove it.
> 
> It was a big benefit to bcachefs performance, and I've had it in my tree
> for quite some time. Was there any other holdup?

Plenty.

- A lack of recent validation against ext4, btrfs and other
filesystems.
- the loss of lockdep coverage by moving to bit locks
- it breaks CONFIG_PREEMPT_RT=y because we nest other spinlocks
  inside the inode_hash_lock and we can't do that if we convert the
  inode hash to bit locks because RT makes spinlocks sleeping locks.
- There's been additions for lockless RCU inode hash lookups from
  AFS and ext4 in weird, uncommon corner cases and I have no idea
  how to validate they still work correctly with hash-bl. I suspect
  they should just go away with hash-bl, but

There's more, but these are the big ones.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: (subset) [PATCH 22/32] vfs: inode cache conversion to hash-bl

2023-10-22 Thread Dave Chinner
On Fri, Oct 20, 2023 at 07:49:18PM +0200, Mateusz Guzik wrote:
> On 10/20/23, Dave Chinner  wrote:
> > On Thu, Oct 19, 2023 at 05:59:58PM +0200, Mateusz Guzik wrote:
> >> > To be clear there is no urgency as far as I'm concerned, but I did run
> >> > into something which is primarily bottlenecked by inode hash lock and
> >> > looks like the above should sort it out.
> >> >
> >> > Looks like the patch was simply forgotten.
> >> >
> >> > tl;dr can this land in -next please
> >>
> >> In case you can't be arsed, here is something funny which may convince
> >> you to expedite. ;)
> >>
> >> I did some benching by running 20 processes in parallel, each doing stat
> >> on a tree of 1 million files (one tree per proc, 1000 dirs x 1000 files,
> >> so 20 mln inodes in total).  Box had 24 cores and 24G RAM.
> >>
> >> Best times:
> >> Linux:  7.60s user 1306.90s system 1863% cpu 1:10.55 total
> >> FreeBSD:3.49s user 345.12s system 1983% cpu 17.573 total
> >> OpenBSD:5.01s user 6463.66s system 2000% cpu 5:23.42 total
> >> DragonflyBSD:   11.73s user 1316.76s system 1023% cpu 2:09.78 total
> >> OmniosCE:   9.17s user 516.53s system 1550% cpu 33.905 total
> >>
> >> NetBSD failed to complete the run, OOM-killing workers:
> >> http://mail-index.netbsd.org/tech-kern/2023/10/19/msg029242.html
> >> OpenBSD is shafted by a big kernel lock, so no surprise it takes a long
> >> time.
> >>
> >> So what I find funny is that Linux needed more time than OmniosCE (an
> >> Illumos variant, fork of Solaris).
> >>
> >> It also needed more time than FreeBSD, which is not necessarily funny
> >> but not that great either.
> >>
> >> All systems were mostly busy contending on locks and in particular Linux
> >> was almost exclusively busy waiting on inode hash lock.
> >
> > Did you bother to test the patch, or are you just complaining
> > that nobody has already done the work for you?
> 
> Why are you giving me attitude?

Look in the mirror, mate.

Starting off with a derogatory statement like:

"In case you can't be arsed, ..."

is a really good way to start a fight.

I don't think anyone working on this stuff couldn't be bothered to
get their lazy arses off their couches to get it merged. Though you
may not have intended it that way, that's exactly what "can't be
arsed" means. 

I have not asked for this code to be merged because I'm not ready to
ask for it to be merged. I'm trying to be careful and cautious about
changing core kernel code that every linux installation out there
uses because I care about this code being robust and stable. That's
the exact opposite of "can't be arsed"

Further, you have asked for code that is not ready to be merged to
be merged without reviewing it or even testing it to see if it
solved your reported problem. This is pretty basic stuff - it you
want it merged, then *you also need to put effort into getting it
merged* regardless of who wrote the code. TANSTAAFL.

But you've done neither - you've just made demands and thrown
hypocritical shade implying busy people working on complex code are
lazy arses.

Perhaps you should consider your words more carefully in future?

> > Because if you tested the patch, you'd have realised that by itself
> > it does nothing to improve performance of the concurrent find+stat
> > workload. The lock contention simply moves to the sb_inode_list_lock
> > instead.
> >
> 
> Is that something you benched? While it may be there is no change,
> going from one bottleneck to another does not automatically mean there
> are no gains in performance.

Of course I have. I wouldn't have said anything if this wasn't a
subject I have specific knowledge and expertise in. As I've already
said, I've been running this specific "will it scale" find+stat
micro-benchmark for well over a decade. For example:

https://lore.kernel.org/linux-xfs/20130603074452.GZ29466@dastard/

That's dated June 2013, and the workload is:

"8-way 50 million zero-length file create, 8-way
find+stat of all the files, 8-unlink of all the files:"

Yeah, this workload only scaled to a bit over 4 CPUs a decade ago,
hence I only tested to 8-way

> For example, this thing on FreeBSD used to take over one minute (just
> like on Linux right now), vast majority of which was spent on
> multicore issues. I massaged it down to ~18 seconds, despite it still
> being mostly bottlenecked on locks.
> 
> So I benched the hashbl change and it provides a marked improvement:
> stock:  7.60s user 1306.90s system 1863% cpu 1:10.5

Re: (subset) [PATCH 22/32] vfs: inode cache conversion to hash-bl

2023-10-20 Thread Dave Chinner
s: outside micro-benchmarks, these
locks just don't show up in profiles on production machines.
Hence there's no urgency to "fix" these lock contention
problems despite the ease with which micro-benchmarks can reproduce
it...

I've kept the patches current for years, even though there hasn't
been a pressing need for them. The last "vfs-scale" version I did
some validation on is here:

https://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git/log/?h=vfs-scale

5.17 was the last kernel I did any serious validation and
measurement against, and that all needs to be repeated before
proposing it for inclusion because lots of stuff has changed since I
last did some serious multi-filesystem a/b testing of this code

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [powerpc] kernel BUG fs/xfs/xfs_message.c:102! [4k block]

2023-10-12 Thread Dave Chinner
gt; 
> xfs/238 test was executed when the crash was encountered.
> Devices were formatted with 4k block size.

Yeah, I've seen this once before, I think I know what the problem is
from analysis of that failure, but I've been unable to reproduce it
again so I've not been able to confirm the diagnosis nor test a fix.

tl;dr: we just unlinked an inode whose cluster buffer has been
invalidated by xfs_icluster_free(). We go to log the inode, but this
is the first time we've logged the inode since it was last cleaned,
so it goes to read the cluster buffer to attach it. It finds the
cluster buffer already marked stale in the transaction, so the DONE
flag is not set and the ASSERT fires.

i.e. it appears to me that this requires inode cluster buffer
writeback between the unlink() operation and the inodegc
inactivation process to set the initial conditions for the problem
to trigger, and then have just a single inode in the inobt chunk
that triggers freeing of the chunk whilst the inode itself is clean. 

I need to confirm that this is the case before trying to fix it,
because this inode log item vs stale inode cluster buffer path is
tricky and nasty and there might be something else going on.
However, I haven't been able to reproduce this to be able to confirm
this hypothesis yet.

I suspect the fix may well be to use xfs_trans_buf_get() in the
xfs_inode_item_precommit() path if XFS_ISTALE is already set on the
inode we are trying to log. We don't need a populated cluster buffer
to read data out of or write data into in this path - all we need to
do is attach the inode to the buffer so that when the buffer
invalidation is committed to the journal it will also correctly
finish the stale inode log item.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [dm-devel] [PATCH v8 0/5] Introduce provisioning primitives

2023-10-10 Thread Dave Chinner
On Tue, Oct 10, 2023 at 03:42:53PM -0700, Sarthak Kukreti wrote:
> On Sun, Oct 8, 2023 at 4:50 PM Dave Chinner  wrote:
> >
> > On Fri, Oct 06, 2023 at 06:28:12PM -0700, Sarthak Kukreti wrote:
> > > Hi,
> > >
> > > This patch series is version 8 of the patch series to introduce
> > > block-level provisioning mechanism (original [1]), which is useful for 
> > > provisioning
> > > space across thinly provisioned storage architectures (loop devices
> > > backed by sparse files, dm-thin devices, virtio-blk). This series has
> > > minimal changes over v7[2].
> > >
> > > This patch series is rebased from the linux-dm/dm-6.5-provision-support 
> > > [1] on to
> > > (cac405a3bfa2 Merge tag 'for-6.6-rc3-tag'). In addition, there's an
> > > additional patch to allow passing through an unshare intent via 
> > > REQ_OP_PROVISION
> > > (suggested by Darrick in [4]).
> >
> > The XFS patches I just posted were smoke tested a while back against
> > loop devices and then forward ported to this patchset. Good for
> > testing that userspace driven file preallocation gets propagated by
> > the filesystem down to the backing device correctly and that
> > subsequent IO to the file then does the right thing (e.g. fio
> > testing using fallocate() to set up the files being written to)
> >
> 
> Thanks! I've been testing with a WIP patch for ext4, I'll give your
> patches a try. Once we are closer to submitting the filesystem
> support, we can formalize the test into an xfstest (sparse file + loop
> + filesystem, fallocate() file, check the size of the underlying
> sparse file).

That's not really a valid test - there are so many optional filesystem
behaviours that can change the layout of the backing file for the
same upper filesystem operations.

What we actually need to test is the ENOSPC guarantees, not that
fallocate has been called by the loop device. i.e. that ENOSPC is
propagated from the underlying filesystem though the loop device to
the application running on the upper filesystem appropriately.  e.g.
when the lower filesystem is at ENOSPC, the writes into provisioned
space in the loop device backing file continue to succeed without
ENOSPC being reported to the upper filesystem.

i.e. this needs to be tested from the perspective of the API
presented to the upper filesystem, not by running an upper fs
operation and then trying to infer correct behaviour by peering at
the state of the lower filesystem...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel


Re: [dm-devel] [PATCH v8 5/5] block: Pass unshare intent via REQ_OP_PROVISION

2023-10-10 Thread Dave Chinner
On Tue, Oct 10, 2023 at 03:42:39PM -0700, Sarthak Kukreti wrote:
> On Sun, Oct 8, 2023 at 4:27 PM Dave Chinner  wrote:
> >
> > On Fri, Oct 06, 2023 at 06:28:17PM -0700, Sarthak Kukreti wrote:
> > > Allow REQ_OP_PROVISION to pass in an extra REQ_UNSHARE bit to
> > > annotate unshare requests to underlying layers. Layers that support
> > > FALLOC_FL_UNSHARE will be able to use this as an indicator of which
> > > fallocate() mode to use.
> > >
> > > Suggested-by: Darrick J. Wong 
> > > Signed-off-by: Sarthak Kukreti 
> > > ---
> > >  block/blk-lib.c   |  6 +-
> > >  block/fops.c  |  6 --
> > >  drivers/block/loop.c  | 35 +--
> > >  include/linux/blk_types.h |  3 +++
> > >  include/linux/blkdev.h|  3 ++-
> > >  5 files changed, 43 insertions(+), 10 deletions(-)
> >
> > I have no idea how filesystems (or even userspace applications, for
> > that matter) are supposed to use this - they have no idea if the
> > underlying block device has shared blocks for LBA ranges it already
> > has allocated and provisioned. IOWs, I don't know waht the semantics
> > of this function is, it is not documented anywhere, and there is no
> > use case present that tells me how it might get used.
> >
> > Yes, unshare at the file level means the filesystem tries to break
> > internal data extent sharing, but if the block layers or backing
> > devices are doing deduplication and sharing unknown to the
> > application or filesystem, how do they ever know that this operation
> > might need to be performed? In what cases do we need to be able to
> > unshare block device ranges, and how is that different to the
> > guarantees that REQ_PROVISION is already supposed to give for
> > provisioned ranges that are then subsequently shared by the block
> > device (e.g. by snapshots)?
> >
> > Also, from an API perspective, this is an "unshare" data operation,
> > not a "provision" operation. Hence I'd suggest that the API should
> > be blkdev_issue_unshare() rather than optional behaviour to
> > _provision() which - before this patch - had clear and well defined
> > meaning
> >
> Fair points, the intent from the conversation with Darrick was the
> addition of support for FALLOC_FL_UNSHARE_RANGE in patch 2 of v4
> (originally suggested by Brian Forster in [1]): if we allow
> fallocate(UNSHARE_RANGE) on a loop device (ex. for creating a
> snapshot, similar in nature to the FICLONE example you mentioned on
> the loop patch), we'd (ideally) want to pass it through to the
> underlying layers and let them figure out what to do with it. But it
> is only for situations where we are explicitly know what the
> underlying layers are and what's the mecha
> 
> I agree though that it clouds the API a bit and I don't think it
> necessarily needs to be a part of the initial patch series: for now, I
> propose keeping just mode zero (and FALLOC_FL_KEEP_SIZE) handling in
> the block series patch and drop this patch for now. WDYT?

Until we have an actual use case for unsharing (which explicitly
breaks extent sharing) as opposed to provisioning (which ensures
overwrites always succeed regardless of extent state) then let's
leave it out of this -provisioning- series.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel


Re: [dm-devel] [PATCH v8 3/5] loop: Add support for provision requests

2023-10-10 Thread Dave Chinner
On Tue, Oct 10, 2023 at 03:43:10PM -0700, Sarthak Kukreti wrote:
> On Sun, Oct 8, 2023 at 4:37 PM Dave Chinner  wrote:
> >
> > On Fri, Oct 06, 2023 at 06:28:15PM -0700, Sarthak Kukreti wrote:
> > > Add support for provision requests to loopback devices.
> > > Loop devices will configure provision support based on
> > > whether the underlying block device/file can support
> > > the provision request and upon receiving a provision bio,
> > > will map it to the backing device/storage. For loop devices
> > > over files, a REQ_OP_PROVISION request will translate to
> > > an fallocate mode 0 call on the backing file.
> > >
> > > Signed-off-by: Sarthak Kukreti 
> > > Signed-off-by: Mike Snitzer 
> >
> >
> > H.
> >
> > This doesn't actually implement the required semantics of
> > REQ_PROVISION. Yes, it passes the command to the filesystem
> > fallocate() implementation, but fallocate() at the filesystem level
> > does not have the same semantics as REQ_PROVISION.
> >
> > i.e. at the filesystem level, fallocate() only guarantees the next
> > write to the provisioned range will succeed without ENOSPC, it does
> > not guarantee *every* write to the range will succeed without
> > ENOSPC. If someone clones the loop file while it is in use (i.e.
> > snapshots it via cp --reflink) then all guarantees that the next
> > write to a provisioned LBA range will succeed without ENOSPC are
> > voided.
> >
> > So while this will work for basic testing that the filesystem is
> > issuing REQ_PROVISION based IO correctly, it can't actually be used
> > for hosting production filesystems that need full REQ_PROVISION
> > guarantees when the loop device backing file is independently
> > shapshotted via FICLONE
> >
> > At minimuim, this set of implementation constraints needs tobe
> > documented somewhere...
> >
> Fair point. I wanted to have a separate fallocate() mode
> (FALLOC_FL_PROVISION) in the earlier series of the patchset so that we
> can distinguish between a provision request and a regular fallocate()
> call; I dropped it from the series after feedback that the default
> case should suffice. But this might be one of the cases where we need
> an explicit intent that we want to provision space.

ISTR that I commented that filesystems like XFS can't implement
REQ_PROVISION semantics for extents without on-disk format
changes. Hence that needs to happen before we expose a new API to
userspace

> Given a separate FALLOC_FL_PROVISION mode in the scenario you
> mentioned, the filesystem could copy previously 'provisioned' blocks
> to new blocks (which implicitly provisions them) or reserve blocks for
> use (and passing through REQ_OP_PROVISION below). That also means that
> the filesystem should track 'provisioned' blocks and take appropriate
> actions to ensure the provisioning guarantees.

Yes, tracking provisioned ranges persistently and the reservations
they require needs on-disk filesytem format changes compared to just
preallocating space.  None of this functionality currently exists in
any filesystem that supports shared extents, and it's a fairly
significant chunk of development work to support it.

Nobody has planned to do this sort of complex surgery to XFS at
this point in time. I doubt that anyone on the btrfs side of
things is really even following this discussion because this is
largely for block device thinp and snapshot support
and btrfs just doesn't care about that.

> For filesystems without copy-on-write semantics (eg. ext4),
> REQ_OP_PROVISION should still be equivalent to mode == 0.

Well, yes. This is the same situation as "for non-sparse block
devices, REQ_PROVISION can just be ignored." This is not an
interesting use case, nor a use case that the functionality or APIs
should be designed around.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel


Re: [dm-devel] [PATCH v8 0/5] Introduce provisioning primitives

2023-10-08 Thread Dave Chinner
On Fri, Oct 06, 2023 at 06:28:12PM -0700, Sarthak Kukreti wrote:
> Hi,
> 
> This patch series is version 8 of the patch series to introduce
> block-level provisioning mechanism (original [1]), which is useful for 
> provisioning
> space across thinly provisioned storage architectures (loop devices
> backed by sparse files, dm-thin devices, virtio-blk). This series has
> minimal changes over v7[2].
> 
> This patch series is rebased from the linux-dm/dm-6.5-provision-support [1] 
> on to
> (cac405a3bfa2 Merge tag 'for-6.6-rc3-tag'). In addition, there's an
> additional patch to allow passing through an unshare intent via 
> REQ_OP_PROVISION
> (suggested by Darrick in [4]).

The XFS patches I just posted were smoke tested a while back against
loop devices and then forward ported to this patchset. Good for
testing that userspace driven file preallocation gets propagated by
the filesystem down to the backing device correctly and that
subsequent IO to the file then does the right thing (e.g. fio
testing using fallocate() to set up the files being written to)

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [RFC PATCH 7/5] xfs: add block device provisioning for fallocate

2023-10-08 Thread Dave Chinner
From: Dave Chinner 

Provision space in the block device for preallocated file space when
userspace asks for it. Make sure to do this outside of transaction
context so it can fail without causing a filesystem shutdown.

XXX: async provisioning submission/completion interface would be
really useful here

Signed-off-by: Dave Chinner 
---
 fs/xfs/xfs_bmap_util.c | 42 +-
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fcefab687285..51e7bc47 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -772,6 +772,37 @@ xfs_free_eofblocks(
return error;
 }
 
+static int
+xfs_bmap_provision_blocks(
+   struct xfs_inode*ip,
+   struct xfs_bmbt_irec*imap,
+   int nimaps)
+{
+   struct xfs_mount*mp = ip->i_mount;
+   struct xfs_buftarg  *target;
+   int i;
+
+   if (!xfs_is_provisioning_blocks(mp))
+   return 0;
+
+   target = xfs_inode_buftarg(ip);
+   if (!target->bt_needs_provisioning)
+   return 0;
+
+   for (i = 0; i < nimaps; i++) {
+   int error;
+
+   error = blkdev_issue_provision(target->bt_bdev,
+   XFS_FSB_TO_DADDR(mp, imap->br_startblock),
+   XFS_FSB_TO_BB(mp, imap->br_blockcount),
+   GFP_KERNEL, 0);
+   ASSERT(error != -EOPNOTSUPP);
+   if (error)
+   return error;
+   }
+   return 0;
+}
+
 int
 xfs_alloc_file_space(
struct xfs_inode*ip,
@@ -780,7 +811,6 @@ xfs_alloc_file_space(
 {
xfs_mount_t *mp = ip->i_mount;
xfs_off_t   count;
-   xfs_filblks_t   allocated_fsb;
xfs_filblks_t   allocatesize_fsb;
xfs_extlen_textsz, temp;
xfs_fileoff_t   startoffset_fsb;
@@ -884,15 +914,17 @@ xfs_alloc_file_space(
if (error)
break;
 
-   allocated_fsb = imapp->br_blockcount;
-
if (nimaps == 0) {
error = -ENOSPC;
break;
}
 
-   startoffset_fsb += allocated_fsb;
-   allocatesize_fsb -= allocated_fsb;
+   error = xfs_bmap_provision_blocks(ip, imapp, nimaps);
+   if (error)
+   break;
+
+   startoffset_fsb += imapp->br_blockcount;
+   allocatesize_fsb -= imapp->br_blockcount;
}
 
return error;

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



[dm-devel] [RFC PATCH 6/5] xfs: detect block devices requiring provisioning

2023-10-08 Thread Dave Chinner
From: Dave Chinner 

Block device provisioning detection infrastructure.

Signed-off-by: Dave Chinner 
---
 fs/xfs/xfs_buf.c   |  2 ++
 fs/xfs/xfs_buf.h   |  1 +
 fs/xfs/xfs_mount.h | 11 ++-
 fs/xfs/xfs_super.c |  4 
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c1ece4a08ff4..f37edae6e68e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2014,6 +2014,8 @@ xfs_alloc_buftarg(
btp->bt_bdev = bdev;
btp->bt_daxdev = fs_dax_get_by_bdev(bdev, >bt_dax_part_off,
mp, ops);
+   if (bdev_max_provision_sectors(bdev))
+   btp->bt_needs_provisioning = true;
 
/*
 * Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index df8f47953bb4..1719a8fce49f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -106,6 +106,7 @@ typedef struct xfs_buftarg {
size_t  bt_meta_sectormask;
size_t  bt_logical_sectorsize;
size_t  bt_logical_sectormask;
+   boolbt_needs_provisioning;
 
/* LRU control structures */
struct shrinker bt_shrinker;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d19cca099bc3..f1eec563c61d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -407,6 +407,13 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
 #define XFS_OPSTATE_WARNED_LARP9
 /* Mount time quotacheck is running */
 #define XFS_OPSTATE_QUOTACHECK_RUNNING 10
+/*
+ * If the block device underlying either the data or rt volume needs
+ * provisioning to guarantee space availability, this flag will be set.
+ * Operations that need to check, issue or free provisioning trigger off
+ * this flag.
+ */
+#define XFS_OPSTATE_PROVISION_BLOCKS   11
 
 #define __XFS_IS_OPSTATE(name, NAME) \
 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -434,6 +441,7 @@ __XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
 #else
 # define xfs_is_quotacheck_running(mp) (false)
 #endif
+__XFS_IS_OPSTATE(provisioning_blocks, PROVISION_BLOCKS)
 
 static inline bool
 xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -452,7 +460,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_WARNED_SCRUB),"wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK),   "wshrink" }, \
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
-   { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING),  "quotacheck" }
+   { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING),  "quotacheck" }, \
+   { (1UL << XFS_OPSTATE_PROVISION_BLOCKS),"provision" }
 
 /*
  * Max and min values for mount-option defined I/O
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 819a3568b28f..a5b15ddfb31e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -471,11 +471,15 @@ xfs_open_devices(
mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
+   if (mp->m_ddev_targp->bt_needs_provisioning)
+   xfs_set_provisioning_blocks(mp);
 
if (rtdev) {
mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
+   if (mp->m_rtdev_targp->bt_needs_provisioning)
+   xfs_set_provisioning_blocks(mp);
}
 
if (logdev && logdev != ddev) {

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v8 3/5] loop: Add support for provision requests

2023-10-08 Thread Dave Chinner
On Fri, Oct 06, 2023 at 06:28:15PM -0700, Sarthak Kukreti wrote:
> Add support for provision requests to loopback devices.
> Loop devices will configure provision support based on
> whether the underlying block device/file can support
> the provision request and upon receiving a provision bio,
> will map it to the backing device/storage. For loop devices
> over files, a REQ_OP_PROVISION request will translate to
> an fallocate mode 0 call on the backing file.
> 
> Signed-off-by: Sarthak Kukreti 
> Signed-off-by: Mike Snitzer 


H.

This doesn't actually implement the required semantics of
REQ_PROVISION. Yes, it passes the command to the filesystem
fallocate() implementation, but fallocate() at the filesystem level
does not have the same semantics as REQ_PROVISION.

i.e. at the filesystem level, fallocate() only guarantees the next
write to the provisioned range will succeed without ENOSPC, it does
not guarantee *every* write to the range will succeed without
ENOSPC. If someone clones the loop file while it is in use (i.e.
snapshots it via cp --reflink) then all guarantees that the next
write to a provisioned LBA range will succeed without ENOSPC are
voided.

So while this will work for basic testing that the filesystem is
issuing REQ_PROVISION based IO correctly, it can't actually be used
for hosting production filesystems that need full REQ_PROVISION
guarantees when the loop device backing file is independently
shapshotted via FICLONE

At minimuim, this set of implementation constraints needs tobe
documented somewhere...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v8 5/5] block: Pass unshare intent via REQ_OP_PROVISION

2023-10-08 Thread Dave Chinner
On Fri, Oct 06, 2023 at 06:28:17PM -0700, Sarthak Kukreti wrote:
> Allow REQ_OP_PROVISION to pass in an extra REQ_UNSHARE bit to
> annotate unshare requests to underlying layers. Layers that support
> FALLOC_FL_UNSHARE will be able to use this as an indicator of which
> fallocate() mode to use.
> 
> Suggested-by: Darrick J. Wong 
> Signed-off-by: Sarthak Kukreti 
> ---
>  block/blk-lib.c   |  6 +-
>  block/fops.c  |  6 --
>  drivers/block/loop.c  | 35 +--
>  include/linux/blk_types.h |  3 +++
>  include/linux/blkdev.h|  3 ++-
>  5 files changed, 43 insertions(+), 10 deletions(-)

I have no idea how filesystems (or even userspace applications, for
that matter) are supposed to use this - they have no idea if the
underlying block device has shared blocks for LBA ranges it already
has allocated and provisioned. IOWs, I don't know waht the semantics
of this function is, it is not documented anywhere, and there is no
use case present that tells me how it might get used.

Yes, unshare at the file level means the filesystem tries to break
internal data extent sharing, but if the block layers or backing
devices are doing deduplication and sharing unknown to the
application or filesystem, how do they ever know that this operation
might need to be performed? In what cases do we need to be able to
unshare block device ranges, and how is that different to the
guarantees that REQ_PROVISION is already supposed to give for
provisioned ranges that are then subsequently shared by the block
device (e.g. by snapshots)?

Also, from an API perspective, this is an "unshare" data operation,
not a "provision" operation. Hence I'd suggest that the API should
be blkdev_issue_unshare() rather than optional behaviour to
_provision() which - before this patch - had clear and well defined
meaning

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [PATCH] xfs: drop experimental warning for FSDAX

2023-09-26 Thread Dave Chinner
On Tue, Sep 26, 2023 at 07:55:19AM -0700, Darrick J. Wong wrote:
> On Thu, Sep 21, 2023 at 04:33:04PM +0800, Shiyang Ruan wrote:
> > Hi,
> > 
> > Any comments?
> 
> I notice that xfs/55[0-2] still fail on my fakepmem machine:
> 
> --- /tmp/fstests/tests/xfs/550.out2023-09-23 09:40:47.839521305 -0700
> +++ /var/tmp/fstests/xfs/550.out.bad  2023-09-24 20:00:23.4 -0700
> @@ -3,7 +3,6 @@ Format and mount
>  Create the original files
>  Inject memory failure (1 page)
>  Inject poison...
> -Process is killed by signal: 7
>  Inject memory failure (2 pages)
>  Inject poison...
> -Process is killed by signal: 7
> +Memory failure didn't kill the process
> 
> (yes, rmap is enabled)

Yes, I see the same failures, too. I've just been ignoring them
because I thought that all the memory failure code was still not
complete

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [f2fs-dev] [PATCH 07/11] vfs: add nowait parameter for file_accessed()

2023-09-10 Thread Dave Chinner via Linux-f2fs-devel
On Fri, Sep 08, 2023 at 01:29:55AM +0100, Pavel Begunkov wrote:
> On 9/3/23 23:30, Dave Chinner wrote:
> > On Wed, Aug 30, 2023 at 02:11:31PM +0800, Hao Xu wrote:
> > > On 8/29/23 19:53, Matthew Wilcox wrote:
> > > > On Tue, Aug 29, 2023 at 03:46:13PM +0800, Hao Xu wrote:
> > > > > On 8/28/23 05:32, Matthew Wilcox wrote:
> > > > > > On Sun, Aug 27, 2023 at 09:28:31PM +0800, Hao Xu wrote:
> > > > > > > From: Hao Xu 
> > > > > > > 
> > > > > > > Add a boolean parameter for file_accessed() to support nowait 
> > > > > > > semantics.
> > > > > > > Currently it is true only with io_uring as its initial caller.
> > > > > > 
> > > > > > So why do we need to do this as part of this series?  Apparently it
> > > > > > hasn't caused any problems for filemap_read().
> > > > > > 
> > > > > 
> > > > > We need this parameter to indicate if nowait semantics should be 
> > > > > enforced in
> > > > > touch_atime(), There are locks and maybe IOs in it.
> > > > 
> > > > That's not my point.  We currently call file_accessed() and
> > > > touch_atime() for nowait reads and nowait writes.  You haven't done
> > > > anything to fix those.
> > > > 
> > > > I suspect you can trim this patchset down significantly by avoiding
> > > > fixing the file_accessed() problem.  And then come back with a later
> > > > patchset that fixes it for all nowait i/o.  Or do a separate prep series
> > > 
> > > I'm ok to do that.
> > > 
> > > > first that fixes it for the existing nowait users, and then a second
> > > > series to do all the directory stuff.
> > > > 
> > > > I'd do the first thing.  Just ignore the problem.  Directory atime
> > > > updates cause I/O so rarely that you can afford to ignore it.  Almost
> > > > everyone uses relatime or nodiratime.
> > > 
> > > Hi Matthew,
> > > The previous discussion shows this does cause issues in real
> > > producations: 
> > > https://lore.kernel.org/io-uring/2785f009-2ebb-028d-8250-d5f3a3051...@gmail.com/#:~:text=fwiw%2C%20we%27ve%20just%20recently%20had%20similar%20problems%20with%20io_uring%20read/write
> > > 
> > 
> > Then separate it out into it's own patch set so we can have a
> > discussion on the merits of requiring using noatime, relatime or
> > lazytime for really latency sensitive IO applications. Changing code
> > is not always the right solution...
> 
> Separation sounds reasonable, but it can hardly be said that only
> latency sensitive apps would care about >1s nowait/async submission
> delays. Presumably, btrfs can improve on that, but it still looks
> like it's perfectly legit for filesystems do heavy stuff in
> timestamping like waiting for IO. Right?

Yes, it is, no-one is denying that. And some filesystems are worse
than others, but none of that means it has to be fixed so getdents
can be converted to NOWAIT semantics.

ie. this patchset is about the getdents NOWAIT machinery, and
fiddling around with timestamps has much, much wider scope than just
NOWAIT getdents machinery. We'll have this discussion about NOWAIT
timestamp updates when a RFC is proposed to address the wider
problem of how timestamp updates should behave in NOWAIT context.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [Cluster-devel] [PATCH 07/11] vfs: add nowait parameter for file_accessed()

2023-09-10 Thread Dave Chinner
On Fri, Sep 08, 2023 at 01:29:55AM +0100, Pavel Begunkov wrote:
> On 9/3/23 23:30, Dave Chinner wrote:
> > On Wed, Aug 30, 2023 at 02:11:31PM +0800, Hao Xu wrote:
> > > On 8/29/23 19:53, Matthew Wilcox wrote:
> > > > On Tue, Aug 29, 2023 at 03:46:13PM +0800, Hao Xu wrote:
> > > > > On 8/28/23 05:32, Matthew Wilcox wrote:
> > > > > > On Sun, Aug 27, 2023 at 09:28:31PM +0800, Hao Xu wrote:
> > > > > > > From: Hao Xu 
> > > > > > > 
> > > > > > > Add a boolean parameter for file_accessed() to support nowait 
> > > > > > > semantics.
> > > > > > > Currently it is true only with io_uring as its initial caller.
> > > > > > 
> > > > > > So why do we need to do this as part of this series?  Apparently it
> > > > > > hasn't caused any problems for filemap_read().
> > > > > > 
> > > > > 
> > > > > We need this parameter to indicate if nowait semantics should be 
> > > > > enforced in
> > > > > touch_atime(), There are locks and maybe IOs in it.
> > > > 
> > > > That's not my point.  We currently call file_accessed() and
> > > > touch_atime() for nowait reads and nowait writes.  You haven't done
> > > > anything to fix those.
> > > > 
> > > > I suspect you can trim this patchset down significantly by avoiding
> > > > fixing the file_accessed() problem.  And then come back with a later
> > > > patchset that fixes it for all nowait i/o.  Or do a separate prep series
> > > 
> > > I'm ok to do that.
> > > 
> > > > first that fixes it for the existing nowait users, and then a second
> > > > series to do all the directory stuff.
> > > > 
> > > > I'd do the first thing.  Just ignore the problem.  Directory atime
> > > > updates cause I/O so rarely that you can afford to ignore it.  Almost
> > > > everyone uses relatime or nodiratime.
> > > 
> > > Hi Matthew,
> > > The previous discussion shows this does cause issues in real
> > > producations: 
> > > https://lore.kernel.org/io-uring/2785f009-2ebb-028d-8250-d5f3a3051...@gmail.com/#:~:text=fwiw%2C%20we%27ve%20just%20recently%20had%20similar%20problems%20with%20io_uring%20read/write
> > > 
> > 
> > Then separate it out into it's own patch set so we can have a
> > discussion on the merits of requiring using noatime, relatime or
> > lazytime for really latency sensitive IO applications. Changing code
> > is not always the right solution...
> 
> Separation sounds reasonable, but it can hardly be said that only
> latency sensitive apps would care about >1s nowait/async submission
> delays. Presumably, btrfs can improve on that, but it still looks
> like it's perfectly legit for filesystems do heavy stuff in
> timestamping like waiting for IO. Right?

Yes, it is, no-one is denying that. And some filesystems are worse
than others, but none of that means it has to be fixed so getdents
can be converted to NOWAIT semantics.

ie. this patchset is about the getdents NOWAIT machinery, and
fiddling around with timestamps has much, much wider scope than just
NOWAIT getdents machinery. We'll have this discussion about NOWAIT
timestamp updates when a RFC is proposed to address the wider
problem of how timestamp updates should behave in NOWAIT context.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [Linux-cachefs] [PATCH 07/11] vfs: add nowait parameter for file_accessed()

2023-09-10 Thread Dave Chinner
On Fri, Sep 08, 2023 at 01:29:55AM +0100, Pavel Begunkov wrote:
> On 9/3/23 23:30, Dave Chinner wrote:
> > On Wed, Aug 30, 2023 at 02:11:31PM +0800, Hao Xu wrote:
> > > On 8/29/23 19:53, Matthew Wilcox wrote:
> > > > On Tue, Aug 29, 2023 at 03:46:13PM +0800, Hao Xu wrote:
> > > > > On 8/28/23 05:32, Matthew Wilcox wrote:
> > > > > > On Sun, Aug 27, 2023 at 09:28:31PM +0800, Hao Xu wrote:
> > > > > > > From: Hao Xu 
> > > > > > > 
> > > > > > > Add a boolean parameter for file_accessed() to support nowait 
> > > > > > > semantics.
> > > > > > > Currently it is true only with io_uring as its initial caller.
> > > > > > 
> > > > > > So why do we need to do this as part of this series?  Apparently it
> > > > > > hasn't caused any problems for filemap_read().
> > > > > > 
> > > > > 
> > > > > We need this parameter to indicate if nowait semantics should be 
> > > > > enforced in
> > > > > touch_atime(), There are locks and maybe IOs in it.
> > > > 
> > > > That's not my point.  We currently call file_accessed() and
> > > > touch_atime() for nowait reads and nowait writes.  You haven't done
> > > > anything to fix those.
> > > > 
> > > > I suspect you can trim this patchset down significantly by avoiding
> > > > fixing the file_accessed() problem.  And then come back with a later
> > > > patchset that fixes it for all nowait i/o.  Or do a separate prep series
> > > 
> > > I'm ok to do that.
> > > 
> > > > first that fixes it for the existing nowait users, and then a second
> > > > series to do all the directory stuff.
> > > > 
> > > > I'd do the first thing.  Just ignore the problem.  Directory atime
> > > > updates cause I/O so rarely that you can afford to ignore it.  Almost
> > > > everyone uses relatime or nodiratime.
> > > 
> > > Hi Matthew,
> > > The previous discussion shows this does cause issues in real
> > > producations: 
> > > https://lore.kernel.org/io-uring/2785f009-2ebb-028d-8250-d5f3a3051...@gmail.com/#:~:text=fwiw%2C%20we%27ve%20just%20recently%20had%20similar%20problems%20with%20io_uring%20read/write
> > > 
> > 
> > Then separate it out into it's own patch set so we can have a
> > discussion on the merits of requiring using noatime, relatime or
> > lazytime for really latency sensitive IO applications. Changing code
> > is not always the right solution...
> 
> Separation sounds reasonable, but it can hardly be said that only
> latency sensitive apps would care about >1s nowait/async submission
> delays. Presumably, btrfs can improve on that, but it still looks
> like it's perfectly legit for filesystems do heavy stuff in
> timestamping like waiting for IO. Right?

Yes, it is, no-one is denying that. And some filesystems are worse
than others, but none of that means it has to be fixed so getdents
can be converted to NOWAIT semantics.

ie. this patchset is about the getdents NOWAIT machinery, and
fiddling around with timestamps has much, much wider scope than just
NOWAIT getdents machinery. We'll have this discussion about NOWAIT
timestamp updates when a RFC is proposed to address the wider
problem of how timestamp updates should behave in NOWAIT context.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs



Re: [f2fs-dev] [PATCH 02/11] xfs: add NOWAIT semantics for readdir

2023-09-03 Thread Dave Chinner via Linux-f2fs-devel
On Sun, Aug 27, 2023 at 09:28:26PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> Implement NOWAIT semantics for readdir. Return EAGAIN error to the
> caller if it would block, like failing to get locks, or going to
> do IO.
> 
> Co-developed-by: Dave Chinner 

Not really.

"Co-developed" implies equal development input between all the
parties, which is not the case here - this patch is based on
prototype I wrote, whilst you're doing the refining, testing and
correctness work.

In these cases with XFS code, we add a line in the commit message to
say:

"This is based on a patch originally written by Dave Chinner."


> Signed-off-by: Dave Chinner 
> Signed-off-by: Hao Xu 
> [fixes deadlock issue, tweak code style]

With a signoff chain like you already have.

In the end you'll also get a RVB from me, which seems rather wrong
to me if I've apparently been "co-developing" the code



> @@ -156,7 +157,9 @@ xfs_dir2_block_getdents(
>   if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
>   return 0;
>  
> - error = xfs_dir3_block_read(args->trans, dp, );
> + if (ctx->flags & DIR_CONTEXT_F_NOWAIT)
> + flags |= XFS_DABUF_NOWAIT;
> + error = xfs_dir3_block_read(args->trans, dp, flags, );
>   if (error)
>   return error;
>  

Given we do this same check in both block and leaf formats to set
XFS_DABUF_NOWAIT, and we do the DIR_CONTEXT_F_NOWAIT check in
xfs_readdir() as well, we should probably do this check once at the
higher level and pass flags down from there with XFS_DABUF_NOWAIT
already set.

> @@ -240,6 +243,7 @@ xfs_dir2_block_getdents(
>  STATIC int
>  xfs_dir2_leaf_readbuf(
>   struct xfs_da_args  *args,
> + struct dir_context  *ctx,
>   size_t  bufsize,
>   xfs_dir2_off_t  *cur_off,
>   xfs_dablk_t *ra_blk,
> @@ -258,10 +262,15 @@ xfs_dir2_leaf_readbuf(
>   struct xfs_iext_cursor  icur;
>   int ra_want;
>   int error = 0;
> -
> - error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
> - if (error)
> - goto out;
> + unsigned intflags = 0;
> +
> + if (ctx->flags & DIR_CONTEXT_F_NOWAIT) {
> + flags |= XFS_DABUF_NOWAIT;
> + } else {
> + error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
> + if (error)
> + goto out;
> + }

Especially as, in hindsight, this doesn't make a whole lot of sense.
If XFS_DABUF_NOWAIT is set, we keep going until
xfs_ilock_data_map_shared_nowait() where we call
xfs_need_iread_extents() to see if we need to read the extents in
and abort at that point.

So, really, we shouldn't get this far with nowait semantics if
we haven't read the extents in yet - we're supposed to already have
the inode locked here and so we should have already checked this
condition before we bother locking the inode...

i.e. all we should be doing here is this:

if (!(flags & XFS_DABUF_NOWAIT)) {
error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
if (error)
goto out;
}

And then we don't need to pass the VFS dir_context down into low
level XFS functions unnecessarily.


>  
>   /*
>* Look for mapped directory blocks at or above the current offset.
> @@ -280,7 +289,7 @@ xfs_dir2_leaf_readbuf(
>   new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
>   if (new_off > *cur_off)
>   *cur_off = new_off;
> - error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, );
> + error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, flags, 
> );
>   if (error)
>   goto out;
>  
> @@ -360,6 +369,7 @@ xfs_dir2_leaf_getdents(
>   int byteoff;/* offset in current block */
>   unsigned intoffset = 0;
>   int error = 0;  /* error return value */
> + int written = 0;
>  
>   /*
>* If the offset is at or past the largest allowed value,
> @@ -391,10 +401,17 @@ xfs_dir2_leaf_getdents(
>   bp = NULL;
>   }
>  
> - if (*lock_mode == 0)
> - *lock_mode = xfs_ilock_data_map_shared(dp);
> - error = xfs_dir2_leaf_readbuf(args, bufsize, ,
> - , );
> + if (*lock_mode == 0) {
> + *lock_mode =
> + xfs_ilock_data_map_shared_generic(dp,
> 

Re: [Linux-cachefs] [PATCH 02/11] xfs: add NOWAIT semantics for readdir

2023-09-03 Thread Dave Chinner
On Sun, Aug 27, 2023 at 09:28:26PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> Implement NOWAIT semantics for readdir. Return EAGAIN error to the
> caller if it would block, like failing to get locks, or going to
> do IO.
> 
> Co-developed-by: Dave Chinner 

Not really.

"Co-developed" implies equal development input between all the
parties, which is not the case here - this patch is based on
prototype I wrote, whilst you're doing the refining, testing and
correctness work.

In these cases with XFS code, we add a line in the commit message to
say:

"This is based on a patch originally written by Dave Chinner."


> Signed-off-by: Dave Chinner 
> Signed-off-by: Hao Xu 
> [fixes deadlock issue, tweak code style]

With a signoff chain like you already have.

In the end you'll also get a RVB from me, which seems rather wrong
to me if I've apparently been "co-developing" the code



> @@ -156,7 +157,9 @@ xfs_dir2_block_getdents(
>   if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
>   return 0;
>  
> - error = xfs_dir3_block_read(args->trans, dp, );
> + if (ctx->flags & DIR_CONTEXT_F_NOWAIT)
> + flags |= XFS_DABUF_NOWAIT;
> + error = xfs_dir3_block_read(args->trans, dp, flags, );
>   if (error)
>   return error;
>  

Given we do this same check in both block and leaf formats to set
XFS_DABUF_NOWAIT, and we do the DIR_CONTEXT_F_NOWAIT check in
xfs_readdir() as well, we should probably do this check once at the
higher level and pass flags down from there with XFS_DABUF_NOWAIT
already set.

> @@ -240,6 +243,7 @@ xfs_dir2_block_getdents(
>  STATIC int
>  xfs_dir2_leaf_readbuf(
>   struct xfs_da_args  *args,
> + struct dir_context  *ctx,
>   size_t  bufsize,
>   xfs_dir2_off_t  *cur_off,
>   xfs_dablk_t *ra_blk,
> @@ -258,10 +262,15 @@ xfs_dir2_leaf_readbuf(
>   struct xfs_iext_cursor  icur;
>   int ra_want;
>   int error = 0;
> -
> - error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
> - if (error)
> - goto out;
> + unsigned intflags = 0;
> +
> + if (ctx->flags & DIR_CONTEXT_F_NOWAIT) {
> + flags |= XFS_DABUF_NOWAIT;
> + } else {
> + error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
> + if (error)
> + goto out;
> + }

Especially as, in hindsight, this doesn't make a whole lot of sense.
If XFS_DABUF_NOWAIT is set, we keep going until
xfs_ilock_data_map_shared_nowait() where we call
xfs_need_iread_extents() to see if we need to read the extents in
and abort at that point.

So, really, we shouldn't get this far with nowait semantics if
we haven't read the extents in yet - we're supposed to already have
the inode locked here and so we should have already checked this
condition before we bother locking the inode...

i.e. all we should be doing here is this:

if (!(flags & XFS_DABUF_NOWAIT)) {
error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
if (error)
goto out;
}

And then we don't need to pass the VFS dir_context down into low
level XFS functions unnecessarily.


>  
>   /*
>* Look for mapped directory blocks at or above the current offset.
> @@ -280,7 +289,7 @@ xfs_dir2_leaf_readbuf(
>   new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
>   if (new_off > *cur_off)
>   *cur_off = new_off;
> - error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, );
> + error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, flags, 
> );
>   if (error)
>   goto out;
>  
> @@ -360,6 +369,7 @@ xfs_dir2_leaf_getdents(
>   int byteoff;/* offset in current block */
>   unsigned intoffset = 0;
>   int error = 0;  /* error return value */
> + int written = 0;
>  
>   /*
>* If the offset is at or past the largest allowed value,
> @@ -391,10 +401,17 @@ xfs_dir2_leaf_getdents(
>   bp = NULL;
>   }
>  
> - if (*lock_mode == 0)
> - *lock_mode = xfs_ilock_data_map_shared(dp);
> - error = xfs_dir2_leaf_readbuf(args, bufsize, ,
> - , );
> + if (*lock_mode == 0) {
> + *lock_mode =
> + xfs_ilock_data_map_shared_generic(dp,
> 

Re: [Cluster-devel] [PATCH 02/11] xfs: add NOWAIT semantics for readdir

2023-09-03 Thread Dave Chinner
On Sun, Aug 27, 2023 at 09:28:26PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> Implement NOWAIT semantics for readdir. Return EAGAIN error to the
> caller if it would block, like failing to get locks, or going to
> do IO.
> 
> Co-developed-by: Dave Chinner 

Not really.

"Co-developed" implies equal development input between all the
parties, which is not the case here - this patch is based on
prototype I wrote, whilst you're doing the refining, testing and
correctness work.

In these cases with XFS code, we add a line in the commit message to
say:

"This is based on a patch originally written by Dave Chinner."


> Signed-off-by: Dave Chinner 
> Signed-off-by: Hao Xu 
> [fixes deadlock issue, tweak code style]

With a signoff chain like you already have.

In the end you'll also get a RVB from me, which seems rather wrong
to me if I've apparently been "co-developing" the code



> @@ -156,7 +157,9 @@ xfs_dir2_block_getdents(
>   if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
>   return 0;
>  
> - error = xfs_dir3_block_read(args->trans, dp, );
> + if (ctx->flags & DIR_CONTEXT_F_NOWAIT)
> + flags |= XFS_DABUF_NOWAIT;
> + error = xfs_dir3_block_read(args->trans, dp, flags, );
>   if (error)
>   return error;
>  

Given we do this same check in both block and leaf formats to set
XFS_DABUF_NOWAIT, and we do the DIR_CONTEXT_F_NOWAIT check in
xfs_readdir() as well, we should probably do this check once at the
higher level and pass flags down from there with XFS_DABUF_NOWAIT
already set.

> @@ -240,6 +243,7 @@ xfs_dir2_block_getdents(
>  STATIC int
>  xfs_dir2_leaf_readbuf(
>   struct xfs_da_args  *args,
> + struct dir_context  *ctx,
>   size_t  bufsize,
>   xfs_dir2_off_t  *cur_off,
>   xfs_dablk_t *ra_blk,
> @@ -258,10 +262,15 @@ xfs_dir2_leaf_readbuf(
>   struct xfs_iext_cursor  icur;
>   int ra_want;
>   int error = 0;
> -
> - error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
> - if (error)
> - goto out;
> + unsigned intflags = 0;
> +
> + if (ctx->flags & DIR_CONTEXT_F_NOWAIT) {
> + flags |= XFS_DABUF_NOWAIT;
> + } else {
> + error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
> + if (error)
> + goto out;
> + }

Especially as, in hindsight, this doesn't make a whole lot of sense.
If XFS_DABUF_NOWAIT is set, we keep going until
xfs_ilock_data_map_shared_nowait() where we call
xfs_need_iread_extents() to see if we need to read the extents in
and abort at that point.

So, really, we shouldn't get this far with nowait semantics if
we haven't read the extents in yet - we're supposed to already have
the inode locked here and so we should have already checked this
condition before we bother locking the inode...

i.e. all we should be doing here is this:

if (!(flags & XFS_DABUF_NOWAIT)) {
error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
if (error)
goto out;
}

And then we don't need to pass the VFS dir_context down into low
level XFS functions unnecessarily.


>  
>   /*
>* Look for mapped directory blocks at or above the current offset.
> @@ -280,7 +289,7 @@ xfs_dir2_leaf_readbuf(
>   new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
>   if (new_off > *cur_off)
>   *cur_off = new_off;
> - error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, );
> + error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, flags, 
> );
>   if (error)
>   goto out;
>  
> @@ -360,6 +369,7 @@ xfs_dir2_leaf_getdents(
>   int byteoff;/* offset in current block */
>   unsigned intoffset = 0;
>   int error = 0;  /* error return value */
> + int written = 0;
>  
>   /*
>* If the offset is at or past the largest allowed value,
> @@ -391,10 +401,17 @@ xfs_dir2_leaf_getdents(
>   bp = NULL;
>   }
>  
> - if (*lock_mode == 0)
> - *lock_mode = xfs_ilock_data_map_shared(dp);
> - error = xfs_dir2_leaf_readbuf(args, bufsize, ,
> - , );
> + if (*lock_mode == 0) {
> + *lock_mode =
> + xfs_ilock_data_map_shared_generic(dp,
> 

Re: [f2fs-dev] [PATCH 07/11] vfs: add nowait parameter for file_accessed()

2023-09-03 Thread Dave Chinner via Linux-f2fs-devel
On Wed, Aug 30, 2023 at 02:11:31PM +0800, Hao Xu wrote:
> On 8/29/23 19:53, Matthew Wilcox wrote:
> > On Tue, Aug 29, 2023 at 03:46:13PM +0800, Hao Xu wrote:
> > > On 8/28/23 05:32, Matthew Wilcox wrote:
> > > > On Sun, Aug 27, 2023 at 09:28:31PM +0800, Hao Xu wrote:
> > > > > From: Hao Xu 
> > > > > 
> > > > > Add a boolean parameter for file_accessed() to support nowait 
> > > > > semantics.
> > > > > Currently it is true only with io_uring as its initial caller.
> > > > 
> > > > So why do we need to do this as part of this series?  Apparently it
> > > > hasn't caused any problems for filemap_read().
> > > > 
> > > 
> > > We need this parameter to indicate if nowait semantics should be enforced 
> > > in
> > > touch_atime(), There are locks and maybe IOs in it.
> > 
> > That's not my point.  We currently call file_accessed() and
> > touch_atime() for nowait reads and nowait writes.  You haven't done
> > anything to fix those.
> > 
> > I suspect you can trim this patchset down significantly by avoiding
> > fixing the file_accessed() problem.  And then come back with a later
> > patchset that fixes it for all nowait i/o.  Or do a separate prep series
> 
> I'm ok to do that.
> 
> > first that fixes it for the existing nowait users, and then a second
> > series to do all the directory stuff.
> > 
> > I'd do the first thing.  Just ignore the problem.  Directory atime
> > updates cause I/O so rarely that you can afford to ignore it.  Almost
> > everyone uses relatime or nodiratime.
> 
> Hi Matthew,
> The previous discussion shows this does cause issues in real
> producations: 
> https://lore.kernel.org/io-uring/2785f009-2ebb-028d-8250-d5f3a3051...@gmail.com/#:~:text=fwiw%2C%20we%27ve%20just%20recently%20had%20similar%20problems%20with%20io_uring%20read/write
> 

Then separate it out into it's own patch set so we can have a
discussion on the merits of requiring using noatime, relatime or
lazytime for really latency sensitive IO applications. Changing code
is not always the right solution...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [Linux-cachefs] [PATCH 07/11] vfs: add nowait parameter for file_accessed()

2023-09-03 Thread Dave Chinner
On Wed, Aug 30, 2023 at 02:11:31PM +0800, Hao Xu wrote:
> On 8/29/23 19:53, Matthew Wilcox wrote:
> > On Tue, Aug 29, 2023 at 03:46:13PM +0800, Hao Xu wrote:
> > > On 8/28/23 05:32, Matthew Wilcox wrote:
> > > > On Sun, Aug 27, 2023 at 09:28:31PM +0800, Hao Xu wrote:
> > > > > From: Hao Xu 
> > > > > 
> > > > > Add a boolean parameter for file_accessed() to support nowait 
> > > > > semantics.
> > > > > Currently it is true only with io_uring as its initial caller.
> > > > 
> > > > So why do we need to do this as part of this series?  Apparently it
> > > > hasn't caused any problems for filemap_read().
> > > > 
> > > 
> > > We need this parameter to indicate if nowait semantics should be enforced 
> > > in
> > > touch_atime(), There are locks and maybe IOs in it.
> > 
> > That's not my point.  We currently call file_accessed() and
> > touch_atime() for nowait reads and nowait writes.  You haven't done
> > anything to fix those.
> > 
> > I suspect you can trim this patchset down significantly by avoiding
> > fixing the file_accessed() problem.  And then come back with a later
> > patchset that fixes it for all nowait i/o.  Or do a separate prep series
> 
> I'm ok to do that.
> 
> > first that fixes it for the existing nowait users, and then a second
> > series to do all the directory stuff.
> > 
> > I'd do the first thing.  Just ignore the problem.  Directory atime
> > updates cause I/O so rarely that you can afford to ignore it.  Almost
> > everyone uses relatime or nodiratime.
> 
> Hi Matthew,
> The previous discussion shows this does cause issues in real
> producations: 
> https://lore.kernel.org/io-uring/2785f009-2ebb-028d-8250-d5f3a3051...@gmail.com/#:~:text=fwiw%2C%20we%27ve%20just%20recently%20had%20similar%20problems%20with%20io_uring%20read/write
> 

Then separate it out into it's own patch set so we can have a
discussion on the merits of requiring using noatime, relatime or
lazytime for really latency sensitive IO applications. Changing code
is not always the right solution...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs



Re: [Cluster-devel] [PATCH 07/11] vfs: add nowait parameter for file_accessed()

2023-09-03 Thread Dave Chinner
On Wed, Aug 30, 2023 at 02:11:31PM +0800, Hao Xu wrote:
> On 8/29/23 19:53, Matthew Wilcox wrote:
> > On Tue, Aug 29, 2023 at 03:46:13PM +0800, Hao Xu wrote:
> > > On 8/28/23 05:32, Matthew Wilcox wrote:
> > > > On Sun, Aug 27, 2023 at 09:28:31PM +0800, Hao Xu wrote:
> > > > > From: Hao Xu 
> > > > > 
> > > > > Add a boolean parameter for file_accessed() to support nowait 
> > > > > semantics.
> > > > > Currently it is true only with io_uring as its initial caller.
> > > > 
> > > > So why do we need to do this as part of this series?  Apparently it
> > > > hasn't caused any problems for filemap_read().
> > > > 
> > > 
> > > We need this parameter to indicate if nowait semantics should be enforced 
> > > in
> > > touch_atime(), There are locks and maybe IOs in it.
> > 
> > That's not my point.  We currently call file_accessed() and
> > touch_atime() for nowait reads and nowait writes.  You haven't done
> > anything to fix those.
> > 
> > I suspect you can trim this patchset down significantly by avoiding
> > fixing the file_accessed() problem.  And then come back with a later
> > patchset that fixes it for all nowait i/o.  Or do a separate prep series
> 
> I'm ok to do that.
> 
> > first that fixes it for the existing nowait users, and then a second
> > series to do all the directory stuff.
> > 
> > I'd do the first thing.  Just ignore the problem.  Directory atime
> > updates cause I/O so rarely that you can afford to ignore it.  Almost
> > everyone uses relatime or nodiratime.
> 
> Hi Matthew,
> The previous discussion shows this does cause issues in real
> producations: 
> https://lore.kernel.org/io-uring/2785f009-2ebb-028d-8250-d5f3a3051...@gmail.com/#:~:text=fwiw%2C%20we%27ve%20just%20recently%20had%20similar%20problems%20with%20io_uring%20read/write
> 

Then separate it out into it's own patch set so we can have a
discussion on the merits of requiring using noatime, relatime or
lazytime for really latency sensitive IO applications. Changing code
is not always the right solution...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [f2fs-dev] [PATCH RFC v5 00/29] io_uring getdents

2023-08-25 Thread Dave Chinner via Linux-f2fs-devel
On Fri, Aug 25, 2023 at 09:54:02PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> This series introduce getdents64 to io_uring, the code logic is similar
> with the snychronized version's. It first try nowait issue, and offload
> it to io-wq threads if the first try fails.
> 
> Patch1 and Patch2 are some preparation
> Patch3 supports nowait for xfs getdents code
> Patch4-11 are vfs change, include adding helpers and trylock for locks
> Patch12-29 supports nowait for involved xfs journal stuff
> note, Patch24 and 27 are actually two questions, might be removed later.
> an xfs test may come later.

You need to drop all the XFS journal stuff. It's fundamentally
broken as it stands, and we cannot support non-blocking
transactional changes without first putting a massive investment in
transaction and intent chain rollback to allow correctly undoing
partially complete modifications.

Regardless, non-blocking transactions are completely unnecessary for
a non-blocking readdir implementation. readdir should only be
touching atime, and with relatime it should only occur once every 24
hours per inode. If that's a problem, then we have noatime mount
options. Hence I just don't see any point in worrying about having a
timestamp update block occasionally...

I also don't really don't see why you need to fiddle with xfs buffer
cache semantics - it already has the functionality "nowait" buffer
reads require (i.e.  XBF_INCORE|XBF_TRYLOCK).

However, the readahead IO that the xfs readdir code issues cannot
use your defined NOWAIT semantics - it must be able to allocate
memory and issue IO. Readahead already avoids blocking on memory
allocation and blocking on IO via the XBF_READ_AHEAD flag. This sets
__GFP_NORETRY for buffer allocation and REQ_RAHEAD for IO. Hence
readahead only needs the existing XBF_TRYLOCK flag to be set to be
compatible with the required NOWAIT semantics

As for the NOIO memory allocation restrictions io_uring requires,
that should be enforced at the io_uring layer before calling into
the VFS using memalloc_noio_save/restore.  At that point no memory
allocation will trigger IO and none of the code running under NOWAIT
conditions even needs to be aware that io_uring has a GFP_NOIO
restriction on memory allocation

Please go back to the simple "do non-blocking buffer IO"
implementation we started with and don't try to solve every little
blocking problem that might exist in the VFS and filesystems...

-Dave
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [Cluster-devel] [PATCH RFC v5 00/29] io_uring getdents

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:02PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> This series introduce getdents64 to io_uring, the code logic is similar
> with the snychronized version's. It first try nowait issue, and offload
> it to io-wq threads if the first try fails.
> 
> Patch1 and Patch2 are some preparation
> Patch3 supports nowait for xfs getdents code
> Patch4-11 are vfs change, include adding helpers and trylock for locks
> Patch12-29 supports nowait for involved xfs journal stuff
> note, Patch24 and 27 are actually two questions, might be removed later.
> an xfs test may come later.

You need to drop all the XFS journal stuff. It's fundamentally
broken as it stands, and we cannot support non-blocking
transactional changes without first putting a massive investment in
transaction and intent chain rollback to allow correctly undoing
partially complete modifications.

Regardless, non-blocking transactions are completely unnecessary for
a non-blocking readdir implementation. readdir should only be
touching atime, and with relatime it should only occur once every 24
hours per inode. If that's a problem, then we have noatime mount
options. Hence I just don't see any point in worrying about having a
timestamp update block occasionally...

I also don't really don't see why you need to fiddle with xfs buffer
cache semantics - it already has the functionality "nowait" buffer
reads require (i.e.  XBF_INCORE|XBF_TRYLOCK).

However, the readahead IO that the xfs readdir code issues cannot
use your defined NOWAIT semantics - it must be able to allocate
memory and issue IO. Readahead already avoids blocking on memory
allocation and blocking on IO via the XBF_READ_AHEAD flag. This sets
__GFP_NORETRY for buffer allocation and REQ_RAHEAD for IO. Hence
readahead only needs the existing XBF_TRYLOCK flag to be set to be
compatible with the required NOWAIT semantics

As for the NOIO memory allocation restrictions io_uring requires,
that should be enforced at the io_uring layer before calling into
the VFS using memalloc_noio_save/restore.  At that point no memory
allocation will trigger IO and none of the code running under NOWAIT
conditions even needs to be aware that io_uring has a GFP_NOIO
restriction on memory allocation

Please go back to the simple "do non-blocking buffer IO"
implementation we started with and don't try to solve every little
blocking problem that might exist in the VFS and filesystems...

-Dave
-- 
Dave Chinner
da...@fromorbit.com



Re: [Linux-cachefs] [PATCH RFC v5 00/29] io_uring getdents

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:02PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> This series introduce getdents64 to io_uring, the code logic is similar
> with the snychronized version's. It first try nowait issue, and offload
> it to io-wq threads if the first try fails.
> 
> Patch1 and Patch2 are some preparation
> Patch3 supports nowait for xfs getdents code
> Patch4-11 are vfs change, include adding helpers and trylock for locks
> Patch12-29 supports nowait for involved xfs journal stuff
> note, Patch24 and 27 are actually two questions, might be removed later.
> an xfs test may come later.

You need to drop all the XFS journal stuff. It's fundamentally
broken as it stands, and we cannot support non-blocking
transactional changes without first putting a massive investment in
transaction and intent chain rollback to allow correctly undoing
partially complete modifications.

Regardless, non-blocking transactions are completely unnecessary for
a non-blocking readdir implementation. readdir should only be
touching atime, and with relatime it should only occur once every 24
hours per inode. If that's a problem, then we have noatime mount
options. Hence I just don't see any point in worrying about having a
timestamp update block occasionally...

I also don't really don't see why you need to fiddle with xfs buffer
cache semantics - it already has the functionality "nowait" buffer
reads require (i.e.  XBF_INCORE|XBF_TRYLOCK).

However, the readahead IO that the xfs readdir code issues cannot
use your defined NOWAIT semantics - it must be able to allocate
memory and issue IO. Readahead already avoids blocking on memory
allocation and blocking on IO via the XBF_READ_AHEAD flag. This sets
__GFP_NORETRY for buffer allocation and REQ_RAHEAD for IO. Hence
readahead only needs the existing XBF_TRYLOCK flag to be set to be
compatible with the required NOWAIT semantics

As for the NOIO memory allocation restrictions io_uring requires,
that should be enforced at the io_uring layer before calling into
the VFS using memalloc_noio_save/restore.  At that point no memory
allocation will trigger IO and none of the code running under NOWAIT
conditions even needs to be aware that io_uring has a GFP_NOIO
restriction on memory allocation

Please go back to the simple "do non-blocking buffer IO"
implementation we started with and don't try to solve every little
blocking problem that might exist in the VFS and filesystems...

-Dave
-- 
Dave Chinner
da...@fromorbit.com

--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs



Re: [f2fs-dev] [PATCH 25/29] xfs: support nowait for xfs_buf_item_init()

2023-08-25 Thread Dave Chinner via Linux-f2fs-devel
On Fri, Aug 25, 2023 at 09:54:27PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> support nowait for xfs_buf_item_init() and error out -EAGAIN to
> _xfs_trans_bjoin() when it would block.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_buf_item.c |  9 +++--
>  fs/xfs/xfs_buf_item.h |  2 +-
>  fs/xfs/xfs_buf_item_recover.c |  2 +-
>  fs/xfs/xfs_trans_buf.c| 16 +---
>  4 files changed, 22 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 023d4e0385dd..b1e63137d65b 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -827,7 +827,8 @@ xfs_buf_item_free_format(
>  int
>  xfs_buf_item_init(
>   struct xfs_buf  *bp,
> - struct xfs_mount *mp)
> + struct xfs_mount *mp,
> + bool   nowait)
>  {
>   struct xfs_buf_log_item *bip = bp->b_log_item;
>   int chunks;
> @@ -847,7 +848,11 @@ xfs_buf_item_init(
>   return 0;
>   }
>  
> - bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
> + bip = kmem_cache_zalloc(xfs_buf_item_cache,
> + GFP_KERNEL | (nowait ? 0 : __GFP_NOFAIL));
> + if (!bip)
> + return -EAGAIN;
> +
>   xfs_log_item_init(mp, >bli_item, XFS_LI_BUF, _buf_item_ops);
>   bip->bli_buf = bp;

I see filesystem shutdowns

> diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
> index 016371f58f26..a1e4f2e8629a 100644
> --- a/fs/xfs/xfs_trans_buf.c
> +++ b/fs/xfs/xfs_trans_buf.c
> @@ -57,13 +57,14 @@ xfs_trans_buf_item_match(
>   * If the buffer does not yet have a buf log item associated with it,
>   * then allocate one for it.  Then add the buf item to the transaction.
>   */
> -STATIC void
> +STATIC int
>  _xfs_trans_bjoin(
>   struct xfs_trans*tp,
>   struct xfs_buf  *bp,
>   int reset_recur)
>  {
>   struct xfs_buf_log_item *bip;
> + int ret;
>  
>   ASSERT(bp->b_transp == NULL);
>  
> @@ -72,7 +73,11 @@ _xfs_trans_bjoin(
>* it doesn't have one yet, then allocate one and initialize it.
>* The checks to see if one is there are in xfs_buf_item_init().
>*/
> - xfs_buf_item_init(bp, tp->t_mountp);
> + ret = xfs_buf_item_init(bp, tp->t_mountp,
> + tp->t_flags & XFS_TRANS_NOWAIT);
> + if (ret < 0)
> + return ret;
> +
>   bip = bp->b_log_item;
>   ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
>   ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
> @@ -92,6 +97,7 @@ _xfs_trans_bjoin(
>   xfs_trans_add_item(tp, >bli_item);
>   bp->b_transp = tp;
>  
> + return 0;
>  }
>  
>  void
> @@ -309,7 +315,11 @@ xfs_trans_read_buf_map(
>   }
>  
>   if (tp) {
> - _xfs_trans_bjoin(tp, bp, 1);
> + error = _xfs_trans_bjoin(tp, bp, 1);
> + if (error) {
> + xfs_buf_relse(bp);
> + return error;
> + }
>   trace_xfs_trans_read_buf(bp->b_log_item);

So what happens at the callers when we have a dirty transaction and
joining a buffer fails with -EAGAIN?

Apart from the fact this may well propagate -EAGAIN up to userspace,
cancelling a dirty transaction at this point will result in a
filesystem shutdown

Indeed, this can happen in the "simple" timestamp update case that
this "nowait" semantic is being aimed at. We log the inode in the
timestamp update, which dirties the log item and registers a
precommit operation to be run. We commit the
transaction, which then runs xfs_inode_item_precommit() and that
may need to attach the inode to the inode cluster buffer. This
results in:

xfs_inode_item_precommit
  xfs_imap_to_bp
xfs_trans_read_buf_map
  _xfs_trans_bjoin
xfs_buf_item_init(XFS_TRANS_NOWAIT)
  kmem_cache_zalloc(GFP_NOFS)
  
  gets -EAGAIN error
propagates -EAGAIN
  fails due to -EAGAIN

And now xfs_trans_commit() fails with a dirty transaction and the
filesystem shuts down.

IOWs, XFS_TRANS_NOWAIT as it stands is fundamentally broken. Once we
dirty an item in a transaction, we *cannot* back out of the
transaction. We *must block* in every place that could fail -
locking, memory allocation and/or IO - until the transaction
completes because we cannot undo the changes we've already made to
the dirty items in the transaction

It's even worse than that - once we have committed intents, the
whole chain of intent processing must be run to completionr. Hence
we can't tolerate backing out of that defered processing chain half
wa

Re: [Linux-cachefs] [PATCH 25/29] xfs: support nowait for xfs_buf_item_init()

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:27PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> support nowait for xfs_buf_item_init() and error out -EAGAIN to
> _xfs_trans_bjoin() when it would block.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_buf_item.c |  9 +++--
>  fs/xfs/xfs_buf_item.h |  2 +-
>  fs/xfs/xfs_buf_item_recover.c |  2 +-
>  fs/xfs/xfs_trans_buf.c| 16 +---
>  4 files changed, 22 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 023d4e0385dd..b1e63137d65b 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -827,7 +827,8 @@ xfs_buf_item_free_format(
>  int
>  xfs_buf_item_init(
>   struct xfs_buf  *bp,
> - struct xfs_mount *mp)
> + struct xfs_mount *mp,
> + bool   nowait)
>  {
>   struct xfs_buf_log_item *bip = bp->b_log_item;
>   int chunks;
> @@ -847,7 +848,11 @@ xfs_buf_item_init(
>   return 0;
>   }
>  
> - bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
> + bip = kmem_cache_zalloc(xfs_buf_item_cache,
> + GFP_KERNEL | (nowait ? 0 : __GFP_NOFAIL));
> + if (!bip)
> + return -EAGAIN;
> +
>   xfs_log_item_init(mp, >bli_item, XFS_LI_BUF, _buf_item_ops);
>   bip->bli_buf = bp;

I see filesystem shutdowns

> diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
> index 016371f58f26..a1e4f2e8629a 100644
> --- a/fs/xfs/xfs_trans_buf.c
> +++ b/fs/xfs/xfs_trans_buf.c
> @@ -57,13 +57,14 @@ xfs_trans_buf_item_match(
>   * If the buffer does not yet have a buf log item associated with it,
>   * then allocate one for it.  Then add the buf item to the transaction.
>   */
> -STATIC void
> +STATIC int
>  _xfs_trans_bjoin(
>   struct xfs_trans*tp,
>   struct xfs_buf  *bp,
>   int reset_recur)
>  {
>   struct xfs_buf_log_item *bip;
> + int ret;
>  
>   ASSERT(bp->b_transp == NULL);
>  
> @@ -72,7 +73,11 @@ _xfs_trans_bjoin(
>* it doesn't have one yet, then allocate one and initialize it.
>* The checks to see if one is there are in xfs_buf_item_init().
>*/
> - xfs_buf_item_init(bp, tp->t_mountp);
> + ret = xfs_buf_item_init(bp, tp->t_mountp,
> + tp->t_flags & XFS_TRANS_NOWAIT);
> + if (ret < 0)
> + return ret;
> +
>   bip = bp->b_log_item;
>   ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
>   ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
> @@ -92,6 +97,7 @@ _xfs_trans_bjoin(
>   xfs_trans_add_item(tp, >bli_item);
>   bp->b_transp = tp;
>  
> + return 0;
>  }
>  
>  void
> @@ -309,7 +315,11 @@ xfs_trans_read_buf_map(
>   }
>  
>   if (tp) {
> - _xfs_trans_bjoin(tp, bp, 1);
> + error = _xfs_trans_bjoin(tp, bp, 1);
> + if (error) {
> + xfs_buf_relse(bp);
> + return error;
> + }
>   trace_xfs_trans_read_buf(bp->b_log_item);

So what happens at the callers when we have a dirty transaction and
joining a buffer fails with -EAGAIN?

Apart from the fact this may well propagate -EAGAIN up to userspace,
cancelling a dirty transaction at this point will result in a
filesystem shutdown

Indeed, this can happen in the "simple" timestamp update case that
this "nowait" semantic is being aimed at. We log the inode in the
timestamp update, which dirties the log item and registers a
precommit operation to be run. We commit the
transaction, which then runs xfs_inode_item_precommit() and that
may need to attach the inode to the inode cluster buffer. This
results in:

xfs_inode_item_precommit
  xfs_imap_to_bp
xfs_trans_read_buf_map
  _xfs_trans_bjoin
xfs_buf_item_init(XFS_TRANS_NOWAIT)
  kmem_cache_zalloc(GFP_NOFS)
  
  gets -EAGAIN error
propagates -EAGAIN
  fails due to -EAGAIN

And now xfs_trans_commit() fails with a dirty transaction and the
filesystem shuts down.

IOWs, XFS_TRANS_NOWAIT as it stands is fundamentally broken. Once we
dirty an item in a transaction, we *cannot* back out of the
transaction. We *must block* in every place that could fail -
locking, memory allocation and/or IO - until the transaction
completes because we cannot undo the changes we've already made to
the dirty items in the transaction

It's even worse than that - once we have committed intents, the
whole chain of intent processing must be run to completionr. Hence
we can't tolerate backing out of that defered processing chain half
way t

Re: [Cluster-devel] [PATCH 25/29] xfs: support nowait for xfs_buf_item_init()

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:27PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> support nowait for xfs_buf_item_init() and error out -EAGAIN to
> _xfs_trans_bjoin() when it would block.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_buf_item.c |  9 +++--
>  fs/xfs/xfs_buf_item.h |  2 +-
>  fs/xfs/xfs_buf_item_recover.c |  2 +-
>  fs/xfs/xfs_trans_buf.c| 16 +---
>  4 files changed, 22 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 023d4e0385dd..b1e63137d65b 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -827,7 +827,8 @@ xfs_buf_item_free_format(
>  int
>  xfs_buf_item_init(
>   struct xfs_buf  *bp,
> - struct xfs_mount *mp)
> + struct xfs_mount *mp,
> + bool   nowait)
>  {
>   struct xfs_buf_log_item *bip = bp->b_log_item;
>   int chunks;
> @@ -847,7 +848,11 @@ xfs_buf_item_init(
>   return 0;
>   }
>  
> - bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL);
> + bip = kmem_cache_zalloc(xfs_buf_item_cache,
> + GFP_KERNEL | (nowait ? 0 : __GFP_NOFAIL));
> + if (!bip)
> + return -EAGAIN;
> +
>   xfs_log_item_init(mp, >bli_item, XFS_LI_BUF, _buf_item_ops);
>   bip->bli_buf = bp;

I see filesystem shutdowns

> diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
> index 016371f58f26..a1e4f2e8629a 100644
> --- a/fs/xfs/xfs_trans_buf.c
> +++ b/fs/xfs/xfs_trans_buf.c
> @@ -57,13 +57,14 @@ xfs_trans_buf_item_match(
>   * If the buffer does not yet have a buf log item associated with it,
>   * then allocate one for it.  Then add the buf item to the transaction.
>   */
> -STATIC void
> +STATIC int
>  _xfs_trans_bjoin(
>   struct xfs_trans*tp,
>   struct xfs_buf  *bp,
>   int reset_recur)
>  {
>   struct xfs_buf_log_item *bip;
> + int ret;
>  
>   ASSERT(bp->b_transp == NULL);
>  
> @@ -72,7 +73,11 @@ _xfs_trans_bjoin(
>* it doesn't have one yet, then allocate one and initialize it.
>* The checks to see if one is there are in xfs_buf_item_init().
>*/
> - xfs_buf_item_init(bp, tp->t_mountp);
> + ret = xfs_buf_item_init(bp, tp->t_mountp,
> + tp->t_flags & XFS_TRANS_NOWAIT);
> + if (ret < 0)
> + return ret;
> +
>   bip = bp->b_log_item;
>   ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
>   ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
> @@ -92,6 +97,7 @@ _xfs_trans_bjoin(
>   xfs_trans_add_item(tp, >bli_item);
>   bp->b_transp = tp;
>  
> + return 0;
>  }
>  
>  void
> @@ -309,7 +315,11 @@ xfs_trans_read_buf_map(
>   }
>  
>   if (tp) {
> - _xfs_trans_bjoin(tp, bp, 1);
> + error = _xfs_trans_bjoin(tp, bp, 1);
> + if (error) {
> + xfs_buf_relse(bp);
> + return error;
> + }
>   trace_xfs_trans_read_buf(bp->b_log_item);

So what happens at the callers when we have a dirty transaction and
joining a buffer fails with -EAGAIN?

Apart from the fact this may well propagate -EAGAIN up to userspace,
cancelling a dirty transaction at this point will result in a
filesystem shutdown

Indeed, this can happen in the "simple" timestamp update case that
this "nowait" semantic is being aimed at. We log the inode in the
timestamp update, which dirties the log item and registers a
precommit operation to be run. We commit the
transaction, which then runs xfs_inode_item_precommit() and that
may need to attach the inode to the inode cluster buffer. This
results in:

xfs_inode_item_precommit
  xfs_imap_to_bp
xfs_trans_read_buf_map
  _xfs_trans_bjoin
xfs_buf_item_init(XFS_TRANS_NOWAIT)
  kmem_cache_zalloc(GFP_NOFS)
  
  gets -EAGAIN error
propagates -EAGAIN
  fails due to -EAGAIN

And now xfs_trans_commit() fails with a dirty transaction and the
filesystem shuts down.

IOWs, XFS_TRANS_NOWAIT as it stands is fundamentally broken. Once we
dirty an item in a transaction, we *cannot* back out of the
transaction. We *must block* in every place that could fail -
locking, memory allocation and/or IO - until the transaction
completes because we cannot undo the changes we've already made to
the dirty items in the transaction

It's even worse than that - once we have committed intents, the
whole chain of intent processing must be run to completionr. Hence
we can't tolerate backing out of that defered processing chain half
way through because we might have to block.

Until we can roll back partial dirty transactions and partially
completed defered intent chains at any random point of completion,
XFS_TRANS_NOWAIT will not work.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [f2fs-dev] [PATCH 24/29] xfs: support nowait for xfs_buf_read_map()

2023-08-25 Thread Dave Chinner via Linux-f2fs-devel
On Fri, Aug 25, 2023 at 09:54:26PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> This causes xfstests generic/232 hung in umount process, waiting for ail
> push, so I comment it for now, need some hints from xfs folks.
> Not a real patch.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_buf.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index cdad80e1ae25..284962a9f31a 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -828,6 +828,13 @@ xfs_buf_read_map(
>   trace_xfs_buf_read(bp, flags, _RET_IP_);
>  
>   if (!(bp->b_flags & XBF_DONE)) {
> +//   /*
> +//* Let's bypass the _xfs_buf_read() for now
> +//*/
> +//   if (flags & XBF_NOWAIT) {
> +//   xfs_buf_relse(bp);
> +//   return -EAGAIN;
> +//   }

This is *fundamentally broken*, and apart from anything else breaks
readahead.

IF we asked for a read, we cannot instantiate the buffer and then
*not issue any IO on it* and release it. That leaves an
uninitialised buffer in memory, and there's every chance that
something then trips over it and bad things happen.

A buffer like this *must* be errored out and marked stale so that
the next access to it will then re-initialise the buffer state and
trigger any preparatory work that needs to be done for the new
operation.

This comes back to my first comments that XBF_TRYLOCK cannot simpy
be replaced with XBF_NOWAIT semantics...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [PATCH 28/29] xfs: support nowait semantics for xc_ctx_lock in xlog_cil_commit()

2023-08-25 Thread Dave Chinner via Linux-f2fs-devel
On Fri, Aug 25, 2023 at 09:54:30PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> Apply trylock logic for xc_ctx_lock in xlog_cil_commit() in nowait
> case and error out -EAGAIN for xlog_cil_commit().

Again, fundamentally broken. Any error from xlog_cil_commit() will
result in a filesystem shutdown as we cannot back out from failure
with dirty log items gracefully at this point.

-Dave.

-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [Linux-cachefs] [PATCH 28/29] xfs: support nowait semantics for xc_ctx_lock in xlog_cil_commit()

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:30PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> Apply trylock logic for xc_ctx_lock in xlog_cil_commit() in nowait
> case and error out -EAGAIN for xlog_cil_commit().

Again, fundamentally broken. Any error from xlog_cil_commit() will
result in a filesystem shutdown as we cannot back out from failure
with dirty log items gracefully at this point.

-Dave.

-- 
Dave Chinner
da...@fromorbit.com

--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs



Re: [Cluster-devel] [PATCH 28/29] xfs: support nowait semantics for xc_ctx_lock in xlog_cil_commit()

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:30PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> Apply trylock logic for xc_ctx_lock in xlog_cil_commit() in nowait
> case and error out -EAGAIN for xlog_cil_commit().

Again, fundamentally broken. Any error from xlog_cil_commit() will
result in a filesystem shutdown as we cannot back out from failure
with dirty log items gracefully at this point.

-Dave.

-- 
Dave Chinner
da...@fromorbit.com



Re: [f2fs-dev] [PATCH 26/29] xfs: return -EAGAIN when nowait meets sync in transaction commit

2023-08-25 Thread Dave Chinner via Linux-f2fs-devel
On Fri, Aug 25, 2023 at 09:54:28PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> if the log transaction is a sync one, let's fail the nowait try and
> return -EAGAIN directly since sync transaction means blocked by IO.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_trans.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 7988b4c7f36e..f1f84a3dd456 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -968,12 +968,24 @@ __xfs_trans_commit(
>   xfs_csn_t   commit_seq = 0;
>   int error = 0;
>   int sync = tp->t_flags & XFS_TRANS_SYNC;
> + boolnowait = tp->t_flags & XFS_TRANS_NOWAIT;
> + boolperm_log = tp->t_flags & XFS_TRANS_PERM_LOG_RES;
>  
>   trace_xfs_trans_commit(tp, _RET_IP_);
>  
> + if (nowait && sync) {
> + /*
> +  * Currently nowait is only from xfs_vn_update_time()
> +  * so perm_log is always false here, but let's make
> +  * code general.
> +  */
> + if (perm_log)
> + xfs_defer_cancel(tp);
> + goto out_unreserve;
> + }

This is fundamentally broken.  We cannot about a transaction commit
with dirty items at this point with shutting down the filesystem.

This points to XFS_TRANS_NOWAIT being completely broken, too,
because we don't call xfs_trans_set_sync() until just before we
commit the transaction. At this point, it is -too late- for
nowait+sync to be handled gracefully, and it will *always* go bad.

IOWs, the whole transaction "nowait" semantics as designed and
implemented is not a workable solution

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [Cluster-devel] [PATCH 26/29] xfs: return -EAGAIN when nowait meets sync in transaction commit

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:28PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> if the log transaction is a sync one, let's fail the nowait try and
> return -EAGAIN directly since sync transaction means blocked by IO.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_trans.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 7988b4c7f36e..f1f84a3dd456 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -968,12 +968,24 @@ __xfs_trans_commit(
>   xfs_csn_t   commit_seq = 0;
>   int error = 0;
>   int sync = tp->t_flags & XFS_TRANS_SYNC;
> + boolnowait = tp->t_flags & XFS_TRANS_NOWAIT;
> + boolperm_log = tp->t_flags & XFS_TRANS_PERM_LOG_RES;
>  
>   trace_xfs_trans_commit(tp, _RET_IP_);
>  
> + if (nowait && sync) {
> + /*
> +  * Currently nowait is only from xfs_vn_update_time()
> +  * so perm_log is always false here, but let's make
> +  * code general.
> +  */
> + if (perm_log)
> + xfs_defer_cancel(tp);
> + goto out_unreserve;
> + }

This is fundamentally broken.  We cannot about a transaction commit
with dirty items at this point with shutting down the filesystem.

This points to XFS_TRANS_NOWAIT being completely broken, too,
because we don't call xfs_trans_set_sync() until just before we
commit the transaction. At this point, it is -too late- for
nowait+sync to be handled gracefully, and it will *always* go bad.

IOWs, the whole transaction "nowait" semantics as designed and
implemented is not a workable solution

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [Linux-cachefs] [PATCH 26/29] xfs: return -EAGAIN when nowait meets sync in transaction commit

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:28PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> if the log transaction is a sync one, let's fail the nowait try and
> return -EAGAIN directly since sync transaction means blocked by IO.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_trans.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 7988b4c7f36e..f1f84a3dd456 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -968,12 +968,24 @@ __xfs_trans_commit(
>   xfs_csn_t   commit_seq = 0;
>   int error = 0;
>   int sync = tp->t_flags & XFS_TRANS_SYNC;
> + boolnowait = tp->t_flags & XFS_TRANS_NOWAIT;
> + boolperm_log = tp->t_flags & XFS_TRANS_PERM_LOG_RES;
>  
>   trace_xfs_trans_commit(tp, _RET_IP_);
>  
> + if (nowait && sync) {
> + /*
> +  * Currently nowait is only from xfs_vn_update_time()
> +  * so perm_log is always false here, but let's make
> +  * code general.
> +  */
> + if (perm_log)
> + xfs_defer_cancel(tp);
> + goto out_unreserve;
> + }

This is fundamentally broken.  We cannot about a transaction commit
with dirty items at this point with shutting down the filesystem.

This points to XFS_TRANS_NOWAIT being completely broken, too,
because we don't call xfs_trans_set_sync() until just before we
commit the transaction. At this point, it is -too late- for
nowait+sync to be handled gracefully, and it will *always* go bad.

IOWs, the whole transaction "nowait" semantics as designed and
implemented is not a workable solution

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs



Re: [Linux-cachefs] [PATCH 24/29] xfs: support nowait for xfs_buf_read_map()

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:26PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> This causes xfstests generic/232 hung in umount process, waiting for ail
> push, so I comment it for now, need some hints from xfs folks.
> Not a real patch.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_buf.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index cdad80e1ae25..284962a9f31a 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -828,6 +828,13 @@ xfs_buf_read_map(
>   trace_xfs_buf_read(bp, flags, _RET_IP_);
>  
>   if (!(bp->b_flags & XBF_DONE)) {
> +//   /*
> +//* Let's bypass the _xfs_buf_read() for now
> +//*/
> +//   if (flags & XBF_NOWAIT) {
> +//   xfs_buf_relse(bp);
> +//   return -EAGAIN;
> +//   }

This is *fundamentally broken*, and apart from anything else breaks
readahead.

IF we asked for a read, we cannot instantiate the buffer and then
*not issue any IO on it* and release it. That leaves an
uninitialised buffer in memory, and there's every chance that
something then trips over it and bad things happen.

A buffer like this *must* be errored out and marked stale so that
the next access to it will then re-initialise the buffer state and
trigger any preparatory work that needs to be done for the new
operation.

This comes back to my first comments that XBF_TRYLOCK cannot simpy
be replaced with XBF_NOWAIT semantics...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs



Re: [Cluster-devel] [PATCH 24/29] xfs: support nowait for xfs_buf_read_map()

2023-08-25 Thread Dave Chinner
On Fri, Aug 25, 2023 at 09:54:26PM +0800, Hao Xu wrote:
> From: Hao Xu 
> 
> This causes xfstests generic/232 hung in umount process, waiting for ail
> push, so I comment it for now, need some hints from xfs folks.
> Not a real patch.
> 
> Signed-off-by: Hao Xu 
> ---
>  fs/xfs/xfs_buf.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index cdad80e1ae25..284962a9f31a 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -828,6 +828,13 @@ xfs_buf_read_map(
>   trace_xfs_buf_read(bp, flags, _RET_IP_);
>  
>   if (!(bp->b_flags & XBF_DONE)) {
> +//   /*
> +//* Let's bypass the _xfs_buf_read() for now
> +//*/
> +//   if (flags & XBF_NOWAIT) {
> +//   xfs_buf_relse(bp);
> +//   return -EAGAIN;
> +//   }

This is *fundamentally broken*, and apart from anything else breaks
readahead.

IF we asked for a read, we cannot instantiate the buffer and then
*not issue any IO on it* and release it. That leaves an
uninitialised buffer in memory, and there's every chance that
something then trips over it and bad things happen.

A buffer like this *must* be errored out and marked stale so that
the next access to it will then re-initialise the buffer state and
trigger any preparatory work that needs to be done for the new
operation.

This comes back to my first comments that XBF_TRYLOCK cannot simpy
be replaced with XBF_NOWAIT semantics...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [f2fs-dev] [PATCH 02/29] xfs: rename XBF_TRYLOCK to XBF_NOWAIT

2023-08-25 Thread Dave Chinner via Linux-f2fs-devel
45,7 @@ struct xfs_buf;
>  
>  /* flags used only as arguments to access routines */
>  #define XBF_INCORE(1u << 29)/* lookup only, return if found in cache */
> -#define XBF_TRYLOCK   (1u << 30)/* lock requested, but do not wait */
> +#define XBF_NOWAIT(1u << 30)/* mem/lock requested, but do not wait */

That's now a really poor comment. It doesn't describe the semantics
or constraints that NOWAIT might imply.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [Cluster-devel] [PATCH 02/29] xfs: rename XBF_TRYLOCK to XBF_NOWAIT

2023-08-25 Thread Dave Chinner
45,7 @@ struct xfs_buf;
>  
>  /* flags used only as arguments to access routines */
>  #define XBF_INCORE(1u << 29)/* lookup only, return if found in cache */
> -#define XBF_TRYLOCK   (1u << 30)/* lock requested, but do not wait */
> +#define XBF_NOWAIT(1u << 30)/* mem/lock requested, but do not wait */

That's now a really poor comment. It doesn't describe the semantics
or constraints that NOWAIT might imply.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [Linux-cachefs] [PATCH 02/29] xfs: rename XBF_TRYLOCK to XBF_NOWAIT

2023-08-25 Thread Dave Chinner
45,7 @@ struct xfs_buf;
>  
>  /* flags used only as arguments to access routines */
>  #define XBF_INCORE(1u << 29)/* lookup only, return if found in cache */
> -#define XBF_TRYLOCK   (1u << 30)/* lock requested, but do not wait */
> +#define XBF_NOWAIT(1u << 30)/* mem/lock requested, but do not wait */

That's now a really poor comment. It doesn't describe the semantics
or constraints that NOWAIT might imply.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs



Re: [f2fs-dev] [PATCH v4 46/48] mm: shrinker: make memcg slab shrink lockless

2023-08-07 Thread Dave Chinner via Linux-f2fs-devel
ly(!shrinker || !shrinker_try_get(shrinker))) 
> {
> + clear_bit(offset, unit->map);
> + rcu_read_unlock();
>   continue;
>   }
> + rcu_read_unlock();
>  
>   /* Call non-slab shrinkers even though kmem is disabled 
> */
>   if (!memcg_kmem_online() &&
> @@ -523,15 +546,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   set_shrinker_bit(memcg, nid, 
> shrinker_id);
>   }
>   freed += ret;
> -
> - if (rwsem_is_contended(_rwsem)) {
> - freed = freed ? : 1;
> - goto unlock;
> - }
> + shrinker_put(shrinker);

Ok, so why is this safe to call without holding the rcu read lock?
The global shrinker has to hold the rcu_read_lock() whilst calling
shrinker_put() to guarantee the validity of the list next pointer,
but we don't hold off RCU here so what guarantees a racing global
shrinker walk doesn't trip over this shrinker_put() call dropping
the refcount to zero and freeing occuring in a different context...


> + /*
> +  * We have already exited the read-side of rcu critical section
> +  * before calling do_shrink_slab(), the shrinker_info may be
> +  * released in expand_one_shrinker_info(), so reacquire the
> +  * shrinker_info.
> +  */
> + index++;
> + goto again;

With that, what makes the use of shrinker_info in
xchg_nr_deferred_memcg() in do_shrink_slab() coherent and valid?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [PATCH v4 46/48] mm: shrinker: make memcg slab shrink lockless

2023-08-07 Thread Dave Chinner via Linux-erofs
ly(!shrinker || !shrinker_try_get(shrinker))) 
> {
> + clear_bit(offset, unit->map);
> + rcu_read_unlock();
>   continue;
>   }
> + rcu_read_unlock();
>  
>   /* Call non-slab shrinkers even though kmem is disabled 
> */
>   if (!memcg_kmem_online() &&
> @@ -523,15 +546,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   set_shrinker_bit(memcg, nid, 
> shrinker_id);
>   }
>   freed += ret;
> -
> - if (rwsem_is_contended(_rwsem)) {
> - freed = freed ? : 1;
> - goto unlock;
> - }
> + shrinker_put(shrinker);

Ok, so why is this safe to call without holding the rcu read lock?
The global shrinker has to hold the rcu_read_lock() whilst calling
shrinker_put() to guarantee the validity of the list next pointer,
but we don't hold off RCU here so what guarantees a racing global
shrinker walk doesn't trip over this shrinker_put() call dropping
the refcount to zero and freeing occuring in a different context...


> + /*
> +  * We have already exited the read-side of rcu critical section
> +  * before calling do_shrink_slab(), the shrinker_info may be
> +  * released in expand_one_shrinker_info(), so reacquire the
> +  * shrinker_info.
> +  */
> + index++;
> + goto again;

With that, what makes the use of shrinker_info in
xchg_nr_deferred_memcg() in do_shrink_slab() coherent and valid?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [Cluster-devel] [PATCH v4 46/48] mm: shrinker: make memcg slab shrink lockless

2023-08-07 Thread Dave Chinner
ly(!shrinker || !shrinker_try_get(shrinker))) 
> {
> + clear_bit(offset, unit->map);
> + rcu_read_unlock();
>   continue;
>   }
> + rcu_read_unlock();
>  
>   /* Call non-slab shrinkers even though kmem is disabled 
> */
>   if (!memcg_kmem_online() &&
> @@ -523,15 +546,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   set_shrinker_bit(memcg, nid, 
> shrinker_id);
>   }
>   freed += ret;
> -
> - if (rwsem_is_contended(_rwsem)) {
> - freed = freed ? : 1;
> - goto unlock;
> - }
> + shrinker_put(shrinker);

Ok, so why is this safe to call without holding the rcu read lock?
The global shrinker has to hold the rcu_read_lock() whilst calling
shrinker_put() to guarantee the validity of the list next pointer,
but we don't hold off RCU here so what guarantees a racing global
shrinker walk doesn't trip over this shrinker_put() call dropping
the refcount to zero and freeing occuring in a different context...


> + /*
> +  * We have already exited the read-side of rcu critical section
> +  * before calling do_shrink_slab(), the shrinker_info may be
> +  * released in expand_one_shrinker_info(), so reacquire the
> +  * shrinker_info.
> +  */
> + index++;
> + goto again;

With that, what makes the use of shrinker_info in
xchg_nr_deferred_memcg() in do_shrink_slab() coherent and valid?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [dm-devel] [PATCH v4 46/48] mm: shrinker: make memcg slab shrink lockless

2023-08-07 Thread Dave Chinner
ly(!shrinker || !shrinker_try_get(shrinker))) 
> {
> + clear_bit(offset, unit->map);
> + rcu_read_unlock();
>   continue;
>   }
> + rcu_read_unlock();
>  
>   /* Call non-slab shrinkers even though kmem is disabled 
> */
>   if (!memcg_kmem_online() &&
> @@ -523,15 +546,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   set_shrinker_bit(memcg, nid, 
> shrinker_id);
>   }
>   freed += ret;
> -
> - if (rwsem_is_contended(_rwsem)) {
> - freed = freed ? : 1;
> - goto unlock;
> - }
> + shrinker_put(shrinker);

Ok, so why is this safe to call without holding the rcu read lock?
The global shrinker has to hold the rcu_read_lock() whilst calling
shrinker_put() to guarantee the validity of the list next pointer,
but we don't hold off RCU here so what guarantees a racing global
shrinker walk doesn't trip over this shrinker_put() call dropping
the refcount to zero and freeing occuring in a different context...


> + /*
> +  * We have already exited the read-side of rcu critical section
> +  * before calling do_shrink_slab(), the shrinker_info may be
> +  * released in expand_one_shrinker_info(), so reacquire the
> +  * shrinker_info.
> +  */
> + index++;
> + goto again;

With that, what makes the use of shrinker_info in
xchg_nr_deferred_memcg() in do_shrink_slab() coherent and valid?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [PATCH v4 46/48] mm: shrinker: make memcg slab shrink lockless

2023-08-07 Thread Dave Chinner
ly(!shrinker || !shrinker_try_get(shrinker))) 
> {
> + clear_bit(offset, unit->map);
> + rcu_read_unlock();
>   continue;
>   }
> + rcu_read_unlock();
>  
>   /* Call non-slab shrinkers even though kmem is disabled 
> */
>   if (!memcg_kmem_online() &&
> @@ -523,15 +546,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   set_shrinker_bit(memcg, nid, 
> shrinker_id);
>   }
>   freed += ret;
> -
> - if (rwsem_is_contended(_rwsem)) {
> - freed = freed ? : 1;
> - goto unlock;
> - }
> + shrinker_put(shrinker);

Ok, so why is this safe to call without holding the rcu read lock?
The global shrinker has to hold the rcu_read_lock() whilst calling
shrinker_put() to guarantee the validity of the list next pointer,
but we don't hold off RCU here so what guarantees a racing global
shrinker walk doesn't trip over this shrinker_put() call dropping
the refcount to zero and freeing occuring in a different context...


> + /*
> +  * We have already exited the read-side of rcu critical section
> +  * before calling do_shrink_slab(), the shrinker_info may be
> +  * released in expand_one_shrinker_info(), so reacquire the
> +  * shrinker_info.
> +  */
> + index++;
> + goto again;

With that, what makes the use of shrinker_info in
xchg_nr_deferred_memcg() in do_shrink_slab() coherent and valid?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH v4 46/48] mm: shrinker: make memcg slab shrink lockless

2023-08-07 Thread Dave Chinner via Virtualization
ly(!shrinker || !shrinker_try_get(shrinker))) 
> {
> + clear_bit(offset, unit->map);
> + rcu_read_unlock();
>   continue;
>   }
> + rcu_read_unlock();
>  
>   /* Call non-slab shrinkers even though kmem is disabled 
> */
>   if (!memcg_kmem_online() &&
> @@ -523,15 +546,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   set_shrinker_bit(memcg, nid, 
> shrinker_id);
>   }
>   freed += ret;
> -
> - if (rwsem_is_contended(_rwsem)) {
> - freed = freed ? : 1;
> - goto unlock;
> - }
> + shrinker_put(shrinker);

Ok, so why is this safe to call without holding the rcu read lock?
The global shrinker has to hold the rcu_read_lock() whilst calling
shrinker_put() to guarantee the validity of the list next pointer,
but we don't hold off RCU here so what guarantees a racing global
shrinker walk doesn't trip over this shrinker_put() call dropping
the refcount to zero and freeing occuring in a different context...


> + /*
> +  * We have already exited the read-side of rcu critical section
> +  * before calling do_shrink_slab(), the shrinker_info may be
> +  * released in expand_one_shrinker_info(), so reacquire the
> +  * shrinker_info.
> +  */
> + index++;
> + goto again;

With that, what makes the use of shrinker_info in
xchg_nr_deferred_memcg() in do_shrink_slab() coherent and valid?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 46/48] mm: shrinker: make memcg slab shrink lockless

2023-08-07 Thread Dave Chinner
ly(!shrinker || !shrinker_try_get(shrinker))) 
> {
> + clear_bit(offset, unit->map);
> + rcu_read_unlock();
>   continue;
>   }
> + rcu_read_unlock();
>  
>   /* Call non-slab shrinkers even though kmem is disabled 
> */
>   if (!memcg_kmem_online() &&
> @@ -523,15 +546,20 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   set_shrinker_bit(memcg, nid, 
> shrinker_id);
>   }
>   freed += ret;
> -
> - if (rwsem_is_contended(_rwsem)) {
> - freed = freed ? : 1;
> - goto unlock;
> - }
> + shrinker_put(shrinker);

Ok, so why is this safe to call without holding the rcu read lock?
The global shrinker has to hold the rcu_read_lock() whilst calling
shrinker_put() to guarantee the validity of the list next pointer,
but we don't hold off RCU here so what guarantees a racing global
shrinker walk doesn't trip over this shrinker_put() call dropping
the refcount to zero and freeing occuring in a different context...


> + /*
> +  * We have already exited the read-side of rcu critical section
> +  * before calling do_shrink_slab(), the shrinker_info may be
> +  * released in expand_one_shrinker_info(), so reacquire the
> +  * shrinker_info.
> +  */
> + index++;
> + goto again;

With that, what makes the use of shrinker_info in
xchg_nr_deferred_memcg() in do_shrink_slab() coherent and valid?

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [f2fs-dev] [PATCH v4 45/48] mm: shrinker: make global slab shrink lockless

2023-08-07 Thread Dave Chinner via Linux-f2fs-devel
On Mon, Aug 07, 2023 at 07:09:33PM +0800, Qi Zheng wrote:
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index eb342994675a..f06225f18531 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -4,6 +4,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #define SHRINKER_UNIT_BITS   BITS_PER_LONG
>  
> @@ -87,6 +89,10 @@ struct shrinker {
>   int seeks;  /* seeks to recreate an obj */
>   unsigned flags;
>  
> + refcount_t refcount;
> + struct completion done;
> + struct rcu_head rcu;

Documentation, please. What does the refcount protect, what does the
completion provide, etc.

> +
>   void *private_data;
>  
>   /* These are for internal use */
> @@ -120,6 +126,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, 
> const char *fmt, ...);
>  void shrinker_register(struct shrinker *shrinker);
>  void shrinker_free(struct shrinker *shrinker);
>  
> +static inline bool shrinker_try_get(struct shrinker *shrinker)
> +{
> + return refcount_inc_not_zero(>refcount);
> +}
> +
> +static inline void shrinker_put(struct shrinker *shrinker)
> +{
> + if (refcount_dec_and_test(>refcount))
> + complete(>done);
> +}
> +
>  #ifdef CONFIG_SHRINKER_DEBUG
>  extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
> const char *fmt, ...);
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index 1911c06b8af5..d318f5621862 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -2,6 +2,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -577,33 +578,42 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, 
> struct mem_cgroup *memcg,
>   if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> - if (!down_read_trylock(_rwsem))
> - goto out;
> -
> - list_for_each_entry(shrinker, _list, list) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(shrinker, _list, list) {
>   struct shrink_control sc = {
>   .gfp_mask = gfp_mask,
>   .nid = nid,
>   .memcg = memcg,
>   };
>  
> + if (!shrinker_try_get(shrinker))
> + continue;
> +
> + /*
> +  * We can safely unlock the RCU lock here since we already
> +  * hold the refcount of the shrinker.
> +  */
> + rcu_read_unlock();
> +
>   ret = do_shrink_slab(, shrinker, priority);
>   if (ret == SHRINK_EMPTY)
>   ret = 0;
>   freed += ret;
> +
>   /*
> -  * Bail out if someone want to register a new shrinker to
> -  * prevent the registration from being stalled for long periods
> -  * by parallel ongoing shrinking.
> +  * This shrinker may be deleted from shrinker_list and freed
> +  * after the shrinker_put() below, but this shrinker is still
> +  * used for the next traversal. So it is necessary to hold the
> +  * RCU lock first to prevent this shrinker from being freed,
> +  * which also ensures that the next shrinker that is traversed
> +  * will not be freed (even if it is deleted from shrinker_list
> +  * at the same time).
>*/

This needs to be moved to the head of the function, and document
the whole list walk, get, put and completion parts of the algorithm
that make it safe. There's more to this than "we hold a reference
count", especially the tricky "we might see the shrinker before it
is fully initialised" case


.
>  void shrinker_free(struct shrinker *shrinker)
>  {
>   struct dentry *debugfs_entry = NULL;
> @@ -686,9 +712,18 @@ void shrinker_free(struct shrinker *shrinker)
>   if (!shrinker)
>   return;
>  
> + if (shrinker->flags & SHRINKER_REGISTERED) {
> + shrinker_put(shrinker);
> + wait_for_completion(>done);
> + }

Needs a comment explaining why we need to wait here...
> +
>   down_write(_rwsem);
>   if (shrinker->flags & SHRINKER_REGISTERED) {
> - list_del(>list);
> + /*
> +  * Lookups on the shrinker are over and will fail in the future,
> +  * so we can now remove it from the lists and free it.
> +  */

 rather than here after the wait has been done and provided the
guarantee that no shrinker is running or will run again...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [dm-devel] [PATCH v4 45/48] mm: shrinker: make global slab shrink lockless

2023-08-07 Thread Dave Chinner
On Mon, Aug 07, 2023 at 07:09:33PM +0800, Qi Zheng wrote:
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index eb342994675a..f06225f18531 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -4,6 +4,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #define SHRINKER_UNIT_BITS   BITS_PER_LONG
>  
> @@ -87,6 +89,10 @@ struct shrinker {
>   int seeks;  /* seeks to recreate an obj */
>   unsigned flags;
>  
> + refcount_t refcount;
> + struct completion done;
> + struct rcu_head rcu;

Documentation, please. What does the refcount protect, what does the
completion provide, etc.

> +
>   void *private_data;
>  
>   /* These are for internal use */
> @@ -120,6 +126,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, 
> const char *fmt, ...);
>  void shrinker_register(struct shrinker *shrinker);
>  void shrinker_free(struct shrinker *shrinker);
>  
> +static inline bool shrinker_try_get(struct shrinker *shrinker)
> +{
> + return refcount_inc_not_zero(>refcount);
> +}
> +
> +static inline void shrinker_put(struct shrinker *shrinker)
> +{
> + if (refcount_dec_and_test(>refcount))
> + complete(>done);
> +}
> +
>  #ifdef CONFIG_SHRINKER_DEBUG
>  extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
> const char *fmt, ...);
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index 1911c06b8af5..d318f5621862 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -2,6 +2,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -577,33 +578,42 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, 
> struct mem_cgroup *memcg,
>   if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> - if (!down_read_trylock(_rwsem))
> - goto out;
> -
> - list_for_each_entry(shrinker, _list, list) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(shrinker, _list, list) {
>   struct shrink_control sc = {
>   .gfp_mask = gfp_mask,
>   .nid = nid,
>   .memcg = memcg,
>   };
>  
> + if (!shrinker_try_get(shrinker))
> + continue;
> +
> + /*
> +  * We can safely unlock the RCU lock here since we already
> +  * hold the refcount of the shrinker.
> +  */
> + rcu_read_unlock();
> +
>   ret = do_shrink_slab(, shrinker, priority);
>   if (ret == SHRINK_EMPTY)
>   ret = 0;
>   freed += ret;
> +
>   /*
> -  * Bail out if someone want to register a new shrinker to
> -  * prevent the registration from being stalled for long periods
> -  * by parallel ongoing shrinking.
> +  * This shrinker may be deleted from shrinker_list and freed
> +  * after the shrinker_put() below, but this shrinker is still
> +  * used for the next traversal. So it is necessary to hold the
> +  * RCU lock first to prevent this shrinker from being freed,
> +  * which also ensures that the next shrinker that is traversed
> +  * will not be freed (even if it is deleted from shrinker_list
> +  * at the same time).
>*/

This needs to be moved to the head of the function, and document
the whole list walk, get, put and completion parts of the algorithm
that make it safe. There's more to this than "we hold a reference
count", especially the tricky "we might see the shrinker before it
is fully initialised" case


.
>  void shrinker_free(struct shrinker *shrinker)
>  {
>   struct dentry *debugfs_entry = NULL;
> @@ -686,9 +712,18 @@ void shrinker_free(struct shrinker *shrinker)
>   if (!shrinker)
>   return;
>  
> + if (shrinker->flags & SHRINKER_REGISTERED) {
> + shrinker_put(shrinker);
> + wait_for_completion(>done);
> + }

Needs a comment explaining why we need to wait here...
> +
>   down_write(_rwsem);
>   if (shrinker->flags & SHRINKER_REGISTERED) {
> - list_del(>list);
> + /*
> +  * Lookups on the shrinker are over and will fail in the future,
> +  * so we can now remove it from the lists and free it.
> +  */

 rather than here after the wait has been done and provided the
guarantee that no shrinker is running or will run again...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [Cluster-devel] [PATCH v4 45/48] mm: shrinker: make global slab shrink lockless

2023-08-07 Thread Dave Chinner
On Mon, Aug 07, 2023 at 07:09:33PM +0800, Qi Zheng wrote:
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index eb342994675a..f06225f18531 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -4,6 +4,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #define SHRINKER_UNIT_BITS   BITS_PER_LONG
>  
> @@ -87,6 +89,10 @@ struct shrinker {
>   int seeks;  /* seeks to recreate an obj */
>   unsigned flags;
>  
> + refcount_t refcount;
> + struct completion done;
> + struct rcu_head rcu;

Documentation, please. What does the refcount protect, what does the
completion provide, etc.

> +
>   void *private_data;
>  
>   /* These are for internal use */
> @@ -120,6 +126,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, 
> const char *fmt, ...);
>  void shrinker_register(struct shrinker *shrinker);
>  void shrinker_free(struct shrinker *shrinker);
>  
> +static inline bool shrinker_try_get(struct shrinker *shrinker)
> +{
> + return refcount_inc_not_zero(>refcount);
> +}
> +
> +static inline void shrinker_put(struct shrinker *shrinker)
> +{
> + if (refcount_dec_and_test(>refcount))
> + complete(>done);
> +}
> +
>  #ifdef CONFIG_SHRINKER_DEBUG
>  extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
> const char *fmt, ...);
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index 1911c06b8af5..d318f5621862 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -2,6 +2,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -577,33 +578,42 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, 
> struct mem_cgroup *memcg,
>   if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> - if (!down_read_trylock(_rwsem))
> - goto out;
> -
> - list_for_each_entry(shrinker, _list, list) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(shrinker, _list, list) {
>   struct shrink_control sc = {
>   .gfp_mask = gfp_mask,
>   .nid = nid,
>   .memcg = memcg,
>   };
>  
> + if (!shrinker_try_get(shrinker))
> + continue;
> +
> + /*
> +  * We can safely unlock the RCU lock here since we already
> +  * hold the refcount of the shrinker.
> +  */
> + rcu_read_unlock();
> +
>   ret = do_shrink_slab(, shrinker, priority);
>   if (ret == SHRINK_EMPTY)
>   ret = 0;
>   freed += ret;
> +
>   /*
> -  * Bail out if someone want to register a new shrinker to
> -  * prevent the registration from being stalled for long periods
> -  * by parallel ongoing shrinking.
> +  * This shrinker may be deleted from shrinker_list and freed
> +  * after the shrinker_put() below, but this shrinker is still
> +  * used for the next traversal. So it is necessary to hold the
> +  * RCU lock first to prevent this shrinker from being freed,
> +  * which also ensures that the next shrinker that is traversed
> +  * will not be freed (even if it is deleted from shrinker_list
> +  * at the same time).
>*/

This needs to be moved to the head of the function, and document
the whole list walk, get, put and completion parts of the algorithm
that make it safe. There's more to this than "we hold a reference
count", especially the tricky "we might see the shrinker before it
is fully initialised" case


.
>  void shrinker_free(struct shrinker *shrinker)
>  {
>   struct dentry *debugfs_entry = NULL;
> @@ -686,9 +712,18 @@ void shrinker_free(struct shrinker *shrinker)
>   if (!shrinker)
>   return;
>  
> + if (shrinker->flags & SHRINKER_REGISTERED) {
> + shrinker_put(shrinker);
> + wait_for_completion(>done);
> + }

Needs a comment explaining why we need to wait here...
> +
>   down_write(_rwsem);
>   if (shrinker->flags & SHRINKER_REGISTERED) {
> - list_del(>list);
> + /*
> +  * Lookups on the shrinker are over and will fail in the future,
> +  * so we can now remove it from the lists and free it.
> +  */

 rather than here after the wait has been done and provided the
guarantee that no shrinker is running or will run again...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH v4 45/48] mm: shrinker: make global slab shrink lockless

2023-08-07 Thread Dave Chinner via Linux-erofs
On Mon, Aug 07, 2023 at 07:09:33PM +0800, Qi Zheng wrote:
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index eb342994675a..f06225f18531 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -4,6 +4,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #define SHRINKER_UNIT_BITS   BITS_PER_LONG
>  
> @@ -87,6 +89,10 @@ struct shrinker {
>   int seeks;  /* seeks to recreate an obj */
>   unsigned flags;
>  
> + refcount_t refcount;
> + struct completion done;
> + struct rcu_head rcu;

Documentation, please. What does the refcount protect, what does the
completion provide, etc.

> +
>   void *private_data;
>  
>   /* These are for internal use */
> @@ -120,6 +126,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, 
> const char *fmt, ...);
>  void shrinker_register(struct shrinker *shrinker);
>  void shrinker_free(struct shrinker *shrinker);
>  
> +static inline bool shrinker_try_get(struct shrinker *shrinker)
> +{
> + return refcount_inc_not_zero(>refcount);
> +}
> +
> +static inline void shrinker_put(struct shrinker *shrinker)
> +{
> + if (refcount_dec_and_test(>refcount))
> + complete(>done);
> +}
> +
>  #ifdef CONFIG_SHRINKER_DEBUG
>  extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
> const char *fmt, ...);
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index 1911c06b8af5..d318f5621862 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -2,6 +2,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -577,33 +578,42 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, 
> struct mem_cgroup *memcg,
>   if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> - if (!down_read_trylock(_rwsem))
> - goto out;
> -
> - list_for_each_entry(shrinker, _list, list) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(shrinker, _list, list) {
>   struct shrink_control sc = {
>   .gfp_mask = gfp_mask,
>   .nid = nid,
>   .memcg = memcg,
>   };
>  
> + if (!shrinker_try_get(shrinker))
> + continue;
> +
> + /*
> +  * We can safely unlock the RCU lock here since we already
> +  * hold the refcount of the shrinker.
> +  */
> + rcu_read_unlock();
> +
>   ret = do_shrink_slab(, shrinker, priority);
>   if (ret == SHRINK_EMPTY)
>   ret = 0;
>   freed += ret;
> +
>   /*
> -  * Bail out if someone want to register a new shrinker to
> -  * prevent the registration from being stalled for long periods
> -  * by parallel ongoing shrinking.
> +  * This shrinker may be deleted from shrinker_list and freed
> +  * after the shrinker_put() below, but this shrinker is still
> +  * used for the next traversal. So it is necessary to hold the
> +  * RCU lock first to prevent this shrinker from being freed,
> +  * which also ensures that the next shrinker that is traversed
> +  * will not be freed (even if it is deleted from shrinker_list
> +  * at the same time).
>*/

This needs to be moved to the head of the function, and document
the whole list walk, get, put and completion parts of the algorithm
that make it safe. There's more to this than "we hold a reference
count", especially the tricky "we might see the shrinker before it
is fully initialised" case


.
>  void shrinker_free(struct shrinker *shrinker)
>  {
>   struct dentry *debugfs_entry = NULL;
> @@ -686,9 +712,18 @@ void shrinker_free(struct shrinker *shrinker)
>   if (!shrinker)
>   return;
>  
> + if (shrinker->flags & SHRINKER_REGISTERED) {
> + shrinker_put(shrinker);
> + wait_for_completion(>done);
> + }

Needs a comment explaining why we need to wait here...
> +
>   down_write(_rwsem);
>   if (shrinker->flags & SHRINKER_REGISTERED) {
> - list_del(>list);
> + /*
> +  * Lookups on the shrinker are over and will fail in the future,
> +  * so we can now remove it from the lists and free it.
> +  */

 rather than here after the wait has been done and provided the
guarantee that no shrinker is running or will run again...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH v4 45/48] mm: shrinker: make global slab shrink lockless

2023-08-07 Thread Dave Chinner
On Mon, Aug 07, 2023 at 07:09:33PM +0800, Qi Zheng wrote:
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index eb342994675a..f06225f18531 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -4,6 +4,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #define SHRINKER_UNIT_BITS   BITS_PER_LONG
>  
> @@ -87,6 +89,10 @@ struct shrinker {
>   int seeks;  /* seeks to recreate an obj */
>   unsigned flags;
>  
> + refcount_t refcount;
> + struct completion done;
> + struct rcu_head rcu;

Documentation, please. What does the refcount protect, what does the
completion provide, etc.

> +
>   void *private_data;
>  
>   /* These are for internal use */
> @@ -120,6 +126,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, 
> const char *fmt, ...);
>  void shrinker_register(struct shrinker *shrinker);
>  void shrinker_free(struct shrinker *shrinker);
>  
> +static inline bool shrinker_try_get(struct shrinker *shrinker)
> +{
> + return refcount_inc_not_zero(>refcount);
> +}
> +
> +static inline void shrinker_put(struct shrinker *shrinker)
> +{
> + if (refcount_dec_and_test(>refcount))
> + complete(>done);
> +}
> +
>  #ifdef CONFIG_SHRINKER_DEBUG
>  extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
> const char *fmt, ...);
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index 1911c06b8af5..d318f5621862 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -2,6 +2,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -577,33 +578,42 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, 
> struct mem_cgroup *memcg,
>   if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> - if (!down_read_trylock(_rwsem))
> - goto out;
> -
> - list_for_each_entry(shrinker, _list, list) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(shrinker, _list, list) {
>   struct shrink_control sc = {
>   .gfp_mask = gfp_mask,
>   .nid = nid,
>   .memcg = memcg,
>   };
>  
> + if (!shrinker_try_get(shrinker))
> + continue;
> +
> + /*
> +  * We can safely unlock the RCU lock here since we already
> +  * hold the refcount of the shrinker.
> +  */
> + rcu_read_unlock();
> +
>   ret = do_shrink_slab(, shrinker, priority);
>   if (ret == SHRINK_EMPTY)
>   ret = 0;
>   freed += ret;
> +
>   /*
> -  * Bail out if someone want to register a new shrinker to
> -  * prevent the registration from being stalled for long periods
> -  * by parallel ongoing shrinking.
> +  * This shrinker may be deleted from shrinker_list and freed
> +  * after the shrinker_put() below, but this shrinker is still
> +  * used for the next traversal. So it is necessary to hold the
> +  * RCU lock first to prevent this shrinker from being freed,
> +  * which also ensures that the next shrinker that is traversed
> +  * will not be freed (even if it is deleted from shrinker_list
> +  * at the same time).
>*/

This needs to be moved to the head of the function, and document
the whole list walk, get, put and completion parts of the algorithm
that make it safe. There's more to this than "we hold a reference
count", especially the tricky "we might see the shrinker before it
is fully initialised" case


.
>  void shrinker_free(struct shrinker *shrinker)
>  {
>   struct dentry *debugfs_entry = NULL;
> @@ -686,9 +712,18 @@ void shrinker_free(struct shrinker *shrinker)
>   if (!shrinker)
>   return;
>  
> + if (shrinker->flags & SHRINKER_REGISTERED) {
> + shrinker_put(shrinker);
> + wait_for_completion(>done);
> + }

Needs a comment explaining why we need to wait here...
> +
>   down_write(_rwsem);
>   if (shrinker->flags & SHRINKER_REGISTERED) {
> - list_del(>list);
> + /*
> +  * Lookups on the shrinker are over and will fail in the future,
> +  * so we can now remove it from the lists and free it.
> +  */

 rather than here after the wait has been done and provided the
guarantee that no shrinker is running or will run again...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH v4 45/48] mm: shrinker: make global slab shrink lockless

2023-08-07 Thread Dave Chinner via Virtualization
On Mon, Aug 07, 2023 at 07:09:33PM +0800, Qi Zheng wrote:
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index eb342994675a..f06225f18531 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -4,6 +4,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #define SHRINKER_UNIT_BITS   BITS_PER_LONG
>  
> @@ -87,6 +89,10 @@ struct shrinker {
>   int seeks;  /* seeks to recreate an obj */
>   unsigned flags;
>  
> + refcount_t refcount;
> + struct completion done;
> + struct rcu_head rcu;

Documentation, please. What does the refcount protect, what does the
completion provide, etc.

> +
>   void *private_data;
>  
>   /* These are for internal use */
> @@ -120,6 +126,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, 
> const char *fmt, ...);
>  void shrinker_register(struct shrinker *shrinker);
>  void shrinker_free(struct shrinker *shrinker);
>  
> +static inline bool shrinker_try_get(struct shrinker *shrinker)
> +{
> + return refcount_inc_not_zero(>refcount);
> +}
> +
> +static inline void shrinker_put(struct shrinker *shrinker)
> +{
> + if (refcount_dec_and_test(>refcount))
> + complete(>done);
> +}
> +
>  #ifdef CONFIG_SHRINKER_DEBUG
>  extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
> const char *fmt, ...);
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index 1911c06b8af5..d318f5621862 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -2,6 +2,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -577,33 +578,42 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, 
> struct mem_cgroup *memcg,
>   if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> - if (!down_read_trylock(_rwsem))
> - goto out;
> -
> - list_for_each_entry(shrinker, _list, list) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(shrinker, _list, list) {
>   struct shrink_control sc = {
>   .gfp_mask = gfp_mask,
>   .nid = nid,
>   .memcg = memcg,
>   };
>  
> + if (!shrinker_try_get(shrinker))
> + continue;
> +
> + /*
> +  * We can safely unlock the RCU lock here since we already
> +  * hold the refcount of the shrinker.
> +  */
> + rcu_read_unlock();
> +
>   ret = do_shrink_slab(, shrinker, priority);
>   if (ret == SHRINK_EMPTY)
>   ret = 0;
>   freed += ret;
> +
>   /*
> -  * Bail out if someone want to register a new shrinker to
> -  * prevent the registration from being stalled for long periods
> -  * by parallel ongoing shrinking.
> +  * This shrinker may be deleted from shrinker_list and freed
> +  * after the shrinker_put() below, but this shrinker is still
> +  * used for the next traversal. So it is necessary to hold the
> +  * RCU lock first to prevent this shrinker from being freed,
> +  * which also ensures that the next shrinker that is traversed
> +  * will not be freed (even if it is deleted from shrinker_list
> +  * at the same time).
>*/

This needs to be moved to the head of the function, and document
the whole list walk, get, put and completion parts of the algorithm
that make it safe. There's more to this than "we hold a reference
count", especially the tricky "we might see the shrinker before it
is fully initialised" case


.
>  void shrinker_free(struct shrinker *shrinker)
>  {
>   struct dentry *debugfs_entry = NULL;
> @@ -686,9 +712,18 @@ void shrinker_free(struct shrinker *shrinker)
>   if (!shrinker)
>   return;
>  
> + if (shrinker->flags & SHRINKER_REGISTERED) {
> + shrinker_put(shrinker);
> + wait_for_completion(>done);
> + }

Needs a comment explaining why we need to wait here...
> +
>   down_write(_rwsem);
>   if (shrinker->flags & SHRINKER_REGISTERED) {
> - list_del(>list);
> + /*
> +  * Lookups on the shrinker are over and will fail in the future,
> +  * so we can now remove it from the lists and free it.
> +  */

 rather than here after the wait has been done and provided the
guarantee that no shrinker is running or will run again...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 45/48] mm: shrinker: make global slab shrink lockless

2023-08-07 Thread Dave Chinner
On Mon, Aug 07, 2023 at 07:09:33PM +0800, Qi Zheng wrote:
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index eb342994675a..f06225f18531 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -4,6 +4,8 @@
>  
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #define SHRINKER_UNIT_BITS   BITS_PER_LONG
>  
> @@ -87,6 +89,10 @@ struct shrinker {
>   int seeks;  /* seeks to recreate an obj */
>   unsigned flags;
>  
> + refcount_t refcount;
> + struct completion done;
> + struct rcu_head rcu;

Documentation, please. What does the refcount protect, what does the
completion provide, etc.

> +
>   void *private_data;
>  
>   /* These are for internal use */
> @@ -120,6 +126,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, 
> const char *fmt, ...);
>  void shrinker_register(struct shrinker *shrinker);
>  void shrinker_free(struct shrinker *shrinker);
>  
> +static inline bool shrinker_try_get(struct shrinker *shrinker)
> +{
> + return refcount_inc_not_zero(>refcount);
> +}
> +
> +static inline void shrinker_put(struct shrinker *shrinker)
> +{
> + if (refcount_dec_and_test(>refcount))
> + complete(>done);
> +}
> +
>  #ifdef CONFIG_SHRINKER_DEBUG
>  extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
> const char *fmt, ...);
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index 1911c06b8af5..d318f5621862 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -2,6 +2,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -577,33 +578,42 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, 
> struct mem_cgroup *memcg,
>   if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>   return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>  
> - if (!down_read_trylock(_rwsem))
> - goto out;
> -
> - list_for_each_entry(shrinker, _list, list) {
> + rcu_read_lock();
> + list_for_each_entry_rcu(shrinker, _list, list) {
>   struct shrink_control sc = {
>   .gfp_mask = gfp_mask,
>   .nid = nid,
>   .memcg = memcg,
>   };
>  
> + if (!shrinker_try_get(shrinker))
> + continue;
> +
> + /*
> +  * We can safely unlock the RCU lock here since we already
> +  * hold the refcount of the shrinker.
> +  */
> + rcu_read_unlock();
> +
>   ret = do_shrink_slab(, shrinker, priority);
>   if (ret == SHRINK_EMPTY)
>   ret = 0;
>   freed += ret;
> +
>   /*
> -  * Bail out if someone want to register a new shrinker to
> -  * prevent the registration from being stalled for long periods
> -  * by parallel ongoing shrinking.
> +  * This shrinker may be deleted from shrinker_list and freed
> +  * after the shrinker_put() below, but this shrinker is still
> +  * used for the next traversal. So it is necessary to hold the
> +  * RCU lock first to prevent this shrinker from being freed,
> +  * which also ensures that the next shrinker that is traversed
> +  * will not be freed (even if it is deleted from shrinker_list
> +  * at the same time).
>*/

This needs to be moved to the head of the function, and document
the whole list walk, get, put and completion parts of the algorithm
that make it safe. There's more to this than "we hold a reference
count", especially the tricky "we might see the shrinker before it
is fully initialised" case


.
>  void shrinker_free(struct shrinker *shrinker)
>  {
>   struct dentry *debugfs_entry = NULL;
> @@ -686,9 +712,18 @@ void shrinker_free(struct shrinker *shrinker)
>   if (!shrinker)
>   return;
>  
> + if (shrinker->flags & SHRINKER_REGISTERED) {
> + shrinker_put(shrinker);
> + wait_for_completion(>done);
> + }

Needs a comment explaining why we need to wait here...
> +
>   down_write(_rwsem);
>   if (shrinker->flags & SHRINKER_REGISTERED) {
> - list_del(>list);
> + /*
> +  * Lookups on the shrinker are over and will fail in the future,
> +  * so we can now remove it from the lists and free it.
> +  */

 rather than here after the wait has been done and provided the
guarantee that no shrinker is running or will run again...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [f2fs-dev] [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

2023-08-07 Thread Dave Chinner via Linux-f2fs-devel
  }
>   }
>   }
>   up_read(_rwsem);
> @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>  {
>   struct shrinker_info *info;
>   unsigned long ret, freed = 0;
> - int i;
> + int offset, index = 0;
>  
>   if (!mem_cgroup_online(memcg))
>   return 0;
> @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   if (unlikely(!info))
>   goto unlock;
>  
> - for_each_set_bit(i, info->map, info->map_nr_max) {
> - struct shrink_control sc = {
> - .gfp_mask = gfp_mask,
> - .nid = nid,
> - .memcg = memcg,
> - };
> - struct shrinker *shrinker;
> + for (; index < shriner_id_to_index(info->map_nr_max); index++) {
> + struct shrinker_info_unit *unit;

This adds another layer of indent to shrink_slab_memcg(). Please
factor it first so that the code ends up being readable. Doing that
first as a separate patch will also make the actual algorithm
changes in this patch be much more obvious - this huge hunk of
diff is pretty much impossible to review...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

2023-08-07 Thread Dave Chinner via Linux-erofs
  }
>   }
>   }
>   up_read(_rwsem);
> @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>  {
>   struct shrinker_info *info;
>   unsigned long ret, freed = 0;
> - int i;
> + int offset, index = 0;
>  
>   if (!mem_cgroup_online(memcg))
>   return 0;
> @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   if (unlikely(!info))
>   goto unlock;
>  
> - for_each_set_bit(i, info->map, info->map_nr_max) {
> - struct shrink_control sc = {
> - .gfp_mask = gfp_mask,
> - .nid = nid,
> - .memcg = memcg,
> - };
> - struct shrinker *shrinker;
> + for (; index < shriner_id_to_index(info->map_nr_max); index++) {
> + struct shrinker_info_unit *unit;

This adds another layer of indent to shrink_slab_memcg(). Please
factor it first so that the code ends up being readable. Doing that
first as a separate patch will also make the actual algorithm
changes in this patch be much more obvious - this huge hunk of
diff is pretty much impossible to review...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

2023-08-07 Thread Dave Chinner via Virtualization
  }
>   }
>   }
>   up_read(_rwsem);
> @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>  {
>   struct shrinker_info *info;
>   unsigned long ret, freed = 0;
> - int i;
> + int offset, index = 0;
>  
>   if (!mem_cgroup_online(memcg))
>   return 0;
> @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   if (unlikely(!info))
>   goto unlock;
>  
> - for_each_set_bit(i, info->map, info->map_nr_max) {
> - struct shrink_control sc = {
> - .gfp_mask = gfp_mask,
> - .nid = nid,
> - .memcg = memcg,
> - };
> - struct shrinker *shrinker;
> + for (; index < shriner_id_to_index(info->map_nr_max); index++) {
> + struct shrinker_info_unit *unit;

This adds another layer of indent to shrink_slab_memcg(). Please
factor it first so that the code ends up being readable. Doing that
first as a separate patch will also make the actual algorithm
changes in this patch be much more obvious - this huge hunk of
diff is pretty much impossible to review...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Cluster-devel] [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

2023-08-07 Thread Dave Chinner
  }
>   }
>   }
>   up_read(_rwsem);
> @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>  {
>   struct shrinker_info *info;
>   unsigned long ret, freed = 0;
> - int i;
> + int offset, index = 0;
>  
>   if (!mem_cgroup_online(memcg))
>   return 0;
> @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   if (unlikely(!info))
>   goto unlock;
>  
> - for_each_set_bit(i, info->map, info->map_nr_max) {
> - struct shrink_control sc = {
> - .gfp_mask = gfp_mask,
> - .nid = nid,
> - .memcg = memcg,
> - };
> - struct shrinker *shrinker;
> + for (; index < shriner_id_to_index(info->map_nr_max); index++) {
> + struct shrinker_info_unit *unit;

This adds another layer of indent to shrink_slab_memcg(). Please
factor it first so that the code ends up being readable. Doing that
first as a separate patch will also make the actual algorithm
changes in this patch be much more obvious - this huge hunk of
diff is pretty much impossible to review...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



Re: [PATCH v4 44/48] mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred}

2023-08-07 Thread Dave Chinner
  }
>   }
>   }
>   up_read(_rwsem);
> @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>  {
>   struct shrinker_info *info;
>   unsigned long ret, freed = 0;
> - int i;
> + int offset, index = 0;
>  
>   if (!mem_cgroup_online(memcg))
>   return 0;
> @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>   if (unlikely(!info))
>   goto unlock;
>  
> - for_each_set_bit(i, info->map, info->map_nr_max) {
> - struct shrink_control sc = {
> - .gfp_mask = gfp_mask,
> - .nid = nid,
> - .memcg = memcg,
> - };
> - struct shrinker *shrinker;
> + for (; index < shriner_id_to_index(info->map_nr_max); index++) {
> + struct shrinker_info_unit *unit;

This adds another layer of indent to shrink_slab_memcg(). Please
factor it first so that the code ends up being readable. Doing that
first as a separate patch will also make the actual algorithm
changes in this patch be much more obvious - this huge hunk of
diff is pretty much impossible to review...

-Dave.
-- 
Dave Chinner
da...@fromorbit.com



  1   2   3   4   5   6   7   8   9   10   >