date:20170109

Re: [Cluster-devel] [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

2017-01-09 Thread Darrick J. Wong

On Fri, Jan 06, 2017 at 03:11:01PM +0100, Michal Hocko wrote:
> From: Michal Hocko 
> 
> xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite
> some time ago. We would like to make this concept more generic and use
> it for other filesystems as well. Let's start by giving the flag a
> more generic name PF_MEMALLOC_NOFS which is in line with an exiting
> PF_MEMALLOC_NOIO already used for the same purpose for GFP_NOIO
> contexts. Replace all PF_FSTRANS usage from the xfs code in the first
> step before we introduce a full API for it as xfs uses the flag directly
> anyway.
> 
> This patch doesn't introduce any functional change.
> 
> Signed-off-by: Michal Hocko 
> Reviewed-by: Brian Foster 

Reviewed-by: Darrick J. Wong 

> ---
>  fs/xfs/kmem.c |  4 ++--
>  fs/xfs/kmem.h |  2 +-
>  fs/xfs/libxfs/xfs_btree.c |  2 +-
>  fs/xfs/xfs_aops.c |  6 +++---
>  fs/xfs/xfs_trans.c| 12 ++--
>  include/linux/sched.h |  2 ++
>  6 files changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> index 339c696bbc01..a76a05dae96b 100644
> --- a/fs/xfs/kmem.c
> +++ b/fs/xfs/kmem.c
> @@ -80,13 +80,13 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
>* the filesystem here and potentially deadlocking.
>*/
> - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
> + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
>   noio_flag = memalloc_noio_save();
>  
>   lflags = kmem_flags_convert(flags);
>   ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
>  
> - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
> + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
>   memalloc_noio_restore(noio_flag);
>  
>   return ptr;
> diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
> index 689f746224e7..d973dbfc2bfa 100644
> --- a/fs/xfs/kmem.h
> +++ b/fs/xfs/kmem.h
> @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
>   lflags = GFP_ATOMIC | __GFP_NOWARN;
>   } else {
>   lflags = GFP_KERNEL | __GFP_NOWARN;
> - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
> + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
>   lflags &= ~__GFP_FS;
>   }
>  
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index 21e6a6ab6b9a..a2672ba4dc33 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
> @@ -2866,7 +2866,7 @@ xfs_btree_split_worker(
>   struct xfs_btree_split_args *args = container_of(work,
>   struct xfs_btree_split_args, 
> work);
>   unsigned long   pflags;
> - unsigned long   new_pflags = PF_FSTRANS;
> + unsigned long   new_pflags = PF_MEMALLOC_NOFS;
>  
>   /*
>* we are in a transaction context here, but may also be doing work
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index ef382bfb402b..d4094bb55033 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
>* We hand off the transaction to the completion thread now, so
>* clear the flag here.
>*/
> - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
> + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
>   return 0;
>  }
>  
> @@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
>* thus we need to mark ourselves as being in a transaction manually.
>* Similarly for freeze protection.
>*/
> - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
>   __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
>  
>   /* we abort the update if there was an IO error */
> @@ -1015,7 +1015,7 @@ xfs_do_writepage(
>* Given that we do not allow direct reclaim to call us, we should
>* never be called while in a filesystem transaction.
>*/
> - if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
> + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
>   goto redirty;
>  
>   /*
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 70f42ea86dfb..f5969c8274fc 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -134,7 +134,7 @@ xfs_trans_reserve(
>   boolrsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
>  
>   /* Mark this thread as being in a transaction */
> - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
>  
>   /*
>* Attempt to reserve the needed disk blocks by decrementing
> @@ -144,7 +144,7 @@ xfs_trans_reserve(
>   if (blocks > 0) {
>   error = xfs_mod_fdblocks(tp->t_mountp

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

2017-01-09 Thread Darrick J. Wong

On Fri, Jan 06, 2017 at 03:11:03PM +0100, Michal Hocko wrote:
> From: Michal Hocko 
> 
> kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore}
> API to prevent from reclaim recursion into the fs because vmalloc can
> invoke unconditional GFP_KERNEL allocations and these functions might be
> called from the NOFS contexts. The memalloc_noio_save will enforce
> GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be
> unnecessary. Let's use memalloc_nofs_{save,restore} instead as it should
> provide exactly what we need here - implicit GFP_NOFS context.
> 
> Changes since v1
> - s@memalloc_noio_restore@memalloc_nofs_restore@ in _xfs_buf_map_pages
>   as per Brian Foster
> 
> Signed-off-by: Michal Hocko 

Reviewed-by: Darrick J. Wong 

> ---
>  fs/xfs/kmem.c| 10 +-
>  fs/xfs/xfs_buf.c |  8 
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> index a76a05dae96b..d69ed5e76621 100644
> --- a/fs/xfs/kmem.c
> +++ b/fs/xfs/kmem.c
> @@ -65,7 +65,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
>  void *
>  kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>  {
> - unsigned noio_flag = 0;
> + unsigned nofs_flag = 0;
>   void*ptr;
>   gfp_t   lflags;
>  
> @@ -80,14 +80,14 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
>* the filesystem here and potentially deadlocking.
>*/
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - noio_flag = memalloc_noio_save();
> + if (flags & KM_NOFS)
> + nofs_flag = memalloc_nofs_save();
>  
>   lflags = kmem_flags_convert(flags);
>   ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
>  
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - memalloc_noio_restore(noio_flag);
> + if (flags & KM_NOFS)
> + memalloc_nofs_restore(nofs_flag);
>  
>   return ptr;
>  }
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 7f0a01f7b592..8cb8dd4cdfd8 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -441,17 +441,17 @@ _xfs_buf_map_pages(
>   bp->b_addr = NULL;
>   } else {
>   int retried = 0;
> - unsigned noio_flag;
> + unsigned nofs_flag;
>  
>   /*
>* vm_map_ram() will allocate auxillary structures (e.g.
>* pagetables) with GFP_KERNEL, yet we are likely to be under
>* GFP_NOFS context here. Hence we need to tell memory reclaim
> -  * that we are in such a context via PF_MEMALLOC_NOIO to prevent
> +  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
>* memory reclaim re-entering the filesystem here and
>* potentially deadlocking.
>*/
> - noio_flag = memalloc_noio_save();
> + nofs_flag = memalloc_nofs_save();
>   do {
>   bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
>   -1, PAGE_KERNEL);
> @@ -459,7 +459,7 @@ _xfs_buf_map_pages(
>   break;
>   vm_unmap_aliases();
>   } while (retried++ <= 1);
> - memalloc_noio_restore(noio_flag);
> + memalloc_nofs_restore(nofs_flag);
>  
>   if (!bp->b_addr)
>   return -ENOMEM;
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

2017-01-09 Thread Brian Foster

On Fri, Jan 06, 2017 at 03:11:03PM +0100, Michal Hocko wrote:
> From: Michal Hocko 
> 
> kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore}
> API to prevent from reclaim recursion into the fs because vmalloc can
> invoke unconditional GFP_KERNEL allocations and these functions might be
> called from the NOFS contexts. The memalloc_noio_save will enforce
> GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be
> unnecessary. Let's use memalloc_nofs_{save,restore} instead as it should
> provide exactly what we need here - implicit GFP_NOFS context.
> 
> Changes since v1
> - s@memalloc_noio_restore@memalloc_nofs_restore@ in _xfs_buf_map_pages
>   as per Brian Foster
> 
> Signed-off-by: Michal Hocko 
> ---

Looks fine to me:

Reviewed-by: Brian Foster 

>  fs/xfs/kmem.c| 10 +-
>  fs/xfs/xfs_buf.c |  8 
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> index a76a05dae96b..d69ed5e76621 100644
> --- a/fs/xfs/kmem.c
> +++ b/fs/xfs/kmem.c
> @@ -65,7 +65,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
>  void *
>  kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>  {
> - unsigned noio_flag = 0;
> + unsigned nofs_flag = 0;
>   void*ptr;
>   gfp_t   lflags;
>  
> @@ -80,14 +80,14 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
>* the filesystem here and potentially deadlocking.
>*/
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - noio_flag = memalloc_noio_save();
> + if (flags & KM_NOFS)
> + nofs_flag = memalloc_nofs_save();
>  
>   lflags = kmem_flags_convert(flags);
>   ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
>  
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - memalloc_noio_restore(noio_flag);
> + if (flags & KM_NOFS)
> + memalloc_nofs_restore(nofs_flag);
>  
>   return ptr;
>  }
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 7f0a01f7b592..8cb8dd4cdfd8 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -441,17 +441,17 @@ _xfs_buf_map_pages(
>   bp->b_addr = NULL;
>   } else {
>   int retried = 0;
> - unsigned noio_flag;
> + unsigned nofs_flag;
>  
>   /*
>* vm_map_ram() will allocate auxillary structures (e.g.
>* pagetables) with GFP_KERNEL, yet we are likely to be under
>* GFP_NOFS context here. Hence we need to tell memory reclaim
> -  * that we are in such a context via PF_MEMALLOC_NOIO to prevent
> +  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
>* memory reclaim re-entering the filesystem here and
>* potentially deadlocking.
>*/
> - noio_flag = memalloc_noio_save();
> + nofs_flag = memalloc_nofs_save();
>   do {
>   bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
>   -1, PAGE_KERNEL);
> @@ -459,7 +459,7 @@ _xfs_buf_map_pages(
>   break;
>   vm_unmap_aliases();
>   } while (retried++ <= 1);
> - memalloc_noio_restore(noio_flag);
> + memalloc_nofs_restore(nofs_flag);
>  
>   if (!bp->b_addr)
>   return -ENOMEM;
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

2017-01-09 Thread Michal Hocko

On Mon 09-01-17 15:08:27, Vlastimil Babka wrote:
> On 01/06/2017 03:11 PM, Michal Hocko wrote:
> > From: Michal Hocko 
> > 
> > kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore}
> > API to prevent from reclaim recursion into the fs because vmalloc can
> > invoke unconditional GFP_KERNEL allocations and these functions might be
> > called from the NOFS contexts. The memalloc_noio_save will enforce
> > GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be
> > unnecessary. Let's use memalloc_nofs_{save,restore} instead as it should
> > provide exactly what we need here - implicit GFP_NOFS context.
> > 
> > Changes since v1
> > - s@memalloc_noio_restore@memalloc_nofs_restore@ in _xfs_buf_map_pages
> >   as per Brian Foster
> > 
> > Signed-off-by: Michal Hocko 
> 
> Not a xfs expert, but seems correct.
> 
> Acked-by: Vlastimil Babka 

Thanks!

> 
> Nit below:
> 
> > ---
> >  fs/xfs/kmem.c| 10 +-
> >  fs/xfs/xfs_buf.c |  8 
> >  2 files changed, 9 insertions(+), 9 deletions(-)
> > 
> > diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> > index a76a05dae96b..d69ed5e76621 100644
> > --- a/fs/xfs/kmem.c
> > +++ b/fs/xfs/kmem.c
> > @@ -65,7 +65,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
> >  void *
> >  kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
> >  {
> > -   unsigned noio_flag = 0;
> > +   unsigned nofs_flag = 0;
> > void*ptr;
> > gfp_t   lflags;
> >  
> > @@ -80,14 +80,14 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
> >  * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
> >  * the filesystem here and potentially deadlocking.
> 
> The comment above is now largely obsolete, or minimally should be
> changed to PF_MEMALLOC_NOFS?
---
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index d69ed5e76621..0c9f94f41b6c 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -77,7 +77,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 * __vmalloc() will allocate data pages and auxillary structures (e.g.
 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
 * here. Hence we need to tell memory reclaim that we are in such a
-* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+* context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
 * the filesystem here and potentially deadlocking.
 */
if (flags & KM_NOFS)

I will fold it into the original patch.

Thanks!
-- 
Michal Hocko
SUSE Labs

Re: [Cluster-devel] [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

2017-01-09 Thread Michal Hocko

On Mon 09-01-17 13:59:05, Vlastimil Babka wrote:
> On 01/06/2017 03:11 PM, Michal Hocko wrote:
> > From: Michal Hocko 
> > 
> > xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite
> > some time ago. We would like to make this concept more generic and use
> > it for other filesystems as well. Let's start by giving the flag a
> > more generic name PF_MEMALLOC_NOFS which is in line with an exiting
> > PF_MEMALLOC_NOIO already used for the same purpose for GFP_NOIO
> > contexts. Replace all PF_FSTRANS usage from the xfs code in the first
> > step before we introduce a full API for it as xfs uses the flag directly
> > anyway.
> > 
> > This patch doesn't introduce any functional change.
> > 
> > Signed-off-by: Michal Hocko 
> > Reviewed-by: Brian Foster 
> 
> Acked-by: Vlastimil Babka 

Thanks!

> 
> A nit:
> 
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2320,6 +2320,8 @@ extern void thread_group_cputime_adjusted(struct 
> > task_struct *p, cputime_t *ut,
> >  #define PF_FREEZER_SKIP0x4000  /* Freezer should not count it 
> > as freezable */
> >  #define PF_SUSPEND_TASK 0x8000  /* this thread called 
> > freeze_processes and should not be frozen */
> >  
> > +#define PF_MEMALLOC_NOFS PF_FSTRANS/* Transition to a more generic 
> > GFP_NOFS scope semantic */
> 
> I don't see why this transition is needed, as there are already no users
> of PF_FSTRANS after this patch. The next patch doesn't remove any more,
> so this is just extra churn IMHO. But not a strong objection.

I just wanted to have this transparent for the xfs in this patch.
AFAIR Dave wanted to have xfs and generic parts separated. So it was the
easiest to simply keep the flag and then remove it in a separate patach.

-- 
Michal Hocko
SUSE Labs

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Expand and make useful the log_flush kernel trace point

2017-01-09 Thread Steven Whitehouse




On 09/01/17 14:10, Bob Peterson wrote:

Hi,

This patch changes the log_flush kernel trace point so that it
actually gives you meaningful statistics about the journal:

Output looks like this:
gfs2_logd-10580 [009]   1390.398627: gfs2_log_flush: 253,8 log flush 
start 8532 r:0 cr:0 v:0 f:5002 n:6153 h:5570/5570 t:2380 a1:2 a2:0 o:1 l:0 i:1

r: sd_log_blks_reserved
cr: sd_log_commited_revoke
v: sd_log_num_revoke
f: sd_log_blks_free
n: sd_log_blks_needed
h: sd_log_head / sd_log_flush_head
t: sd_log_tail
a1: count of items on the sd_ail1_list
a2: count of items on the sd_ail2_list
o: count of items on the sd_log_le_ordered list
l: sd_log_in_flight
i: sd_log_idle
I think several of those letters have been used before. Another concern 
is that a lot of those numbers are very implementation specific, and 
might have to change in due course. Adding the head/tail of the log 
seems like a useful thing to do though. No point adding "log in flight" 
since that will be 0 on both entry and exit to a log flush because we 
always wait for all the I/O.


Another issue is the tracepoints are supposed to be fast, counting the 
number of items in a list which is already so long that it causes 
performance issues upon occasion is definitely not a good plan.


We already have a pin/unpin tracepoint to track pinned blocks, so that 
is already tracking what is going into the AIL, so maybe we just need 
something when blocks exit the AIL? Perhaps better to keep AIL tracking 
separate from log tracing too.


The only other possible thing we might want to track is writes to the 
log which are not associated with pinned blocks, such as log headers, etc.,


Steve.


Signed-off-by: Bob Peterson 
---
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 49ac55d..cdecd1c 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -73,6 +73,16 @@ static inline u8 glock_trace_state(unsigned int state)
}
return DLM_LOCK_NL;
  }
+
+static inline const unsigned int list_count(const struct list_head *head)
+{
+   const struct list_head *tmp;
+   unsigned int count = 0;
+   list_for_each(tmp, head)
+   count++;
+   return count;
+}
+
  #endif
  
  /* Section 1 - Locking

@@ -359,18 +369,58 @@ TRACE_EVENT(gfs2_log_flush,
__field(dev_t,  dev )
__field(int,start   )
__field(u64,log_seq )
+   __field(unsigned int,   sd_log_blks_reserved)
+   __field(int,sd_log_commited_revoke  )
+   __field(unsigned int,   sd_log_num_revoke   )
+   __field(int,sd_log_blks_free)
+   __field(int,sd_log_blks_needed  )
+   __field(unsigned int,   sd_log_head )
+   __field(unsigned int,   sd_log_flush_head   )
+   __field(unsigned int,   sd_log_tail )
+   __field(unsigned int,   ail1_count  )
+   __field(unsigned int,   ail2_count  )
+   __field(unsigned int,   ordered_count   )
+   __field(int,sd_log_in_flight)
+   __field(int,sd_log_idle )
),
  
  	TP_fast_assign(

__entry->dev= sdp->sd_vfs->s_dev;
__entry->start   = start;
__entry->log_seq = sdp->sd_log_sequence;
+   __entry->sd_log_blks_reserved= 
sdp->sd_log_blks_reserved;
+   __entry->sd_log_commited_revoke  = 
sdp->sd_log_commited_revoke;
+   __entry->sd_log_num_revoke   = sdp->sd_log_num_revoke;
+   __entry->sd_log_blks_free= 
atomic_read(&sdp->sd_log_blks_free);
+   __entry->sd_log_blks_needed  = 
atomic_read(&sdp->sd_log_blks_needed);
+   __entry->sd_log_head = sdp->sd_log_head;
+   __entry->sd_log_flush_head   = sdp->sd_log_flush_head;
+   __entry->sd_log_tail = sdp->sd_log_tail;
+   __entry->ail1_count = list_count(&sdp->sd_ail1_list);
+   __entry->ail2_count = list_count(&sdp->sd_ail2_list);
+   __entry->ordered_count = list_count(&sdp->sd_log_le_ordered);
+   __entry->sd_log_in_flight = atomic_read(&sdp->sd_log_in_flight);
+   __entry->sd_log_idle = sdp->sd_log_idle;
),
  
-	TP_printk("%u,%u log flush %s %llu",

+   TP_printk("%u,%u log flush %s %llu r:%u cr:%d v:%u "
+ "f:%d n:%d h:%u/%u t:%u a1:%u a2:%u o:%u l:%d i:%d",
  MAJOR(__entry->dev), MINOR(__entry->dev),
  __entry->start ? "start" : "end",
- (unsigned long long)__entry->log_seq)
+ (unsigned long long)__entry->log_seq,
+ __entry->sd_log_blks_reserved,
+

[Cluster-devel] [GFS2 PATCH] GFS2: Expand and make useful the log_flush kernel trace point

2017-01-09 Thread Bob Peterson

Hi,

This patch changes the log_flush kernel trace point so that it
actually gives you meaningful statistics about the journal:

Output looks like this:
   gfs2_logd-10580 [009]   1390.398627: gfs2_log_flush: 253,8 log flush 
start 8532 r:0 cr:0 v:0 f:5002 n:6153 h:5570/5570 t:2380 a1:2 a2:0 o:1 l:0 i:1

r: sd_log_blks_reserved
cr: sd_log_commited_revoke
v: sd_log_num_revoke
f: sd_log_blks_free
n: sd_log_blks_needed
h: sd_log_head / sd_log_flush_head
t: sd_log_tail
a1: count of items on the sd_ail1_list
a2: count of items on the sd_ail2_list
o: count of items on the sd_log_le_ordered list
l: sd_log_in_flight
i: sd_log_idle

Signed-off-by: Bob Peterson 
---
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 49ac55d..cdecd1c 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -73,6 +73,16 @@ static inline u8 glock_trace_state(unsigned int state)
}
return DLM_LOCK_NL;
 }
+
+static inline const unsigned int list_count(const struct list_head *head)
+{
+   const struct list_head *tmp;
+   unsigned int count = 0;
+   list_for_each(tmp, head)
+   count++;
+   return count;
+}
+
 #endif
 
 /* Section 1 - Locking
@@ -359,18 +369,58 @@ TRACE_EVENT(gfs2_log_flush,
__field(dev_t,  dev )
__field(int,start   )
__field(u64,log_seq )
+   __field(unsigned int,   sd_log_blks_reserved)
+   __field(int,sd_log_commited_revoke  )
+   __field(unsigned int,   sd_log_num_revoke   )
+   __field(int,sd_log_blks_free)
+   __field(int,sd_log_blks_needed  )
+   __field(unsigned int,   sd_log_head )
+   __field(unsigned int,   sd_log_flush_head   )
+   __field(unsigned int,   sd_log_tail )
+   __field(unsigned int,   ail1_count  )
+   __field(unsigned int,   ail2_count  )
+   __field(unsigned int,   ordered_count   )
+   __field(int,sd_log_in_flight)
+   __field(int,sd_log_idle )
),
 
TP_fast_assign(
__entry->dev= sdp->sd_vfs->s_dev;
__entry->start  = start;
__entry->log_seq= sdp->sd_log_sequence;
+   __entry->sd_log_blks_reserved   = sdp->sd_log_blks_reserved;
+   __entry->sd_log_commited_revoke = sdp->sd_log_commited_revoke;
+   __entry->sd_log_num_revoke  = sdp->sd_log_num_revoke;
+   __entry->sd_log_blks_free   = 
atomic_read(&sdp->sd_log_blks_free);
+   __entry->sd_log_blks_needed = 
atomic_read(&sdp->sd_log_blks_needed);
+   __entry->sd_log_head= sdp->sd_log_head;
+   __entry->sd_log_flush_head  = sdp->sd_log_flush_head;
+   __entry->sd_log_tail= sdp->sd_log_tail;
+   __entry->ail1_count = list_count(&sdp->sd_ail1_list);
+   __entry->ail2_count = list_count(&sdp->sd_ail2_list);
+   __entry->ordered_count = list_count(&sdp->sd_log_le_ordered);
+   __entry->sd_log_in_flight = atomic_read(&sdp->sd_log_in_flight);
+   __entry->sd_log_idle = sdp->sd_log_idle;
),
 
-   TP_printk("%u,%u log flush %s %llu",
+   TP_printk("%u,%u log flush %s %llu r:%u cr:%d v:%u "
+ "f:%d n:%d h:%u/%u t:%u a1:%u a2:%u o:%u l:%d i:%d",
  MAJOR(__entry->dev), MINOR(__entry->dev),
  __entry->start ? "start" : "end",
- (unsigned long long)__entry->log_seq)
+ (unsigned long long)__entry->log_seq,
+ __entry->sd_log_blks_reserved,
+ __entry->sd_log_commited_revoke,
+ __entry->sd_log_num_revoke,
+ __entry->sd_log_blks_free,
+ __entry->sd_log_blks_needed,
+ __entry->sd_log_head,
+ __entry->sd_log_flush_head,
+ __entry->sd_log_tail,
+ __entry->ail1_count,
+ __entry->ail2_count,
+ __entry->ordered_count,
+ __entry->sd_log_in_flight,
+ __entry->sd_log_idle)
 );
 
 /* Reserving/releasing blocks in the log */

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

2017-01-09 Thread Vlastimil Babka

On 01/06/2017 03:11 PM, Michal Hocko wrote:
> From: Michal Hocko 
> 
> kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore}
> API to prevent from reclaim recursion into the fs because vmalloc can
> invoke unconditional GFP_KERNEL allocations and these functions might be
> called from the NOFS contexts. The memalloc_noio_save will enforce
> GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be
> unnecessary. Let's use memalloc_nofs_{save,restore} instead as it should
> provide exactly what we need here - implicit GFP_NOFS context.
> 
> Changes since v1
> - s@memalloc_noio_restore@memalloc_nofs_restore@ in _xfs_buf_map_pages
>   as per Brian Foster
> 
> Signed-off-by: Michal Hocko 

Not a xfs expert, but seems correct.

Acked-by: Vlastimil Babka 

Nit below:

> ---
>  fs/xfs/kmem.c| 10 +-
>  fs/xfs/xfs_buf.c |  8 
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> index a76a05dae96b..d69ed5e76621 100644
> --- a/fs/xfs/kmem.c
> +++ b/fs/xfs/kmem.c
> @@ -65,7 +65,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
>  void *
>  kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>  {
> - unsigned noio_flag = 0;
> + unsigned nofs_flag = 0;
>   void*ptr;
>   gfp_t   lflags;
>  
> @@ -80,14 +80,14 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
>* the filesystem here and potentially deadlocking.

The comment above is now largely obsolete, or minimally should be
changed to PF_MEMALLOC_NOFS?

>*/
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - noio_flag = memalloc_noio_save();
> + if (flags & KM_NOFS)
> + nofs_flag = memalloc_nofs_save();
>  
>   lflags = kmem_flags_convert(flags);
>   ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
>  
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - memalloc_noio_restore(noio_flag);
> + if (flags & KM_NOFS)
> + memalloc_nofs_restore(nofs_flag);
>  
>   return ptr;
>  }
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 7f0a01f7b592..8cb8dd4cdfd8 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -441,17 +441,17 @@ _xfs_buf_map_pages(
>   bp->b_addr = NULL;
>   } else {
>   int retried = 0;
> - unsigned noio_flag;
> + unsigned nofs_flag;
>  
>   /*
>* vm_map_ram() will allocate auxillary structures (e.g.
>* pagetables) with GFP_KERNEL, yet we are likely to be under
>* GFP_NOFS context here. Hence we need to tell memory reclaim
> -  * that we are in such a context via PF_MEMALLOC_NOIO to prevent
> +  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
>* memory reclaim re-entering the filesystem here and
>* potentially deadlocking.
>*/
> - noio_flag = memalloc_noio_save();
> + nofs_flag = memalloc_nofs_save();
>   do {
>   bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
>   -1, PAGE_KERNEL);
> @@ -459,7 +459,7 @@ _xfs_buf_map_pages(
>   break;
>   vm_unmap_aliases();
>   } while (retried++ <= 1);
> - memalloc_noio_restore(noio_flag);
> + memalloc_nofs_restore(nofs_flag);
>  
>   if (!bp->b_addr)
>   return -ENOMEM;
>

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

2017-01-09 Thread Vlastimil Babka

On 01/09/2017 02:42 PM, Michal Hocko wrote:
> On Mon 09-01-17 14:04:21, Vlastimil Babka wrote:
> [...]
>>> +static inline unsigned int memalloc_nofs_save(void)
>>> +{
>>> +   unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
>>> +   current->flags |= PF_MEMALLOC_NOFS;
>>
>> So this is not new, as same goes for memalloc_noio_save, but I've
>> noticed that e.g. exit_signal() does tsk->flags |= PF_EXITING;
>> So is it possible that there's a r-m-w hazard here?
> 
> exit_signals operates on current and all task_struct::flags should be
> used only on the current.
> [...]

Ah, good to know.

> 
>>> @@ -3029,7 +3029,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
>>> mem_cgroup *memcg,
>>> int nid;
>>> struct scan_control sc = {
>>> .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
>>> -   .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
>>> +   .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
>>
>> So this function didn't do memalloc_noio_flags() before? Is it a bug
>> that should be fixed separately or at least mentioned? Because that
>> looks like a functional change...
> 
> We didn't need it. Kmem charges are opt-in and current all of them
> support GFP_IO. The LRU pages are not charged in NOIO context either.
> We need it now because there will be callers to charge GFP_KERNEL while
> being inside the NOFS scope.

I see.

> Now that you have opened this I have noticed that the code is wrong
> here because GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK would overwrite
> the removed GFP_FS. I guess it would be better and less error prone
> to move the current_gfp_context part into the direct reclaim entry -
> do_try_to_free_pages - and put the comment like this

Agree with your "So let's just scratch this follow up fix in the next
e-mail.

So for the unchanged patch.

Acked-by: Vlastimil Babka

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

2017-01-09 Thread Michal Hocko

On Mon 09-01-17 14:42:10, Michal Hocko wrote:
> On Mon 09-01-17 14:04:21, Vlastimil Babka wrote:
[...]
> Now that you have opened this I have noticed that the code is wrong
> here because GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK would overwrite
> the removed GFP_FS.

Blee, it wouldn't because ~GFP_RECLAIM_MASK will not contain neither
GFP_FS nor GFP_IO. So all is good here.

> I guess it would be better and less error prone
> to move the current_gfp_context part into the direct reclaim entry -
> do_try_to_free_pages - and put the comment like this

well, after more thinking about we, should probably keep it where it is.
If for nothing else try_to_free_mem_cgroup_pages has a tracepoint which
prints the gfp mask so we should use the filtered one. So let's just
scratch this follow up fix.

> ---
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 4ea6b610f20e..df7975185f11 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2756,6 +2756,13 @@ static unsigned long do_try_to_free_pages(struct 
> zonelist *zonelist,
>   int initial_priority = sc->priority;
>   unsigned long total_scanned = 0;
>   unsigned long writeback_threshold;
> +
> + /*
> +  * Make sure that the gfp context properly handles scope gfp mask.
> +  * This might weaken the reclaim context (e.g. make it GFP_NOFS or
> +  * GFP_NOIO).
> +  */
> + sc->gfp_mask = current_gfp_context(sc->gfp_mask);
>  retry:
>   delayacct_freepages_start();
>  
> @@ -2949,7 +2956,7 @@ unsigned long try_to_free_pages(struct zonelist 
> *zonelist, int order,
>   unsigned long nr_reclaimed;
>   struct scan_control sc = {
>   .nr_to_reclaim = SWAP_CLUSTER_MAX,
> - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
> + .gfp_mask = gfp_mask,
>   .reclaim_idx = gfp_zone(gfp_mask),
>   .order = order,
>   .nodemask = nodemask,
> @@ -3029,8 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
> mem_cgroup *memcg,
>   int nid;
>   struct scan_control sc = {
>   .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> - .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
> - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
> + .gfp_mask = GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK,
>   .reclaim_idx = MAX_NR_ZONES - 1,
>   .target_mem_cgroup = memcg,
>   .priority = DEF_PRIORITY,
> @@ -3723,7 +3729,7 @@ static int __node_reclaim(struct pglist_data *pgdat, 
> gfp_t gfp_mask, unsigned in
>   int classzone_idx = gfp_zone(gfp_mask);
>   struct scan_control sc = {
>   .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
> + .gfp_mask = gfp_mask,
>   .order = order,
>   .priority = NODE_RECLAIM_PRIORITY,
>   .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
> -- 
> Michal Hocko
> SUSE Labs

-- 
Michal Hocko
SUSE Labs

[Cluster-devel] [PATCH] fs/dlm: Fix kernel memory disclosure

2017-01-09 Thread Vlad Tsyrklevich

Clear the 'unused' field to avoid leaking memory to userland in
copy_result_to_user().

Signed-off-by: Vlad Tsyrklevich 
---
 fs/dlm/user.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1ce908c..0570711 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -138,6 +138,8 @@ static void compat_output(struct dlm_lock_result *res,
res32->lksb.sb_flags = res->lksb.sb_flags;
res32->lksb.sb_lkid = res->lksb.sb_lkid;
res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+
+   memset(&res32->unused, 0, sizeof(res32->unused));
 }
 #endif
 
-- 
2.7.0

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

2017-01-09 Thread Michal Hocko

On Mon 09-01-17 14:04:21, Vlastimil Babka wrote:
[...]
> > +static inline unsigned int memalloc_nofs_save(void)
> > +{
> > +   unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
> > +   current->flags |= PF_MEMALLOC_NOFS;
> 
> So this is not new, as same goes for memalloc_noio_save, but I've
> noticed that e.g. exit_signal() does tsk->flags |= PF_EXITING;
> So is it possible that there's a r-m-w hazard here?

exit_signals operates on current and all task_struct::flags should be
used only on the current.
[...]

> > @@ -3029,7 +3029,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
> > mem_cgroup *memcg,
> > int nid;
> > struct scan_control sc = {
> > .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> > -   .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
> > +   .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
> 
> So this function didn't do memalloc_noio_flags() before? Is it a bug
> that should be fixed separately or at least mentioned? Because that
> looks like a functional change...

We didn't need it. Kmem charges are opt-in and current all of them
support GFP_IO. The LRU pages are not charged in NOIO context either.
We need it now because there will be callers to charge GFP_KERNEL while
being inside the NOFS scope.

Now that you have opened this I have noticed that the code is wrong
here because GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK would overwrite
the removed GFP_FS. I guess it would be better and less error prone
to move the current_gfp_context part into the direct reclaim entry -
do_try_to_free_pages - and put the comment like this
---
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4ea6b610f20e..df7975185f11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2756,6 +2756,13 @@ static unsigned long do_try_to_free_pages(struct 
zonelist *zonelist,
int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
+
+   /*
+* Make sure that the gfp context properly handles scope gfp mask.
+* This might weaken the reclaim context (e.g. make it GFP_NOFS or
+* GFP_NOIO).
+*/
+   sc->gfp_mask = current_gfp_context(sc->gfp_mask);
 retry:
delayacct_freepages_start();
 
@@ -2949,7 +2956,7 @@ unsigned long try_to_free_pages(struct zonelist 
*zonelist, int order,
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
-   .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+   .gfp_mask = gfp_mask,
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
@@ -3029,8 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
mem_cgroup *memcg,
int nid;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-   .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
-   (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+   .gfp_mask = GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK,
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
@@ -3723,7 +3729,7 @@ static int __node_reclaim(struct pglist_data *pgdat, 
gfp_t gfp_mask, unsigned in
int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-   .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+   .gfp_mask = gfp_mask,
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
-- 
Michal Hocko
SUSE Labs

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

2017-01-09 Thread Vlastimil Babka

On 01/06/2017 03:11 PM, Michal Hocko wrote:
> From: Michal Hocko 
> 
> GFP_NOFS context is used for the following 5 reasons currently
>   - to prevent from deadlocks when the lock held by the allocation
> context would be needed during the memory reclaim
>   - to prevent from stack overflows during the reclaim because
> the allocation is performed from a deep context already
>   - to prevent lockups when the allocation context depends on
> other reclaimers to make a forward progress indirectly
>   - just in case because this would be safe from the fs POV
>   - silence lockdep false positives
> 
> Unfortunately overuse of this allocation context brings some problems
> to the MM. Memory reclaim is much weaker (especially during heavy FS
> metadata workloads), OOM killer cannot be invoked because the MM layer
> doesn't have enough information about how much memory is freeable by the
> FS layer.
> 
> In many cases it is far from clear why the weaker context is even used
> and so it might be used unnecessarily. We would like to get rid of
> those as much as possible. One way to do that is to use the flag in
> scopes rather than isolated cases. Such a scope is declared when really
> necessary, tracked per task and all the allocation requests from within
> the context will simply inherit the GFP_NOFS semantic.
> 
> Not only this is easier to understand and maintain because there are
> much less problematic contexts than specific allocation requests, this
> also helps code paths where FS layer interacts with other layers (e.g.
> crypto, security modules, MM etc...) and there is no easy way to convey
> the allocation context between the layers.
> 
> Introduce memalloc_nofs_{save,restore} API to control the scope
> of GFP_NOFS allocation context. This is basically copying
> memalloc_noio_{save,restore} API we have for other restricted allocation
> context GFP_NOIO. The PF_MEMALLOC_NOFS flag already exists and it is
> just an alias for PF_FSTRANS which has been xfs specific until recently.
> There are no more PF_FSTRANS users anymore so let's just drop it.
> 
> PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS
> implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO. memalloc_noio_flags
> is renamed to current_gfp_context because it now cares about both
> PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts. Xfs code paths preserve
> their semantic. kmem_flags_convert() doesn't need to evaluate the flag
> anymore.
> 
> This patch shouldn't introduce any functional changes.
> 
> Let's hope that filesystems will drop direct GFP_NOFS (resp. ~__GFP_FS)
> usage as much as possible and only use a properly documented
> memalloc_nofs_{save,restore} checkpoints where they are appropriate.
> 
> Signed-off-by: Michal Hocko 


[...]

> +static inline unsigned int memalloc_nofs_save(void)
> +{
> + unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
> + current->flags |= PF_MEMALLOC_NOFS;

So this is not new, as same goes for memalloc_noio_save, but I've
noticed that e.g. exit_signal() does tsk->flags |= PF_EXITING;
So is it possible that there's a r-m-w hazard here?

> + return flags;
> +}
> +
> +static inline void memalloc_nofs_restore(unsigned int flags)
> +{
> + current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
> +}
> +
>  /* Per-process atomic flags. */
>  #define PFA_NO_NEW_PRIVS 0   /* May not gain new privileges. */
>  #define PFA_SPREAD_PAGE  1  /* Spread page cache over cpuset */

[...]

> @@ -3029,7 +3029,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct 
> mem_cgroup *memcg,
>   int nid;
>   struct scan_control sc = {
>   .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
> + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |

So this function didn't do memalloc_noio_flags() before? Is it a bug
that should be fixed separately or at least mentioned? Because that
looks like a functional change...

Thanks!

>   (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
>   .reclaim_idx = MAX_NR_ZONES - 1,
>   .target_mem_cgroup = memcg,
> @@ -3723,7 +3723,7 @@ static int __node_reclaim(struct pglist_data *pgdat, 
> gfp_t gfp_mask, unsigned in
>   int classzone_idx = gfp_zone(gfp_mask);
>   struct scan_control sc = {
>   .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
> + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
>   .order = order,
>   .priority = NODE_RECLAIM_PRIORITY,
>   .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
>

Re: [Cluster-devel] [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

2017-01-09 Thread Vlastimil Babka

On 01/06/2017 03:11 PM, Michal Hocko wrote:
> From: Michal Hocko 
> 
> xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite
> some time ago. We would like to make this concept more generic and use
> it for other filesystems as well. Let's start by giving the flag a
> more generic name PF_MEMALLOC_NOFS which is in line with an exiting
> PF_MEMALLOC_NOIO already used for the same purpose for GFP_NOIO
> contexts. Replace all PF_FSTRANS usage from the xfs code in the first
> step before we introduce a full API for it as xfs uses the flag directly
> anyway.
> 
> This patch doesn't introduce any functional change.
> 
> Signed-off-by: Michal Hocko 
> Reviewed-by: Brian Foster 

Acked-by: Vlastimil Babka 

A nit:

> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2320,6 +2320,8 @@ extern void thread_group_cputime_adjusted(struct 
> task_struct *p, cputime_t *ut,
>  #define PF_FREEZER_SKIP  0x4000  /* Freezer should not count it 
> as freezable */
>  #define PF_SUSPEND_TASK 0x8000  /* this thread called 
> freeze_processes and should not be frozen */
>  
> +#define PF_MEMALLOC_NOFS PF_FSTRANS  /* Transition to a more generic 
> GFP_NOFS scope semantic */

I don't see why this transition is needed, as there are already no users
of PF_FSTRANS after this patch. The next patch doesn't remove any more,
so this is just extra churn IMHO. But not a strong objection.

> +
>  /*
>   * Only the _current_ task can read/write to tsk->flags, but other
>   * tasks can access tsk->flags in readonly mode for example
>

Re: [Cluster-devel] [PATCH 1/8] lockdep: allow to disable reclaim lockup detection

2017-01-09 Thread Vlastimil Babka

On 01/06/2017 03:11 PM, Michal Hocko wrote:
> From: Michal Hocko 
> 
> The current implementation of the reclaim lockup detection can lead to
> false positives and those even happen and usually lead to tweak the
> code to silence the lockdep by using GFP_NOFS even though the context
> can use __GFP_FS just fine. See
> http://lkml.kernel.org/r/20160512080321.GA18496@dastard as an example.
> 
> =
> [ INFO: inconsistent lock state ]
> 4.5.0-rc2+ #4 Tainted: G   O
> -
> inconsistent {RECLAIM_FS-ON-R} -> {IN-RECLAIM_FS-W} usage.
> kswapd0/543 [HC0[0]:SC0[0]:HE1:SE1] takes:
> 
> (&xfs_nondir_ilock_class){-+}, at: [] 
> xfs_ilock+0x177/0x200 [xfs]
> 
> {RECLAIM_FS-ON-R} state was registered at:
>   [] mark_held_locks+0x79/0xa0
>   [] lockdep_trace_alloc+0xb3/0x100
>   [] kmem_cache_alloc+0x33/0x230
>   [] kmem_zone_alloc+0x81/0x120 [xfs]
>   [] xfs_refcountbt_init_cursor+0x3e/0xa0 [xfs]
>   [] __xfs_refcount_find_shared+0x75/0x580 [xfs]
>   [] xfs_refcount_find_shared+0x84/0xb0 [xfs]
>   [] xfs_getbmap+0x608/0x8c0 [xfs]
>   [] xfs_vn_fiemap+0xab/0xc0 [xfs]
>   [] do_vfs_ioctl+0x498/0x670
>   [] SyS_ioctl+0x79/0x90
>   [] entry_SYSCALL_64_fastpath+0x12/0x6f
> 
>CPU0
>
>   lock(&xfs_nondir_ilock_class);
>   
> lock(&xfs_nondir_ilock_class);
> 
>  *** DEADLOCK ***
> 
> 3 locks held by kswapd0/543:
> 
> stack backtrace:
> CPU: 0 PID: 543 Comm: kswapd0 Tainted: G   O4.5.0-rc2+ #4
> 
> Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
> 
>  82a34f10 88003aa078d0 813a14f9 88003d8551c0
>  88003aa07920 8110ec65  0001
>  8801 000b 0008 88003d855aa0
> Call Trace:
>  [] dump_stack+0x4b/0x72
>  [] print_usage_bug+0x215/0x240
>  [] mark_lock+0x1f5/0x660
>  [] ? print_shortest_lock_dependencies+0x1a0/0x1a0
>  [] __lock_acquire+0xa80/0x1e50
>  [] ? kmem_cache_alloc+0x15e/0x230
>  [] ? kmem_zone_alloc+0x81/0x120 [xfs]
>  [] lock_acquire+0xd8/0x1e0
>  [] ? xfs_ilock+0x177/0x200 [xfs]
>  [] ? xfs_reflink_cancel_cow_range+0x150/0x300 [xfs]
>  [] down_write_nested+0x5e/0xc0
>  [] ? xfs_ilock+0x177/0x200 [xfs]
>  [] xfs_ilock+0x177/0x200 [xfs]
>  [] xfs_reflink_cancel_cow_range+0x150/0x300 [xfs]
>  [] xfs_fs_evict_inode+0xdc/0x1e0 [xfs]
>  [] evict+0xc5/0x190
>  [] dispose_list+0x39/0x60
>  [] prune_icache_sb+0x4b/0x60
>  [] super_cache_scan+0x14f/0x1a0
>  [] shrink_slab.part.63.constprop.79+0x1e9/0x4e0
>  [] shrink_zone+0x15e/0x170
>  [] kswapd+0x4f1/0xa80
>  [] ? zone_reclaim+0x230/0x230
>  [] kthread+0xf2/0x110
>  [] ? kthread_create_on_node+0x220/0x220
>  [] ret_from_fork+0x3f/0x70
>  [] ? kthread_create_on_node+0x220/0x220
> 
> To quote Dave:
> "
> Ignoring whether reflink should be doing anything or not, that's a
> "xfs_refcountbt_init_cursor() gets called both outside and inside
> transactions" lockdep false positive case. The problem here is
> lockdep has seen this allocation from within a transaction, hence a
> GFP_NOFS allocation, and now it's seeing it in a GFP_KERNEL context.
> Also note that we have an active reference to this inode.
> 
> So, because the reclaim annotations overload the interrupt level
> detections and it's seen the inode ilock been taken in reclaim
> ("interrupt") context, this triggers a reclaim context warning where
> it thinks it is unsafe to do this allocation in GFP_KERNEL context
> holding the inode ilock...
> "
> 
> This sounds like a fundamental problem of the reclaim lock detection.
> It is really impossible to annotate such a special usecase IMHO unless
> the reclaim lockup detection is reworked completely. Until then it
> is much better to provide a way to add "I know what I am doing flag"
> and mark problematic places. This would prevent from abusing GFP_NOFS
> flag which has a runtime effect even on configurations which have
> lockdep disabled.
> 
> Introduce __GFP_NOLOCKDEP flag which tells the lockdep gfp tracking to
> skip the current allocation request.
> 
> While we are at it also make sure that the radix tree doesn't
> accidentaly override tags stored in the upper part of the gfp_mask.
> 
> Suggested-by: Peter Zijlstra 
> Acked-by: Peter Zijlstra (Intel) 
> Signed-off-by: Michal Hocko 

Acked-by: Vlastimil Babka

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Wake up io waiters whenever a flush is done

2017-01-09 Thread Steven Whitehouse


Hi,

Acked-by: Steven Whitehouse 

Steve.

On 07/01/17 03:19, Bob Peterson wrote:

Hi,

Before this patch, if a process called function gfs2_log_reserve to
reserve some journal blocks, but the journal not enough blocks were
free, it would call io_schedule. However, in the log flush daemon,
it woke up the waiters only if an gfs2_ail_flush was no longer
required. This resulted in situations where processes would wait
forever because the number of blocks required was so high that it
pushed the journal into a perpetual state of flush being required.

This patch changes the logd daemon so that it wakes up io waiters
every time the log is actually flushed.

Signed-off-by: Bob Peterson 
---
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4df349c..5028a9d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -918,12 +918,15 @@ int gfs2_logd(void *data)
struct gfs2_sbd *sdp = data;
unsigned long t = 1;
DEFINE_WAIT(wait);
+   bool did_flush;
  
  	while (!kthread_should_stop()) {
  
+		did_flush = false;

if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
gfs2_ail1_empty(sdp);
gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+   did_flush = true;
}
  
  		if (gfs2_ail_flush_reqd(sdp)) {

@@ -931,9 +934,10 @@ int gfs2_logd(void *data)
gfs2_ail1_wait(sdp);
gfs2_ail1_empty(sdp);
gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+   did_flush = true;
}
  
-		if (!gfs2_ail_flush_reqd(sdp))

+   if (!gfs2_ail_flush_reqd(sdp) || did_flush)
wake_up(&sdp->sd_log_waitq);
  
  		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;

Re: [Cluster-devel] [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

Re: [Cluster-devel] [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Expand and make useful the log_flush kernel trace point

[Cluster-devel] [GFS2 PATCH] GFS2: Expand and make useful the log_flush kernel trace point

Re: [Cluster-devel] [PATCH 4/8] xfs: use memalloc_nofs_{save, restore} instead of memalloc_noio*

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

[Cluster-devel] [PATCH] fs/dlm: Fix kernel memory disclosure

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

Re: [Cluster-devel] [PATCH 3/8] mm: introduce memalloc_nofs_{save, restore} API

Re: [Cluster-devel] [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

Re: [Cluster-devel] [PATCH 1/8] lockdep: allow to disable reclaim lockup detection

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Wake up io waiters whenever a flush is done

16 matches

Site Navigation

Mail list logo

Footer information