Re: [PATCH 2/2] xfs: add support for sub-pagesize writeback without buffer_heads

2018-05-25 Thread Brian Foster
On Wed, May 23, 2018 at 04:46:46PM +0200, Christoph Hellwig wrote:
> Switch to using the iomap_page structure for checking sub-page uptodate
> status and track sub-page I/O completion status, and remove large
> quantities of boilerplate code working around buffer heads.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  fs/xfs/xfs_aops.c  | 536 +++--
>  fs/xfs/xfs_buf.h   |   1 -
>  fs/xfs/xfs_iomap.c |   3 -
>  fs/xfs/xfs_super.c |   2 +-
>  fs/xfs/xfs_trace.h |  18 +-
>  5 files changed, 79 insertions(+), 481 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index efa2cbb27d67..d279929e53fb 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
...
> @@ -768,7 +620,7 @@ xfs_aops_discard_page(
>   int error;
>  
>   if (XFS_FORCED_SHUTDOWN(mp))
> - goto out_invalidate;
> + goto out;
>  
>   xfs_alert(mp,
>   "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
> @@ -778,15 +630,15 @@ xfs_aops_discard_page(
>   PAGE_SIZE / i_blocksize(inode));
>   if (error && !XFS_FORCED_SHUTDOWN(mp))
>   xfs_alert(mp, "page discard unable to remove delalloc 
> mapping.");
> -out_invalidate:
> - xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
> +out:
> + iomap_invalidatepage(page, 0, PAGE_SIZE);

All this does is lose the tracepoint. I don't think this call needs to
change. The rest looks Ok to me, but I still need to run some tests on
the whole thing.

Brian

>  }
>  
>  /*
>   * We implement an immediate ioend submission policy here to avoid needing to
>   * chain multiple ioends and hence nest mempool allocations which can violate
>   * forward progress guarantees we need to provide. The current ioend we are
> - * adding buffers to is cached on the writepage context, and if the new 
> buffer
> + * adding blocks to is cached on the writepage context, and if the new block
>   * does not append to the cached ioend it will create a new ioend and cache 
> that
>   * instead.
>   *
> @@ -807,41 +659,28 @@ xfs_writepage_map(
>   uint64_tend_offset)
>  {
>   LIST_HEAD(submit_list);
> + struct iomap_page   *iop = to_iomap_page(page);
> + unsignedlen = i_blocksize(inode);
>   struct xfs_ioend*ioend, *next;
> - struct buffer_head  *bh = NULL;
> - ssize_t len = i_blocksize(inode);
> - int error = 0;
> - int count = 0;
> - loff_t  file_offset;/* file offset of page */
> - unsignedpoffset;/* offset into page */
> + int error = 0, count = 0, i;
> + u64 file_offset;/* file offset of page */
>  
> - if (page_has_buffers(page))
> - bh = page_buffers(page);
> + ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
> + ASSERT(!iop || atomic_read(>write_count) == 0);
>  
>   /*
> -  * Walk the blocks on the page, and we we run off then end of the
> -  * current map or find the current map invalid, grab a new one.
> -  * We only use bufferheads here to check per-block state - they no
> -  * longer control the iteration through the page. This allows us to
> -  * replace the bufferhead with some other state tracking mechanism in
> -  * future.
> +  * Walk through the page to find areas to write back. If we run off the
> +  * end of the current map or find the current map invalid, grab a new
> +  * one.
>*/
> - for (poffset = 0, file_offset = page_offset(page);
> -  poffset < PAGE_SIZE;
> -  poffset += len, file_offset += len) {
> - /* past the range we are writing, so nothing more to write. */
> - if (file_offset >= end_offset)
> - break;
> -
> + for (i = 0, file_offset = page_offset(page);
> +  i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
> +  i++, file_offset += len) {
>   /*
>* Block does not contain valid data, skip it.
>*/
> - if (bh && !buffer_uptodate(bh)) {
> - if (PageUptodate(page))
> - ASSERT(buffer_mapped(bh));
> - bh = bh->b_this_page;
> + if (iop && !test_bit(i, iop->uptodate))
>   continue;
> - }
>  
>   /*
>* If we don't have a valid map, now it's time to get a new one
> @@ -854,52 +693,33 @@ xfs_writepage_map(
>   error = xfs_map_blocks(inode, file_offset, >imap,
>>io_type);
>   if (error)
> - goto out;
> + break;
>   }
>  
> - if (wpc->io_type == XFS_IO_HOLE) {
> -  

Re: [PATCH 1/2] iomap: add support for sub-pagesize buffered I/O without buffer heads

2018-05-25 Thread Brian Foster
On Wed, May 23, 2018 at 04:46:45PM +0200, Christoph Hellwig wrote:
> After already supporting a simple implementation of buffered writes for
> the blocksize == PAGE_SIZE case in the last commit this adds full support
> even for smaller block sizes.   There are three bits of per-block
> information in the buffer_head structure that really matter for the iomap
> read and write path:
> 
>  - uptodate status (BH_uptodate)
>  - marked as currently under read I/O (BH_Async_Read)
>  - marked as currently under write I/O (BH_Async_Write)
> 
> Instead of having new per-block structures this now adds a per-page
> structure called struct iomap_page to track this information in a slightly
> different form:
> 
>  - a bitmap for the per-block uptodate status.  For worst case of a 64k
>page size system this bitmap needs to contain 128 bits.  For the
>typical 4k page size case it only needs 8 bits, although we still
>need a full unsigned long due to the way the atomic bitmap API works.
>  - two atomic_t counters are used to track the outstanding read and write
>counts
> 
> There is quite a bit of boilerplate code as the buffered I/O path uses
> various helper methods, but the actual code is very straight forward.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  fs/iomap.c| 247 +++---
>  include/linux/iomap.h |  31 ++
>  2 files changed, 260 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index debb859a8a14..ea746e0287f9 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
...
> @@ -104,6 +105,107 @@ iomap_sector(struct iomap *iomap, loff_t pos)
>   return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
>  }
>  
> +static struct iomap_page *
> +iomap_page_create(struct inode *inode, struct page *page)
> +{
> + struct iomap_page *iop = to_iomap_page(page);
> +
> + if (iop || i_blocksize(inode) == PAGE_SIZE)
> + return iop;
> +
> + iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL);
> + atomic_set(>read_count, 0);
> + atomic_set(>write_count, 0);
> + bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
> + set_page_private(page, (unsigned long)iop);
> + SetPagePrivate(page);

The buffer head implementation does a get/put page when the private
state is set. I'm not quite sure why that is tbh, but do you know
whether we need that here or not?

> + return iop;
> +}
> +
...
> @@ -142,18 +244,19 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>  {
>   struct iomap_readpage_ctx *ctx = data;
>   struct page *page = ctx->cur_page;
> - unsigned poff = pos & (PAGE_SIZE - 1);
> - unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
> + struct iomap_page *iop = iomap_page_create(inode, page);
>   bool is_contig = false;
> + loff_t orig_pos = pos;
> + unsigned poff, plen;
>   sector_t sector;
>  
> - /* we don't support blocksize < PAGE_SIZE quite yet: */
> - WARN_ON_ONCE(pos != page_offset(page));
> - WARN_ON_ONCE(plen != PAGE_SIZE);
> + iomap_adjust_read_range(inode, iop, , length, , );
> + if (plen == 0)
> + goto done;
>  
>   if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
>   zero_user(page, poff, plen);
> - SetPageUptodate(page);
> + iomap_set_range_uptodate(page, poff, plen);
>   goto done;
>   }
>  
> @@ -169,6 +272,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>   is_contig = true;
>   }
>  
> + /*
> +  * If we start a new segment we need to increase the read count, and we
> +  * need to do so before submitting any previous full bio to make sure
> +  * that we don't prematurely unlock the page.
> +  */
> + if (iop)
> + atomic_inc(>read_count);
> +
>   if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
>   if (ctx->bio)
>   submit_bio(ctx->bio);
> @@ -177,7 +288,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>  
>   __bio_add_page(ctx->bio, page, plen, poff);
>  done:
> - return plen;
> + return pos - orig_pos + plen;

A brief comment here (or above the adjust_read_range() call) to explain
the final length calculation would be helpful. E.g., it looks like
leading uptodate blocks are part of the read while trailing uptodate
blocks can be truncated by the above call.

>  }
>  
>  int
> @@ -188,8 +299,6 @@ iomap_readpage(struct page *page, const struct iomap_ops 
> *ops)
>   unsigned poff;
>   loff_t ret;
>  
> - WARN_ON_ONCE(page_has_buffers(page));
> -
>   for (poff = 0; poff < PAGE_SIZE; poff += ret) {
>   ret = iomap_apply(inode, page_offset(page) + poff,
>   PAGE_SIZE - poff, 0, ops, ,
> @@ -295,6 +404,92 @@ iomap_readpages(struct address_space 

Re: [PATCH 08/34] mm: split ->readpages calls to avoid non-contiguous pages lists

2018-05-22 Thread Brian Foster
On Fri, May 18, 2018 at 06:48:04PM +0200, Christoph Hellwig wrote:
> That way file systems don't have to go spotting for non-contiguous pages
> and work around them.  It also kicks off I/O earlier, allowing it to
> finish earlier and reduce latency.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  mm/readahead.c | 12 +++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/readahead.c b/mm/readahead.c
> index fa4d4b767130..044ab0c137cc 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -177,8 +177,18 @@ unsigned int __do_page_cache_readahead(struct 
> address_space *mapping,
>   rcu_read_lock();
>   page = radix_tree_lookup(>i_pages, page_offset);
>   rcu_read_unlock();
> - if (page && !radix_tree_exceptional_entry(page))
> + if (page && !radix_tree_exceptional_entry(page)) {
> + /*
> +  * Page already present?  Kick off the current batch of
> +  * contiguous pages before continuing with the next
> +  * batch.
> +  */
> + if (nr_pages)
> + read_pages(mapping, filp, _pool, nr_pages,
> + gfp_mask);
> + nr_pages = 0;
>   continue;
> + }

The comment at the top of this function explicitly states that we don't
submit I/Os before all of the pages are allocated. That probably needs
an update, at least.

That aside, couldn't this introduce that kind of problematic read/write
behavior if the mapping was sparsely populated for whatever reason
(every other page, for example)? Perhaps that's just too unlikely to
matter.

Brian

>  
>   page = __page_cache_alloc(gfp_mask);
>   if (!page)
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag

2018-05-01 Thread Brian Foster
On Tue, May 01, 2018 at 09:23:20AM +1000, Dave Chinner wrote:
> On Mon, Apr 30, 2018 at 05:00:14PM -0600, Jens Axboe wrote:
> > On 4/30/18 4:40 PM, Jens Axboe wrote:
> > > On 4/30/18 4:28 PM, Dave Chinner wrote:
> > >> Yes, it does, but so would having the block layer to throttle device
> > >> discard requests in flight to a queue depth of 1. And then we don't
> > >> have to change XFS at all.
> > > 
> > > I'm perfectly fine with making that change by default, and much easier
> > > for me since I don't have to patch file systems.
> > 
> > Totally untested, but this should do the trick. It ensures we have
> > a QD of 1 (per caller), which should be sufficient.
> > 
> > If people tune down the discard size, then you'll be blocking waiting
> > for discards on issue.
> > 
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index a676084d4740..0bf9befcc863 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -11,16 +11,19 @@
> >  #include "blk.h"
> >  
> >  static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
> > -   gfp_t gfp)
> > +   gfp_t gfp)
> >  {
> > -   struct bio *new = bio_alloc(gfp, nr_pages);
> > -
> > +   /*
> > +* Devices suck at discard, so if we have to break up the bio
> > +* size due to the max discard size setting, wait for the
> > +* previous one to finish first.
> > +*/
> > if (bio) {
> > -   bio_chain(bio, new);
> > -   submit_bio(bio);
> > +   submit_bio_wait(bio);
> > +   bio_put(bio);
> > }
> 
> This only addresses the case where __blkdev_issue_discard() breaks
> up a single large discard, right? It seems like a brute force
> solution, too, because it will do so even when the underlying device
> is idle and there's no need to throttle.
> 

I think the above serializes the discards regardless of whether
__blkdev_issue_discard() splits up a single range itself or the caller
passes discontiguous ranges as part of a single chain.

That said, I agree that it seems brute force. ISTM that this wouldn't
address separate callers causing the same problem (i.e., if the discard
burst doesn't happen to be part of a single chain)...? Alternatively,
what if the discard chain is large but covers a very small range (e.g.,
consider a chain of single block discards, for example)? Given your
previous comments around the size of the range being the determining
factor, would we really want to serialize in that case?

BTW, what happens with submit_bio_wait() if the block device is plugged?
Note that xlog_discard_busy_extents() currently plugs around the discard
bio chain construction.

> Shouldn't the throttling logic at least look at device congestion?
> i.e. if the device is not backlogged, then we should be able to
> issue the discard without problems. 
> 
> I ask this because this only addresses throttling the "discard large
> extent" case when the discard limit is set low. i.e. your exact
> problem case. We know that XFS can issue large numbers of
> discontiguous async discards in a single batch - this patch does not
> address that case and so it will still cause starvation problems.
> 
> If we look at device congestion in determining how to throttle/back
> off during discard issuing, then it doesn't matter what
> max_discard_sectors is set to - it will throttle in all situations
> that cause device overloads and starvations
> 

Indeed, this all seems more robust to me. TBH, even something more
simple like allowing one discard chain at a time where the chain itself
is size limited (and so puts the onus on the caller a bit to construct
larger chains to trade off for more simple block layer throttling logic)
seems reasonable, but I'm not sure if/how feasible that is (or if it
really is any more simple). Just a thought.

Note that however the implementation, I think that channeling the
backpressure in the form of sync submission has the potential to
reintroduce the filesystem latencies that have been brought up a couple
times in this thread. Unless I'm missing something, I still think there's
value in trying to separate discard submission from log I/O completion
in XFS.

Dave, do you have any fundamental thoughts on that? For example, I'm
wondering if there's any danger in creating a sort of intermediary busy
state between log I/O completion and some time later when a background
wq job or something discards the pending busy extents and removes them
from the pag tree. That actually might open the door for more
interesting optimizations like eliminating the need for certain discards
at all if the assocated blocks are reallocated (after the associated log
I/O completed but before the background discard happened to occur).
Actually, I'm told that explicit reuse of such blocks might be a
significant win for things like VDO (dm dedup), where discards are
apparently extra double ungood. Thoughts?

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com
> --
> To unsubscribe from 

Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag

2018-04-30 Thread Brian Foster
On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
> On 4/30/18 11:19 AM, Brian Foster wrote:
> > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >> XFS recently added support for async discards. While this can be
> >> a win for some workloads and devices, there are also cases where
> >> async bursty discard will severly harm the latencies of reads
> >> and writes.
> >>
> >> Add a 'discard_sync' mount flag to revert to using sync discard,
> >> issuing them one at the time and waiting for each one. This fixes
> >> a big performance regression we had moving to kernels that include
> >> the XFS async discard support.
> >>
> >> Signed-off-by: Jens Axboe <ax...@kernel.dk>
> >> ---
> > 
> > Hm, I figured the async discard stuff would have been a pretty clear win
> > all around, but then again I'm not terribly familiar with what happens
> > with discards beneath the fs. I do know that the previous behavior would
> > cause fs level latencies due to holding up log I/O completion while
> > discards completed one at a time. My understanding is that this lead to
> > online discard being pretty much universally "not recommended" in favor
> > of fstrim.
> 
> It's not a secret that most devices suck at discard. While the async
> discard is nifty and I bet works well for some cases, it can also cause
> a flood of discards on the device side which does not work well for
> other cases.
> 

Heh, Ok.

> > Do you have any more data around the workload where the old sync discard
> > behavior actually provides an overall win over the newer behavior? Is it
> > purely a matter of workload, or is it a workload+device thing with how
> > discards may be handled by certain devices?
> 
> The worse read latencies were observed on more than one device type,
> making it sync again was universally a win. We've had many issues
> with discard, one trick that is often used is to chop up file deletion
> into smaller chunks. Say you want to kill 10GB of data, you do it
> incrementally, since 10G of discard usually doesn't behave very nicely.
> If you make that async, then you're back to square one.
> 

Makes sense, so there's not much win in chopping up huge discard ranges
into smaller, async requests that cover the same overall size/range..

> > I'm ultimately not against doing this if it's useful for somebody and is
> > otherwise buried under a mount option, but it would be nice to see if
> > there's opportunity to improve the async mechanism before resorting to
> > that. Is the issue here too large bio chains, too many chains at once,
> > or just simply too many discards (regardless of chaining) at the same
> > time?
> 
> Well, ultimately you'd need better scheduling of the discards, but for
> most devices what really works the best is a simple "just do one at
> the time". The size constraint was added to further limit the impact.
> 

... but presumably there is some value in submitting some number of
requests together provided they adhere to some size constraint..? Is
there a typical size constraint for the average ssd, or is this value
all over the place? (Is there a field somewhere in the bdev that the fs
can query?)

(I guess I'll defer to Christoph's input on this, I assume he measured
some kind of improvement in the previous async work..)

> Honestly, I think the only real improvement would be on the device
> side. Most folks want discard as an advisory hint, and it should not
> impact current workloads at all. In reality, many implementations
> are much more strict and even include substantial flash writes. For
> the cases where we can't just turn it off (I'd love to), we at least
> need to make it less intrusive.
> 
> > I'm wondering if xfs could separate discard submission from log I/O
> > completion entirely and then perhaps limit/throttle submission somehow
> > or another (Christoph, thoughts?) via a background task. Perhaps doing
> > something like that may even eliminate the need for some discards on
> > busy filesystems with a lot of block free -> reallocation activity, but
> > I'm just handwaving atm.
> 
> Just having the sync vs async option is the best split imho. The async
> could potentially be scheduled. I don't think more involved logic
> belongs in the fs.
> 

The more interesting part to me is whether we can safely separate
discard from log I/O completion in XFS. Then we can release the log
buffer locks and whatnot and let the fs proceed without waiting on any
number of discards to complete. In theory, I think the background task
could issue discards one at a time (or N at a time, or N blocks at a
time, whateve

Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag

2018-04-30 Thread Brian Foster
On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> XFS recently added support for async discards. While this can be
> a win for some workloads and devices, there are also cases where
> async bursty discard will severly harm the latencies of reads
> and writes.
> 
> Add a 'discard_sync' mount flag to revert to using sync discard,
> issuing them one at the time and waiting for each one. This fixes
> a big performance regression we had moving to kernels that include
> the XFS async discard support.
> 
> Signed-off-by: Jens Axboe 
> ---

Hm, I figured the async discard stuff would have been a pretty clear win
all around, but then again I'm not terribly familiar with what happens
with discards beneath the fs. I do know that the previous behavior would
cause fs level latencies due to holding up log I/O completion while
discards completed one at a time. My understanding is that this lead to
online discard being pretty much universally "not recommended" in favor
of fstrim.

Do you have any more data around the workload where the old sync discard
behavior actually provides an overall win over the newer behavior? Is it
purely a matter of workload, or is it a workload+device thing with how
discards may be handled by certain devices?

I'm ultimately not against doing this if it's useful for somebody and is
otherwise buried under a mount option, but it would be nice to see if
there's opportunity to improve the async mechanism before resorting to
that. Is the issue here too large bio chains, too many chains at once,
or just simply too many discards (regardless of chaining) at the same
time?

I'm wondering if xfs could separate discard submission from log I/O
completion entirely and then perhaps limit/throttle submission somehow
or another (Christoph, thoughts?) via a background task. Perhaps doing
something like that may even eliminate the need for some discards on
busy filesystems with a lot of block free -> reallocation activity, but
I'm just handwaving atm.

Brian

>  fs/xfs/xfs_log_cil.c | 18 ++
>  fs/xfs/xfs_mount.h   |  1 +
>  fs/xfs/xfs_super.c   |  7 ++-
>  3 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index 4668403b1741..4eced3eceea5 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -552,10 +552,16 @@ xlog_discard_busy_extents(
>   struct bio  *bio = NULL;
>   struct blk_plug plug;
>   int error = 0;
> + int discard_flags = 0;
> + boolsync_discard = mp->m_flags & 
> XFS_MOUNT_DISCARD_SYNC;
>  
>   ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
>  
> - blk_start_plug();
> + if (!sync_discard)
> + blk_start_plug();
> + else
> + discard_flags = BLKDEV_DISCARD_SYNC;
> +
>   list_for_each_entry(busyp, list, list) {
>   trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
>busyp->length);
> @@ -563,7 +569,7 @@ xlog_discard_busy_extents(
>   error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
>   XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
>   XFS_FSB_TO_BB(mp, busyp->length),
> - GFP_NOFS, 0, );
> + GFP_NOFS, discard_flags, );
>   if (error && error != -EOPNOTSUPP) {
>   xfs_info(mp,
>"discard failed for extent [0x%llx,%u], error %d",
> @@ -574,14 +580,18 @@ xlog_discard_busy_extents(
>   }
>   }
>  
> - if (bio) {
> + if (sync_discard) {
> + xfs_extent_busy_clear(mp, >busy_extents, false);
> + kmem_free(ctx);
> + } else if (bio) {
>   bio->bi_private = ctx;
>   bio->bi_end_io = xlog_discard_endio;
>   submit_bio(bio);
> + blk_finish_plug();
>   } else {
>   xlog_discard_endio_work(>discard_endio_work);
> + blk_finish_plug();
>   }
> - blk_finish_plug();
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 10b90bbc5162..3f0b7b9106c7 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -219,6 +219,7 @@ typedef struct xfs_mount {
>  operations, typically for
>  disk errors in metadata */
>  #define XFS_MOUNT_DISCARD(1ULL << 5) /* discard unused blocks */
> +#define XFS_MOUNT_DISCARD_SYNC   (1ULL << 6) /* discard unused 
> blocks sync */
>  #define XFS_MOUNT_NOALIGN(1ULL << 7) /* turn off stripe alignment
>  allocations */
>  #define XFS_MOUNT_ATTR2  (1ULL << 8) /* allow use of attr2 
> format */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 

Re: [PATCH] A tentative patch to bypass REQ_META in blk throttle

2018-04-27 Thread Brian Foster
On Thu, Apr 26, 2018 at 12:01:18PM -0700, Tejun Heo wrote:
> Hello,
> 
> On Fri, Apr 20, 2018 at 06:06:01PM +0800, Benlong Zhang wrote:
> > One problem with cgwb is how fs should treat metadata bios.
> > For example in xfs, the log might be partially stuck in one
> > group, leaving threads in other groups waiting for too long.
> > Please refer to the linux-xfs discussion:
> > "[PATCH V2] xfs: implement cgroup writeback support"
> > 
> > One approach is to correctly tag bio->bi_css from within the
> > filesystems (for xfs log should be blkcg_root), and the other
> > is to skip them, but relies on REQ_META being set.
> > 
> > It works with xfs on a cgwb porting implementation to 3.10.0.
> > But really not sure about other filesystems...
> 
> Yeah, Josef (cc'd) is working on a similar approach but in a more
> generic way.
> 

I thought Josef's work had more to do with tracking/managing dirty
metadata (i.e., driving writeback/reclaim and whatnot). Perhaps I just
haven't followed it close enough... how does that relate to cgroup bio
ownership?

The issue addressed by this patch (wrt to XFS) is that log I/O that
happens to be submitted via a throttled cgroup ends up blocking
unthrottled tasks because the log is essentially a global/shared
mechanism. We therefore want to somehow exempt metadata I/O from
throttling in that particular case.

Note that I'm not familiar enough with the block layer code to know
whether this patch is the right way to do that, but I think (as noted in
the commit log above) that not doing the default bio association if the
fs hasn't (or associating with some root group) for metadata I/O would
also be sufficient. Hm?

Brian

> Thanks.
> 
> -- 
> tejun


Re: [PATCH v2] xfs: fix incorrect log_flushed on fsync

2017-09-06 Thread Brian Foster
On Wed, Sep 06, 2017 at 03:19:37PM +0200, Christoph Hellwig wrote:
> > You could also say that flush sequence counting code doesn't belong
> > to xfs code at all. There is nothing xfs specific about it.
> > 
> > If we had an API:
> > 
> > flush_seq = blkdev_get_flush_seq(bdev, flush_seq);
> > blkdev_issue_flush_after(bdev, flush_seq);
> > 
> > I am sure it would have been useful to more fs.
> > 
> > In fact, block drivers that use blk_insert_flush(),
> > already have serialized flush requests, so implementing
> > the above functionality would be quite trivial for those.
> > 
> > I am not fluent enough in block layer to say if this makes sense.
> > Christoph? Should we add some block people to this discussion?
> 
> Not that the interface can't be based on blkdev_issue_flush as
> our most important flushes are submitted asynchronously.
> 
> But except for that tying into the flush state machine sounds
> very interesting.  Let me look into that as the designate
> xfs <-> block layer liaison.

That sounds like a nice idea to me, particularly if there is potential
for other users. I'll wait to look into doing anything in XFS until we
see how this turns out. Thanks.

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 4.11.0-rc1 boot resulted in WARNING: CPU: 14 PID: 1722 at fs/sysfs/dir.c:31 .sysfs_warn_dup+0x78/0xb0

2017-03-09 Thread Brian Foster
cc linux-block

On Thu, Mar 09, 2017 at 04:20:06PM +0530, Abdul Haleem wrote:
> On Wed, 2017-03-08 at 08:17 -0500, Brian Foster wrote:
> > On Tue, Mar 07, 2017 at 10:01:04PM +0530, Abdul Haleem wrote:
> > > 
> > > Hi,
> > > 
> > > Today's mainline (4.11.0-rc1) booted with warnings on Power7 LPAR.
> > > 
> > > Issue is not reproducible all the time.
> > > 
> > > traces:
> > > 
> > > Found device VDASD 5.
> > > Mounting /home...
> > > Reached target Swap.
> > > Found device VDASD 2.
> > > 
> > > Mounting /boot...
> > > 
> > > sysfs: cannot create duplicate filename '/fs/xfs/sda'
> > 
> > That is strange. The sysfs name is ultimately based on the superblock id
> > (sb->s_id), which afaik should reflect the underlying device and thus be
> > unique. Just to be sure, do you otherwise have a mounted and functional
> > sysfs dir?
> > 
> > I assume after this boot you have a mounted 'sda' device somewhere.. If
> > so, what content already exists under the sysfs fs/xfs/sda subdir when
> > the duplicate warning occurs? Has the associated device been
> > mounted/unmounted before this occurs, or is there anything abnormal
> > going on during boot up, such as device repartitioning, devices coming
> > and going, etc..?
> 
> Brian, Thanks for looking into this issue.
> 
> No, the system is not in same state, it was overwritten by other runs of
> Automation framework.
> 
> However the boot logs are preserved and attached here.
> 
> > 
> > What does 'mount' show for this system when the problem occurs vs. when
> > it doesn't? It might be useful to post the full boot log for when this
> > occurs as well.
> 
> I do not have any sysfs info when warning occursm but on a normal system
> (with out warning):
> 
> # mount
> sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,relatime)
> proc on /proc type proc (rw,nosuid,nodev,noexec,relatime)
> devtmpfs on /dev type devtmpfs
> (rw,nosuid,size=41833088k,nr_inodes=653642,mode=755)
> securityfs on /sys/kernel/security type securityfs
> (rw,nosuid,nodev,noexec,relatime)
> tmpfs on /dev/shm type tmpfs (rw,nosuid,nodev)
> devpts on /dev/pts type devpts
> (rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000)
> tmpfs on /run type tmpfs (rw,nosuid,nodev,mode=755)
> tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,mode=755)
> cgroup on /sys/fs/cgroup/systemd type cgroup
> (rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd)
> pstore on /sys/fs/pstore type pstore (rw,nosuid,nodev,noexec,relatime)
> cgroup on /sys/fs/cgroup/cpuset type cgroup
> (rw,nosuid,nodev,noexec,relatime,cpuset)
> cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup
> (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
> cgroup on /sys/fs/cgroup/freezer type cgroup
> (rw,nosuid,nodev,noexec,relatime,freezer)
> cgroup on /sys/fs/cgroup/hugetlb type cgroup
> (rw,nosuid,nodev,noexec,relatime,hugetlb)
> cgroup on /sys/fs/cgroup/devices type cgroup
> (rw,nosuid,nodev,noexec,relatime,devices)
> cgroup on /sys/fs/cgroup/net_cls type cgroup
> (rw,nosuid,nodev,noexec,relatime,net_cls)
> cgroup on /sys/fs/cgroup/blkio type cgroup
> (rw,nosuid,nodev,noexec,relatime,blkio)
> cgroup on /sys/fs/cgroup/memory type cgroup
> (rw,nosuid,nodev,noexec,relatime,memory)
> cgroup on /sys/fs/cgroup/perf_event type cgroup
> (rw,nosuid,nodev,noexec,relatime,perf_event)
> configfs on /sys/kernel/config type configfs (rw,relatime)
> /dev/sda3 on / type ext4 (rw,relatime,data=ordered)
> systemd-1 on /proc/sys/fs/binfmt_misc type autofs
> (rw,relatime,fd=24,pgrp=1,timeout=300,minproto=5,maxproto=5,direct)
> debugfs on /sys/kernel/debug type debugfs (rw,relatime)
> mqueue on /dev/mqueue type mqueue (rw,relatime)
> hugetlbfs on /dev/hugepages type hugetlbfs (rw,relatime)
> binfmt_misc on /proc/sys/fs/binfmt_misc type binfmt_misc (rw,relatime)
> sunrpc on /var/lib/nfs/rpc_pipefs type rpc_pipefs (rw,relatime)
> nfsd on /proc/fs/nfsd type nfsd (rw,relatime)
> /dev/sda2 on /boot type xfs (rw,relatime,attr2,inode64,noquota)
> /dev/sda5 on /home type xfs (rw,relatime,attr2,inode64,noquota)

So you have two XFS filesystems mounted on partitions of this device,
sda2 and sda5.

> tmpfs on /run/user/0 type tmpfs
> (rw,nosuid,nodev,relatime,size=8375680k,mode=700)
> 
> # ls /sys/fs/xfs
> sda  sda5  stats
> 

Yet under /sys/fs/xfs we have sda and sda5. Presumably sda here refers
to sda2.

> # df -h
> Filesystem  Size  Used Avail Use% Mounted on
> devtmpfs 40G 0   40G   0% /dev
> tmpfs40G 0   40G   0% /dev/shm
>