Re: [PATCH v2 1/3] btrfs: remove always true if branch in find_delalloc_range

2018-11-28 Thread Nikolay Borisov



On 29.11.18 г. 5:33 ч., Lu Fengqi wrote:
> The @found is always false when it comes to the if branch. Besides, the
> bool type is more suitable for @found. Change the return value of the
> function and its caller to bool as well.
> 
> Signed-off-by: Lu Fengqi 

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/extent_io.c | 31 +++
>  fs/btrfs/extent_io.h |  2 +-
>  fs/btrfs/tests/extent-io-tests.c |  2 +-
>  3 files changed, 17 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index b2769e92b556..4b6b87e63b4a 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1452,16 +1452,16 @@ int find_first_extent_bit(struct extent_io_tree 
> *tree, u64 start,
>   * find a contiguous range of bytes in the file marked as delalloc, not
>   * more than 'max_bytes'.  start and end are used to return the range,
>   *
> - * 1 is returned if we find something, 0 if nothing was in the tree
> + * true is returned if we find something, false if nothing was in the tree
>   */
> -static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
> +static noinline bool find_delalloc_range(struct extent_io_tree *tree,
>   u64 *start, u64 *end, u64 max_bytes,
>   struct extent_state **cached_state)
>  {
>   struct rb_node *node;
>   struct extent_state *state;
>   u64 cur_start = *start;
> - u64 found = 0;
> + bool found = false;
>   u64 total_bytes = 0;
>  
>   spin_lock(>lock);
> @@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct 
> extent_io_tree *tree,
>*/
>   node = tree_search(tree, cur_start);
>   if (!node) {
> - if (!found)
> - *end = (u64)-1;
> + *end = (u64)-1;
>   goto out;
>   }
>  
> @@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct 
> extent_io_tree *tree,
>   *cached_state = state;
>   refcount_inc(>refs);
>   }
> - found++;
> + found = true;
>   *end = state->end;
>   cur_start = state->end + 1;
>   node = rb_next(node);
> @@ -1551,13 +1550,13 @@ static noinline int lock_delalloc_pages(struct inode 
> *inode,
>  }
>  
>  /*
> - * find a contiguous range of bytes in the file marked as delalloc, not
> - * more than 'max_bytes'.  start and end are used to return the range,
> + * find and lock a contiguous range of bytes in the file marked as delalloc,
> + * not more than 'max_bytes'.  start and end are used to return the range,
>   *
> - * 1 is returned if we find something, 0 if nothing was in the tree
> + * true is returned if we find something, false if nothing was in the tree
>   */
>  EXPORT_FOR_TESTS
> -noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
> +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
>   struct extent_io_tree *tree,
>   struct page *locked_page, u64 *start,
>   u64 *end)
> @@ -1565,7 +1564,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
> inode *inode,
>   u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
>   u64 delalloc_start;
>   u64 delalloc_end;
> - u64 found;
> + bool found;
>   struct extent_state *cached_state = NULL;
>   int ret;
>   int loops = 0;
> @@ -1580,7 +1579,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
> inode *inode,
>   *start = delalloc_start;
>   *end = delalloc_end;
>   free_extent_state(cached_state);
> - return 0;
> + return false;
>   }
>  
>   /*
> @@ -1612,7 +1611,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
> inode *inode,
>   loops = 1;
>   goto again;
>   } else {
> - found = 0;
> + found = false;
>   goto out_failed;
>   }
>   }
> @@ -3195,7 +3194,7 @@ static noinline_for_stack int writepage_delalloc(struct 
> inode *inode,
>  {
>   struct extent_io_tree *tree = _I(inode)->io_tree;
>   u64 page_end = delalloc_start + PAGE_SIZE - 1;
> - u64 nr_delalloc;
> + bool found;
>   u64 delalloc_to_write = 0;
>   u64 delalloc_end = 0;
>   int ret;
> @@ -3203,11 +3202,11 @@ static noinline_for_stack int 
> writepage_delalloc(struct inode *inode,
>  
>  
>   while (delalloc_end < page_end) {
> - nr_delalloc = find_lock_delalloc_range(inode, tree,
> + found = find_lock_delalloc_range(inode, tree,
>  page,
>  _start,
>  _end);
> - if (nr_delalloc == 0) 

[PATCH v2 1/3] btrfs: remove always true if branch in find_delalloc_range

2018-11-28 Thread Lu Fengqi
The @found is always false when it comes to the if branch. Besides, the
bool type is more suitable for @found. Change the return value of the
function and its caller to bool as well.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/extent_io.c | 31 +++
 fs/btrfs/extent_io.h |  2 +-
 fs/btrfs/tests/extent-io-tests.c |  2 +-
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b2769e92b556..4b6b87e63b4a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1452,16 +1452,16 @@ int find_first_extent_bit(struct extent_io_tree *tree, 
u64 start,
  * find a contiguous range of bytes in the file marked as delalloc, not
  * more than 'max_bytes'.  start and end are used to return the range,
  *
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * true is returned if we find something, false if nothing was in the tree
  */
-static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+static noinline bool find_delalloc_range(struct extent_io_tree *tree,
u64 *start, u64 *end, u64 max_bytes,
struct extent_state **cached_state)
 {
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
-   u64 found = 0;
+   bool found = false;
u64 total_bytes = 0;
 
spin_lock(>lock);
@@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
 */
node = tree_search(tree, cur_start);
if (!node) {
-   if (!found)
-   *end = (u64)-1;
+   *end = (u64)-1;
goto out;
}
 
@@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
*cached_state = state;
refcount_inc(>refs);
}
-   found++;
+   found = true;
*end = state->end;
cur_start = state->end + 1;
node = rb_next(node);
@@ -1551,13 +1550,13 @@ static noinline int lock_delalloc_pages(struct inode 
*inode,
 }
 
 /*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'.  start and end are used to return the range,
+ * find and lock a contiguous range of bytes in the file marked as delalloc,
+ * not more than 'max_bytes'.  start and end are used to return the range,
  *
- * 1 is returned if we find something, 0 if nothing was in the tree
+ * true is returned if we find something, false if nothing was in the tree
  */
 EXPORT_FOR_TESTS
-noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
+noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end)
@@ -1565,7 +1564,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
inode *inode,
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
-   u64 found;
+   bool found;
struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
@@ -1580,7 +1579,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
inode *inode,
*start = delalloc_start;
*end = delalloc_end;
free_extent_state(cached_state);
-   return 0;
+   return false;
}
 
/*
@@ -1612,7 +1611,7 @@ noinline_for_stack u64 find_lock_delalloc_range(struct 
inode *inode,
loops = 1;
goto again;
} else {
-   found = 0;
+   found = false;
goto out_failed;
}
}
@@ -3195,7 +3194,7 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
 {
struct extent_io_tree *tree = _I(inode)->io_tree;
u64 page_end = delalloc_start + PAGE_SIZE - 1;
-   u64 nr_delalloc;
+   bool found;
u64 delalloc_to_write = 0;
u64 delalloc_end = 0;
int ret;
@@ -3203,11 +3202,11 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
 
 
while (delalloc_end < page_end) {
-   nr_delalloc = find_lock_delalloc_range(inode, tree,
+   found = find_lock_delalloc_range(inode, tree,
   page,
   _start,
   _end);
-   if (nr_delalloc == 0) {
+   if (!found) {
delalloc_start = delalloc_end + 1;
continue;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

Re: [PATCH 2/3] btrfs: wakeup cleaner thread when adding delayed iput

2018-11-28 Thread Qu Wenruo



On 2018/11/29 上午4:08, Filipe Manana wrote:
> On Wed, Nov 28, 2018 at 7:09 PM David Sterba  wrote:
>>
>> On Tue, Nov 27, 2018 at 03:08:08PM -0500, Josef Bacik wrote:
>>> On Tue, Nov 27, 2018 at 07:59:42PM +, Chris Mason wrote:
 On 27 Nov 2018, at 14:54, Josef Bacik wrote:

> On Tue, Nov 27, 2018 at 10:26:15AM +0200, Nikolay Borisov wrote:
>>
>>
>> On 21.11.18 г. 21:09 ч., Josef Bacik wrote:
>>> The cleaner thread usually takes care of delayed iputs, with the
>>> exception of the btrfs_end_transaction_throttle path.  The cleaner
>>> thread only gets woken up every 30 seconds, so instead wake it up to
>>> do
>>> it's work so that we can free up that space as quickly as possible.
>>
>> Have you done any measurements how this affects the overall system. I
>> suspect this introduces a lot of noise since now we are going to be
>> doing a thread wakeup on every iput, does this give a chance to have
>> nice, large batches of iputs that  the cost of wake up can be
>> amortized
>> across?
>
> I ran the whole patchset with our A/B testing stuff and the patchset
> was a 5%
> win overall, so I'm inclined to think it's fine.  Thanks,

 It's a good point though, a delayed wakeup may be less overhead.
>>>
>>> Sure, but how do we go about that without it sometimes messing up?  In 
>>> practice
>>> the only time we're doing this is at the end of finish_ordered_io, so 
>>> likely to
>>> not be a constant stream.  I suppose since we have places where we force it 
>>> to
>>> run that we don't really need this.  IDK, I'm fine with dropping it.  
>>> Thanks,
>>
>> The transaction thread wakes up cleaner periodically (commit interval,
>> 30s by default), so the time to process iputs is not unbounded.
>>
>> I have the same concerns as Nikolay, coupling the wakeup to all delayed
>> iputs could result in smaller batches. But some of the callers of
>> btrfs_add_delayed_iput might benefit from the extra wakeup, like
>> btrfs_remove_block_group, so I don't want to leave the idea yet.
> 
> I'm curious, why do you think it would benefit btrfs_remove_block_group()?

Just as Filipe said, I'm not sure why btrfs_remove_block_group() would
get some benefit from more frequent cleaner thread wake up.

For an empty block group to really be removed, it also needs to have 0
pinned bytenr, which is only possible after a transaction get committed.

Thanks,
Qu


Re: [PATCH 2/3] btrfs: wakeup cleaner thread when adding delayed iput

2018-11-28 Thread Filipe Manana
On Wed, Nov 28, 2018 at 7:09 PM David Sterba  wrote:
>
> On Tue, Nov 27, 2018 at 03:08:08PM -0500, Josef Bacik wrote:
> > On Tue, Nov 27, 2018 at 07:59:42PM +, Chris Mason wrote:
> > > On 27 Nov 2018, at 14:54, Josef Bacik wrote:
> > >
> > > > On Tue, Nov 27, 2018 at 10:26:15AM +0200, Nikolay Borisov wrote:
> > > >>
> > > >>
> > > >> On 21.11.18 г. 21:09 ч., Josef Bacik wrote:
> > > >>> The cleaner thread usually takes care of delayed iputs, with the
> > > >>> exception of the btrfs_end_transaction_throttle path.  The cleaner
> > > >>> thread only gets woken up every 30 seconds, so instead wake it up to
> > > >>> do
> > > >>> it's work so that we can free up that space as quickly as possible.
> > > >>
> > > >> Have you done any measurements how this affects the overall system. I
> > > >> suspect this introduces a lot of noise since now we are going to be
> > > >> doing a thread wakeup on every iput, does this give a chance to have
> > > >> nice, large batches of iputs that  the cost of wake up can be
> > > >> amortized
> > > >> across?
> > > >
> > > > I ran the whole patchset with our A/B testing stuff and the patchset
> > > > was a 5%
> > > > win overall, so I'm inclined to think it's fine.  Thanks,
> > >
> > > It's a good point though, a delayed wakeup may be less overhead.
> >
> > Sure, but how do we go about that without it sometimes messing up?  In 
> > practice
> > the only time we're doing this is at the end of finish_ordered_io, so 
> > likely to
> > not be a constant stream.  I suppose since we have places where we force it 
> > to
> > run that we don't really need this.  IDK, I'm fine with dropping it.  
> > Thanks,
>
> The transaction thread wakes up cleaner periodically (commit interval,
> 30s by default), so the time to process iputs is not unbounded.
>
> I have the same concerns as Nikolay, coupling the wakeup to all delayed
> iputs could result in smaller batches. But some of the callers of
> btrfs_add_delayed_iput might benefit from the extra wakeup, like
> btrfs_remove_block_group, so I don't want to leave the idea yet.

I'm curious, why do you think it would benefit btrfs_remove_block_group()?



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”


Re: [PATCH 2/3] btrfs: wakeup cleaner thread when adding delayed iput

2018-11-28 Thread Chris Mason
On 28 Nov 2018, at 14:06, David Sterba wrote:

> On Tue, Nov 27, 2018 at 03:08:08PM -0500, Josef Bacik wrote:
>> On Tue, Nov 27, 2018 at 07:59:42PM +, Chris Mason wrote:
>>> On 27 Nov 2018, at 14:54, Josef Bacik wrote:
>>>
 On Tue, Nov 27, 2018 at 10:26:15AM +0200, Nikolay Borisov wrote:
>
>
> On 21.11.18 г. 21:09 ч., Josef Bacik wrote:
>> The cleaner thread usually takes care of delayed iputs, with the
>> exception of the btrfs_end_transaction_throttle path.  The 
>> cleaner
>> thread only gets woken up every 30 seconds, so instead wake it up 
>> to
>> do
>> it's work so that we can free up that space as quickly as 
>> possible.
>
> Have you done any measurements how this affects the overall 
> system. I
> suspect this introduces a lot of noise since now we are going to 
> be
> doing a thread wakeup on every iput, does this give a chance to 
> have
> nice, large batches of iputs that  the cost of wake up can be
> amortized
> across?

 I ran the whole patchset with our A/B testing stuff and the 
 patchset
 was a 5%
 win overall, so I'm inclined to think it's fine.  Thanks,
>>>
>>> It's a good point though, a delayed wakeup may be less overhead.
>>
>> Sure, but how do we go about that without it sometimes messing up?  
>> In practice
>> the only time we're doing this is at the end of finish_ordered_io, so 
>> likely to
>> not be a constant stream.  I suppose since we have places where we 
>> force it to
>> run that we don't really need this.  IDK, I'm fine with dropping it.  
>> Thanks,
>
> The transaction thread wakes up cleaner periodically (commit interval,
> 30s by default), so the time to process iputs is not unbounded.
>
> I have the same concerns as Nikolay, coupling the wakeup to all 
> delayed
> iputs could result in smaller batches. But some of the callers of
> btrfs_add_delayed_iput might benefit from the extra wakeup, like
> btrfs_remove_block_group, so I don't want to leave the idea yet.

Yeah, I love the idea, I'm just worried about a tiny bit of rate 
limiting.  Since we're only waking up a fixed process and not creating 
new work queue items every time, it's probably fine, but a delay of HZ/2 
would probably be even better.

-chris


Re: [PATCH 2/3] btrfs: wakeup cleaner thread when adding delayed iput

2018-11-28 Thread David Sterba
On Tue, Nov 27, 2018 at 03:08:08PM -0500, Josef Bacik wrote:
> On Tue, Nov 27, 2018 at 07:59:42PM +, Chris Mason wrote:
> > On 27 Nov 2018, at 14:54, Josef Bacik wrote:
> > 
> > > On Tue, Nov 27, 2018 at 10:26:15AM +0200, Nikolay Borisov wrote:
> > >>
> > >>
> > >> On 21.11.18 г. 21:09 ч., Josef Bacik wrote:
> > >>> The cleaner thread usually takes care of delayed iputs, with the
> > >>> exception of the btrfs_end_transaction_throttle path.  The cleaner
> > >>> thread only gets woken up every 30 seconds, so instead wake it up to 
> > >>> do
> > >>> it's work so that we can free up that space as quickly as possible.
> > >>
> > >> Have you done any measurements how this affects the overall system. I
> > >> suspect this introduces a lot of noise since now we are going to be
> > >> doing a thread wakeup on every iput, does this give a chance to have
> > >> nice, large batches of iputs that  the cost of wake up can be 
> > >> amortized
> > >> across?
> > >
> > > I ran the whole patchset with our A/B testing stuff and the patchset 
> > > was a 5%
> > > win overall, so I'm inclined to think it's fine.  Thanks,
> > 
> > It's a good point though, a delayed wakeup may be less overhead.
> 
> Sure, but how do we go about that without it sometimes messing up?  In 
> practice
> the only time we're doing this is at the end of finish_ordered_io, so likely 
> to
> not be a constant stream.  I suppose since we have places where we force it to
> run that we don't really need this.  IDK, I'm fine with dropping it.  Thanks,

The transaction thread wakes up cleaner periodically (commit interval,
30s by default), so the time to process iputs is not unbounded.

I have the same concerns as Nikolay, coupling the wakeup to all delayed
iputs could result in smaller batches. But some of the callers of
btrfs_add_delayed_iput might benefit from the extra wakeup, like
btrfs_remove_block_group, so I don't want to leave the idea yet.


Re: [PATCH] btrfs: Remove unnecessary code from __btrfs_rebalance

2018-11-28 Thread David Sterba
On Thu, Nov 22, 2018 at 10:17:33AM +0200, Nikolay Borisov wrote:
> The first step fo the rebalance process, ensuring there is 1mb free on
> each device. This number seems rather small. And in fact when talking
> to the original authors their opinions were:
> 
> "man that's a little bonkers"
> "i don't think we even need that code anymore"
> "I think it was there to make sure we had room for the blank 1M at the
> beginning. I bet it goes all the way back to v0"
> "we just don't need any of that tho, i say we just delete it"
>
> Clearly, this piece of code has lost its original intent throughout
> the years. It doesn't really bring any real practical benefits to the
> relocation process.

That I would agree, 

> No functional changes.

though this sounds too bold given what code it removes. Shrink, grow in
a transaction, all of that can fail for various reasons and would have
visible effects. I'd rather reserve the 'no functional changes'
statement for cases where it's really just renaming or restructuring the
code without any side effects.

> - ret = btrfs_shrink_device(device, old_size - size_to_free);
> - if (ret == -ENOSPC)
> - break;

Shrink can be pretty heavy even before it fails with ENOSPC, I think the
runtime of balance will go down. And it could be even noticeable, as it
has to go through all chunks twice before it really fails.

Measuring that would be good as it's a thing that can be a hilight of
pull request/change overviews.

I'll add the patch to for-next for testing, please update te changelog
with the potential effects. Thanks.


Re: [PATCH v2] btrfs: improve error handling of btrfs_add_link()

2018-11-28 Thread David Sterba
On Tue, Nov 27, 2018 at 09:56:43AM +0100, Johannes Thumshirn wrote:
> On 26/11/2018 19:37, David Sterba wrote:
> > I though a transaction abort would be here, as the state is not
> > consistent. Also I'm not sure what I as a user would get from such error
> > message after calling link(). If the error handling in the error
> > handling fails, there's not much left to do and the abort either
> > happened earlier in the callees or is necessary here.
> 
> OK, now you've lost me. Isn't that what my previous patch did?

Yes (call abort in fail_dir_item:) but it also merged all the abort
calls -- this is the part that I wanted to be updated.


Re: [PATCH] Fix typos

2018-11-28 Thread Andrea Gelmini
On Wed, Nov 28, 2018 at 06:02:47PM +0100, David Sterba wrote:
> You've put your time to find and fix the typos so you deserve the
> credit. Because you sent them with the signed-off-by line I will use

Ok, David, thanks a lot.
So, I'm going to rework the patch with the various suggestion, and
then re-send it.

Thanks a lot,
Andrea


Re: [PATCH] Fix typos

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 05:17:09PM +0100, Andrea Gelmini wrote:
> On Wed, Nov 28, 2018 at 04:56:20PM +0100, David Sterba wrote:
> > Thanks, such patches are accepted once in a while when the amount of new
> > typo fixes becomes noticeable.
> 
> About this patch (and the others sent to this m/l), I don't
> care to have them committed with my name.

You've put your time to find and fix the typos so you deserve the
credit. Because you sent them with the signed-off-by line I will use
that and maybe write a short changelog but otherwise I don't see a
reason why you should not be mentioned.

> Feel free to copy/integrate/commit with your/someone else name.

Unless you really want to stay out of the linux git history, I'm somehow
obliged by the development process to stick with the patch author name.

> It's just - probably - the best way to let the real devs to know
> about the typos, but it's not valuable effort that must be
> recognized.

The typos can be annoying and distracting while reading code or prevent
some function names to be greppable (which is usually the reason to fix
them). My first patch to linux was also fixing typos, and see where I am
now :)


Re: [PATCH] btrfs: Refactor btrfs_merge_bio_hook

2018-11-28 Thread Nikolay Borisov



On 28.11.18 г. 18:46 ч., David Sterba wrote:
> On Tue, Nov 27, 2018 at 08:57:58PM +0200, Nikolay Borisov wrote:
>> This function really checks whether adding more data to the bio will
>> straddle a stripe/chunk. So first let's give it a more appropraite
>> name - btrfs_bio_fits_in_stripe. Secondly, the offset parameter was
>> never used to just remove it. Thirdly, pages are submitted to either
>> btree or data inodes so it's guaranteed that tree->ops is set so replace
>> the check with an ASSERT. Finally, document the parameters of the
>> function. No functional changes.
>>
>> Signed-off-by: Nikolay Borisov 
> 
> Reviewed-by: David Sterba 
> 
>> -submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 
>> 0);
>> +submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
>> +  0);
> 
>> -submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE,
>> -comp_bio, 0);
>> +submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE,
>> +  comp_bio, 0);
> 
> 
>> -if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size,
>> -  bio, bio_flags))
>> +ASSERT(tree->ops);
>> +if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
>>  can_merge = false;
> 
> Got me curious if we could get rid of the size parameter, it's 2x
> PAGE_SIZE and could be something else in one case but it's not obvious
> if it really happens.
> 
> Another thing I noticed is lack of proper error handling in all callers,
> as its' 0, 1, and negative errno. The error would be interpreted as true
> ie. add page to bio and continue.

Actually anything other than 0 is returned then the current bio is
actually submitted (I presume you refer to the code in compression.c).
As a matter of fact I think btrfs_bio_fits_in_stripe could even be
turned to return a bool value.

THe only time this function could return an error is if the mapping
logic goes haywire which could happen 'if (offset < stripe_offset) {' or
we don't find a chunk for the given offset, which is unlikely.

> 


Re: [PATCH] btrfs: Refactor btrfs_merge_bio_hook

2018-11-28 Thread David Sterba
On Tue, Nov 27, 2018 at 08:57:58PM +0200, Nikolay Borisov wrote:
> This function really checks whether adding more data to the bio will
> straddle a stripe/chunk. So first let's give it a more appropraite
> name - btrfs_bio_fits_in_stripe. Secondly, the offset parameter was
> never used to just remove it. Thirdly, pages are submitted to either
> btree or data inodes so it's guaranteed that tree->ops is set so replace
> the check with an ASSERT. Finally, document the parameters of the
> function. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: David Sterba 

> - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 
> 0);
> + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
> +   0);

> - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE,
> - comp_bio, 0);
> + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE,
> +   comp_bio, 0);


> - if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size,
> -   bio, bio_flags))
> + ASSERT(tree->ops);
> + if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
>   can_merge = false;

Got me curious if we could get rid of the size parameter, it's 2x
PAGE_SIZE and could be something else in one case but it's not obvious
if it really happens.

Another thing I noticed is lack of proper error handling in all callers,
as its' 0, 1, and negative errno. The error would be interpreted as true
ie. add page to bio and continue.


Re: [PATCH 2/3] btrfs: cleanup the useless DEFINE_WAIT in cleanup_transaction

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 11:22:54AM +0800, Lu Fengqi wrote:
> When it is introduced at commit f094ac32aba3 ("Btrfs: fix NULL pointer
> after aborting a transaction"), it's useless.
> 
> Signed-off-by: Lu Fengqi 

Added to misc-next, thanks.


Re: [PATCH 3/3] btrfs: remove redundant nowait check for buffered_write

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 09:04:31AM +0200, Nikolay Borisov wrote:
> On 28.11.18 г. 5:23 ч., Lu Fengqi wrote:
> > The generic_write_checks will check the combination of IOCB_NOWAIT and
> > !IOCB_DIRECT.
> 
> True, however btrfs will return ENOSUPP whereas the generic code returns
> EINVAL. I guess this is not a big deal and it's likely generic code is
> correct, so:

I tried to check the git history if there are some clues for that. The
duplicate checks could be a intermediate step before the aio/nowait
feature was complete but somehow not removed at the end.

The btrfs part was Added in commit
91f9943e1c7b6638f27312d03fe71fcc67b23571 "fs: support RWF_NOWAIT for
buffered reads" (4.13)

The generic checks were added in
6be96d3ad34a124450028dabba43f07fe1d0c86d (4.12), so from that it seems
that it was added tot btrfs as redundant already.


Re: [PATCH] Fix typos

2018-11-28 Thread Andrea Gelmini
On Wed, Nov 28, 2018 at 04:56:20PM +0100, David Sterba wrote:
> 
> Thanks, such patches are accepted once in a while when the amount of new
> typo fixes becomes noticeable.

About this patch (and the others sent to this m/l), I don't
care to have them committed with my name.

Feel free to copy/integrate/commit with your/someone else name.

It's just - probably - the best way to let the real devs to know
about the typos, but it's not valuable effort that must be
recognized.

So, use them as you prefer.

Thanks a lot for your work,
Gelma


Re: Linux-next regression?

2018-11-28 Thread Andrea Gelmini
On Tue, Nov 27, 2018 at 10:16:52PM +0800, Qu Wenruo wrote:
>
> But it's less a concerning problem since it doesn't reach latest RC, so
> if you could reproduce it stably, I'd recommend to do a bisect.

No problem to bisect, usually.
But right now it's not possible for me, I explain further.
Anyway, here the rest of the story.

So, in the end I:
a) booted with 4.20.0-rc4
b) updated backup
c) did the btrfs check --read-only
d) seven steps, everything is perfect
e) no complains on screen or in logs (never had)
f) so, started to compile linux-next 20181128 (on another partition)
e) without using (reading or writing) on /home, I started
f) btrfs filesystem defrag -v -r -t 128M /home
g) it worked without complain (in screen or logs)
h) then, reboot with kernel tag 20181128
i) and no way to mount:

--
nov 28 15:44:03 glet kernel: BTRFS: device label home devid 1 transid 37360 
/dev/mapper/cry-home
nov 28 15:44:04 glet kernel: BTRFS info (device dm-3): use lzo compression, 
level 0
nov 28 15:44:04 glet kernel: BTRFS info (device dm-3): turning on discard
nov 28 15:44:04 glet kernel: BTRFS info (device dm-3): enabling auto defrag
nov 28 15:44:04 glet kernel: BTRFS info (device dm-3): disk space caching is 
enabled
nov 28 15:44:04 glet kernel: BTRFS info (device dm-3): has skinny extents
nov 28 15:44:04 glet kernel: BTRFS error (device dm-3): bad tree block start, 
want 2150302023680 have 17816181330383341936
nov 28 15:44:04 glet kernel: BTRFS error (device dm-3): failed to read block 
groups: -5
nov 28 15:44:04 glet kernel: BTRFS error (device dm-3): open_ctree failed
--

l) get back to 4.20.0-rc4
m) mounted, but after a few minutes, I get this:

--
nov 28 15:51:23 glet kernel: BTRFS warning (device dm-3): block group 
2199347265536 has wrong amount of free space
nov 28 15:51:23 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 2199347265536, rebuilding it now
nov 28 15:51:23 glet kernel: BTRFS warning (device dm-3): block group 
2196126040064 has wrong amount of free space
nov 28 15:51:23 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 2196126040064, rebuilding it now
nov 28 15:52:09 glet kernel: BTRFS warning (device dm-3): block group 
218431488 has wrong amount of free space
nov 28 15:52:09 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 218431488, rebuilding it now
nov 28 15:52:09 glet kernel: BTRFS warning (device dm-3): block group 
2183241138176 has wrong amount of free space
nov 28 15:52:09 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 2183241138176, rebuilding it now
nov 28 15:52:53 glet kernel: BTRFS warning (device dm-3): block group 
2152102625280 has wrong amount of free space
nov 28 15:52:53 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 2152102625280, rebuilding it now
nov 28 15:54:13 glet kernel: BTRFS warning (device dm-3): block group 
2530059747328 has wrong amount of free space
nov 28 15:54:13 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 2530059747328, rebuilding it now
nov 28 15:55:10 glet kernel: BTRFS warning (device dm-3): block group 
2151028883456 has wrong amount of free space
nov 28 15:55:10 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 2151028883456, rebuilding it now
nov 28 15:55:48 glet kernel: BTRFS warning (device dm-3): block group 
2203642232832 has wrong amount of free space
nov 28 15:55:48 glet kernel: BTRFS warning (device dm-3): failed to load free 
space cache for block group 2203642232832, rebuilding it now
--

n) and then read-only mode:

--
[ 1058.996960] BTRFS error (device dm-3): bad tree block start, want 
2150382092288 have 159161645701828393
[ 1058.996967] BTRFS: error (device dm-3) in __btrfs_free_extent:6831: errno=-5 
IO failure
[ 1058.996969] BTRFS info (device dm-3): forced readonly
[ 1058.996971] BTRFS: error (device dm-3) in btrfs_run_delayed_refs:2978: 
errno=-5 IO failure
[ 1059.002857] BTRFS error (device dm-3): pending csums is 97832960
--

So, ok, for the moment I'm very sorry I can't help you with bisect, because I 
have to
revert to ext4. This is the laptop I use to work with.

If I can help you investigating, just tell me.

Thanks for your time,
Gelma


signature.asc
Description: PGP signature


Re: [PATCH] Fix typos

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 12:05:13PM +0100, Andrea Gelmini wrote:
> Signed-off-by: Andrea Gelmini 
> ---
> 
> Stupid fixes. Made on 4.20-rc4, and ported on linux-next (next-20181128).

Thanks, such patches are accepted once in a while when the amount of new
typo fixes becomes noticeable.

>  fs/btrfs/backref.c |  4 ++--
>  fs/btrfs/check-integrity.c |  2 +-
>  fs/btrfs/compression.c |  4 ++--
>  fs/btrfs/ctree.c   |  4 ++--
>  fs/btrfs/dev-replace.c |  2 +-
>  fs/btrfs/disk-io.c |  4 ++--
>  fs/btrfs/extent-tree.c | 28 ++--
>  fs/btrfs/extent_io.c   |  4 ++--
>  fs/btrfs/extent_io.h   |  2 +-
>  fs/btrfs/extent_map.c  |  2 +-
>  fs/btrfs/file.c|  6 +++---
>  fs/btrfs/free-space-tree.c |  2 +-
>  fs/btrfs/inode.c   | 10 +-
>  fs/btrfs/ioctl.c   |  4 ++--
>  fs/btrfs/lzo.c |  2 +-
>  fs/btrfs/qgroup.c  | 14 +++---
>  fs/btrfs/qgroup.h  |  4 ++--
>  fs/btrfs/raid56.c  |  2 +-
>  fs/btrfs/ref-verify.c  |  6 +++---
>  fs/btrfs/relocation.c  |  2 +-
>  fs/btrfs/scrub.c   |  2 +-
>  fs/btrfs/send.c|  4 ++--
>  fs/btrfs/super.c   |  8 
>  fs/btrfs/transaction.c |  4 ++--
>  fs/btrfs/tree-checker.c|  6 +++---
>  fs/btrfs/tree-log.c|  4 ++--
>  fs/btrfs/volumes.c |  8 
>  27 files changed, 72 insertions(+), 72 deletions(-)

Approves.

> @@ -1010,14 +1010,14 @@ int btrfs_lookup_extent_info(struct 
> btrfs_trans_handle *trans,
>   *
>   * - multiple snapshots, subvolumes, or different generations in one subvol
>   * - different files inside a single subvolume
> - * - different offsets inside a file (bookend extents in file.c)
> + * - different offsets inside a file (booked extents in file.c)
>   *
>   * The extent ref structure for the implicit back refs has fields for:
>   *
>   * - Objectid of the subvolume root
>   * - objectid of the file holding the reference
>   * - original offset in the file
> - * - how many bookend extents
> + * - how many booked extents

Please note that 'bookend' is not a typo here. The meaning used for
btrfs is close to 2a or 2b in https://www.merriam-webster.com/dictionary/bookend

and refers to the mechanism how blocks are shared by extents and
lengthier explanation would show why snapshots are so cheap (I really
tried to write a tl;dr version, but nope).


Re: [PATCH v2 0/3] Misc cosmetic changes for map_private_extent_buffer

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 09:54:53AM +0100, Johannes Thumshirn wrote:
> While trying to understand the checksum code I came across some oddities
> regarding map_private_extent_buffer() and it's interaction with
> csum_tree_block().
> 
> These patches address them but are either purely cosmetic or only add a
> comment documenting behaviour.
> 
> Changes to v1:
> * Add Reviewed-by tags
> * Change wording of commit message in patch 3/3
> 
> Johannes Thumshirn (3):
>   btrfs: don't initialize 'offset' in map_private_extent_buffer()
>   btrfs: use offset_in_page for start_offset in
> map_private_extent_buffer()
>   btrfs: document extent mapping assumptions in checksum

1 and 3 applied, thanks.


Re: [PATCH v2 2/3] btrfs: use offset_in_page for start_offset in map_private_extent_buffer()

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 09:54:55AM +0100, Johannes Thumshirn wrote:
> In map_private_extent_buffer() use offset_in_page() to initialize
> 'start_offset' instead of open-coding it.

Can you please fix all instances where it's opencoded? Grepping for
'PAGE_SIZE - 1' finds a number of them. Thanks.


Re: [PATCH 1/9] btrfs: switch BTRFS_FS_STATE_* to enums

2018-11-28 Thread David Sterba
On Tue, Nov 27, 2018 at 04:24:03PM -0800, Omar Sandoval wrote:
> On Tue, Nov 27, 2018 at 08:53:41PM +0100, David Sterba wrote:
> > We can use simple enum for values that are not part of on-disk format:
> > global filesystem states.
> 
> Reviewed-by: Omar Sandoval 
> 
> Some typos/wording suggestions below.
> 
> > Signed-off-by: David Sterba 
> > ---
> >  fs/btrfs/ctree.h | 25 +++--
> >  1 file changed, 19 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index a98507fa9192..f82ec5e41b0c 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -109,13 +109,26 @@ static inline unsigned long btrfs_chunk_item_size(int 
> > num_stripes)
> >  }
> >  
> >  /*
> > - * File system states
> > + * Runtime (in-memory) states of filesystem
> >   */
> > -#define BTRFS_FS_STATE_ERROR   0
> > -#define BTRFS_FS_STATE_REMOUNTING  1
> > -#define BTRFS_FS_STATE_TRANS_ABORTED   2
> > -#define BTRFS_FS_STATE_DEV_REPLACING   3
> > -#define BTRFS_FS_STATE_DUMMY_FS_INFO   4
> > +enum {
> > +   /* Global indicator of serious filesysystem errors */
> 
> filesysystem -> filesystem
> 
> > +   BTRFS_FS_STATE_ERROR,
> > +   /*
> > +* Filesystem is being remounted, allow to skip some operations, like
> > +* defrag
> > +*/
> > +   BTRFS_FS_STATE_REMOUNTING,
> > +   /* Track if the transaction abort has been reported */
> 
> Which one is "the" transaction abort? This gives me the impression that
> this is a flag on the transaction, but it's actually filesystem state.
> Maybe "Track if a transaction abort has been reported on this
> filesystem"?

Your wording is more clear and I've used it in the patch.

> > +   BTRFS_FS_STATE_TRANS_ABORTED,
> > +   /*
> > +* Indicate that replace source or target device state is changed and
> > +* allow to block bio operations
> > +*/
> 
> Again, this makes it sound like it's device state, but it's actually
> filesystem state. How about "Bio operations should be blocked on this
> filesystem because a source or target device is being destroyed as part
> of a device replace"?

Same. Thanks.


[PATCH v2] Btrfs: fix fsync of files with multiple hard links in new directories

2018-11-28 Thread fdmanana
From: Filipe Manana 

The log tree has a long standing problem that when a file is fsync'ed we
only check for new ancestors, created in the current transaction, by
following only the hard link for which the fsync was issued. We follow the
ancestors using the VFS' dget_parent() API. This means that if we create a
new link for a file in a directory that is new (or in an any other new
ancestor directory) and then fsync the file using an old hard link, we end
up not logging the new ancestor, and on log replay that new hard link and
ancestor do not exist. In some cases, involving renames, the file will not
exist at all.

Example:

  mkfs.btrfs -f /dev/sdb
  mount /dev/sdb /mnt

  mkdir /mnt/A
  touch /mnt/foo
  ln /mnt/foo /mnt/A/bar
  xfs_io -c fsync /mnt/foo

  

In this example after log replay only the hard link named 'foo' exists
and directory A does not exist, which is unexpected. In other major linux
filesystems, such as ext4, xfs and f2fs for example, both hard links exist
and so does directory A after mounting again the filesystem.

Checking if any new ancestors are new and need to be logged was added in
2009 by commit 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes"),
however only for the ancestors of the hard link (dentry) for which the
fsync was issued, instead of checking for all ancestors for all of the
inode's hard links.

So fix this by tracking the id of the last transaction where a hard link
was created for an inode and then on fsync fallback to a full transaction
commit when an inode has more than one hard link and at least one new hard
link was created in the current transaction. This is the simplest solution
since this is not a common use case (adding frequently hard links for
which there's an ancestor created in the current transaction and then
fsync the file). In case it ever becomes a common use case, a solution
that consists of iterating the fs/subvol btree for each hard link and
check if any ancestor is new, could be implemented.

This solves many unexpected scenarios reported by Jayashree Mohan and
Vijay Chidambaram, and for which there is a new test case for fstests
under review.

Reported-by: Vijay Chidambaram 
Reported-by: Jayashree Mohan 
Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
Signed-off-by: Filipe Manana 
---

V2: Added missing case set last_link_trans after an inode is evicted and
loaded again.

 fs/btrfs/btrfs_inode.h |  6 ++
 fs/btrfs/inode.c   | 17 +
 fs/btrfs/tree-log.c| 16 
 3 files changed, 39 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..7177d1d33584 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,6 +147,12 @@ struct btrfs_inode {
u64 last_unlink_trans;
 
/*
+* Track the transaction id of the last transaction used to create a
+* hard link for the inode. This is used by the log tree (fsync).
+*/
+   u64 last_link_trans;
+
+   /*
 * Number of bytes outstanding that are going to need csums.  This is
 * used in ENOSPC accounting.
 */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64ea749c1ba4..51f4628be2d2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3685,6 +3685,21 @@ static int btrfs_read_locked_inode(struct inode *inode,
 * inode is not a directory, logging its parent unnecessarily.
 */
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+   /*
+* Similar reasoning for last_link_trans, needs to be set otherwise
+* for a case like the following:
+*
+* mkdir A
+* touch foo
+* ln foo A/bar
+* echo 2 > /proc/sys/vm/drop_caches
+* fsync foo
+* 
+*
+* Would result in link bar and directory A not existing after the power
+* failure.
+*/
+   BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
 
path->slots[0]++;
if (inode->i_nlink != 1 ||
@@ -6651,6 +,7 @@ static int btrfs_link(struct dentry *old_dentry, struct 
inode *dir,
if (err)
goto fail;
}
+   BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
 true, NULL);
@@ -9179,6 +9195,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+   ei->last_link_trans = 0;
ei->last_log_commit = 0;
 
spin_lock_init(>lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index aac3749f697f..896d79144052 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5760,6 +5760,22 @@ static int btrfs_log_inode_parent(struct 
btrfs_trans_handle *trans,
   

Re: [PATCH v4] Btrfs: fix deadlock with memory reclaim during scrub

2018-11-28 Thread Filipe Manana
On Wed, Nov 28, 2018 at 2:22 PM David Sterba  wrote:
>
> On Mon, Nov 26, 2018 at 08:10:30PM +, Filipe Manana wrote:
> > On Mon, Nov 26, 2018 at 6:17 PM David Sterba  wrote:
> > >
> > > On Fri, Nov 23, 2018 at 06:25:40PM +, fdman...@kernel.org wrote:
> > > > From: Filipe Manana 
> > > >
> > > > When a transaction commit starts, it attempts to pause scrub and it 
> > > > blocks
> > > > until the scrub is paused. So while the transaction is blocked waiting 
> > > > for
> > > > scrub to pause, we can not do memory allocation with GFP_KERNEL from 
> > > > scrub,
> > > > otherwise we risk getting into a deadlock with reclaim.
> > > >
> > > > Checking for scrub pause requests is done early at the beginning of the
> > > > while loop of scrub_stripe() and later in the loop, scrub_extent() and
> > > > scrub_raid56_parity() are called, which in turn call scrub_pages() and
> > > > scrub_pages_for_parity() respectively. These last two functions do 
> > > > memory
> > > > allocations using GFP_KERNEL. Same problem could happen while scrubbing
> > > > the super blocks, since it calls scrub_pages().
> > > >
> > > > So make sure GFP_NOFS is used for the memory allocations because at any
> > > > time a scrub pause request can happen from another task that started to
> > > > commit a transaction.
> > > >
> > > > Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission 
> > > > path")
> > > > Signed-off-by: Filipe Manana 
> > > > ---
> > > >
> > > > V2: Make using GFP_NOFS unconditionial. Previous version was racy, as 
> > > > pausing
> > > > requests migth happen just after we checked for them.
> > > >
> > > > V3: Use memalloc_nofs_save() just like V1 did.
> > > >
> > > > V4: Similar problem happened for raid56, which was previously missed, so
> > > > deal with it as well as the case for scrub_supers().
> > >
> > > Enclosing the whole scrub to 'nofs' seems like the best option and
> > > future proof. What I missed in 58c4e173847a was the "don't hold big lock
> > > under GFP_KERNEL allocation" pattern.
> > >
> > > >  fs/btrfs/scrub.c | 12 
> > > >  1 file changed, 12 insertions(+)
> > > >
> > > > diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> > > > index 3be1456b5116..e08b7502d1f0 100644
> > > > --- a/fs/btrfs/scrub.c
> > > > +++ b/fs/btrfs/scrub.c
> > > > @@ -3779,6 +3779,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info 
> > > > *fs_info, u64 devid, u64 start,
> > > >   struct scrub_ctx *sctx;
> > > >   int ret;
> > > >   struct btrfs_device *dev;
> > > > + unsigned int nofs_flag;
> > > >
> > > >   if (btrfs_fs_closing(fs_info))
> > > >   return -EINVAL;
> > > > @@ -3882,6 +3883,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info 
> > > > *fs_info, u64 devid, u64 start,
> > > >   atomic_inc(_info->scrubs_running);
> > > >   mutex_unlock(_info->scrub_lock);
> > > >
> > > > + /*
> > > > +  * In order to avoid deadlock with reclaim when there is a 
> > > > transaction
> > > > +  * trying to pause scrub, make sure we use GFP_NOFS for all the
> > > > +  * allocations done at btrfs_scrub_pages() and 
> > > > scrub_pages_for_parity()
> > > > +  * invoked by our callees. The pausing request is done when the
> > > > +  * transaction commit starts, and it blocks the transaction until 
> > > > scrub
> > > > +  * is paused (done at specific points at scrub_stripe() or right 
> > > > above
> > > > +  * before incrementing fs_info->scrubs_running).
> > >
> > > This hilights why there's perhaps no point in trying to make the nofs
> > > section smaller, handling all the interactions between scrub and
> > > transaction would be too complex.
> > >
> > > Reviewed-by: David Sterba 
> >
> > Well, the worker tasks can also not use gfp_kernel, since the scrub
> > task waits for them to complete before pausing.
> > I missed this, and 2 reviewers as well, so perhaps it wasn't that
> > trivial and I shouldn't feel that I miserably failed to identify all
> > cases for something rather trivial. V5 sent.
>
> You can say that you left it there intentionally, such cookies are a
> good drill for reviewers to stay sharp.

Unfortunately for me, it was not on purpose.

However there's the small drill of, for the workers only, potentially
moving the memalloc_nofs_save() and
restore to scrub_handle_errored_block(), since the only two possible
gfp_kernel allocations for workers
are during the case where a repair is needed:

scrub_bio_end_io_worker()
  scrub_block_complete()
scrub_handle_errored_block()
  lock_full_stripe()
insert_full_stripe_lock()
  -> kmalloc with GFP_KERNEL


scrub_bio_end_io_worker()
   scrub_block_complete()
 scrub_handle_errored_block()
   scrub_write_page_to_dev_replace()
 scrub_add_page_to_wr_bio()
   -> kzalloc with GFP_KERNEL

Just to avoid some duplication.

>
> When I started the conversions of GFP_NOFS -> GFP_KERNEL, scrub looked
> quite safe in this respect but turns out it's not. 

[PATCH] Btrfs: fix fsync of files with multiple hard links in new directories

2018-11-28 Thread fdmanana
From: Filipe Manana 

The log tree has a long standing problem that when a file is fsync'ed we
only check for new ancestors, created in the current transaction, by
following only the hard link for which the fsync was issued. We follow the
ancestors using the VFS' dget_parent() API. This means that if we create a
new link for a file in a directory that is new (or in an any other new
ancestor directory) and then fsync the file using an old hard link, we end
up not logging the new ancestor, and on log replay that new hard link and
ancestor do not exist. In some cases, involving renames, the file will not
exist at all.

Example:

  mkfs.btrfs -f /dev/sdb
  mount /dev/sdb /mnt

  mkdir /mnt/A
  touch /mnt/foo
  ln /mnt/foo /mnt/A/bar
  xfs_io -c fsync /mnt/foo

  

In this example after log replay only the hard link named 'foo' exists
and directory A does not exist, which is unexpected. In other major linux
filesystems, such as ext4, xfs and f2fs for example, both hard links exist
and so does directory A after mounting again the filesystem.

Checking if any new ancestors are new and need to be logged was added in
2009 by commit 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes"),
however only for the ancestors of the hard link (dentry) for which the
fsync was issued, instead of checking for all ancestors for all of the
inode's hard links.

So fix this by tracking the id of the last transaction where a hard link
was created for an inode and then on fsync fallback to a full transaction
commit when an inode has more than one hard link and at least one new hard
link was created in the current transaction. This is the simplest solution
since this is not a common use case (adding frequently hard links for
which there's an ancestor created in the current transaction and then
fsync the file). In case it ever becomes a common use case, a solution
that consists of iterating the fs/subvol btree for each hard link and
check if any ancestor is new, could be implemented.

This solves many unexpected scenarios reported by Jayashree Mohan and
Vijay Chidambaram, and for which there is a new test case for fstests
under review.

Reported-by: Vijay Chidambaram 
Reported-by: Jayashree Mohan 
Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
Signed-off-by: Filipe Manana 
---
 fs/btrfs/btrfs_inode.h |  6 ++
 fs/btrfs/inode.c   |  2 ++
 fs/btrfs/tree-log.c| 16 
 3 files changed, 24 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..7177d1d33584 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,6 +147,12 @@ struct btrfs_inode {
u64 last_unlink_trans;
 
/*
+* Track the transaction id of the last transaction used to create a
+* hard link for the inode. This is used by the log tree (fsync).
+*/
+   u64 last_link_trans;
+
+   /*
 * Number of bytes outstanding that are going to need csums.  This is
 * used in ENOSPC accounting.
 */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64ea749c1ba4..2e5660eb5aa1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6651,6 +6651,7 @@ static int btrfs_link(struct dentry *old_dentry, struct 
inode *dir,
if (err)
goto fail;
}
+   BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
 true, NULL);
@@ -9179,6 +9180,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+   ei->last_link_trans = 0;
ei->last_log_commit = 0;
 
spin_lock_init(>lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index aac3749f697f..896d79144052 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5760,6 +5760,22 @@ static int btrfs_log_inode_parent(struct 
btrfs_trans_handle *trans,
goto end_trans;
}
 
+   /*
+* If a new hard link was added to the inode in the current transaction
+* and its link count is now greater than 1, we need to fallback to a
+* transaction commit, otherwise we can end up not logging all its new
+* parents for all the hard links. Here just from the dentry used to
+* fsync, we can not visit the ancestor inodes for all the other hard
+* links to figure out if any is new, so we fallback to a transaction
+* commit (instead of adding a lot of complexity of scanning a btree,
+* since this scenario is not a common use case).
+*/
+   if (inode->vfs_inode.i_nlink > 1 &&
+   inode->last_link_trans > last_committed) {
+   ret = -EMLINK;
+   goto end_trans;
+   }
+
while (1) {
 

Re: [PATCH v4] Btrfs: fix deadlock with memory reclaim during scrub

2018-11-28 Thread David Sterba
On Mon, Nov 26, 2018 at 08:10:30PM +, Filipe Manana wrote:
> On Mon, Nov 26, 2018 at 6:17 PM David Sterba  wrote:
> >
> > On Fri, Nov 23, 2018 at 06:25:40PM +, fdman...@kernel.org wrote:
> > > From: Filipe Manana 
> > >
> > > When a transaction commit starts, it attempts to pause scrub and it blocks
> > > until the scrub is paused. So while the transaction is blocked waiting for
> > > scrub to pause, we can not do memory allocation with GFP_KERNEL from 
> > > scrub,
> > > otherwise we risk getting into a deadlock with reclaim.
> > >
> > > Checking for scrub pause requests is done early at the beginning of the
> > > while loop of scrub_stripe() and later in the loop, scrub_extent() and
> > > scrub_raid56_parity() are called, which in turn call scrub_pages() and
> > > scrub_pages_for_parity() respectively. These last two functions do memory
> > > allocations using GFP_KERNEL. Same problem could happen while scrubbing
> > > the super blocks, since it calls scrub_pages().
> > >
> > > So make sure GFP_NOFS is used for the memory allocations because at any
> > > time a scrub pause request can happen from another task that started to
> > > commit a transaction.
> > >
> > > Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission 
> > > path")
> > > Signed-off-by: Filipe Manana 
> > > ---
> > >
> > > V2: Make using GFP_NOFS unconditionial. Previous version was racy, as 
> > > pausing
> > > requests migth happen just after we checked for them.
> > >
> > > V3: Use memalloc_nofs_save() just like V1 did.
> > >
> > > V4: Similar problem happened for raid56, which was previously missed, so
> > > deal with it as well as the case for scrub_supers().
> >
> > Enclosing the whole scrub to 'nofs' seems like the best option and
> > future proof. What I missed in 58c4e173847a was the "don't hold big lock
> > under GFP_KERNEL allocation" pattern.
> >
> > >  fs/btrfs/scrub.c | 12 
> > >  1 file changed, 12 insertions(+)
> > >
> > > diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> > > index 3be1456b5116..e08b7502d1f0 100644
> > > --- a/fs/btrfs/scrub.c
> > > +++ b/fs/btrfs/scrub.c
> > > @@ -3779,6 +3779,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, 
> > > u64 devid, u64 start,
> > >   struct scrub_ctx *sctx;
> > >   int ret;
> > >   struct btrfs_device *dev;
> > > + unsigned int nofs_flag;
> > >
> > >   if (btrfs_fs_closing(fs_info))
> > >   return -EINVAL;
> > > @@ -3882,6 +3883,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, 
> > > u64 devid, u64 start,
> > >   atomic_inc(_info->scrubs_running);
> > >   mutex_unlock(_info->scrub_lock);
> > >
> > > + /*
> > > +  * In order to avoid deadlock with reclaim when there is a 
> > > transaction
> > > +  * trying to pause scrub, make sure we use GFP_NOFS for all the
> > > +  * allocations done at btrfs_scrub_pages() and 
> > > scrub_pages_for_parity()
> > > +  * invoked by our callees. The pausing request is done when the
> > > +  * transaction commit starts, and it blocks the transaction until 
> > > scrub
> > > +  * is paused (done at specific points at scrub_stripe() or right 
> > > above
> > > +  * before incrementing fs_info->scrubs_running).
> >
> > This hilights why there's perhaps no point in trying to make the nofs
> > section smaller, handling all the interactions between scrub and
> > transaction would be too complex.
> >
> > Reviewed-by: David Sterba 
> 
> Well, the worker tasks can also not use gfp_kernel, since the scrub
> task waits for them to complete before pausing.
> I missed this, and 2 reviewers as well, so perhaps it wasn't that
> trivial and I shouldn't feel that I miserably failed to identify all
> cases for something rather trivial. V5 sent.

You can say that you left it there intentionally, such cookies are a
good drill for reviewers to stay sharp.

When I started the conversions of GFP_NOFS -> GFP_KERNEL, scrub looked
quite safe in this respect but turns out it's not. I was wondering if we
could add some lock assertions before GFP_KERNEL allocations, like:

assert_lock_not_held(fs_info->device_list_mutex)
kmalloc(GFP_KERNEL)

and add more locks per subsystem (eg. the scrub lock) and possibly some
convenience wrappers. Michal's scope GFS_NOFS patch series has a
debugging warning where NOFS is used in context where it does not need
to, while having the 'must not hold an important lock' would be a good
debugging helper too.


Re: [PATCH] btrfs: adjust order of unlocks in do_trimming()

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 11:21:12AM +0800, Su Yue wrote:
> In function do_trimming(), block_group->lock should be unlocked first.

Please also write why this is correct and if there are any bad
consequences of the current behaviour. Thanks.


Re: Balance: invalid convert data profile raid10

2018-11-28 Thread Qu Wenruo


On 2018/11/28 下午3:20, Mikko Merikivi wrote:
> Well, excuse me for thinking it wouldn't since in md-raid it worked.
> https://wiki.archlinux.org/index.php/RAID#RAID_level_comparison
> 
> Anyway, the error message is truly confusing for someone who doesn't
> know about btrfs's implementation. I suppose in md-raid the near
> layout is actually RAID 1 and far2 uses twice as much space.
> https://en.wikipedia.org/wiki/Non-standard_RAID_levels#LINUX-MD-RAID-10
> 
> Well, I guess I'll go with RAID 1, then. Thanks for clearing up the confusion.

You should check mkfs.btrfs(8).
It has a pretty good datasheet for all supported profiles under PROFILES
section, and it mentions min/max devices.

Thanks,
Qu


> ke 28. marrask. 2018 klo 3.14 Qu Wenruo (quwenruo.bt...@gmx.com) kirjoitti:
>>
>>
>>
>> On 2018/11/28 上午5:16, Mikko Merikivi wrote:
>>> I seem unable to convert an existing btrfs device array to RAID 10.
>>> Since it's pretty much RAID 0 and 1 combined, and 5 and 6 are
>>> unstable, it's what I would like to use.
>>>
>>> After I did tried this with 4.19.2-arch1-1-ARCH and btrfs-progs v4.19,
>>> I updated my system and tried btrfs balance again with this system
>>> information:
>>> [mikko@localhost lvdata]$ uname -a
>>> Linux localhost 4.19.4-arch1-1-ARCH #1 SMP PREEMPT Fri Nov 23 09:06:58
>>> UTC 2018 x86_64 GNU/Linux
>>> [mikko@localhost lvdata]$ btrfs --version
>>> btrfs-progs v4.19
>>> [mikko@localhost lvdata]$ sudo btrfs fi show
>>> Label: 'main1'  uuid: c7cbb9c3-8c55-45f1-b03c-48992efe2f11
>>> Total devices 1 FS bytes used 2.90TiB
>>> devid1 size 3.64TiB used 2.91TiB path /dev/mapper/main
>>>
>>> Label: 'red'  uuid: f3c781a8-0f3e-4019-acbf-0b783cf566d0
>>> Total devices 2 FS bytes used 640.00KiB
>>> devid1 size 931.51GiB used 2.03GiB path /dev/mapper/red1
>>> devid2 size 931.51GiB used 2.03GiB path /dev/mapper/red2
>>
>> RAID10 needs at least 4 devices.
>>
>> Thanks,
>> Qu
>>
>>> [mikko@localhost lvdata]$ btrfs fi df /mnt/red/
>>> Data, RAID1: total=1.00GiB, used=512.00KiB
>>> System, RAID1: total=32.00MiB, used=16.00KiB
>>> Metadata, RAID1: total=1.00GiB, used=112.00KiB
>>> GlobalReserve, single: total=16.00MiB, used=0.00B
>>>
>>> ---
>>>
>>> Here are the steps I originally used:
>>>
>>> [mikko@localhost lvdata]$ sudo cryptsetup luksFormat -s 512
>>> --use-random /dev/sdc
>>> [mikko@localhost lvdata]$ sudo cryptsetup luksFormat -s 512
>>> --use-random /dev/sdd
>>> [mikko@localhost lvdata]$ sudo cryptsetup luksOpen /dev/sdc red1
>>> [mikko@localhost lvdata]$ sudo cryptsetup luksOpen /dev/sdd red2
>>> [mikko@localhost lvdata]$ sudo mkfs.btrfs -L red /dev/mapper/red1
>>> btrfs-progs v4.19
>>> See http://btrfs.wiki.kernel.org for more information.
>>>
>>> Label:  red
>>> UUID:   f3c781a8-0f3e-4019-acbf-0b783cf566d0
>>> Node size:  16384
>>> Sector size:4096
>>> Filesystem size:931.51GiB
>>> Block group profiles:
>>>   Data: single8.00MiB
>>>   Metadata: DUP   1.00GiB
>>>   System:   DUP   8.00MiB
>>> SSD detected:   no
>>> Incompat features:  extref, skinny-metadata
>>> Number of devices:  1
>>> Devices:
>>>IDSIZE  PATH
>>> 1   931.51GiB  /dev/mapper/red1
>>>
>>> [mikko@localhost lvdata]$ sudo mount -t btrfs -o
>>> defaults,noatime,nodiratime,autodefrag,compress=lzo /dev/mapper/red1
>>> /mnt/red
>>> [mikko@localhost lvdata]$ sudo btrfs device add /dev/mapper/red2 /mnt/red
>>> [mikko@localhost lvdata]$ sudo btrfs balance start -dconvert=raid10
>>> -mconvert=raid10 /mnt/red
>>> ERROR: error during balancing '/mnt/red': Invalid argument
>>> There may be more info in syslog - try dmesg | tail
>>> code 1
>>>
>>> [mikko@localhost lvdata]$ dmesg | tail
>>> [12026.263243] BTRFS info (device dm-1): disk space caching is enabled
>>> [12026.263244] BTRFS info (device dm-1): has skinny extents
>>> [12026.263245] BTRFS info (device dm-1): flagging fs with big metadata 
>>> feature
>>> [12026.275153] BTRFS info (device dm-1): checking UUID tree
>>> [12195.431766] BTRFS info (device dm-1): enabling auto defrag
>>> [12195.431769] BTRFS info (device dm-1): use lzo compression, level 0
>>> [12195.431770] BTRFS info (device dm-1): disk space caching is enabled
>>> [12195.431771] BTRFS info (device dm-1): has skinny extents
>>> [12205.815941] BTRFS info (device dm-1): disk added /dev/mapper/red2
>>> [12744.788747] BTRFS error (device dm-1): balance: invalid convert
>>> data profile raid10
>>>
>>> Converting to RAID 1 did work but what can I do to make it RAID 10?
>>> With the up-to-date system it still says "invalid convert data profile
>>> raid10".
>>>
>>



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/9] Switch defines to enums

2018-11-28 Thread Qu Wenruo


On 2018/11/28 下午9:25, David Sterba wrote:
> On Wed, Nov 28, 2018 at 09:33:50AM +0800, Qu Wenruo wrote:
>> On 2018/11/28 上午3:53, David Sterba wrote:
>>> This is motivated by a merging mistake that happened a few releases ago.
>>> Two patches updated BTRFS_FS_* flags independently to the same value,
>>> git did not see any direct merge conflict. The two values got mixed at
>>> runtime and caused crash.
>>>
>>> Update all #define sequential values, the above merging problem would
>>> not happen as there would be a conflict and the enum value
>>> auto-increment would prevent duplicated values anyway.
>>
>> Just one small question for the bitmap usage.
>>
>> For enum we won't directly know the last number is, my concern is if
>> we're using u16 as bitmap and there is some enum over 15, will we get a
>> warning at compile time or some bug would just sneak into kernel?
> 
> Do you have a concrete example where this would happen? Most bits are
> used in 'long' that should be at least 32. I'm not aware of 16bit bits
> flags.
> 

OK, I'm too paranoid here.

The set_bit() definition needs unsigned long *, and passing a u16 * will
cause compile time warning, so it shouldn't be a problem.

And for enum over 63, reviewers will definitely complain anyway.

Thanks,
Qu



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] Fix typos

2018-11-28 Thread Nikolay Borisov



On 28.11.18 г. 15:24 ч., Brendan Hide wrote:
> 
> 
> On 11/28/18 1:23 PM, Nikolay Borisov wrote:
>>
>>
>> On 28.11.18 г. 13:05 ч., Andrea Gelmini wrote:
>>> Signed-off-by: Andrea Gelmini 
>>> ---
> 
> 
> 
>>>
>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> index bab2f1983c07..babbd75d91d2 100644
>>> --- a/fs/btrfs/inode.c
>>> +++ b/fs/btrfs/inode.c
>>> @@ -104,7 +104,7 @@ static void __endio_write_update_ordered(struct
>>> inode *inode,
>>>     /*
>>>    * Cleanup all submitted ordered extents in specified range to
>>> handle errors
>>> - * from the fill_dellaloc() callback.
>>> + * from the fill_delalloc() callback.
>>
>> This is a pure whitespace fix which is generally frowned upon. What you
>> can do though, is replace 'fill_delalloc callback' with
>> 'btrfs_run_delalloc_range' since the callback is gone already.
>>
>>>    *
>>>    * NOTE: caller must ensure that when an error happens, it can not
>>> call
>>>    * extent_clear_unlock_delalloc() to clear both the bits
>>> EXTENT_DO_ACCOUNTING
>>> @@ -1831,7 +1831,7 @@ void btrfs_clear_delalloc_extent(struct inode
>>> *vfs_inode,
>>
>> 
>>
>>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>>> index 410c7e007ba8..d7b6c2b09a0c 100644
>>> --- a/fs/btrfs/ioctl.c
>>> +++ b/fs/btrfs/ioctl.c
>>> @@ -892,7 +892,7 @@ static int create_snapshot(struct btrfs_root
>>> *root, struct inode *dir,
>>>    *  7. If we were asked to remove a directory and victim isn't one
>>> - ENOTDIR.
>>>    *  8. If we were asked to remove a non-directory and victim isn't
>>> one - EISDIR.
>>>    *  9. We can't remove a root or mountpoint.
>>> - * 10. We don't allow removal of NFS sillyrenamed files; it's
>>> handled by
>>> + * 10. We don't allow removal of NFS silly renamed files; it's
>>> handled by
>>>    * nfs_async_unlink().
>>>    */
>>>   @@ -3522,7 +3522,7 @@ static int btrfs_extent_same_range(struct
>>> inode *src, u64 loff, u64 olen,
>>>  false);
>>>   /*
>>>    * If one of the inodes has dirty pages in the respective range or
>>> - * ordered extents, we need to flush dellaloc and wait for all
>>> ordered
>>> + * ordered extents, we need to flush delalloc and wait for all
>>> ordered
>>
>> Just whitespace fix, drop it.
>>
>> 
>>
> 
> If the spelling is changed, surely that is not a whitespace fix?

My bad, I missed it the first time.

> 


Re: [PATCH] Fix typos

2018-11-28 Thread Brendan Hide




On 11/28/18 1:23 PM, Nikolay Borisov wrote:



On 28.11.18 г. 13:05 ч., Andrea Gelmini wrote:

Signed-off-by: Andrea Gelmini 
---






diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bab2f1983c07..babbd75d91d2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -104,7 +104,7 @@ static void __endio_write_update_ordered(struct inode 
*inode,
  
  /*

   * Cleanup all submitted ordered extents in specified range to handle errors
- * from the fill_dellaloc() callback.
+ * from the fill_delalloc() callback.


This is a pure whitespace fix which is generally frowned upon. What you
can do though, is replace 'fill_delalloc callback' with
'btrfs_run_delalloc_range' since the callback is gone already.


   *
   * NOTE: caller must ensure that when an error happens, it can not call
   * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
@@ -1831,7 +1831,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,





diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 410c7e007ba8..d7b6c2b09a0c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -892,7 +892,7 @@ static int create_snapshot(struct btrfs_root *root, struct 
inode *dir,
   *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
   *  8. If we were asked to remove a non-directory and victim isn't one - 
EISDIR.
   *  9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * 10. We don't allow removal of NFS silly renamed files; it's handled by
   * nfs_async_unlink().
   */
  
@@ -3522,7 +3522,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,

   false);
/*
 * If one of the inodes has dirty pages in the respective range or
-* ordered extents, we need to flush dellaloc and wait for all ordered
+* ordered extents, we need to flush delalloc and wait for all ordered


Just whitespace fix, drop it.





If the spelling is changed, surely that is not a whitespace fix?


Re: [PATCH 3/3] btrfs: remove redundant nowait check for buffered_write

2018-11-28 Thread Johannes Thumshirn
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs Filesystems
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 2/3] btrfs: cleanup the useless DEFINE_WAIT in cleanup_transaction

2018-11-28 Thread Johannes Thumshirn
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs Filesystems
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 9/9] btrfs: drop extra enum initialization where using defaults

2018-11-28 Thread Johannes Thumshirn
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs Filesystems
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 8/9] btrfs: switch BTRFS_ORDERED_* to enums

2018-11-28 Thread Johannes Thumshirn
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs Filesystems
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 6/9] btrfs: switch EXTENT_FLAG_* to enums

2018-11-28 Thread Johannes Thumshirn
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs Filesystems
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 0/9] Switch defines to enums

2018-11-28 Thread David Sterba
On Wed, Nov 28, 2018 at 09:33:50AM +0800, Qu Wenruo wrote:
> On 2018/11/28 上午3:53, David Sterba wrote:
> > This is motivated by a merging mistake that happened a few releases ago.
> > Two patches updated BTRFS_FS_* flags independently to the same value,
> > git did not see any direct merge conflict. The two values got mixed at
> > runtime and caused crash.
> > 
> > Update all #define sequential values, the above merging problem would
> > not happen as there would be a conflict and the enum value
> > auto-increment would prevent duplicated values anyway.
> 
> Just one small question for the bitmap usage.
> 
> For enum we won't directly know the last number is, my concern is if
> we're using u16 as bitmap and there is some enum over 15, will we get a
> warning at compile time or some bug would just sneak into kernel?

Do you have a concrete example where this would happen? Most bits are
used in 'long' that should be at least 32. I'm not aware of 16bit bits
flags.


Re: [PATCH 7/9] btrfs: switch BTRFS_*_LOCK to enums

2018-11-28 Thread David Sterba
On Tue, Nov 27, 2018 at 04:37:16PM -0800, Omar Sandoval wrote:
> On Tue, Nov 27, 2018 at 08:53:55PM +0100, David Sterba wrote:
> > We can use simple enum for values that are not part of on-disk format:
> > tree lock types.
> > 
> > Signed-off-by: David Sterba 
> > ---
> >  fs/btrfs/locking.h | 10 ++
> >  1 file changed, 6 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
> > index 29135def468e..684d0ef4faa4 100644
> > --- a/fs/btrfs/locking.h
> > +++ b/fs/btrfs/locking.h
> > @@ -6,10 +6,12 @@
> >  #ifndef BTRFS_LOCKING_H
> >  #define BTRFS_LOCKING_H
> >  
> > -#define BTRFS_WRITE_LOCK 1
> > -#define BTRFS_READ_LOCK 2
> > -#define BTRFS_WRITE_LOCK_BLOCKING 3
> > -#define BTRFS_READ_LOCK_BLOCKING 4
> > +enum {
> > +   BTRFS_WRITE_LOCK,
> 
> See btrfs_set_path_blocking() and btrfs_release_path(); 0 means no lock,
> so this needs to be BTRFS_WRITE_LOCK = 1. I imagine that lockdep would
> catch this.

Oh right of course, thanks for catching it. I'll drop the patch for now,
0 could be added to the set as BTRFS_NO_LOCK and replaced in the code
which is beyond what this series does.


Re: [PATCH 5/9] btrfs: swtich EXTENT_BUFFER_* to enums

2018-11-28 Thread Johannes Thumshirn
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs Filesystems
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 4/9] btrfs: switch BTRFS_ROOT_* to enums

2018-11-28 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs Filesystems
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 2/9] btrfs: switch BTRFS_BLOCK_RSV_* to enums

2018-11-28 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH 1/9] btrfs: switch BTRFS_FS_STATE_* to enums

2018-11-28 Thread Johannes Thumshirn
Looks good,
Reviewed-by: Johannes Thumshirn 
-- 
Johannes ThumshirnSUSE Labs
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH] Fix typos

2018-11-28 Thread Nikolay Borisov



On 28.11.18 г. 13:05 ч., Andrea Gelmini wrote:
> Signed-off-by: Andrea Gelmini 
> ---
> 
> Stupid fixes. Made on 4.20-rc4, and ported on linux-next (next-20181128).
> 
> 
>  fs/btrfs/backref.c |  4 ++--
>  fs/btrfs/check-integrity.c |  2 +-
>  fs/btrfs/compression.c |  4 ++--
>  fs/btrfs/ctree.c   |  4 ++--
>  fs/btrfs/dev-replace.c |  2 +-
>  fs/btrfs/disk-io.c |  4 ++--
>  fs/btrfs/extent-tree.c | 28 ++--
>  fs/btrfs/extent_io.c   |  4 ++--
>  fs/btrfs/extent_io.h   |  2 +-
>  fs/btrfs/extent_map.c  |  2 +-
>  fs/btrfs/file.c|  6 +++---
>  fs/btrfs/free-space-tree.c |  2 +-
>  fs/btrfs/inode.c   | 10 +-
>  fs/btrfs/ioctl.c   |  4 ++--
>  fs/btrfs/lzo.c |  2 +-
>  fs/btrfs/qgroup.c  | 14 +++---
>  fs/btrfs/qgroup.h  |  4 ++--
>  fs/btrfs/raid56.c  |  2 +-
>  fs/btrfs/ref-verify.c  |  6 +++---
>  fs/btrfs/relocation.c  |  2 +-
>  fs/btrfs/scrub.c   |  2 +-
>  fs/btrfs/send.c|  4 ++--
>  fs/btrfs/super.c   |  8 
>  fs/btrfs/transaction.c |  4 ++--
>  fs/btrfs/tree-checker.c|  6 +++---
>  fs/btrfs/tree-log.c|  4 ++--
>  fs/btrfs/volumes.c |  8 
>  27 files changed, 72 insertions(+), 72 deletions(-)
> 
> diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
> index 4a15f87dbbb4..78556447e1d5 100644
> --- a/fs/btrfs/backref.c
> +++ b/fs/btrfs/backref.c
> @@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
>  }
>  
>  /*



> @@ -9807,7 +9807,7 @@ void btrfs_dec_block_group_ro(struct 
> btrfs_block_group_cache *cache)
>  }
>  
>  /*
> - * checks to see if its even possible to relocate this block group.
> + * checks to see if it's even possible to relocate this block group.
>   *
>   * @return - false if not enough space can be found for relocation, true
>   * otherwise
> @@ -9872,7 +9872,7 @@ bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
> u64 bytenr)
>* ok we don't have enough space, but maybe we have free space on our
>* devices to allocate new chunks for relocation, so loop through our
>* alloc devices and guess if we have enough space.  if this block
> -  * group is going to be restriped, run checks against the target
> +  * group is going to be restripped, run checks against the target

Drop this hunk, here we mean to restripe the changes and not restrip
them. So restriped is the correct past tense of the verb.

>* profile instead of the current one.
>*/
>   can_reloc = false;
> @@ -10424,7 +10424,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info 
> *info)
>* check for two cases, either we are full, and therefore
>* don't need to bother with the caching work since we won't
>* find any space, or we are empty, and we can just add all
> -  * the space in and be done with it.  This saves us _alot_ of
> +  * the space in and be done with it.  This saves us _a_lot_ of

This should be _a lot_

>* time, particularly in the full case.
>*/
>   if (found_key.offset == btrfs_block_group_used(>item)) {
> @@ -10700,7 +10700,7 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>  
>   mutex_lock(>transaction->cache_write_mutex);
>   /*
> -  * make sure our free spache cache IO is done before remove the
> +  * make sure our free space cache IO is done before remove the

I think there should be a 'we' before 'remove'

>* free space inode
>*/
>   spin_lock(>transaction->dirty_bgs_lock);
> @@ -11217,7 +11217,7 @@ static int btrfs_trim_free_extents(struct 
> btrfs_device *device,
>   if (!blk_queue_discard(bdev_get_queue(device->bdev)))
>   return 0;
>  
> - /* Not writeable = nothing to do. */
> + /* Not writable = nothing to do. */

This comment is redundant so could be removed altogether

>   if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state))
>   return 0;
>  
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index aef3c9866ff0..1493f0c102ec 100644



> diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
> index e5089087eaa6..13e71efcfe21 100644
> --- a/fs/btrfs/free-space-tree.c
> +++ b/fs/btrfs/free-space-tree.c
> @@ -550,7 +550,7 @@ static void free_space_set_bits(struct 
> btrfs_block_group_cache *block_group,
>  
>  /*
>   * We can't use btrfs_next_item() in modify_free_space_bitmap() because
> - * btrfs_next_leaf() doesn't get the path for 

[PATCH] Fix typos

2018-11-28 Thread Andrea Gelmini
Signed-off-by: Andrea Gelmini 
---

Stupid fixes. Made on 4.20-rc4, and ported on linux-next (next-20181128).


 fs/btrfs/backref.c |  4 ++--
 fs/btrfs/check-integrity.c |  2 +-
 fs/btrfs/compression.c |  4 ++--
 fs/btrfs/ctree.c   |  4 ++--
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/disk-io.c |  4 ++--
 fs/btrfs/extent-tree.c | 28 ++--
 fs/btrfs/extent_io.c   |  4 ++--
 fs/btrfs/extent_io.h   |  2 +-
 fs/btrfs/extent_map.c  |  2 +-
 fs/btrfs/file.c|  6 +++---
 fs/btrfs/free-space-tree.c |  2 +-
 fs/btrfs/inode.c   | 10 +-
 fs/btrfs/ioctl.c   |  4 ++--
 fs/btrfs/lzo.c |  2 +-
 fs/btrfs/qgroup.c  | 14 +++---
 fs/btrfs/qgroup.h  |  4 ++--
 fs/btrfs/raid56.c  |  2 +-
 fs/btrfs/ref-verify.c  |  6 +++---
 fs/btrfs/relocation.c  |  2 +-
 fs/btrfs/scrub.c   |  2 +-
 fs/btrfs/send.c|  4 ++--
 fs/btrfs/super.c   |  8 
 fs/btrfs/transaction.c |  4 ++--
 fs/btrfs/tree-checker.c|  6 +++---
 fs/btrfs/tree-log.c|  4 ++--
 fs/btrfs/volumes.c |  8 
 27 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4a15f87dbbb4..78556447e1d5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
 }
 
 /*
- * We maintain three seperate rbtrees: one for direct refs, one for
+ * We maintain three separate rbtrees: one for direct refs, one for
  * indirect refs which have a key, and one for indirect refs which do not
  * have a key. Each tree does merge on insertion.
  *
@@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info 
*fs_info,
}
 
/*
-* Now it's a direct ref, put it in the the direct tree. We must
+* Now it's a direct ref, put it in the direct tree. We must
 * do this last because the ref could be merged/freed here.
 */
prelim_ref_insert(fs_info, >direct, ref, NULL);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 781cae168d2a..42b2c5e159f1 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct 
btrfsic_state *state,
 * write operations. Therefore it keeps the linkage
 * information for a block until a block is
 * rewritten. This can temporarily cause incorrect
-* and even circular linkage informations. This
+* and even circular linkage information. This
 * causes no harm unless such blocks are referenced
 * by the most recent super block.
 */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 34d50bc5c10d..d8ce66f3e943 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1202,7 +1202,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned 
long buf_start,
 /*
  * Shannon Entropy calculation
  *
- * Pure byte distribution analysis fails to determine compressiability of data.
+ * Pure byte distribution analysis fails to determine compressibility of data.
  * Try calculating entropy to estimate the average minimum number of bits
  * needed to encode the sampled data.
  *
@@ -1266,7 +1266,7 @@ static u8 get4bits(u64 num, int shift) {
 
 /*
  * Use 4 bits as radix base
- * Use 16 u32 counters for calculating new possition in buf array
+ * Use 16 u32 counters for calculating new position in buf array
  *
  * @array - array that will be sorted
  * @array_buf - buffer array to store sorting results
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1ec08efba814..25a58486bf89 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1414,7 +1414,7 @@ static inline int should_cow_block(struct 
btrfs_trans_handle *trans,
 *
 * What is forced COW:
 *when we create snapshot during committing the transaction,
-*after we've finished coping src root, we must COW the shared
+*after we've finished copying src root, we must COW the shared
 *block to ensure the metadata consistency.
 */
if (btrfs_header_generation(buf) == trans->transid &&
@@ -3746,7 +3746,7 @@ static int push_leaf_right(struct btrfs_trans_handle 
*trans, struct btrfs_root
/* Key greater than all keys in the leaf, right neighbor has
 * enough room for it and we're not emptying our leaf to delete
 * it, therefore use right neighbor to insert the new item and
-* no need to touch/dirty our left leaft. */
+* no need to touch/dirty our left leaf. */
btrfs_tree_unlock(left);
free_extent_buffer(left);
path->

Re: [RFC PATCH] btrfs: drop file privileges in btrfs_clone_files

2018-11-28 Thread Filipe Manana
On Wed, Nov 28, 2018 at 9:26 AM Lu Fengqi  wrote:
>
> On Wed, Nov 28, 2018 at 09:48:07AM +0200, Nikolay Borisov wrote:
> >
> >
> >On 28.11.18 г. 9:46 ч., Christoph Hellwig wrote:
> >> On Wed, Nov 28, 2018 at 09:44:59AM +0200, Nikolay Borisov wrote:
> >>>
> >>>
> >>> On 28.11.18 г. 5:07 ч., Lu Fengqi wrote:
>  The generic/513 tell that cloning into a file did not strip security
>  privileges (suid, capabilities) like a regular write would.
> 
>  Signed-off-by: Lu Fengqi 
>  ---
>  The xfs and ocfs2 call generic_remap_file_range_prep to drop file
>  privileges, I'm not sure whether btrfs should do the same thing.
> >>>
> >>> Why do you think btrfs shouldn't do the same thing. Looking at
>
> I'm not sure btrfs doesn't use generic check intentionally for some reason.
>
> >>> remap_file_range_prep it seems that btrfs is missing a ton of checks
> >>> that are useful i.e immutable files/aligned offsets etc.
>
> It is indeed.
>
> In addition, generic_remap_file_range_prep will invoke inode_dio_wait
> filemap_write_and_wait_range for the source and destination inode/range.
> For the dedupe case, it will call vfs_dedupe_file_range_compare.
>
> I still can't judge whether these operations are welcome by btrfs. I
> will go deep into the code.
>
> >>
> >> Any chance we could move btrfs over to use remap_file_range_prep so that
> >> all file systems share the exact same checks?
>
> In theory we can call generic_remap_file_range_prep in
> btrfs_remap_file_range, which give us the opportunity to clean up the
> duplicate check code in btrfs_extent_same and btrfs_clone_files.
>
> >
> >I'm not very familiar with the, Filipe is more familiar so adding to CC.
> >But IMO we should do that provided there are no blockers.
> >
> >Filipe, what do you think, is it feasible?
>
> I'm all ears for the suggestions.

There's no reason why it shouldn't be possible to have them called in
btrfs as well.
There's quite a few changes in vfs and generic functions introduced in
4.20 due to reflink/dedupe bugs, probably either at the time,
or when cloning/dedupe stopped being btrfs specific, someone forgot to
make btrfs use those generic vfs helpers.
I'll take a look as well.

>
> --
> Thanks,
> Lu
>
>


-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”


Re: [PATCH 1/3] btrfs: remove always true if branch in find_delalloc_range

2018-11-28 Thread Lu Fengqi
On Wed, Nov 28, 2018 at 09:01:42AM +0200, Nikolay Borisov wrote:
>
>
>On 28.11.18 г. 5:21 ч., Lu Fengqi wrote:
>> The @found is always false when it comes to the if branch. Besides, the
>> bool type is more suitable for @found.
>
>Well if you are ranging the type of found variable it also makes sense
>to change the return value of the function to bool as well.

Good catch.

-- 
Thanks,
Lu

>
>> 
>> Signed-off-by: Lu Fengqi 
>> ---
>>  fs/btrfs/extent_io.c | 7 +++
>>  1 file changed, 3 insertions(+), 4 deletions(-)
>> 
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 582b4b1c41e0..b4ee3399be96 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -1461,7 +1461,7 @@ static noinline u64 find_delalloc_range(struct 
>> extent_io_tree *tree,
>>  struct rb_node *node;
>>  struct extent_state *state;
>>  u64 cur_start = *start;
>> -u64 found = 0;
>> +bool found = false;
>>  u64 total_bytes = 0;
>>  
>>  spin_lock(>lock);
>> @@ -1472,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct 
>> extent_io_tree *tree,
>>   */
>>  node = tree_search(tree, cur_start);
>>  if (!node) {
>> -if (!found)
>> -*end = (u64)-1;
>> +*end = (u64)-1;
>>  goto out;
>>  }
>>  
>> @@ -1493,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct 
>> extent_io_tree *tree,
>>  *cached_state = state;
>>  refcount_inc(>refs);
>>  }
>> -found++;
>> +found = true;
>>  *end = state->end;
>>  cur_start = state->end + 1;
>>  node = rb_next(node);
>> 
>
>




Re: [RFC PATCH] btrfs: drop file privileges in btrfs_clone_files

2018-11-28 Thread Lu Fengqi
On Wed, Nov 28, 2018 at 09:48:07AM +0200, Nikolay Borisov wrote:
>
>
>On 28.11.18 г. 9:46 ч., Christoph Hellwig wrote:
>> On Wed, Nov 28, 2018 at 09:44:59AM +0200, Nikolay Borisov wrote:
>>>
>>>
>>> On 28.11.18 г. 5:07 ч., Lu Fengqi wrote:
 The generic/513 tell that cloning into a file did not strip security
 privileges (suid, capabilities) like a regular write would.

 Signed-off-by: Lu Fengqi 
 ---
 The xfs and ocfs2 call generic_remap_file_range_prep to drop file
 privileges, I'm not sure whether btrfs should do the same thing.
>>>
>>> Why do you think btrfs shouldn't do the same thing. Looking at

I'm not sure btrfs doesn't use generic check intentionally for some reason.

>>> remap_file_range_prep it seems that btrfs is missing a ton of checks
>>> that are useful i.e immutable files/aligned offsets etc.

It is indeed.

In addition, generic_remap_file_range_prep will invoke inode_dio_wait
filemap_write_and_wait_range for the source and destination inode/range.
For the dedupe case, it will call vfs_dedupe_file_range_compare.

I still can't judge whether these operations are welcome by btrfs. I
will go deep into the code.

>> 
>> Any chance we could move btrfs over to use remap_file_range_prep so that
>> all file systems share the exact same checks?

In theory we can call generic_remap_file_range_prep in
btrfs_remap_file_range, which give us the opportunity to clean up the
duplicate check code in btrfs_extent_same and btrfs_clone_files.

>
>I'm not very familiar with the, Filipe is more familiar so adding to CC.
>But IMO we should do that provided there are no blockers.
>
>Filipe, what do you think, is it feasible?

I'm all ears for the suggestions.

-- 
Thanks,
Lu




Re: [RFC PATCH 01/17] btrfs: priority alloc: prepare of priority aware allocator

2018-11-28 Thread Su Yue




On 11/28/18 4:24 PM, Nikolay Borisov wrote:



On 28.11.18 г. 5:11 ч., Su Yue wrote:

To implement priority aware allocator, this patch:
Introduces struct btrfs_priority_tree which contains block groups
in same level.
Adds member priority to struct btrfs_block_group_cache and pointer
points to the priority tree it's located.

Adds member priority_trees to struct btrfs_space_info to represents
priority trees in different raid types.

Signed-off-by: Su Yue 
---
  fs/btrfs/ctree.h | 24 
  1 file changed, 24 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e62824cae00a..5c4651d8a524 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -437,6 +437,8 @@ struct btrfs_space_info {
struct rw_semaphore groups_sem;
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
+   /* for priority trees in our same type */
+   struct rb_root priority_trees[BTRFS_NR_RAID_TYPES];
wait_queue_head_t wait;
  
  	struct kobject kobj;

@@ -558,6 +560,21 @@ struct btrfs_full_stripe_locks_tree {
struct mutex lock;
  };
  
+/*

+ * Tree to record all block_groups in same priority level.
+ * Only used in priority aware allocator.
+ */
+struct btrfs_priority_tree {
+   /* protected by groups_sem */
+   struct rb_root block_groups;
+   struct rw_semaphore groups_sem;
+
+   /* for different level priority trees in same index*/
+   struct rb_node node;
+
+   int level;


Do you ever expect the level to be a negative number? If not then use
u8/u32 depending on the range of levels you expect.



Indeed, level is not expected to be negative. u8 is more proper.


+};
+
  struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -571,6 +588,8 @@ struct btrfs_block_group_cache {
u64 flags;
u64 cache_generation;
  
+	/* It's used only when priority aware allocator is enabled. */

+   long priority;


What's the range of priorities you are expecting, wouldn't an u8 be
sufficient, that gives us 256 priorities?



The 6th patch introduces three special priorities. That's what I called
dirty codes.

Thanks,
Su


/*
 * If the free space extent count exceeds this number, convert the block
 * group to bitmaps.
@@ -616,6 +635,9 @@ struct btrfs_block_group_cache {
/* for block groups in the same raid type */
struct list_head list;
  
+	/* for block groups in the same priority level */

+   struct rb_node node;
+
/* usage count */
atomic_t count;
  
@@ -670,6 +692,8 @@ struct btrfs_block_group_cache {
  
  	/* Record locked full stripes for RAID5/6 block group */

struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+
+   struct btrfs_priority_tree *priority_tree;
  };
  
  /* delayed seq elem */










Re: [PATCH v2 3/3] btrfs: document extent mapping assumptions in checksum

2018-11-28 Thread Nikolay Borisov



On 28.11.18 г. 10:54 ч., Johannes Thumshirn wrote:
> Document why map_private_extent_buffer() cannot return '1' (i.e. the map
> spans two pages) for the csum_tree_block() case.
> 
> The current algorithm for detecting a page boundary crossing in
> map_private_extent_buffer() will return a '1' *IFF* the extent buffer's
> offset in the page + the offset passed in by csum_tree_block() and the
> minimal length passed in by csum_tree_block() - 1 are bigger than
> PAGE_SIZE.
> 
> We always pass BTRFS_CSUM_SIZE (32) as offset and a minimal length of 32
> and the current extent buffer allocator always guarantees page aligned
> extends, so the above condition can't be true.
> 
> Signed-off-by: Johannes Thumshirn 

Reviewed-by: Nikolay Borisov 


> 
> ---
> Changes to v1:
> * Changed wording of the commit message according to Noah's suggestion
> ---
>  fs/btrfs/disk-io.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 4bc270ef29b4..14d355d0cb7a 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
>  
>   len = buf->len - offset;
>   while (len > 0) {
> + /*
> +  * Note: we don't need to check for the err == 1 case here, as
> +  * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
> +  * and 'min_len = 32' and the currently implemented mapping
> +  * algorithm we cannot cross a page boundary.
> +  */
>   err = map_private_extent_buffer(buf, offset, 32,
>   , _start, _len);
>   if (err)
> 


Re: [RFC PATCH 03/17] btrfs: priority alloc: introduce compute_block_group_priority/usage

2018-11-28 Thread Nikolay Borisov



On 28.11.18 г. 5:11 ч., Su Yue wrote:
> Introduce compute_block_group_usage() and compute_block_group_usage().
> And call the latter in btrfs_make_block_group() and
> btrfs_read_block_groups().
> 
> compute_priority_level use ilog2(free) to compute priority level.
> 
> Signed-off-by: Su Yue 
> ---
>  fs/btrfs/extent-tree.c | 60 ++
>  1 file changed, 60 insertions(+)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index d242a1174e50..0f4c5b1e0bcc 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10091,6 +10091,7 @@ static int check_chunk_block_group_mappings(struct 
> btrfs_fs_info *fs_info)
>   return ret;
>  }
>  
> +static long compute_block_group_priority(struct btrfs_block_group_cache *bg);

That is ugly, just put the function above the first place where they are
going to be used and don't introduce forward declarations for static
functions.

>  int btrfs_read_block_groups(struct btrfs_fs_info *info)
>  {
>   struct btrfs_path *path;
> @@ -10224,6 +10225,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info 
> *info)
>  
>   link_block_group(cache);
>  
> + cache->priority = compute_block_group_priority(cache);
> +
>   set_avail_alloc_bits(info, cache->flags);
>   if (btrfs_chunk_readonly(info, cache->key.objectid)) {
>   inc_block_group_ro(cache, 1);
> @@ -10373,6 +10376,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle 
> *trans, u64 bytes_used,
>  
>   link_block_group(cache);
>  
> + cache->priority = compute_block_group_priority(cache);
> +
>   list_add_tail(>bg_list, >new_bgs);
>  
>   set_avail_alloc_bits(fs_info, type);
> @@ -11190,3 +11195,58 @@ void btrfs_mark_bg_unused(struct 
> btrfs_block_group_cache *bg)
>   }
>   spin_unlock(_info->unused_bgs_lock);
>  }
> +
> +enum btrfs_priority_shift {
> + PRIORITY_USAGE_SHIFT = 0
> +};
> +
> +static inline u8
> +compute_block_group_usage(struct btrfs_block_group_cache *cache)
> +{
> + u64 num_bytes;
> + u8 usage;
> +
> + num_bytes = cache->reserved + cache->bytes_super +
> + btrfs_block_group_used(>item);
> +
> + usage = div_u64(num_bytes, div_factor_fine(cache->key.offset, 1));

Mention somewhere (either as a function description or in the patch
description) that you use the % used.

> +
> + return usage;
> +}
> +
> +static long compute_block_group_priority(struct btrfs_block_group_cache *bg)
> +{
> + u8 usage;
> + long priority = 0;
> +
> + if (btrfs_test_opt(bg->fs_info, PRIORITY_USAGE)) {
> + usage = compute_block_group_usage(bg);
> + priority |= usage << PRIORITY_USAGE_SHIFT;
> + }

Why is priority a signed type and not unsigned, I assume priority can
never be negative? I briefly looked at the other patches and most of the
time the argument passed is indeed na unsigned value.

> +
> + return priority;
> +}
> +
> +static int compute_priority_level(struct btrfs_fs_info *fs_info,
> +   long priority)
> +{
> + int level = 0;
> +
> + if (btrfs_test_opt(fs_info, PRIORITY_USAGE)) {
> + u8 free;
> +
> + WARN_ON(priority < 0);

I think this WARN_ON is redundant provided that the high-level
interfaces are sane and don't allow negative value to trickle down.

> + free = 100 - (priority >> PRIORITY_USAGE_SHIFT);
> +
> + if (free == 0)
> + level = 0;
> + else if (free == 100)
> + level = ilog2(free) + 1;
> + else
> + level = ilog2(free);
> + /* log2(1) == 0, leave level 0 for unused block_groups */
> + level = ilog2(100) + 1 - level;
> + }
> +
> + return level;
> +}
> 


[PATCH v2 3/3] btrfs: document extent mapping assumptions in checksum

2018-11-28 Thread Johannes Thumshirn
Document why map_private_extent_buffer() cannot return '1' (i.e. the map
spans two pages) for the csum_tree_block() case.

The current algorithm for detecting a page boundary crossing in
map_private_extent_buffer() will return a '1' *IFF* the extent buffer's
offset in the page + the offset passed in by csum_tree_block() and the
minimal length passed in by csum_tree_block() - 1 are bigger than
PAGE_SIZE.

We always pass BTRFS_CSUM_SIZE (32) as offset and a minimal length of 32
and the current extent buffer allocator always guarantees page aligned
extends, so the above condition can't be true.

Signed-off-by: Johannes Thumshirn 

---
Changes to v1:
* Changed wording of the commit message according to Noah's suggestion
---
 fs/btrfs/disk-io.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4bc270ef29b4..14d355d0cb7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
 
len = buf->len - offset;
while (len > 0) {
+   /*
+* Note: we don't need to check for the err == 1 case here, as
+* with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
+* and 'min_len = 32' and the currently implemented mapping
+* algorithm we cannot cross a page boundary.
+*/
err = map_private_extent_buffer(buf, offset, 32,
, _start, _len);
if (err)
-- 
2.16.4



[PATCH v2 1/3] btrfs: don't initialize 'offset' in map_private_extent_buffer()

2018-11-28 Thread Johannes Thumshirn
In map_private_extent_buffer() the 'offset' variable is initialized to a
page aligned version of the 'start' parameter.

But later on it is overwritten with either the offset from the extent
buffer's start or 0.

So get rid of the initial initialization.

Signed-off-by: Johannes Thumshirn 
Reviewed-by: Nikolay Borisov 
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 582b4b1c41e0..7aafdec49dc3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5380,7 +5380,7 @@ int map_private_extent_buffer(const struct extent_buffer 
*eb,
  char **map, unsigned long *map_start,
  unsigned long *map_len)
 {
-   size_t offset = start & (PAGE_SIZE - 1);
+   size_t offset;
char *kaddr;
struct page *p;
size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
-- 
2.16.4



[PATCH v2 2/3] btrfs: use offset_in_page for start_offset in map_private_extent_buffer()

2018-11-28 Thread Johannes Thumshirn
In map_private_extent_buffer() use offset_in_page() to initialize
'start_offset' instead of open-coding it.

Signed-off-by: Johannes Thumshirn 
Reviewed-by: Nikolay Borisov 
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7aafdec49dc3..85cd3975c680 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5383,7 +5383,7 @@ int map_private_extent_buffer(const struct extent_buffer 
*eb,
size_t offset;
char *kaddr;
struct page *p;
-   size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+   size_t start_offset = offset_in_page(eb->start);
unsigned long i = (start_offset + start) >> PAGE_SHIFT;
unsigned long end_i = (start_offset + start + min_len - 1) >>
PAGE_SHIFT;
-- 
2.16.4



[PATCH v2 0/3] Misc cosmetic changes for map_private_extent_buffer

2018-11-28 Thread Johannes Thumshirn
While trying to understand the checksum code I came across some oddities
regarding map_private_extent_buffer() and it's interaction with
csum_tree_block().

These patches address them but are either purely cosmetic or only add a
comment documenting behaviour.

Changes to v1:
* Add Reviewed-by tags
* Change wording of commit message in patch 3/3

Johannes Thumshirn (3):
  btrfs: don't initialize 'offset' in map_private_extent_buffer()
  btrfs: use offset_in_page for start_offset in
map_private_extent_buffer()
  btrfs: document extent mapping assumptions in checksum

 fs/btrfs/disk-io.c   | 6 ++
 fs/btrfs/extent_io.c | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

-- 
2.16.4



Re: [PATCH 2/2] btrfs: scrub: fix circular locking dependency warning

2018-11-28 Thread Anand Jain




On 11/26/2018 05:59 PM, Nikolay Borisov wrote:



On 26.11.18 г. 11:07 ч., Anand Jain wrote:

Circular locking dependency check reports warning[1], that's because
the btrfs_scrub_dev() calls the stack #0 below with, the
fs_info::scrub_lock held. The test case leading to this warning..

   mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs
   btrfs scrub start -B /btrfs

In fact we have fs_info::scrub_workers_refcnt to tack if the init and
destroy of the scrub workers are needed. So once we have incremented
and decremented the fs_info::scrub_workers_refcnt value in the thread,
its ok to drop the scrub_lock, and then actually do the
btrfs_destroy_workqueue() part. So this patch drops the scrub_lock
before calling btrfs_destroy_workqueue().

[1]
[   76.146826] ==
[   76.147086] WARNING: possible circular locking dependency detected
[   76.147316] 4.20.0-rc3+ #41 Not tainted
[   76.147489] --
[   76.147722] btrfs/4065 is trying to acquire lock:
[   76.147984] 38593bc0 ((wq_completion)"%s-%s""btrfs",
name){+.+.}, at: flush_workqueue+0x70/0x4d0
[   76.148337]
but task is already holding lock:
[   76.148594] 62392ab7 (_info->scrub_lock){+.+.}, at:
btrfs_scrub_dev+0x316/0x5d0 [btrfs]
[   76.148909]
which lock already depends on the new lock.

[   76.149191]
the existing dependency chain (in reverse order) is:
[   76.149446]
-> #3 (_info->scrub_lock){+.+.}:
[   76.149707]btrfs_scrub_dev+0x11f/0x5d0 [btrfs]
[   76.149924]btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.150216]do_vfs_ioctl+0xa9/0x6d0
[   76.150468]ksys_ioctl+0x60/0x90
[   76.150716]__x64_sys_ioctl+0x16/0x20
[   76.150911]do_syscall_64+0x50/0x180
[   76.151182]entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.151469]
-> #2 (_devs->device_list_mutex){+.+.}:
[   76.151851]reada_start_machine_worker+0xca/0x3f0 [btrfs]
[   76.152195]normal_work_helper+0xf0/0x4c0 [btrfs]
[   76.152489]process_one_work+0x1f4/0x520
[   76.152751]worker_thread+0x46/0x3d0
[   76.153715]kthread+0xf8/0x130
[   76.153912]ret_from_fork+0x3a/0x50
[   76.154178]
-> #1 ((work_completion)(>normal_work)){+.+.}:
[   76.154575]worker_thread+0x46/0x3d0
[   76.154828]kthread+0xf8/0x130
[   76.155108]ret_from_fork+0x3a/0x50
[   76.155357]
-> #0 ((wq_completion)"%s-%s""btrfs", name){+.+.}:
[   76.155751]flush_workqueue+0x9a/0x4d0
[   76.155911]drain_workqueue+0xca/0x1a0
[   76.156182]destroy_workqueue+0x17/0x230
[   76.156455]btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
[   76.156756]scrub_workers_put+0x2e/0x60 [btrfs]
[   76.156931]btrfs_scrub_dev+0x329/0x5d0 [btrfs]
[   76.157219]btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.157491]do_vfs_ioctl+0xa9/0x6d0
[   76.157742]ksys_ioctl+0x60/0x90
[   76.157910]__x64_sys_ioctl+0x16/0x20
[   76.158177]do_syscall_64+0x50/0x180
[   76.158429]entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.158716]
other info that might help us debug this:

[   76.158908] Chain exists of:
   (wq_completion)"%s-%s""btrfs", name --> _devs->device_list_mutex
--> _info->scrub_lock

[   76.159629]  Possible unsafe locking scenario:

[   76.160607]CPU0CPU1
[   76.160934]
[   76.161210]   lock(_info->scrub_lock);
[   76.161458]
lock(_devs->device_list_mutex);
[   76.161805]
lock(_info->scrub_lock);
[   76.161909]   lock((wq_completion)"%s-%s""btrfs", name);
[   76.162201]
  *** DEADLOCK ***

[   76.162627] 2 locks held by btrfs/4065:
[   76.162897]  #0: bef2775b (sb_writers#12){.+.+}, at:
mnt_want_write_file+0x24/0x50
[   76.163335]  #1: 62392ab7 (_info->scrub_lock){+.+.}, at:
btrfs_scrub_dev+0x316/0x5d0 [btrfs]
[   76.163796]
stack backtrace:
[   76.163911] CPU: 1 PID: 4065 Comm: btrfs Not tainted 4.20.0-rc3+ #41
[   76.164228] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS
VirtualBox 12/01/2006
[   76.164646] Call Trace:
[   76.164872]  dump_stack+0x5e/0x8b
[   76.165128]  print_circular_bug.isra.37+0x1f1/0x1fe
[   76.165398]  __lock_acquire+0x14aa/0x1620
[   76.165652]  lock_acquire+0xb0/0x190
[   76.165910]  ? flush_workqueue+0x70/0x4d0
[   76.166175]  flush_workqueue+0x9a/0x4d0
[   76.166420]  ? flush_workqueue+0x70/0x4d0
[   76.166671]  ? drain_workqueue+0x52/0x1a0
[   76.166911]  drain_workqueue+0xca/0x1a0
[   76.167167]  destroy_workqueue+0x17/0x230
[   76.167428]  btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
[   76.167720]  scrub_workers_put+0x2e/0x60 [btrfs]
[   76.168233]  btrfs_scrub_dev+0x329/0x5d0 [btrfs]
[   76.168504]  ? __sb_start_write+0x121/0x1b0
[   76.168759]  ? mnt_want_write_file+0x24/0x50
[   76.169654]  btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.169934]  ? find_held_lock+0x2d/0x90
[   76.170204]  ? find_held_lock+0x2d/0x90
[   76.170450]  

Re: [PATCH 3/3] btrfs: document extent mapping assumptions in checksum

2018-11-28 Thread Johannes Thumshirn
On 27/11/2018 17:36, Nikolay Borisov wrote:
> 
> 
> On 27.11.18 г. 18:00 ч., Johannes Thumshirn wrote:
>> Document why map_private_extent_buffer() cannot return '1' (i.e. the map
>> spans two pages) for the csum_tree_block() case.
>>
>> The current algorithm for detecting a page boundary crossing in
>> map_private_extent_buffer() will return a '1' *IFF* the product of the
> 
> I think the word product must be replaced with 'sum', since product
> implies multiplication :)

*doh* m)

Yes thanks for spotting this.

-- 
Johannes ThumshirnSUSE Labs
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [RFC PATCH 01/17] btrfs: priority alloc: prepare of priority aware allocator

2018-11-28 Thread Nikolay Borisov



On 28.11.18 г. 5:11 ч., Su Yue wrote:
> To implement priority aware allocator, this patch:
> Introduces struct btrfs_priority_tree which contains block groups
> in same level.
> Adds member priority to struct btrfs_block_group_cache and pointer
> points to the priority tree it's located.
> 
> Adds member priority_trees to struct btrfs_space_info to represents
> priority trees in different raid types.
> 
> Signed-off-by: Su Yue 
> ---
>  fs/btrfs/ctree.h | 24 
>  1 file changed, 24 insertions(+)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index e62824cae00a..5c4651d8a524 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -437,6 +437,8 @@ struct btrfs_space_info {
>   struct rw_semaphore groups_sem;
>   /* for block groups in our same type */
>   struct list_head block_groups[BTRFS_NR_RAID_TYPES];
> + /* for priority trees in our same type */
> + struct rb_root priority_trees[BTRFS_NR_RAID_TYPES];
>   wait_queue_head_t wait;
>  
>   struct kobject kobj;
> @@ -558,6 +560,21 @@ struct btrfs_full_stripe_locks_tree {
>   struct mutex lock;
>  };
>  
> +/*
> + * Tree to record all block_groups in same priority level.
> + * Only used in priority aware allocator.
> + */
> +struct btrfs_priority_tree {
> + /* protected by groups_sem */
> + struct rb_root block_groups;
> + struct rw_semaphore groups_sem;
> +
> + /* for different level priority trees in same index*/
> + struct rb_node node;
> +
> + int level;

Do you ever expect the level to be a negative number? If not then use
u8/u32 depending on the range of levels you expect.

> +};
> +
>  struct btrfs_block_group_cache {
>   struct btrfs_key key;
>   struct btrfs_block_group_item item;
> @@ -571,6 +588,8 @@ struct btrfs_block_group_cache {
>   u64 flags;
>   u64 cache_generation;
>  
> + /* It's used only when priority aware allocator is enabled. */
> + long priority;

What's the range of priorities you are expecting, wouldn't an u8 be
sufficient, that gives us 256 priorities?

>   /*
>* If the free space extent count exceeds this number, convert the block
>* group to bitmaps.
> @@ -616,6 +635,9 @@ struct btrfs_block_group_cache {
>   /* for block groups in the same raid type */
>   struct list_head list;
>  
> + /* for block groups in the same priority level */
> + struct rb_node node;
> +
>   /* usage count */
>   atomic_t count;
>  
> @@ -670,6 +692,8 @@ struct btrfs_block_group_cache {
>  
>   /* Record locked full stripes for RAID5/6 block group */
>   struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
> +
> + struct btrfs_priority_tree *priority_tree;
>  };
>  
>  /* delayed seq elem */
> 


Re: [PATCH 5/8] btrfs: don't enospc all tickets on flush failure

2018-11-28 Thread Nikolay Borisov



On 27.11.18 г. 21:46 ч., Josef Bacik wrote:
> On Mon, Nov 26, 2018 at 02:25:52PM +0200, Nikolay Borisov wrote:
>>
>>
>> On 21.11.18 г. 21:03 ч., Josef Bacik wrote:
>>> With the introduction of the per-inode block_rsv it became possible to
>>> have really really large reservation requests made because of data
>>> fragmentation.  Since the ticket stuff assumed that we'd always have
>>> relatively small reservation requests it just killed all tickets if we
>>> were unable to satisfy the current request.  However this is generally
>>> not the case anymore.  So fix this logic to instead see if we had a
>>> ticket that we were able to give some reservation to, and if we were
>>> continue the flushing loop again.  Likewise we make the tickets use the
>>> space_info_add_old_bytes() method of returning what reservation they did
>>> receive in hopes that it could satisfy reservations down the line.
>>
>>
>> The logic of the patch can be summarised as follows:
>>
>> If no progress is made for a ticket, then start fail all tickets until
>> the first one that has progress made on its reservation (inclusive). In
>> this case this first ticket will be failed but at least it's space will
>> be reused via space_info_add_old_bytes.
>>
>> Frankly this seem really arbitrary.
> 
> It's not though.  The tickets are in order of who requested the reservation.
> Because we will backfill reservations for things like hugely fragmented files 
> or
> large amounts of delayed refs we can have spikes where we're trying to reserve
> 100mb's of metadata space.  We may fill 50mb of that before we run out of 
> space.
> Well so we can't satisfy that reservation, but the small 100k reservations 
> that
> are waiting to be serviced can be satisfied and they can run.  The alternative
> is you get ENOSPC and then you can turn around and touch a file no problem
> because it's a small reservation and there was room for it.  This patch 
> enables
> better behavior for the user.  Thanks,

Well this information needs to be in the changelog since it describe the
situation where this patch is useful.

> 
> Josef
>