[PATCH] btrfs: trival fix of __btrfs_set_acl error handling

2015-09-08 Thread Sheng Yong
* If the allocation failed, don't free to free it, even though kfree
  allows to free a NULL pointer.
* If posix_acl_to_xattr() failed, cleanup the allocation and return
  the error directly.

Signed-off-by: Sheng Yong 
---
 fs/btrfs/acl.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a..6d01d09 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -103,18 +103,18 @@ static int __btrfs_set_acl(struct btrfs_trans_handle 
*trans,
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
value = kmalloc(size, GFP_NOFS);
-   if (!value) {
-   ret = -ENOMEM;
-   goto out;
-   }
+   if (!value)
+   return -ENOMEM;
 
ret = posix_acl_to_xattr(_user_ns, acl, value, size);
-   if (ret < 0)
-   goto out;
+   if (ret < 0) {
+   kfree(value);
+   return ret;
+   }
}
 
ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
-out:
+
kfree(value);
 
if (!ret)
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: trival fix of __btrfs_set_acl error handling

2015-09-08 Thread Sheng Yong
Hi, Qu

On 9/8/2015 4:50 PM, Qu Wenruo wrote:
> Sheng Yong wrote on 2015/09/08 08:46 +:
>> * If the allocation failed, don't free to free it, even though kfree
>>allows to free a NULL pointer.
>> * If posix_acl_to_xattr() failed, cleanup the allocation and return
>>the error directly.
> So, what's the point?
> For me, I didn't see the pros of the change.
> As kfree() allow NULL pointer, why not use it?
In fact, there is no semantic changes. It's just because when I walk through
the code, and find there is no need to call kfree(), and could be cleaned up.
It's fine to keep as it is :)

thanks,
Sheng
> 
> Thanks,
> Qu
>>
>> Signed-off-by: Sheng Yong 
>> ---
>>   fs/btrfs/acl.c | 14 +++---
>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
>> index 9a0124a..6d01d09 100644
>> --- a/fs/btrfs/acl.c
>> +++ b/fs/btrfs/acl.c
>> @@ -103,18 +103,18 @@ static int __btrfs_set_acl(struct btrfs_trans_handle 
>> *trans,
>>   if (acl) {
>>   size = posix_acl_xattr_size(acl->a_count);
>>   value = kmalloc(size, GFP_NOFS);
>> -if (!value) {
>> -ret = -ENOMEM;
>> -goto out;
>> -}
>> +if (!value)
>> +return -ENOMEM;
>>
>>   ret = posix_acl_to_xattr(_user_ns, acl, value, size);
>> -if (ret < 0)
>> -goto out;
>> +if (ret < 0) {
>> +kfree(value);
>> +return ret;
>> +}
>>   }
>>
>>   ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
>> -out:
>> +
>>   kfree(value);
>>
>>   if (!ret)
>>
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/19] btrfs: qgroup: Introduce function to reserve data range per inode

2015-09-08 Thread Qu Wenruo
Introduce new function reserve_data_range().
This function will find non-overlap range and to insert it into reserve
map using previously introduced functions.

This provides the basis for later per inode reserve map implement.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 92 +++
 1 file changed, 92 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a4e3af4..77a2e07 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2701,6 +2701,98 @@ static int insert_data_ranges(struct 
btrfs_qgroup_data_rsv_map *map,
 }
 
 /*
+ * Check qgroup limit and insert dirty range into reserve_map.
+ *
+ * Must be called with map->lock hold
+ */
+static int reserve_data_range(struct btrfs_root *root,
+ struct btrfs_qgroup_data_rsv_map *map,
+ struct data_rsv_range *tmp,
+ struct ulist *insert_list, u64 start, u64 len)
+{
+   struct data_rsv_range *range;
+   u64 cur_start = 0;
+   u64 cur_len = 0;
+   u64 reserve = 0;
+   int ret = 0;
+
+   range = find_reserve_range(map, start);
+   /* empty tree, insert the whole range */
+   if (!range) {
+   reserve = len;
+   ret = ulist_add(insert_list, start, len, GFP_ATOMIC);
+   if (ret < 0)
+   return ret;
+   goto insert;
+   }
+
+   /* For case range is covering the leading part */
+   if (range->start <= start && range->start + range->len > start)
+   cur_start = range->start + range->len;
+   else
+   cur_start = start;
+
+   /*
+* iterate until the end of the range.
+* Like the following:
+*
+*  ||
+*|//1//|   |2//|   |///3///|   <- exists
+* Then we will need to insert the following
+*  |\\\4\\\|   |\\\5\\\|   |\\\6\\\|
+* And only add qgroup->reserved for rang 4,5,6.
+*/
+   while (cur_start < start + len) {
+   struct rb_node *next_node;
+   u64 next_start;
+
+   if (range->start + range->len <= cur_start) {
+   /*
+* Move to next range if current range is before
+* cur_start
+* e.g range is 1, cur_start is the end of range 1.
+*/
+   next_node = rb_next(>node);
+   if (!next_node) {
+   /*
+* no next range, fill the rest
+* e.g range is 3, cur_start is end of range 3.
+*/
+   cur_len = start + len - cur_start;
+   next_start = start + len;
+   } else {
+   range = rb_entry(next_node,
+struct data_rsv_range, node);
+   cur_len = min(range->start, start + len) -
+ cur_start;
+   next_start = range->start + range->len;
+   }
+   } else {
+   /*
+* current range is already after cur_start
+* e.g range is 2, cur_start is end of range 1.
+*/
+   cur_len = min(range->start, start + len) - cur_start;
+   next_start = range->start + range->len;
+   }
+   reserve += cur_len;
+   ret = ulist_add(insert_list, cur_start, cur_len, GFP_ATOMIC);
+   if (ret < 0)
+   return ret;
+
+   cur_start = next_start;
+   }
+insert:
+   ret = btrfs_qgroup_reserve(root, reserve);
+   if (ret < 0)
+   return ret;
+   /* ranges must be inserted after we are sure it has enough space */
+   ret = insert_data_ranges(map, tmp, insert_list);
+   map->reserved += reserve;
+   return ret;
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 00/14] Accurate qgroup reserve framework

2015-09-08 Thread Qu Wenruo

Sorry for the confusing cover letter title.

This patch is no longer RFC now.
It's already a working one, and we're doing stress test to ensure it's 
completely OK, but seems quite good for now.


To Chris,

I know the timing I sent the patchset is quite awful, as there is only 
less than 1 week for rc1, and the merge window will close soon.


But I still hope there would be a small chance we can merge it into 
early v4.3-rc. Maybe rc2 or rc3?
As the reserve space leaking problem is quite annoying, sometimes even 
making qgroup limit unusable.


If that's not possible, I'm completely OK with that though, as Linus 
won't be happy about that without doubt.


Thanks,
Qu

Qu Wenruo wrote on 2015/09/08 16:56 +0800:

[[BUG]]
One of the most common case to trigger the bug is the following method:
1) Enable quota
2) Limit excl of qgroup 5 to 16M
3) Write [0,2M) of a file inside subvol 5 10 times without sync

EQUOT will be triggered at about the 8th write.

[[CAUSE]]
The problem is caused by the fact that qgroup will reserve space even
the data space is already reserved.

In above reproducer, each time we buffered write [0,2M) qgroup will
reserve 2M space, but in fact, at the 1st time, we have already reserved
2M and from then on, we don't need to reserved any data space as we are
only writing [0,2M).

Also, the reserved space will only be freed *ONCE* when its backref is
run at commit_transaction() time.

That's causing the reserved space leaking.

[[FIX]]
The fix is not a simple one, as currently btrfs_qgroup_reserve() follow
the very bad btrfs space allocating principle:
   Allocate as much as you needed, even it's not fully used.

So for accurate qgroup reserve, we introduce a completely new framework
for data and metadata.
1) Per-inode data reserve map
Now, each inode will have a data reserve map, recording which range
of data is already reserved.
If we are writing a range which is already reserved, we won't need to
reserve space again.

Also, for the fact that qgroup is only accounted at commit_trans(),
for data commit into disc and its metadata is also inserted into
current tree, we should free the data reserved range, but still keep
the reserved space until commit_trans().

So delayed_ref_head will have new members to record how much space is
reserved and free them at commit_trans() time.

2) Per-root metadata reserve counter
For metadata(tree block), it's impossible to know how much space it
will use exactly in advance.
And due to the new qgroup accounting framework, the old
free-at-end-trans may lead to exceeding limit.

So we record how much metadata space is reserved for each root, and
free them at commit_trans() time.
This method is not perfect, but thanks to the compared small size of
metadata, it should be quite good.

More detailed info can be found in each commit message and source
commend.

Qu Wenruo (19):
   btrfs: qgroup: New function declaration for new reserve implement
   btrfs: qgroup: Implement data_rsv_map init/free functions
   btrfs: qgroup: Introduce new function to search most left reserve
 range
   btrfs: qgroup: Introduce function to insert non-overlap reserve range
   btrfs: qgroup: Introduce function to reserve data range per inode
   btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function
   btrfs: qgroup: Introduce function to release reserved range
   btrfs: qgroup: Introduce function to release/free reserved data range
   btrfs: delayed_ref: Add new function to record reserved space into
 delayed ref
   btrfs: delayed_ref: release and free qgroup reserved at proper timing
   btrfs: qgroup: Introduce new functions to reserve/free metadata
   btrfs: qgroup: Use new metadata reservation.
   btrfs: extent-tree: Add new verions of btrfs_check_data_free_space
   btrfs: Switch to new check_data_free_space
   btrfs: fallocate: Add support to accurate qgroup reserve
   btrfs: extent-tree: Add new version of btrfs_delalloc_reserve_space
   btrfs: extent-tree: Use new __btrfs_delalloc_reserve_space function
   btrfs: qgroup: Cleanup old inaccurate facilities
   btrfs: qgroup: Add handler for NOCOW and inline

  fs/btrfs/btrfs_inode.h |   6 +
  fs/btrfs/ctree.h   |   8 +-
  fs/btrfs/delayed-ref.c |  29 +++
  fs/btrfs/delayed-ref.h |  14 +
  fs/btrfs/disk-io.c |   1 +
  fs/btrfs/extent-tree.c |  99 +---
  fs/btrfs/file.c| 169 +
  fs/btrfs/inode-map.c   |   2 +-
  fs/btrfs/inode.c   |  51 +++-
  fs/btrfs/ioctl.c   |   3 +-
  fs/btrfs/qgroup.c  | 674 -
  fs/btrfs/qgroup.h  |  18 +-
  fs/btrfs/transaction.c |  34 +--
  fs/btrfs/transaction.h |   1 -
  14 files changed, 979 insertions(+), 130 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/19] btrfs: qgroup: Introduce function to release/free reserved data range

2015-09-08 Thread Qu Wenruo
Introduce functions btrfs_qgroup_release/free_data() to release/free
reserved data range.

Release means, just remove the data range from data rsv map, but doesn't
free the reserved space.
This is for normal buffered write case, when data is written into disc
and its metadata is added into tree, its reserved space should still be
kept until commit_trans().
So in that case, we only release dirty range, but keep the reserved
space recorded some other place until commit_tran().

Free means not only remove data range, but also free reserved space.
This is used for case for cleanup.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 48 
 fs/btrfs/qgroup.h |  2 ++
 2 files changed, 50 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e24c10d..ba7888f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2979,6 +2979,54 @@ next:
return 0;
 }
 
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+  int free_reserved)
+{
+   struct data_rsv_range *tmp;
+   struct btrfs_qgroup_data_rsv_map *map;
+   u64 reserved = 0;
+   int ret;
+
+   spin_lock(_I(inode)->qgroup_init_lock);
+   map = BTRFS_I(inode)->qgroup_rsv_map;
+   spin_unlock(_I(inode)->qgroup_init_lock);
+   if (!map)
+   return 0;
+
+   tmp = kmalloc(sizeof(*tmp), GFP_NOFS);
+   if (!tmp)
+   return -ENOMEM;
+   spin_lock(>lock);
+   ret = release_data_range(map, tmp, start, len, );
+   /* release_data_range() won't fail only check if memory is used */
+   if (ret == 0)
+   kfree(tmp);
+   if (free_reserved)
+   btrfs_qgroup_free(BTRFS_I(inode)->root, reserved);
+   spin_unlock(>lock);
+   return 0;
+}
+
+/*
+ * Caller should be truncate/invalidate_page.
+ * As it will release the reserved data.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+   return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Caller should be finish_ordered_io
+ * As qgroup accouting happens at commit time, for data written to disk
+ * its reserved space should not be freed until commit.
+ * Or we may beyond the limit.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+   return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
 /*
  * Init data_rsv_map for a given inode.
  *
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 366b853..8e69dc1 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -88,4 +88,6 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, 
u64 qgroupid,
 int btrfs_qgroup_init_data_rsv_map(struct inode *inode);
 void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/19] btrfs: extent-tree: Add new verions of btrfs_check_data_free_space

2015-09-08 Thread Qu Wenruo
Add new function __btrfs_check_data_free_space() to do precious space
reservation.

The new function will replace old btrfs_check_data_free_space(), but
until all the change is done, let's just use the new name.

Also, export internal use function btrfs_alloc_data_chunk_ondemand(), as
now qgroup reserve requires precious bytes, which can only be got in
later loop(like fallocate).
But data space info check and data chunk allocate doesn't need to be
that accurate, and can be called at the beginning.

So export it for later operations.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/extent-tree.c | 50 +-
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ae86025..c1a0aaf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3453,6 +3453,8 @@ enum btrfs_reserve_flush_enum {
 };
 
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes);
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 402415c..61366ca 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3907,11 +3907,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int 
data)
return ret;
 }
 
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 {
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4032,19 +4028,55 @@ commit_trans:
  data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
-   ret = btrfs_qgroup_reserve(root, write_bytes);
-   if (ret)
-   goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
  data_sinfo->flags, bytes, 1);
-out:
spin_unlock(_sinfo->lock);
 
return ret;
 }
 
 /*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes)
+{
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   int ret;
+
+   ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
+   if (ret < 0)
+   return ret;
+   ret = btrfs_qgroup_reserve(root, write_bytes);
+   return ret;
+}
+
+/*
+ * New check_data_free_space() with ability for precious data reserveation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   int ret;
+
+   /* align the range */
+   len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+   start = round_down(start, root->sectorsize);
+
+   ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+   if (ret < 0)
+   return ret;
+
+   /* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+   ret = btrfs_qgroup_reserve_data(inode, start, len);
+   return ret;
+}
+
+/*
  * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/19] btrfs: fallocate: Add support to accurate qgroup reserve

2015-09-08 Thread Qu Wenruo
Now fallocate will do accurate qgroup reserve space check, unlike old
method, which will always reserve the whole length of the range.

With this patch, fallocate will:
1) Iterate the desired range and mark in data rsv map
   Only range which is going to be allocated will be recorded in data
   rsv map and reserve the space.
   For already allocated range (normal/prealloc extent) they will be
   skipped.
   Also, record the marked range into a new list for later use.

2) If 1) succeeded, do real file extent allocate.
   And at file extent allocation time, corresponding range will be
   removed from the range in data rsv map.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/file.c | 147 +---
 1 file changed, 107 insertions(+), 40 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c1eec4f..26e59bc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2545,17 +2545,61 @@ out_only_mutex:
return err;
 }
 
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+   struct list_head list;
+   u64 start;
+   u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+   struct falloc_range *prev = NULL;
+   struct falloc_range *range = NULL;
+
+   if (list_empty(head))
+   goto insert;
+
+   /*
+* As fallocate iterate by bytenr order, we only need to check
+* the last range.
+*/
+   prev = list_entry(head->prev, struct falloc_range, list);
+   if (prev->start + prev->len == start) {
+   prev->len += len;
+   return 0;
+   }
+insert:
+   range = kmalloc(sizeof(*range), GFP_NOFS);
+   if (!range)
+   return -ENOMEM;
+   range->start = start;
+   range->len = len;
+   list_add_tail(>list, head);
+   return 0;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
 {
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+   struct falloc_range *range;
+   struct falloc_range *tmp;
+   struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
+   u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2571,10 +2615,11 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
 
/*
-* Make sure we have enough space before we do the
-* allocation.
+* Only trigger disk allocation, don't trigger qgroup reserve
+*
+* For qgroup space, it will be checked later.
 */
-   ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, 
alloc_end - alloc_start);
+   ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
if (ret)
return ret;
 
@@ -2583,6 +2628,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
 
+   /*
+* TODO: Move these two operations after we have checked
+* accurate reserved space, or fallocate can still fail but
+* with page truncated or size expanded.
+*
+* But that's a minor problem and won't do much harm BTW.
+*/
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
@@ -2641,10 +2693,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
 
+   /* First, check if we exceed the qgroup limit */
+   INIT_LIST_HEAD(_list);
cur_offset = alloc_start;
while (1) {
-   u64 actual_end;
-
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
  alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2657,54 +2709,69 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
-
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
 !test_bit(EXTENT_FLAG_PREALLOC, >flags))) {
-   ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-   last_byte - cur_offset,
-   1 << 

[PATCH 12/19] btrfs: qgroup: Use new metadata reservation.

2015-09-08 Thread Qu Wenruo
As we have the new metadata reservation functions, use them to replace
the old btrfs_qgroup_reserve() call for metadata.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c | 14 ++
 fs/btrfs/transaction.c | 34 ++
 fs/btrfs/transaction.h |  1 -
 3 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 65e60eb..402415c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5343,7 +5343,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
-   ret = btrfs_qgroup_reserve(root, num_bytes);
+   ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
@@ -5361,10 +5361,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
 
-   if (ret) {
-   if (*qgroup_reserved)
-   btrfs_qgroup_free(root, *qgroup_reserved);
-   }
+   if (ret && *qgroup_reserved)
+   btrfs_qgroup_free_meta(root, *qgroup_reserved);
 
return ret;
 }
@@ -5525,15 +5523,15 @@ int btrfs_delalloc_reserve_metadata(struct inode 
*inode, u64 num_bytes)
spin_unlock(_I(inode)->lock);
 
if (root->fs_info->quota_enabled) {
-   ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+   ret = btrfs_qgroup_reserve_meta(root,
+   nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
 
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
-   if (root->fs_info->quota_enabled)
-   btrfs_qgroup_free(root, nr_extents * root->nodesize);
+   btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 68ad89e..707e8ea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -446,13 +446,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, 
unsigned int type,
 * the appropriate flushing if need be.
 */
if (num_items > 0 && root != root->fs_info->chunk_root) {
-   if (root->fs_info->quota_enabled &&
-   is_fstree(root->root_key.objectid)) {
-   qgroup_reserved = num_items * root->nodesize;
-   ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-   if (ret)
-   return ERR_PTR(ret);
-   }
+   qgroup_reserved = num_items * root->nodesize;
+   ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+   if (ret)
+   return ERR_PTR(ret);
 
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
/*
@@ -521,7 +518,6 @@ again:
h->block_rsv = NULL;
h->orig_rsv = NULL;
h->aborted = 0;
-   h->qgroup_reserved = 0;
h->delayed_ref_elem.seq = 0;
h->type = type;
h->allocating_chunk = false;
@@ -546,7 +542,6 @@ again:
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
-   h->qgroup_reserved = qgroup_reserved;
 
 got_it:
btrfs_record_root_in_trans(h, root);
@@ -564,8 +559,7 @@ alloc_fail:
btrfs_block_rsv_release(root, >fs_info->trans_block_rsv,
num_bytes);
 reserve_fail:
-   if (qgroup_reserved)
-   btrfs_qgroup_free(root, qgroup_reserved);
+   btrfs_qgroup_free_meta(root, qgroup_reserved);
return ERR_PTR(ret);
 }
 
@@ -782,15 +776,6 @@ static int __btrfs_end_transaction(struct 
btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
 
-   if (trans->qgroup_reserved) {
-   /*
-* the same root has to be passed here between start_transaction
-* and end_transaction. Subvolume quota depends on this.
-*/
-   btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-   trans->qgroup_reserved = 0;
-   }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
 
@@ -1205,6 +1190,7 @@ static noinline int commit_fs_roots(struct 
btrfs_trans_handle *trans,
spin_lock(_info->fs_roots_radix_lock);
if (err)
break;
+   btrfs_qgroup_free_meta_all(root);
}
  

[PATCH 14/19] btrfs: Switch to new check_data_free_space

2015-09-08 Thread Qu Wenruo
Use new check_data_free_space for buffered write and inode cache.

For buffered write case, as nodatacow write won't increase quota account,
so unlike old behavior which does reserve before check nocow, now we
check nocow first and then only reserve data if we can't do nocow write.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/file.c| 22 +-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61366ca..2e3f19e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3352,7 +3352,7 @@ again:
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
 
-   ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+   ret = __btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b823fac..c1eec4f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1510,12 +1510,17 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
}
 
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-   ret = btrfs_check_data_free_space(inode, reserve_bytes, 
write_bytes);
-   if (ret == -ENOSPC &&
-   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC))) {
+
+   if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, _bytes);
+   if (ret < 0)
+   break;
if (ret > 0) {
+   /*
+* For nodata cow case, no need to reserve
+* data space.
+*/
only_release_metadata = true;
/*
 * our prealloc extent may be smaller than
@@ -1524,15 +1529,14 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
 PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-   ret = 0;
-   } else {
-   ret = -ENOSPC;
+   goto reserve_metadata;
}
}
-
-   if (ret)
+   ret = __btrfs_check_data_free_space(inode, pos, write_bytes);
+   if (ret < 0)
break;
 
+reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: trival fix of __btrfs_set_acl error handling

2015-09-08 Thread Qu Wenruo

Sheng Yong wrote on 2015/09/08 08:46 +:

* If the allocation failed, don't free to free it, even though kfree
   allows to free a NULL pointer.
* If posix_acl_to_xattr() failed, cleanup the allocation and return
   the error directly.

So, what's the point?
For me, I didn't see the pros of the change.
As kfree() allow NULL pointer, why not use it?

Thanks,
Qu


Signed-off-by: Sheng Yong 
---
  fs/btrfs/acl.c | 14 +++---
  1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a..6d01d09 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -103,18 +103,18 @@ static int __btrfs_set_acl(struct btrfs_trans_handle 
*trans,
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
value = kmalloc(size, GFP_NOFS);
-   if (!value) {
-   ret = -ENOMEM;
-   goto out;
-   }
+   if (!value)
+   return -ENOMEM;

ret = posix_acl_to_xattr(_user_ns, acl, value, size);
-   if (ret < 0)
-   goto out;
+   if (ret < 0) {
+   kfree(value);
+   return ret;
+   }
}

ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
-out:
+
kfree(value);

if (!ret)


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/19] btrfs: delayed_ref: Add new function to record reserved space into delayed ref

2015-09-08 Thread Qu Wenruo
Add new function btrfs_add_delayed_qgroup_reserve() function to record
how much space is reserved for that extent.

As btrfs only accounts qgroup at run_delayed_refs() time, so newly
allocated extent should keep the reserved space until then.

So add needed function with related members to do it.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/delayed-ref.c | 29 +
 fs/btrfs/delayed-ref.h | 14 ++
 2 files changed, 43 insertions(+)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81d..bd9b63b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -476,6 +476,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+   head_ref->qgroup_reserved = 0;
+   head_ref->qgroup_ref_root = 0;
 
/* Record qgroup extent info if provided */
if (qrecord) {
@@ -746,6 +748,33 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info 
*fs_info,
return 0;
 }
 
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+struct btrfs_trans_handle *trans,
+u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+   struct btrfs_delayed_ref_root *delayed_refs;
+   struct btrfs_delayed_ref_head *ref_head;
+   int ret = 0;
+
+   if (!fs_info->quota_enabled || !is_fstree(ref_root))
+   return 0;
+
+   delayed_refs = >transaction->delayed_refs;
+
+   spin_lock(_refs->lock);
+   ref_head = find_ref_head(_refs->href_root, bytenr, 0);
+   if (!ref_head) {
+   ret = -ENOENT;
+   goto out;
+   }
+   WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+   ref_head->qgroup_ref_root = ref_root;
+   ref_head->qgroup_reserved = num_bytes;
+out:
+   spin_unlock(_refs->lock);
+   return ret;
+}
+
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6..d4c41e2 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -113,6 +113,17 @@ struct btrfs_delayed_ref_head {
int total_ref_mod;
 
/*
+* For qgroup reserved space freeing.
+*
+* ref_root and reserved will be recorded after
+* BTRFS_ADD_DELAYED_EXTENT is called.
+* And will be used to free reserved qgroup space at
+* run_delayed_refs() time.
+*/
+   u64 qgroup_ref_root;
+   u64 qgroup_reserved;
+
+   /*
 * when a new extent is allocated, it is just reserved in memory
 * The actual extent isn't inserted into the extent allocation tree
 * until the delayed ref is processed.  must_insert_reserved is
@@ -242,6 +253,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info 
*fs_info,
   u64 owner, u64 offset, int action,
   struct btrfs_delayed_extent_op *extent_op,
   int no_quota);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+struct btrfs_trans_handle *trans,
+u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/19] btrfs: qgroup: Introduce new functions to reserve/free metadata

2015-09-08 Thread Qu Wenruo
Introduce new functions btrfs_qgroup_reserve/free_meta() to reserve/free
metadata reserved space.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  3 +++
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/qgroup.c  | 40 
 fs/btrfs/qgroup.h  |  4 
 4 files changed, 48 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..ae86025 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1943,6 +1943,9 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshoted;
+
+   /* For qgroup metadata space reserve */
+   atomic_t qgroup_meta_rsv;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0b658d0..704d212 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1259,6 +1259,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, 
u32 stripesize,
atomic_set(>orphan_inodes, 0);
atomic_set(>refs, 1);
atomic_set(>will_be_snapshoted, 0);
+   atomic_set(>qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5a69a2d..b759e96 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3102,3 +3102,43 @@ void btrfs_qgroup_free_data_rsv_map(struct inode *inode)
kfree(dirty_map);
binode->qgroup_rsv_map = NULL;
 }
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+   int ret;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+   num_bytes == 0)
+   return 0;
+
+   BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+   ret = btrfs_qgroup_reserve(root, num_bytes);
+   if (ret < 0)
+   return ret;
+   atomic_add(num_bytes, >qgroup_meta_rsv);
+   return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+   int reserved;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+   return;
+
+   reserved = atomic_xchg(>qgroup_meta_rsv, 0);
+   if (reserved == 0)
+   return;
+   btrfs_qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+   return;
+
+   BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+   WARN_ON(atomic_read(>qgroup_meta_rsv) < num_bytes);
+   atomic_sub(num_bytes, >qgroup_meta_rsv);
+   btrfs_qgroup_free(root, num_bytes);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 49fa15e..2d507c8 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -96,4 +96,8 @@ void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
 #endif /* __BTRFS_QGROUP__ */
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/19] btrfs: delayed_ref: release and free qgroup reserved at proper timing

2015-09-08 Thread Qu Wenruo
Qgroup reserved space needs to be released from inode dirty map and get
freed at different timing:

1) Release when the metadata is written into tree
After corresponding metadata is written into tree, any newer write will
be COWed(don't include NOCOW case yet).
So we must release its range from inode dirty range map, or we will
forget to reserve needed range, causing accounting exceeding the limit.

2) Free reserved bytes when delayed ref is run
When delayed refs are run, qgroup accounting will follow soon and turn
the reserved bytes into rfer/excl numbers.
As run_delayed_refs and qgroup accounting are all done at
commit_transaction() time, we are safe to free reserved space in
run_delayed_ref time().

With these timing to release/free reserved space, we should be able to
resolve the long existing qgroup reserve space leak problem.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c |  4 
 fs/btrfs/inode.c   | 10 ++
 fs/btrfs/qgroup.c  |  5 ++---
 fs/btrfs/qgroup.h  |  8 +++-
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5411f0a..65e60eb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2345,6 +2345,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle 
*trans,
  node->num_bytes);
}
}
+
+   /* Also free its reserved qgroup space */
+   btrfs_qgroup_free_refroot(root->fs_info, head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61b2c17..1f7cac0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2112,6 +2112,16 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
btrfs_ino(inode), file_pos, );
+   if (ret < 0)
+   goto out;
+   /*
+* Release the reserved range from inode dirty range map, and
+* move it to delayed ref codes, as now accounting only happens at
+* commit_transaction() time.
+*/
+   btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+   ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+   root->objectid, disk_bytenr, ram_bytes);
 out:
btrfs_free_path(path);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ba7888f..5a69a2d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2169,14 +2169,13 @@ out:
return ret;
 }
 
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+  u64 ref_root, u64 num_bytes)
 {
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
-   struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist_node *unode;
struct ulist_iterator uiter;
-   u64 ref_root = root->root_key.objectid;
int ret = 0;
 
if (!is_fstree(ref_root))
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 8e69dc1..49fa15e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -75,7 +75,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 struct btrfs_qgroup_inherit *inherit);
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+  u64 ref_root, u64 num_bytes);
+static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+   return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+num_bytes);
+}
 
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 17/19] btrfs: extent-tree: Use new __btrfs_delalloc_reserve_space function

2015-09-08 Thread Qu Wenruo
Use new __btrfs_delalloc_reserve_space to reserve space.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/inode-map.c |  2 +-
 fs/btrfs/inode.c | 16 ++--
 fs/btrfs/ioctl.c |  5 +++--
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582a..ab639d3 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,7 +488,7 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
 
-   ret = btrfs_delalloc_reserve_space(inode, prealloc);
+   ret = __btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
goto out_put;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1f7cac0..d70cb26 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1985,7 +1985,8 @@ again:
goto again;
}
 
-   ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+   ret = __btrfs_delalloc_reserve_space(inode, page_start,
+PAGE_CACHE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -4581,7 +4582,8 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, 
loff_t len,
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
-   ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+   ret = __btrfs_delalloc_reserve_space(inode,
+   round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
if (ret)
goto out;
 
@@ -8373,7 +8375,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct 
iov_iter *iter,
mutex_unlock(>i_mutex);
relock = true;
}
-   ret = btrfs_delalloc_reserve_space(inode, count);
+   ret = __btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
goto out;
outstanding_extents = div64_u64(count +
@@ -8620,7 +8622,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, 
struct vm_fault *vmf)
u64 page_end;
 
sb_start_pagefault(inode->i_sb);
-   ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+   page_start = page_offset(page);
+   page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+   ret = __btrfs_delalloc_reserve_space(inode, page_start,
+PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
@@ -8639,8 +8645,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct 
vm_fault *vmf)
 again:
lock_page(page);
size = i_size_read(inode);
-   page_start = page_offset(page);
-   page_end = page_start + PAGE_CACHE_SIZE - 1;
 
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..e0291fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1119,8 +1119,9 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-   ret = btrfs_delalloc_reserve_space(inode,
-  page_cnt << PAGE_CACHE_SHIFT);
+   ret = __btrfs_delalloc_reserve_space(inode,
+   start_index << PAGE_CACHE_SHIFT,
+   page_cnt << PAGE_CACHE_SHIFT);
if (ret)
return ret;
i_done = 0;
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 19/19] btrfs: qgroup: Add handler for NOCOW and inline

2015-09-08 Thread Qu Wenruo
For NOCOW and inline case, there will be no delayed_ref created for
them, so we should free their reserved data space at proper
time(finish_ordered_io for NOCOW and cow_file_inline for inline).

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c |  7 ++-
 fs/btrfs/inode.c   | 15 +++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ab1b1a1..ca15bd3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4055,7 +4055,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 
start, u64 len)
if (ret < 0)
return ret;
 
-   /* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+   /*
+* Use new btrfs_qgroup_reserve_data to reserve precious data space
+*
+* TODO: Find a good method to avoid reserve data space for NOCOW
+* range, but don't impact performance on quota disable case.
+*/
ret = btrfs_qgroup_reserve_data(inode, start, len);
return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8c09197..9b783e6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct 
btrfs_root *root,
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+   /*
+* Don't forget to free the reserved space, as for inlined extent
+* it won't count as data extent, free them directly here.
+* And at reserve time, it's always aligned to page size, so
+* just free one page here.
+*/
+   btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -2831,6 +2838,14 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
 
if (test_bit(BTRFS_ORDERED_NOCOW, _extent->flags)) {
BUG_ON(!list_empty(_extent->list)); /* Logic error */
+
+   /*
+* For mwrite(mmap + memset to write) case, we still reserve
+* space for NOCOW range.
+* As NOCOW won't cause a new delayed ref, just free the space
+*/
+   btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+  ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 18/19] btrfs: qgroup: Cleanup old inaccurate facilities

2015-09-08 Thread Qu Wenruo
Cleanup the old facilities which use old btrfs_qgroup_reserve() function
call, replace them with the newer version, and remove the "__" prefix in
them.

Also, make btrfs_qgroup_reserve/free() functions private, as they are
now only used inside qgroup codes.

Now, the whole btrfs qgroup is swithed to use the new reserve facilities.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  6 ++
 fs/btrfs/extent-tree.c | 56 --
 fs/btrfs/file.c|  2 +-
 fs/btrfs/inode-map.c   |  2 +-
 fs/btrfs/inode.c   | 12 +--
 fs/btrfs/ioctl.c   |  2 +-
 fs/btrfs/qgroup.c  | 19 ++---
 fs/btrfs/qgroup.h  |  7 ---
 8 files changed, 27 insertions(+), 79 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 12f14fd..8489419 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3452,8 +3452,7 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes);
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3471,8 +3470,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root 
*root,
  u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07f45b7..ab1b1a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3352,7 +3352,7 @@ again:
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
 
-   ret = __btrfs_check_data_free_space(inode, 0, num_pages);
+   ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
 
@@ -4037,27 +4037,11 @@ commit_trans:
 }
 
 /*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes)
-{
-   struct btrfs_root *root = BTRFS_I(inode)->root;
-   int ret;
-
-   ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
-   if (ret < 0)
-   return ret;
-   ret = btrfs_qgroup_reserve(root, write_bytes);
-   return ret;
-}
-
-/*
  * New check_data_free_space() with ability for precious data reserveation
  * Will replace old btrfs_check_data_free_space(), but for patch split,
  * add a new function first and then replace it.
  */
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 {
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
@@ -5710,11 +5694,11 @@ void btrfs_delalloc_release_metadata(struct inode 
*inode, u64 num_bytes)
  * Return 0 for success
  * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
 {
int ret;
 
-   ret = __btrfs_check_data_free_space(inode, start, len);
+   ret = btrfs_check_data_free_space(inode, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len);
@@ -5724,38 +5708,6 @@ int __btrfs_delalloc_reserve_space(struct inode *inode, 
u64 start, u64 len)
 }
 
 /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
- * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
- *
- * This will do the following things
- *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
- *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
- * o add it to the fs_info's delalloc inodes list.
- *
- * This will return 0 for success and -ENOSPC if there is no space left.
- */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
-{
-   int ret;
-
-   ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-   if (ret)

[PATCH 16/19] btrfs: extent-tree: Add new version of btrfs_delalloc_reserve_space

2015-09-08 Thread Qu Wenruo
Add new version of btrfs_delalloc_reserve_space() function, which
supports accurate qgroup reserve.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/extent-tree.c | 38 ++
 2 files changed, 39 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c1a0aaf..12f14fd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3472,6 +3472,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root 
*root,
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2e3f19e..07f45b7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5686,6 +5686,44 @@ void btrfs_delalloc_release_metadata(struct inode 
*inode, u64 num_bytes)
 }
 
 /**
+ * __btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
+ *
+ * This will do the following things
+ *
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
+ *   extents and how much csums will be needed
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
+ *
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
+ */
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+{
+   int ret;
+
+   ret = __btrfs_check_data_free_space(inode, start, len);
+   if (ret < 0)
+   return ret;
+   ret = btrfs_delalloc_reserve_metadata(inode, len);
+   if (ret < 0)
+   btrfs_free_reserved_data_space(inode, len);
+   return ret;
+}
+
+/**
  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
  * @inode: inode we're writing to
  * @num_bytes: the number of bytes we want to allocate
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/19] btrfs: qgroup: Introduce function to insert non-overlap reserve range

2015-09-08 Thread Qu Wenruo
New function insert_data_ranges() will insert non-overlap reserve ranges
into reserve map.

It provides the basis for later qgroup reserve map implement.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 124 ++
 1 file changed, 124 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fc24fc3..a4e3af4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2577,6 +2577,130 @@ find_reserve_range(struct btrfs_qgroup_data_rsv_map 
*map, u64 start)
 }
 
 /*
+ * Insert one data range
+ * [start,len) here won't overflap with each other.
+ *
+ * Return 0 if range is inserted and tmp is not used.
+ * Return > 0 if range is inserted and tmp is used.
+ * No catchable error case. Only possible error will cause BUG_ON() as
+ * that's logical error.
+ */
+static int insert_data_range(struct btrfs_qgroup_data_rsv_map *map,
+struct data_rsv_range *tmp,
+u64 start, u64 len)
+{
+   struct rb_node **p = >root.rb_node;
+   struct rb_node *parent = NULL;
+   struct rb_node *tmp_node = NULL;
+   struct data_rsv_range *range = NULL;
+   struct data_rsv_range *prev_range = NULL;
+   struct data_rsv_range *next_range = NULL;
+   int prev_merged = 0;
+   int next_merged = 0;
+   int ret = 0;
+
+   while (*p) {
+   parent = *p;
+   range = rb_entry(parent, struct data_rsv_range, node);
+   if (range->start < start)
+   p = &(*p)->rb_right;
+   else if (range->start > start)
+   p = &(*p)->rb_left;
+   else
+   BUG_ON(1);
+   }
+
+   /* Empty tree, goto isolated case */
+   if (!range)
+   goto insert_isolated;
+
+   /* get adjusted ranges */
+   if (range->start < start) {
+   prev_range = range;
+   tmp_node = rb_next(parent);
+   if (tmp)
+   next_range = rb_entry(tmp_node, struct data_rsv_range,
+ node);
+   } else {
+   next_range = range;
+   tmp_node = rb_prev(parent);
+   if (tmp)
+   prev_range = rb_entry(tmp_node, struct data_rsv_range,
+ node);
+   }
+
+   /* try to merge with previous and next ranges */
+   if (prev_range && prev_range->start + prev_range->len == start) {
+   prev_merged = 1;
+   prev_range->len += len;
+   }
+   if (next_range && start + len == next_range->start) {
+   next_merged = 1;
+
+   /*
+* the range can be merged with adjusted two ranges into one,
+* remove the tailing range.
+*/
+   if (prev_merged) {
+   prev_range->len += next_range->len;
+   rb_erase(_range->node, >root);
+   kfree(next_range);
+   } else {
+   next_range->start = start;
+   next_range->len += len;
+   }
+   }
+
+insert_isolated:
+   /* isolated case, need to insert range now */
+   if (!next_merged && !prev_merged) {
+   BUG_ON(!tmp);
+
+   tmp->start = start;
+   tmp->len = len;
+   rb_link_node(>node, parent, p);
+   rb_insert_color(>node, >root);
+   ret = 1;
+   }
+   return ret;
+}
+
+/*
+ * insert reserve range and merge them if possible
+ *
+ * Return 0 if all inserted and tmp not used
+ * Return > 0 if all inserted and tmp used
+ * No catchable error return value.
+ */
+static int insert_data_ranges(struct btrfs_qgroup_data_rsv_map *map,
+ struct data_rsv_range *tmp,
+ struct ulist *insert_list)
+{
+   struct ulist_node *unode;
+   struct ulist_iterator uiter;
+   int tmp_used = 0;
+   int ret = 0;
+
+   ULIST_ITER_INIT();
+   while ((unode = ulist_next(insert_list, ))) {
+   ret = insert_data_range(map, tmp, unode->val, unode->aux);
+
+   /*
+* insert_data_range() won't return error return value,
+* no need to hanle <0 case.
+*
+* Also tmp should be used at most one time, so clear it to
+* NULL to cooperate with sanity check in insert_data_range().
+*/
+   if (ret > 0) {
+   tmp_used = 1;
+   tmp = NULL;
+   }
+   }
+   return tmp_used;
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a 

[PATCH 06/19] btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function

2015-09-08 Thread Qu Wenruo
This new function will do all the hard work to reserve precious space
for a write.

The overall work flow will be the following.

File A already has some dirty pages:

0   4K  8K  12K 16K
|///|   |///|

And then, someone want to write some data into range [4K, 16K).
|<--desired>|

Unlike the old and wrong implement, which reserve 12K, this function
will only reserve space for newly dirty part:
|\\\|   |\\\|
Which only takes 8K reserve space, as other part has already allocated
their own reserve space.

So the final reserve map will be:
|///|

This provides the basis to resolve the long existing qgroup limit bug.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 57 +++
 fs/btrfs/qgroup.h |  1 +
 2 files changed, 58 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 77a2e07..337b784 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2793,6 +2793,63 @@ insert:
 }
 
 /*
+ * Make sure the data space for [start, start + len) is reserved.
+ * It will either reserve new space from given qgroup or reuse the already
+ * reserved space.
+ *
+ * Return 0 for successful reserve.
+ * Return <0 for error.
+ *
+ * TODO: to handle nocow case, like NODATACOW or write into prealloc space
+ * along with other mixed case.
+ * Like write 2M, first 1M can be nocowed, but next 1M is on hole and need COW.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+   struct btrfs_inode *binode = BTRFS_I(inode);
+   struct btrfs_root *root = binode->root;
+   struct btrfs_qgroup_data_rsv_map *reserve_map;
+   struct data_rsv_range *tmp = NULL;
+   struct ulist *insert_list;
+   int ret;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+   len == 0)
+   return 0;
+
+   if (!binode->qgroup_rsv_map) {
+   ret = btrfs_qgroup_init_data_rsv_map(inode);
+   if (ret < 0)
+   return ret;
+   }
+   reserve_map = binode->qgroup_rsv_map;
+   insert_list = ulist_alloc(GFP_NOFS);
+   if (!insert_list)
+   return -ENOMEM;
+   tmp = kzalloc(sizeof(*tmp), GFP_NOFS);
+   if (!tmp) {
+   ulist_free(insert_list);
+   return -ENOMEM;
+   }
+
+   spin_lock(_map->lock);
+   ret = reserve_data_range(root, reserve_map, tmp, insert_list, start,
+len);
+   /*
+* For error and already exists case, free tmp memory.
+* For tmp used case, set ret to 0, as some careless
+* caller consider >0 as error.
+*/
+   if (ret <= 0)
+   kfree(tmp);
+   else
+   ret = 0;
+   spin_unlock(_map->lock);
+   ulist_free(insert_list);
+   return ret;
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c87b7dc..366b853 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -87,4 +87,5 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, 
u64 qgroupid,
 /* for qgroup reserve */
 int btrfs_qgroup_init_data_rsv_map(struct inode *inode);
 void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/19] btrfs: qgroup: Introduce function to release reserved range

2015-09-08 Thread Qu Wenruo
Introduce new function release_data_range() to release reserved ranges.
It will iterate through all existing ranges and remove/shrink them.

Note this function will not free reserved space, as the range can be
released in the following conditions:
1) The dirty range gets written to disk.
   In this case, reserved range will be released but reserved bytes
   will not be freed until the delayed_ref is run.

2) Truncate
   In this case, dirty ranges will be released and reserved bytes will
   also be freed.

So the new function won't free reserved space, but record them into
parameter if called needs.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 130 ++
 1 file changed, 130 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 337b784..e24c10d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2849,6 +2849,136 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 
start, u64 len)
return ret;
 }
 
+/* Small helper used in release_data_range() to update rsv map */
+static inline void __update_rsv(struct btrfs_qgroup_data_rsv_map *map,
+   u64 *reserved, u64 cur_rsv)
+{
+   if (reserved)
+   *reserved += cur_rsv;
+   if (WARN_ON(map->reserved < cur_rsv))
+   map->reserved = 0;
+   else
+   map->reserved -= cur_rsv;
+}
+
+/*
+ * Release the range [start, start + len) from rsv map.
+ *
+ * The behavior should be much like reserve_data_range().
+ * @tmp: the allocated memory for case which need to split existing
+ *   range into two.
+ * @reserved: the number of bytes that may need to free
+ * Return > 0 if 'tmp' memory is used and release range successfully
+ * Return 0 if 'tmp' memory is not used and release range successfully
+ * Return < 0 for error
+ */
+static int release_data_range(struct btrfs_qgroup_data_rsv_map *map,
+ struct data_rsv_range *tmp,
+ u64 start, u64 len, u64 *reserved)
+{
+   struct data_rsv_range *range;
+   u64 cur_rsv = 0;
+   int ret = 0;
+
+   range = find_reserve_range(map, start);
+   /* empty tree, just return */
+   if (!range)
+   return 0;
+   /*
+* For split case
+*  ||
+* ||
+* In this case, we need to insert one new range.
+*/
+   if (range->start < start && range->start + range->len > start + len) {
+   u64 new_start = start + len;
+   u64 new_len = range->start + range->len - start - len;
+
+   cur_rsv = len;
+   if (reserved)
+   *reserved += cur_rsv;
+   map->reserved -= cur_rsv;
+
+   range->len = start - range->start;
+   ret = insert_data_range(map, tmp, new_start, new_len);
+   WARN_ON(ret <= 0);
+   return 1;
+   }
+
+   /*
+* Iterate until the end of the range and free release all
+* reserved data from map.
+* We iterate by existing range, as that will makes codes a
+* little more clean.
+*
+*  |<-desired>|
+* |//1//|  |//2//| |//3//| |//4//|
+*/
+   while (range->start < start + len) {
+   struct rb_node *next = NULL;
+   int range_freed = 0;
+
+   /*
+*  |<---desired>|
+* |///|
+*/
+   if (unlikely(range->start + range->len <= start))
+   goto next;
+
+   /*
+*  ||
+* |///|
+*/
+   if (range->start < start &&
+   range->start + range->len > start) {
+   cur_rsv = range->start + range->len - start;
+
+   range->len = start - range->start;
+   goto next;
+   }
+
+   /*
+*  |<--desired-->|
+*  |/|
+* Including same start/end case, so other case don't need
+* to check start/end equal case and don't need bother
+* deleting range.
+*/
+   if (range->start >= start &&
+   range->start + range->len <= start + len) {
+   cur_rsv = range->len;
+
+   range_freed = 1;
+   next = rb_next(>node);
+   rb_erase(>node, >root);
+   kfree(range);
+   goto next;
+
+   }
+
+   /*
+*  |<--desired-->|
+*|///|
+*/
+   if (range->start < 

[PATCH RFC 00/14] Accurate qgroup reserve framework

2015-09-08 Thread Qu Wenruo
[[BUG]]
One of the most common case to trigger the bug is the following method:
1) Enable quota
2) Limit excl of qgroup 5 to 16M
3) Write [0,2M) of a file inside subvol 5 10 times without sync

EQUOT will be triggered at about the 8th write.

[[CAUSE]]
The problem is caused by the fact that qgroup will reserve space even
the data space is already reserved.

In above reproducer, each time we buffered write [0,2M) qgroup will
reserve 2M space, but in fact, at the 1st time, we have already reserved
2M and from then on, we don't need to reserved any data space as we are
only writing [0,2M).

Also, the reserved space will only be freed *ONCE* when its backref is
run at commit_transaction() time.

That's causing the reserved space leaking.

[[FIX]]
The fix is not a simple one, as currently btrfs_qgroup_reserve() follow
the very bad btrfs space allocating principle:
  Allocate as much as you needed, even it's not fully used.

So for accurate qgroup reserve, we introduce a completely new framework
for data and metadata.
1) Per-inode data reserve map
   Now, each inode will have a data reserve map, recording which range
   of data is already reserved.
   If we are writing a range which is already reserved, we won't need to
   reserve space again.

   Also, for the fact that qgroup is only accounted at commit_trans(),
   for data commit into disc and its metadata is also inserted into
   current tree, we should free the data reserved range, but still keep
   the reserved space until commit_trans().

   So delayed_ref_head will have new members to record how much space is
   reserved and free them at commit_trans() time.

2) Per-root metadata reserve counter
   For metadata(tree block), it's impossible to know how much space it
   will use exactly in advance.
   And due to the new qgroup accounting framework, the old
   free-at-end-trans may lead to exceeding limit.

   So we record how much metadata space is reserved for each root, and
   free them at commit_trans() time.
   This method is not perfect, but thanks to the compared small size of
   metadata, it should be quite good.

More detailed info can be found in each commit message and source
commend.

Qu Wenruo (19):
  btrfs: qgroup: New function declaration for new reserve implement
  btrfs: qgroup: Implement data_rsv_map init/free functions
  btrfs: qgroup: Introduce new function to search most left reserve
range
  btrfs: qgroup: Introduce function to insert non-overlap reserve range
  btrfs: qgroup: Introduce function to reserve data range per inode
  btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function
  btrfs: qgroup: Introduce function to release reserved range
  btrfs: qgroup: Introduce function to release/free reserved data range
  btrfs: delayed_ref: Add new function to record reserved space into
delayed ref
  btrfs: delayed_ref: release and free qgroup reserved at proper timing
  btrfs: qgroup: Introduce new functions to reserve/free metadata
  btrfs: qgroup: Use new metadata reservation.
  btrfs: extent-tree: Add new verions of btrfs_check_data_free_space
  btrfs: Switch to new check_data_free_space
  btrfs: fallocate: Add support to accurate qgroup reserve
  btrfs: extent-tree: Add new version of btrfs_delalloc_reserve_space
  btrfs: extent-tree: Use new __btrfs_delalloc_reserve_space function
  btrfs: qgroup: Cleanup old inaccurate facilities
  btrfs: qgroup: Add handler for NOCOW and inline

 fs/btrfs/btrfs_inode.h |   6 +
 fs/btrfs/ctree.h   |   8 +-
 fs/btrfs/delayed-ref.c |  29 +++
 fs/btrfs/delayed-ref.h |  14 +
 fs/btrfs/disk-io.c |   1 +
 fs/btrfs/extent-tree.c |  99 +---
 fs/btrfs/file.c| 169 +
 fs/btrfs/inode-map.c   |   2 +-
 fs/btrfs/inode.c   |  51 +++-
 fs/btrfs/ioctl.c   |   3 +-
 fs/btrfs/qgroup.c  | 674 -
 fs/btrfs/qgroup.h  |  18 +-
 fs/btrfs/transaction.c |  34 +--
 fs/btrfs/transaction.h |   1 -
 14 files changed, 979 insertions(+), 130 deletions(-)

-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/19] btrfs: qgroup: New function declaration for new reserve implement

2015-09-08 Thread Qu Wenruo
Add new structures and functions for new qgroup reserve implement dirty
phase.
Which will focus on avoiding over-reserve as in that case, which means
for already reserved dirty space range, we won't reserve space again.

This patch adds the needed structure declaration and comments.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/btrfs_inode.h |  4 
 fs/btrfs/qgroup.c  | 58 ++
 fs/btrfs/qgroup.h  |  3 +++
 3 files changed, 65 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 81220b2..e3ece65 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,7 @@
 #include "extent_io.h"
 #include "ordered-data.h"
 #include "delayed-inode.h"
+#include "qgroup.h"
 
 /*
  * ordered_data_close is set by truncate when a file that used
@@ -195,6 +196,9 @@ struct btrfs_inode {
struct timespec i_otime;
 
struct inode vfs_inode;
+
+   /* qgroup dirty map for data space reserve */
+   struct btrfs_qgroup_data_rsv_map *qgroup_rsv_map;
 };
 
 extern unsigned char btrfs_filetype_table[];
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e9ace09..561c36d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -91,6 +91,64 @@ struct btrfs_qgroup {
u64 new_refcnt;
 };
 
+/*
+ * Record one range of reserved space.
+ */
+struct data_rsv_range {
+   struct rb_node node;
+   u64 start;
+   u64 len;
+};
+
+/*
+ * Record per inode reserved range.
+ * This is mainly used to resolve reserved space leaking problem.
+ * One of the cause is the mismatch with reserve and free.
+ *
+ * New qgroup will handle reserve in two phase.
+ * 1) Dirty phase.
+ *Pages are just marked dirty, but not written to disk.
+ * 2) Flushed phase
+ *Pages are written to disk, but transaction is not committed yet.
+ *
+ * At Diryt phase, we only need to focus on avoiding over-reserve.
+ *
+ * The idea is like below.
+ * 1) Write [0,8K)
+ * 0   4K  8K  12K 16K
+ * ||
+ * Reserve +8K, total reserved: 8K
+ *
+ * 2) Write [0,4K)
+ * 0   4K  8K  12K 16K
+ * ||
+ * Reserve 0, total reserved 8K
+ *
+ * 3) Write [12K,16K)
+ * 0   4K  8K  12K 16K
+ * ||  |///|
+ * Reserve +4K, tocal reserved 12K
+ *
+ * 4) Flush [0,8K)
+ * Can happen without commit transaction, like fallocate will trigger the
+ * write.
+ * 0   4K  8K  12K 16K
+ * |///|
+ * Reserve 0, tocal reserved 12K
+ * As the extent is written to disk, not dirty any longer, the range get
+ * removed.
+ * But as its delayed_refs is not run, its reserved space will not be freed.
+ * And things continue to Flushed phase.
+ *
+ * By this method, we can avoid over-reserve, which will lead to reserved
+ * space leak.
+ */
+struct btrfs_qgroup_data_rsv_map {
+   struct rb_root root;
+   u64 reserved;
+   spinlock_t lock;
+};
+
 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
   int mod)
 {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcf..2f863a4 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,9 @@ struct btrfs_qgroup_extent_record {
struct ulist *old_roots;
 };
 
+/* For per-inode dirty range reserve */
+struct btrfs_qgroup_data_rsv_map;
+
 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
   struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/19] btrfs: qgroup: Implement data_rsv_map init/free functions

2015-09-08 Thread Qu Wenruo
New functions btrfs_qgroup_init/free_data_rsv_map() to init/free data
reserve map.

Data reserve map is used to mark which range already holds reserved
space, to avoid current reserved space leak.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/btrfs_inode.h |  2 ++
 fs/btrfs/inode.c   | 10 +++
 fs/btrfs/qgroup.c  | 77 ++
 fs/btrfs/qgroup.h  |  3 ++
 4 files changed, 92 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e3ece65..27cc338 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -199,6 +199,8 @@ struct btrfs_inode {
 
/* qgroup dirty map for data space reserve */
struct btrfs_qgroup_data_rsv_map *qgroup_rsv_map;
+   /* lock to ensure rsv_map will only be initialized once */
+   spinlock_t qgroup_init_lock;
 };
 
 extern unsigned char btrfs_filetype_table[];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 37dd8d0..61b2c17 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8939,6 +8939,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(>delalloc_inodes);
RB_CLEAR_NODE(>rb_node);
 
+   /*
+* Init qgroup info to empty, as they will be initialized at write
+* time.
+* This behavior is needed for enable quota later case.
+*/
+   spin_lock_init(>qgroup_init_lock);
+   ei->qgroup_rsv_map = NULL;
+
return inode;
 }
 
@@ -8996,6 +9004,8 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
+   /* free and check data rsv map */
+   btrfs_qgroup_free_data_rsv_map(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 561c36d..cf07c17 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2539,3 +2539,80 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
btrfs_queue_work(fs_info->qgroup_rescan_workers,
 _info->qgroup_rescan_work);
 }
+
+/*
+ * Init data_rsv_map for a given inode.
+ *
+ * This is needed at write time as quota can be disabled and then enabled
+ */
+int btrfs_qgroup_init_data_rsv_map(struct inode *inode)
+{
+   struct btrfs_inode *binode = BTRFS_I(inode);
+   struct btrfs_root *root = binode->root;
+   struct btrfs_qgroup_data_rsv_map *dirty_map;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+   return 0;
+
+   spin_lock(>qgroup_init_lock);
+   /* Quick route for init */
+   if (likely(binode->qgroup_rsv_map))
+   goto out;
+   spin_unlock(>qgroup_init_lock);
+
+   /*
+* Slow allocation route
+*
+* TODO: Use kmem_cache to speedup allocation
+*/
+   dirty_map = kmalloc(sizeof(*dirty_map), GFP_NOFS);
+   if (!dirty_map)
+   return -ENOMEM;
+
+   dirty_map->reserved = 0;
+   dirty_map->root = RB_ROOT;
+   spin_lock_init(_map->lock);
+
+   /* Lock again to ensure no one has already init it before */
+   spin_lock(>qgroup_init_lock);
+   if (binode->qgroup_rsv_map) {
+   spin_unlock(>qgroup_init_lock);
+   kfree(dirty_map);
+   return 0;
+   }
+   binode->qgroup_rsv_map = dirty_map;
+out:
+   spin_unlock(>qgroup_init_lock);
+   return 0;
+}
+
+void btrfs_qgroup_free_data_rsv_map(struct inode *inode)
+{
+   struct btrfs_inode *binode = BTRFS_I(inode);
+   struct btrfs_root *root = binode->root;
+   struct btrfs_qgroup_data_rsv_map *dirty_map = binode->qgroup_rsv_map;
+   struct rb_node *node;
+
+   /*
+* this function is called at inode destroy routine, so no concurrency
+* will happen, no need to get the lock.
+*/
+   if (!dirty_map)
+   return;
+
+   /* insanity check */
+   WARN_ON(!root->fs_info->quota_enabled || !is_fstree(root->objectid));
+
+   btrfs_qgroup_free(root, dirty_map->reserved);
+   spin_lock(_map->lock);
+   while ((node = rb_first(_map->root)) != NULL) {
+   struct data_rsv_range *range;
+
+   range = rb_entry(node, struct data_rsv_range, node);
+   rb_erase(node, _map->root);
+   kfree(range);
+   }
+   spin_unlock(_map->lock);
+   kfree(dirty_map);
+   binode->qgroup_rsv_map = NULL;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 2f863a4..c87b7dc 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -84,4 +84,7 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, 
u64 qgroupid,
   u64 rfer, u64 excl);
 #endif
 
+/* for qgroup reserve */
+int btrfs_qgroup_init_data_rsv_map(struct inode *inode);
+void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
 #endif /* 

[PATCH 03/19] btrfs: qgroup: Introduce new function to search most left reserve range

2015-09-08 Thread Qu Wenruo
Introduce the new function to search the most left reserve range in a
reserve map.

It provides the basis for later reserve map implement.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cf07c17..fc24fc3 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2541,6 +2541,42 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Return the nearest left range of given start
+ * No ensure about the range will cover start.
+ */
+static struct data_rsv_range *
+find_reserve_range(struct btrfs_qgroup_data_rsv_map *map, u64 start)
+{
+   struct rb_node **p = >root.rb_node;
+   struct rb_node *parent = NULL;
+   struct rb_node *prev = NULL;
+   struct data_rsv_range *range = NULL;
+
+   while (*p) {
+   parent = *p;
+   range = rb_entry(parent, struct data_rsv_range, node);
+   if (range->start < start)
+   p = &(*p)->rb_right;
+   else if (range->start > start)
+   p = &(*p)->rb_left;
+   else
+   return range;
+   }
+
+   /* empty tree */
+   if (!parent)
+   return NULL;
+   if (range->start <= start)
+   return range;
+
+   prev = rb_prev(parent);
+   /* Already most left one */
+   if (!prev)
+   return range;
+   return rb_entry(prev, struct data_rsv_range, node);
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: raid6 + hot spare question

2015-09-08 Thread Hugo Mills
On Tue, Sep 08, 2015 at 01:59:19PM +0200, Peter Keše wrote:
> 
> I'm planning to set up a raid6 array with 4 x 4TB drives.
> Presumably that would result in 8TB of usable space + parity, which
> is about enough for my data (my data is currently 5TB in raid1,
> slowly growing at about 1 TB per year, but I often keep some
> additional backups if space permits).
> 
> However I'd like to be prepared for a disk failure. Because my
> server is not easily accessible and disk replacement times can be
> long, I'm considering the idea of making a 5-drive raid6, thus
> getting 12TB useable space + parity. In this case, the extra 4TB
> drive would serve as some sort of a hot spare.
> 
> My assumption is that if one hard drive fails before the volume is
> more than 8TB full, I can just rebalance and resize the volume from
> 12 TB back to 8 TB essentially going from 5-drive raid6 to 4-drive
> raid6).
> 
> Can anyone confirm my assumption? Can I indeed rebalance from
> 5-drive raid6 to 4-drive raid6 if the volume is not too big?

   Yes, you can, provided, as you say, the data is small enough to fit
into the reduced filesystem.

   Hugo.

-- 
Hugo Mills | "What's so bad about being drunk?"
hugo@... carfax.org.uk | "You ask a glass of water"
http://carfax.org.uk/  | Arthur & Ford
PGP: E2AB1DE4  | The Hitch-Hiker's Guide to the Galaxy


signature.asc
Description: Digital signature


Re: [PATCH] btrfs: trival fix of __btrfs_set_acl error handling

2015-09-08 Thread David Sterba
On Tue, Sep 08, 2015 at 05:02:32PM +0800, Sheng Yong wrote:
> Hi, Qu
> 
> On 9/8/2015 4:50 PM, Qu Wenruo wrote:
> > Sheng Yong wrote on 2015/09/08 08:46 +:
> >> * If the allocation failed, don't free to free it, even though kfree
> >>allows to free a NULL pointer.
> >> * If posix_acl_to_xattr() failed, cleanup the allocation and return
> >>the error directly.
> > So, what's the point?
> > For me, I didn't see the pros of the change.
> > As kfree() allow NULL pointer, why not use it?
> In fact, there is no semantic changes. It's just because when I walk through
> the code, and find there is no need to call kfree(), and could be cleaned up.
> It's fine to keep as it is :)

I agree with Qu. In this case it's not performance critical and conforms
to the widely used pattern of single return point from branches.
However, the acl functions are not consistent in that respect. It's more
a matter of style that gets unified eventually.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


raid6 + hot spare question

2015-09-08 Thread Peter Keše


I'm planning to set up a raid6 array with 4 x 4TB drives.
Presumably that would result in 8TB of usable space + parity, which is 
about enough for my data (my data is currently 5TB in raid1, slowly 
growing at about 1 TB per year, but I often keep some additional backups 
if space permits).


However I'd like to be prepared for a disk failure. Because my server is 
not easily accessible and disk replacement times can be long, I'm 
considering the idea of making a 5-drive raid6, thus getting 12TB 
useable space + parity. In this case, the extra 4TB drive would serve as 
some sort of a hot spare.


My assumption is that if one hard drive fails before the volume is more 
than 8TB full, I can just rebalance and resize the volume from 12 TB 
back to 8 TB essentially going from 5-drive raid6 to 4-drive raid6).


Can anyone confirm my assumption? Can I indeed rebalance from 5-drive 
raid6 to 4-drive raid6 if the volume is not too big?


Thanks,
Peter
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v9.1 1/3] xfstests: btrfs: add functions to create dm-error device

2015-09-08 Thread Anand Jain
From: Anand Jain 

Controlled EIO from the device is achieved using the dm device.
Helper functions are at common/dmerror.

Broadly steps will include calling _dmerror_init().
_dmerror_init() will use SCRATCH_DEV to create dm linear device and assign
DMERROR_DEV to /dev/mapper/error-test.

When test script is ready to get EIO, the test cases can call
_dmerror_load_table() which then it will load the dm error.
so that reading DMERROR_DEV will cause EIO. After the test case is
complete, cleanup must be done by calling _dmerror_cleanup().

Signed-off-by: Anand Jain 
Reviewed-by: Filipe Manana 
---

v9->v9.1: actually picked up the changes mentioned below in v9

v8->v9:
 Accepts Eryu's review comments viz..
 . check for modloaded as in dm-flakey
 . add _require_block_device and _require_sane_bdev_flush to dmerror_required()

v7->v8:
 . Mainly avoid duplicate lines of code, create reusable functions
  _common_dev_mount_options(), dmerror_mount_options(),
  _dmerror_mount()
 . Update _scratch_mount_option() to use _common_dev_mount_options()

v6->v7:
 rename _init_dmerror() to _dmerror_init()
 remove _scratch_mkfs_dmerror()
 rename _mount_dmerror() to _dmerror_mount()
 rename _cleaup_dmerror() to _dmerror_cleanup()
 rename _load_dmerror_table() to _dmerror_load_table()
 rename BLK_DEV_SIZE to blk_dev_size
 remove _unmount_dmerror there were no consumer of it
 use _fail instead of _fatal in rc/dmerror
 update error log to make crisp sense
 move _require_dmerror() from common/rc to common/dmerror and rename
   it to dmerror_required() so that its consistent with other function
   names with in the file

v5->v6: accepts Eryu's comments, thanks
 . added missing $MKFS_OPTIONS at _scratch_mkfs_dmerror()
 . used $MOUNT_PROG instead of mount at _mount_dmerror()
 . correct typo $UMOUNT_PROG, no S at the end in _unmount_dmerror()

v4->v5: No Change. keep up with the patch set

v3->v4: rebase on latest xfstests code

v2.1->v3: accepts Filipe Manana's review comments, thanks
 . correct if else statement in _require_dm_error()
 . fix indent 
   (a missed Dave comment in v1. looks like I goofed with git cli)

v2->v2.1: fixed missed typo error fixup in the commit.

v1->v2: accepts Dave Chinner's review comments, thanks
 . use SCRATCH_DEV for dmerror backing device
 . remove duplicate check of DM_BLK_DEV in _init_dm_error_dev()
 . remove a wrong check when reading block size in _init_dm_error_dev()
 . remove a wrong check with blockdev --setra in _init_dm_error_dev()
 . remove unnecessary check in _load_dm_error_table()
 . remove unnecessary dmerror device test by using dd

 common/dmerror | 79 ++
 common/rc  | 13 --
 2 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 common/dmerror

diff --git a/common/dmerror b/common/dmerror
new file mode 100644
index 000..a81856d
--- /dev/null
+++ b/common/dmerror
@@ -0,0 +1,79 @@
+##/bin/bash
+#
+# Copyright (c) 2015 Oracle.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#
+# common functions for setting up and tearing down a dmerror device
+
+# this test requires the device mapper error target
+#
+_dmerror_required()
+{
+   _require_command "$DMSETUP_PROG" dmsetup
+
+   _require_block_device $SCRATCH_DEV
+   _require_sane_bdev_flush $SCRATCH_DEV
+
+   modprobe dm-mod >/dev/null 2>&1
+   $DMSETUP_PROG targets | grep error >/dev/null 2>&1
+   [ $? -ne 0 ] && _notrun "This test requires dm error support"
+}
+
+_dmerror_init()
+{
+   local dm_backing_dev=$SCRATCH_DEV
+
+   $DMSETUP_PROG remove error-test > /dev/null 2>&1
+
+   local blk_dev_size=`blockdev --getsz $dm_backing_dev`
+
+   DMERROR_DEV='/dev/mapper/error-test'
+
+   DMLINEAR_TABLE="0 $blk_dev_size linear $dm_backing_dev 0"
+
+   $DMSETUP_PROG create error-test --table "$DMLINEAR_TABLE" || \
+   _fatal "failed to create dm linear device"
+
+   DMERROR_TABLE="0 $blk_dev_size error $dm_backing_dev 0"
+}
+
+_dmerror_mount_options()
+{
+   echo `_common_dev_mount_options $*` $DMERROR_DEV $SCRATCH_MNT
+}
+
+_dmerror_mount()
+{
+   _mount -t $FSTYP `_dmerror_mount_options $*`
+}
+
+_dmerror_cleanup()
+{
+   $UMOUNT_PROG $SCRATCH_MNT > /dev/null 2>&1
+   $DMSETUP_PROG remove error-test > /dev/null 

Re: [PATCH] btrfs-progs: makefile: drop u option from ar invocation

2015-09-08 Thread David Sterba
On Sat, Sep 05, 2015 at 01:06:18AM +0200, Arnd Hannemann wrote:
> In newer distros (ubuntu 15.10, fedora rawhide) the binutils
> ar uses the new D flag per default to build deterministic
> binaries.
> Without this patch the following warning is issued, when
> building btrfs-progs:
> 
> [AR] libbtrfs.a
> /usr/bin/ar: `u' modifier ignored since `D' is the default (see `U')
> 
> For libtrfs.a performance benefit of the u option can be neglected,
> so drop the u option and silence the warning.
> 
> In the future one might want to explicitly add the D option anyway.
> 
> Signed-off-by: Arnd Hannemann 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Ian Kumlien
On 8 September 2015 at 21:43, Ian Kumlien  wrote:
> On 8 September 2015 at 21:34, Hugo Mills  wrote:
>> On Tue, Sep 08, 2015 at 09:18:05PM +0200, Ian Kumlien wrote:
[--8<--]

>>Physically removing it is the way to go (or disabling it using echo
>> offline >/sys/block/sda/device/state). Once you've done that, you can
>> mount the degraded FS with -odegraded, then either add a new device
>> and balance to restore the RAID-1, or balance with
>> -{d,m}convert=single to drop the redundancy to single.
>
> This did not work...

And removing the pyscial device is not the answer either... until i
did a read only mount ;)

Didn't expect it to fail with unable to open ctree like that...

[--8<--]
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: mkfs.btrfs cannot find rotational file for SSD detection for a pmem device

2015-09-08 Thread Elliott, Robert (Persistent Memory)

> -Original Message-
> From: Austin S Hemmelgarn [mailto:ahferro...@gmail.com]
> Sent: Tuesday, September 8, 2015 7:56 AM
> Subject: Re: mkfs.btrfs cannot find rotational file for SSD detection for
> a pmem device
> 
> On 2015-09-06 13:51, Elliott, Robert (Persistent Memory) wrote:
...
> > The impact looks limited to the print and causing it to not
> > automatically disable "metadata duplication on a single device."
> This is an issue inherent in the current pmem driver however, it should
> be fixed there and not in mkfs.btrfs, as other filesystems make
> decisions based on this file also, as does the I/O scheduler, and some
> block storage servers.  
> ...

The rotational file does exist, at:
/sys/devices/LNXSYSTM\:00/LNXSYBUS\:00/ACPI0012\:00/ndbus1/region0/namespace0.0/block/pmem0/queue/rotational

One or more functions are having trouble parsing that 108-byte string
... mkfs.btrfs's is_ssd, libblkid's blkid_devno_to_wholedisk, or
libblkid's sysfs_devno_to_wholedisk.  I'm not sure where the
breakdown occurs.

This is reminiscent of an issue that numactl has parsing the path to
get to .../device/numa_node (rather than .../queue/rotational).  It
was confused by not finding "/devices/pci" in a path for a storage
device.

> This gets tricky though because pmem isn't
> technically a block device at the low level, and doesn't use some parts
> of the block layer that most other block devices do.
> 
> On that note however, if the pmem device is backed by actual RAM and not
> flash storage (and most of them are from what I've seen), then the only
> advantage of using single metadata mode over dup is space savings, as
> RAM is not (usually) write limited.

pmem devices will be a mix ranging from flash-backed DRAM to new
technologies like 3D Crosspoint, usually offering high performance
and good wearout characteristics.

The btrfs driver does detect it as SSD after mkfs.btrfs did not:
kernel: BTRFS info (device pmem0): disk space caching is enabled
kernel: BTRFS: has skinny extents
kernel: BTRFS: flagging fs with big metadata feature
kernel: BTRFS: detected SSD devices, enabling SSD mode

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Pádraig Brady
On 08/09/15 20:10, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>  wrote:
>> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>>> I see copy_file_range() is a reflink() on BTRFS?
>>> That's a bit surprising, as it avoids the copy completely.
>>> cp(1) for example considered doing a BTRFS clone by default,
>>> but didn't due to expectations that users actually wanted
>>> the data duplicated on disk for resilience reasons,
>>> and for performance reasons so that write latencies were
>>> restricted to the copy operation, rather than being
>>> introduced at usage time as the dest file is CoW'd.
>>>
>>> If reflink() is a possibility for copy_file_range()
>>> then could it be done optionally with a flag?
>>
>> The idea is that filesystems get to choose how to handle copies in the 
>> default case.  BTRFS could do a reflink, but NFS could do a server side copy 
>> instead.  I can change the default behavior to only do a data copy (unless 
>> the reflink flag is specified) instead, if that is desirable.
>>
>> What does everybody think?
> 
> I think the best you could do is to have a hint asking politely for
> the data to be deep-copied.  After all, some filesystems reserve the
> right to transparently deduplicate.
> 
> Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> advantage to deep copying unless you actually want two copies for
> locality reasons.

Agreed. The relink and server side copy are separate things.
There's no advantage to not doing a server side copy,
but as mentioned there may be advantages to doing deep copies on BTRFS
(another reason not previous mentioned in this thread, would be
to avoid ENOSPC errors at some time in the future).

So having control over the deep copy seems useful.
It's debatable whether ALLOW_REFLINK should be on/off by default
for copy_file_range().  I'd be inclined to have such a setting off by default,
but cp(1) at least will work with whatever is chosen.

thanks,
Pádraig.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Chris Murphy
On Tue, Sep 8, 2015 at 2:00 PM, Ian Kumlien  wrote:
> On 8 September 2015 at 21:55, Ian Kumlien  wrote:
>> On 8 September 2015 at 21:43, Ian Kumlien  wrote:
>>> On 8 September 2015 at 21:34, Hugo Mills  wrote:
 On Tue, Sep 08, 2015 at 09:18:05PM +0200, Ian Kumlien wrote:
>> [--8<--]
>>
Physically removing it is the way to go (or disabling it using echo
 offline >/sys/block/sda/device/state). Once you've done that, you can
 mount the degraded FS with -odegraded, then either add a new device
 and balance to restore the RAID-1, or balance with
 -{d,m}convert=single to drop the redundancy to single.
>>>
>>> This did not work...
>>
>> And removing the pyscial device is not the answer either... until i
>> did a read only mount ;)
>>
>> Didn't expect it to fail with unable to open ctree like that...
>
> Someone thought they were done too early, only one disk => read only
> mount. But, readonly mount => no balance.
>
> I think something is wrong
>
> btrfs balance start -dconvert=single -mconvert=single /mnt/disk/
> ERROR: error during balancing '/mnt/disk/' - Read-only file system
>
> btrfs dev delete missing /mnt/disk/
> ERROR: error removing the device 'missing' - Read-only file system
>
> Any mount without ro becomes:
> [  507.236652] BTRFS info (device sda2): allowing degraded mounts
> [  507.236655] BTRFS info (device sda2): disk space caching is enabled
> [  507.325365] BTRFS: bdev (null) errs: wr 2036894, rd 2031380, flush
> 705, corrupt 0, gen 0
> [  510.983321] BTRFS: too many missing devices, writeable mount is not allowed
> [  511.006241] BTRFS: open_ctree failed
>
> And one of them has to give! ;)


You've run into this:
https://bugzilla.kernel.org/show_bug.cgi?id=92641




-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Ian Kumlien
On 8 September 2015 at 22:17, Chris Murphy  wrote:
> On Tue, Sep 8, 2015 at 2:13 PM, Ian Kumlien  wrote:
>> On 8 September 2015 at 22:08, Chris Murphy  wrote:
>>> On Tue, Sep 8, 2015 at 2:00 PM, Ian Kumlien  wrote:
>>
>> [--8<--]
>>
 Someone thought they were done too early, only one disk => read only
 mount. But, readonly mount => no balance.

 I think something is wrong

 btrfs balance start -dconvert=single -mconvert=single /mnt/disk/
 ERROR: error during balancing '/mnt/disk/' - Read-only file system

 btrfs dev delete missing /mnt/disk/
 ERROR: error removing the device 'missing' - Read-only file system

 Any mount without ro becomes:
 [  507.236652] BTRFS info (device sda2): allowing degraded mounts
 [  507.236655] BTRFS info (device sda2): disk space caching is enabled
 [  507.325365] BTRFS: bdev (null) errs: wr 2036894, rd 2031380, flush
 705, corrupt 0, gen 0
 [  510.983321] BTRFS: too many missing devices, writeable mount is not 
 allowed
 [  511.006241] BTRFS: open_ctree failed

 And one of them has to give! ;)
>>>
>>>
>>> You've run into this:
>>> https://bugzilla.kernel.org/show_bug.cgi?id=92641
>>
>> Ah, I thought it might not be known - I'm currently copying the files
>> since a read only mount is "good enough" for that
>>
>> -o degraded should allow readwrite *IF* the data can be written to
>> My question is also, would this keep me from "adding devices"?
>> I mean, it did seem like a catch 22 earlier, but that would really
>> make a mess of things...
>
> It is not possible to add a device to an ro filesystem, so effectively
> the fs read-writeability is broken in this case.

Wow, now that's quite a bug!

> --
> Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Ian Kumlien
On 8 September 2015 at 21:34, Hugo Mills  wrote:
> On Tue, Sep 08, 2015 at 09:18:05PM +0200, Ian Kumlien wrote:
>> Hi,
>>
>> Currently i have a raid1 configuration on two disks where one of them
>> is failing.
>>
>> But since:
>> btrfs fi df /mnt/disk/
>> Data, RAID1: total=858.00GiB, used=638.16GiB
>> Data, single: total=1.00GiB, used=256.00KiB
>> System, RAID1: total=32.00MiB, used=132.00KiB
>> Metadata, RAID1: total=4.00GiB, used=1.21GiB
>> GlobalReserve, single: total=412.00MiB, used=0.00B
>>
>> There should be no problem in failing one disk... Or so i thought!
>>
>> btrfs dev delete /dev/sdb2 /mnt/disk/
>> ERROR: error removing the device '/dev/sdb2' - unable to go below two
>> devices on raid1
>
>dev delete is more like a reshaping operation in mdadm: it tries to
> remove a device safely whilst retaining all of the redundancy
> guarantees. You can't go down to one device with RAID-1 and still keep
> the redundancy.
>
>dev delete is really for managed device removal under non-failure
> conditions, not for error recovery.
>
>> And i can't issue rebalance either since it will tell me about errors
>> until the failing disk dies.
>>
>> Whats even more interesting is that i can't mount just the working
>> disk - ie if the other disk
>> *has* failed and is inaccessible... though, i haven't tried physically
>> removing it...
>
>Physically removing it is the way to go (or disabling it using echo
> offline >/sys/block/sda/device/state). Once you've done that, you can
> mount the degraded FS with -odegraded, then either add a new device
> and balance to restore the RAID-1, or balance with
> -{d,m}convert=single to drop the redundancy to single.

This did not work...

[ 1742.368079] BTRFS info (device sda2): The free space cache file
(280385028096) is invalid. skip it
[ 1789.052403] BTRFS: open /dev/sdb2 failed
[ 1789.064629] BTRFS info (device sda2): allowing degraded mounts
[ 1789.064632] BTRFS info (device sda2): disk space caching is enabled
[ 1789.092286] BTRFS: bdev /dev/sdb2 errs: wr 2036894, rd 2031380,
flush 705, corrupt 0, gen 0
[ 1792.625275] BTRFS: too many missing devices, writeable mount is not allowed
[ 1792.644407] BTRFS: open_ctree failed

>> mdam has fail and remove, I assume for this reason - perhaps it's
>> something that should be added?
>
>I think there should be a btrfs dev drop, which is the fail-like
> operation: tell the FS that a device is useless, and should be dropped
> from the array, so the FS doesn't keep trying to write to it. That's
> not implemented yet, though.

Damn it =)

>Hugo.
>
> --
> Hugo Mills | Alert status mauve ocelot: Slight chance of
> hugo@... carfax.org.uk | brimstone. Be prepared to make a nice cup of tea.
> http://carfax.org.uk/  |
> PGP: E2AB1DE4  |
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Andy Lutomirski
On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
 wrote:
> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> I see copy_file_range() is a reflink() on BTRFS?
>> That's a bit surprising, as it avoids the copy completely.
>> cp(1) for example considered doing a BTRFS clone by default,
>> but didn't due to expectations that users actually wanted
>> the data duplicated on disk for resilience reasons,
>> and for performance reasons so that write latencies were
>> restricted to the copy operation, rather than being
>> introduced at usage time as the dest file is CoW'd.
>>
>> If reflink() is a possibility for copy_file_range()
>> then could it be done optionally with a flag?
>
> The idea is that filesystems get to choose how to handle copies in the 
> default case.  BTRFS could do a reflink, but NFS could do a server side copy 
> instead.  I can change the default behavior to only do a data copy (unless 
> the reflink flag is specified) instead, if that is desirable.
>
> What does everybody think?

I think the best you could do is to have a hint asking politely for
the data to be deep-copied.  After all, some filesystems reserve the
right to transparently deduplicate.

Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
advantage to deep copying unless you actually want two copies for
locality reasons.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[btrfs tools] ability to fail a device...

2015-09-08 Thread Ian Kumlien
Hi,

Currently i have a raid1 configuration on two disks where one of them
is failing.

But since:
btrfs fi df /mnt/disk/
Data, RAID1: total=858.00GiB, used=638.16GiB
Data, single: total=1.00GiB, used=256.00KiB
System, RAID1: total=32.00MiB, used=132.00KiB
Metadata, RAID1: total=4.00GiB, used=1.21GiB
GlobalReserve, single: total=412.00MiB, used=0.00B

There should be no problem in failing one disk... Or so i thought!

btrfs dev delete /dev/sdb2 /mnt/disk/
ERROR: error removing the device '/dev/sdb2' - unable to go below two
devices on raid1

And i can't issue rebalance either since it will tell me about errors
until the failing disk dies.

Whats even more interesting is that i can't mount just the working
disk - ie if the other disk
*has* failed and is inaccessible... though, i haven't tried physically
removing it...

mdam has fail and remove, I assume for this reason - perhaps it's
something that should be added?

uname -r
4.2.0

btrfs --version
btrfs-progs v4.1.2
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Hugo Mills
On Tue, Sep 08, 2015 at 09:18:05PM +0200, Ian Kumlien wrote:
> Hi,
> 
> Currently i have a raid1 configuration on two disks where one of them
> is failing.
> 
> But since:
> btrfs fi df /mnt/disk/
> Data, RAID1: total=858.00GiB, used=638.16GiB
> Data, single: total=1.00GiB, used=256.00KiB
> System, RAID1: total=32.00MiB, used=132.00KiB
> Metadata, RAID1: total=4.00GiB, used=1.21GiB
> GlobalReserve, single: total=412.00MiB, used=0.00B
> 
> There should be no problem in failing one disk... Or so i thought!
> 
> btrfs dev delete /dev/sdb2 /mnt/disk/
> ERROR: error removing the device '/dev/sdb2' - unable to go below two
> devices on raid1

   dev delete is more like a reshaping operation in mdadm: it tries to
remove a device safely whilst retaining all of the redundancy
guarantees. You can't go down to one device with RAID-1 and still keep
the redundancy.

   dev delete is really for managed device removal under non-failure
conditions, not for error recovery.

> And i can't issue rebalance either since it will tell me about errors
> until the failing disk dies.
> 
> Whats even more interesting is that i can't mount just the working
> disk - ie if the other disk
> *has* failed and is inaccessible... though, i haven't tried physically
> removing it...

   Physically removing it is the way to go (or disabling it using echo
offline >/sys/block/sda/device/state). Once you've done that, you can
mount the degraded FS with -odegraded, then either add a new device
and balance to restore the RAID-1, or balance with
-{d,m}convert=single to drop the redundancy to single.

> mdam has fail and remove, I assume for this reason - perhaps it's
> something that should be added?

   I think there should be a btrfs dev drop, which is the fail-like
operation: tell the FS that a device is useless, and should be dropped
from the array, so the FS doesn't keep trying to write to it. That's
not implemented yet, though.

   Hugo.

-- 
Hugo Mills | Alert status mauve ocelot: Slight chance of
hugo@... carfax.org.uk | brimstone. Be prepared to make a nice cup of tea.
http://carfax.org.uk/  |
PGP: E2AB1DE4  |


signature.asc
Description: Digital signature


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Ian Kumlien
On 8 September 2015 at 21:55, Ian Kumlien  wrote:
> On 8 September 2015 at 21:43, Ian Kumlien  wrote:
>> On 8 September 2015 at 21:34, Hugo Mills  wrote:
>>> On Tue, Sep 08, 2015 at 09:18:05PM +0200, Ian Kumlien wrote:
> [--8<--]
>
>>>Physically removing it is the way to go (or disabling it using echo
>>> offline >/sys/block/sda/device/state). Once you've done that, you can
>>> mount the degraded FS with -odegraded, then either add a new device
>>> and balance to restore the RAID-1, or balance with
>>> -{d,m}convert=single to drop the redundancy to single.
>>
>> This did not work...
>
> And removing the pyscial device is not the answer either... until i
> did a read only mount ;)
>
> Didn't expect it to fail with unable to open ctree like that...

Someone thought they were done too early, only one disk => read only
mount. But, readonly mount => no balance.

I think something is wrong

btrfs balance start -dconvert=single -mconvert=single /mnt/disk/
ERROR: error during balancing '/mnt/disk/' - Read-only file system

btrfs dev delete missing /mnt/disk/
ERROR: error removing the device 'missing' - Read-only file system

Any mount without ro becomes:
[  507.236652] BTRFS info (device sda2): allowing degraded mounts
[  507.236655] BTRFS info (device sda2): disk space caching is enabled
[  507.325365] BTRFS: bdev (null) errs: wr 2036894, rd 2031380, flush
705, corrupt 0, gen 0
[  510.983321] BTRFS: too many missing devices, writeable mount is not allowed
[  511.006241] BTRFS: open_ctree failed

And one of them has to give! ;)

> [--8<--]
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Ian Kumlien
On 8 September 2015 at 22:08, Chris Murphy  wrote:
> On Tue, Sep 8, 2015 at 2:00 PM, Ian Kumlien  wrote:

[--8<--]

>> Someone thought they were done too early, only one disk => read only
>> mount. But, readonly mount => no balance.
>>
>> I think something is wrong
>>
>> btrfs balance start -dconvert=single -mconvert=single /mnt/disk/
>> ERROR: error during balancing '/mnt/disk/' - Read-only file system
>>
>> btrfs dev delete missing /mnt/disk/
>> ERROR: error removing the device 'missing' - Read-only file system
>>
>> Any mount without ro becomes:
>> [  507.236652] BTRFS info (device sda2): allowing degraded mounts
>> [  507.236655] BTRFS info (device sda2): disk space caching is enabled
>> [  507.325365] BTRFS: bdev (null) errs: wr 2036894, rd 2031380, flush
>> 705, corrupt 0, gen 0
>> [  510.983321] BTRFS: too many missing devices, writeable mount is not 
>> allowed
>> [  511.006241] BTRFS: open_ctree failed
>>
>> And one of them has to give! ;)
>
>
> You've run into this:
> https://bugzilla.kernel.org/show_bug.cgi?id=92641

Ah, I thought it might not be known - I'm currently copying the files
since a read only mount is "good enough" for that

-o degraded should allow readwrite *IF* the data can be written to
My question is also, would this keep me from "adding devices"?
I mean, it did seem like a catch 22 earlier, but that would really
make a mess of things...

> --
> Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Chris Murphy
On Tue, Sep 8, 2015 at 2:13 PM, Ian Kumlien  wrote:
> On 8 September 2015 at 22:08, Chris Murphy  wrote:
>> On Tue, Sep 8, 2015 at 2:00 PM, Ian Kumlien  wrote:
>
> [--8<--]
>
>>> Someone thought they were done too early, only one disk => read only
>>> mount. But, readonly mount => no balance.
>>>
>>> I think something is wrong
>>>
>>> btrfs balance start -dconvert=single -mconvert=single /mnt/disk/
>>> ERROR: error during balancing '/mnt/disk/' - Read-only file system
>>>
>>> btrfs dev delete missing /mnt/disk/
>>> ERROR: error removing the device 'missing' - Read-only file system
>>>
>>> Any mount without ro becomes:
>>> [  507.236652] BTRFS info (device sda2): allowing degraded mounts
>>> [  507.236655] BTRFS info (device sda2): disk space caching is enabled
>>> [  507.325365] BTRFS: bdev (null) errs: wr 2036894, rd 2031380, flush
>>> 705, corrupt 0, gen 0
>>> [  510.983321] BTRFS: too many missing devices, writeable mount is not 
>>> allowed
>>> [  511.006241] BTRFS: open_ctree failed
>>>
>>> And one of them has to give! ;)
>>
>>
>> You've run into this:
>> https://bugzilla.kernel.org/show_bug.cgi?id=92641
>
> Ah, I thought it might not be known - I'm currently copying the files
> since a read only mount is "good enough" for that
>
> -o degraded should allow readwrite *IF* the data can be written to
> My question is also, would this keep me from "adding devices"?
> I mean, it did seem like a catch 22 earlier, but that would really
> make a mess of things...

It is not possible to add a device to an ro filesystem, so effectively
the fs read-writeability is broken in this case.

-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Hugo Mills
On Tue, Sep 08, 2015 at 02:17:55PM -0600, Chris Murphy wrote:
> On Tue, Sep 8, 2015 at 2:13 PM, Ian Kumlien  wrote:
> > On 8 September 2015 at 22:08, Chris Murphy  wrote:
> >> On Tue, Sep 8, 2015 at 2:00 PM, Ian Kumlien  wrote:
> >
> > [--8<--]
> >
> >>> Someone thought they were done too early, only one disk => read only
> >>> mount. But, readonly mount => no balance.
> >>>
> >>> I think something is wrong
> >>>
> >>> btrfs balance start -dconvert=single -mconvert=single /mnt/disk/
> >>> ERROR: error during balancing '/mnt/disk/' - Read-only file system
> >>>
> >>> btrfs dev delete missing /mnt/disk/
> >>> ERROR: error removing the device 'missing' - Read-only file system
> >>>
> >>> Any mount without ro becomes:
> >>> [  507.236652] BTRFS info (device sda2): allowing degraded mounts
> >>> [  507.236655] BTRFS info (device sda2): disk space caching is enabled
> >>> [  507.325365] BTRFS: bdev (null) errs: wr 2036894, rd 2031380, flush
> >>> 705, corrupt 0, gen 0
> >>> [  510.983321] BTRFS: too many missing devices, writeable mount is not 
> >>> allowed
> >>> [  511.006241] BTRFS: open_ctree failed
> >>>
> >>> And one of them has to give! ;)
> >>
> >>
> >> You've run into this:
> >> https://bugzilla.kernel.org/show_bug.cgi?id=92641
> >
> > Ah, I thought it might not be known - I'm currently copying the files
> > since a read only mount is "good enough" for that
> >
> > -o degraded should allow readwrite *IF* the data can be written to
> > My question is also, would this keep me from "adding devices"?
> > I mean, it did seem like a catch 22 earlier, but that would really
> > make a mess of things...
> 
> It is not possible to add a device to an ro filesystem, so effectively
> the fs read-writeability is broken in this case.

   I thought this particular issue had already been dealt with in 4.2?
(i.e. you can still mount an FS RW if it's degraded, but there are
still some single chunks on it).

   Ian: If you can still mount the FS read/write with both devices in
it, then it might be worth trying to balance away the problematic
single chunks with:

btrfs bal start -dprofiles=single -mprofiles=single /mountpoint

   Then unmount, pull the dead drive, and remount -odegraded.

   Hugo.

-- 
Hugo Mills | The early bird gets the worm, but the second mouse
hugo@... carfax.org.uk | gets the cheese.
http://carfax.org.uk/  |
PGP: E2AB1DE4  |


signature.asc
Description: Digital signature


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Ian Kumlien
On 8 September 2015 at 22:28, Hugo Mills  wrote:
> On Tue, Sep 08, 2015 at 02:17:55PM -0600, Chris Murphy wrote:
>> On Tue, Sep 8, 2015 at 2:13 PM, Ian Kumlien  wrote:
>> > On 8 September 2015 at 22:08, Chris Murphy  wrote:
>> >> On Tue, Sep 8, 2015 at 2:00 PM, Ian Kumlien  wrote:

[--8<--]

>> > -o degraded should allow readwrite *IF* the data can be written to
>> > My question is also, would this keep me from "adding devices"?
>> > I mean, it did seem like a catch 22 earlier, but that would really
>> > make a mess of things...
>>
>> It is not possible to add a device to an ro filesystem, so effectively
>> the fs read-writeability is broken in this case.
>
>I thought this particular issue had already been dealt with in 4.2?
> (i.e. you can still mount an FS RW if it's degraded, but there are
> still some single chunks on it).

Single chunks are only on sda - not on sdb...

There should be no problem...

>Ian: If you can still mount the FS read/write with both devices in
> it, then it might be worth trying to balance away the problematic
> single chunks with:
>
> btrfs bal start -dprofiles=single -mprofiles=single /mountpoint
>
>Then unmount, pull the dead drive, and remount -odegraded.

It never completes, too many errors and eventually the disk disappears
until the machine is turned off and on again... (normal disk reset
doesn't work)
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Hugo Mills
On Tue, Sep 08, 2015 at 10:33:54PM +0200, Ian Kumlien wrote:
> On 8 September 2015 at 22:28, Hugo Mills  wrote:
> > On Tue, Sep 08, 2015 at 02:17:55PM -0600, Chris Murphy wrote:
> >> On Tue, Sep 8, 2015 at 2:13 PM, Ian Kumlien  wrote:
> >> > On 8 September 2015 at 22:08, Chris Murphy  
> >> > wrote:
> >> >> On Tue, Sep 8, 2015 at 2:00 PM, Ian Kumlien  
> >> >> wrote:
> 
> [--8<--]
> 
> >> > -o degraded should allow readwrite *IF* the data can be written to
> >> > My question is also, would this keep me from "adding devices"?
> >> > I mean, it did seem like a catch 22 earlier, but that would really
> >> > make a mess of things...
> >>
> >> It is not possible to add a device to an ro filesystem, so effectively
> >> the fs read-writeability is broken in this case.
> >
> >I thought this particular issue had already been dealt with in 4.2?
> > (i.e. you can still mount an FS RW if it's degraded, but there are
> > still some single chunks on it).
> 
> Single chunks are only on sda - not on sdb...
> 
> There should be no problem...

   The check is more primitive than that at the moment, sadly. It just
checks that the number of missing devices is smaller than or equal to
the acceptable device loss for each RAID profile present on the FS.

> >Ian: If you can still mount the FS read/write with both devices in
> > it, then it might be worth trying to balance away the problematic
> > single chunks with:
> >
> > btrfs bal start -dprofiles=single -mprofiles=single /mountpoint
> >
> >Then unmount, pull the dead drive, and remount -odegraded.
> 
> It never completes, too many errors and eventually the disk disappears
> until the machine is turned off and on again... (normal disk reset
> doesn't work)

   The profiles= parameters should limit the balance to just the three
single chunks, and will remove them (because they're empty). It
shouldn't hit the metadata too hard, even if it's raising lots of
errors.

   Hugo.

-- 
Hugo Mills | The early bird gets the worm, but the second mouse
hugo@... carfax.org.uk | gets the cheese.
http://carfax.org.uk/  |
PGP: E2AB1DE4  |


signature.asc
Description: Digital signature


[PATCH] btrfs: use a single if() statement for one outcome in get_block_rsv()

2015-09-08 Thread Alexandru Moise
Rather than have three separate if() statements for the same outcome
we should just OR them together in the same if() statement.

Signed-off-by: Alexandru Moise <00moses.alexande...@gmail.com>
---
 fs/btrfs/extent-tree.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5411f0a..e8f2c15 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4890,13 +4890,9 @@ static struct btrfs_block_rsv *get_block_rsv(
 {
struct btrfs_block_rsv *block_rsv = NULL;
 
-   if (test_bit(BTRFS_ROOT_REF_COWS, >state))
-   block_rsv = trans->block_rsv;
-
-   if (root == root->fs_info->csum_root && trans->adding_csums)
-   block_rsv = trans->block_rsv;
-
-   if (root == root->fs_info->uuid_root)
+   if (test_bit(BTRFS_ROOT_REF_COWS, >state) ||
+   (root == root->fs_info->csum_root && trans->adding_csums) ||
+(root == root->fs_info->uuid_root))
block_rsv = trans->block_rsv;
 
if (!block_rsv)
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Anna Schumaker
On 09/08/2015 04:45 PM, Darrick J. Wong wrote:
> On Tue, Sep 08, 2015 at 11:08:03AM -0400, Anna Schumaker wrote:
>> On 09/05/2015 04:33 AM, Al Viro wrote:
>>> On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
>>>
 This is a bit of a surprising result, since in my testing in the
 past, copy_{to/from}_user() is a major consumer of CPU time (50%
 of a CPU core at 1GB/s).  What backing filesystem did you test on?
>>>
>>> While we are at it, was cp(1) using read(2)/write(2) loop or was it using
>>> something else (sendfile(2), for example)?
>>
>> cp uses a read / write loop, and has some heuristics for guessing an optimum 
>> buffer size.
> 
> ..but afaict cp doesn't fsync at the end, which means it's possible that
> the destination file's blocks are still delalloc and nothing's been flushed
> to disk yet.  What happens if you time (cp /tmp/a /tmp/b ; sync) ?

That's already how I was using cp :).  The example program in my man page also 
doesn't fsync at the end, so the extra sync at the end is needed for both.

Anna

> 
> 2048M / 1.667s = ~1200MB/s.
> 
> --D
> 
>>
>> Anna
>>
>>>
>>

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range()

2015-09-08 Thread Darrick J. Wong
On Tue, Sep 08, 2015 at 11:04:03AM -0400, Anna Schumaker wrote:
> On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
> > On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
> >> copy_file_range() is a new system call for copying ranges of data
> >> completely in the kernel.  This gives filesystems an opportunity to
> >> implement some kind of "copy acceleration", such as reflinks or
> >> server-side-copy (in the case of NFS).
> >>
> >> Signed-off-by: Anna Schumaker 
> >> ---
> >>  man2/copy_file_range.2 | 168 
> >> +
> >>  1 file changed, 168 insertions(+)
> >>  create mode 100644 man2/copy_file_range.2
> >>
> >> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
> >> new file mode 100644
> >> index 000..4a4cb73
> >> --- /dev/null
> >> +++ b/man2/copy_file_range.2
> >> @@ -0,0 +1,168 @@
> >> +.\"This manpage is Copyright (C) 2015 Anna Schumaker 
> >> 
> >> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
> >> +.SH NAME
> >> +copy_file_range \- Copy a range of data from one file to another
> >> +.SH SYNOPSIS
> >> +.nf
> >> +.B #include 
> >> +.B #include 
> >> +.B #include 
> >> +
> >> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " 
> >> off_in ",
> >> +.BI "int " fd_out ", loff_t * " off_out ", size_t " len ",
> >> +.BI "unsigned int " flags );
> >> +.fi
> >> +.SH DESCRIPTION
> >> +The
> >> +.BR copy_file_range ()
> >> +system call performs an in-kernel copy between two file descriptors
> >> +without all that tedious mucking about in userspace.
> > 
> > ;)
> > 
> >> +It copies up to
> >> +.I len
> >> +bytes of data from file descriptor
> >> +.I fd_in
> >> +to file descriptor
> >> +.I fd_out
> >> +at
> >> +.IR off_out .
> >> +The file descriptors must not refer to the same file.
> > 
> > Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> > with itself.
> 
> I've never really thought about it... Zach had that in his initial
> submission, so mentioned it in the man page.  Should I remove that bit?

Yes, please!

I could be wrong, but I think btrfs only started supporting files that share
blocks with themselves relatively recently(?)

I'm not sure why zab added this; was hoping he'd speak up. ;)

> 
> > 
> >> +
> >> +The following semantics apply for
> >> +.IR fd_in ,
> >> +and similar statements apply to
> >> +.IR off_out :
> >> +.IP * 3
> >> +If
> >> +.I off_in
> >> +is NULL, then bytes are read from
> >> +.I fd_in
> >> +starting from the current file offset and the current
> >> +file offset is adjusted appropriately.
> >> +.IP *
> >> +If
> >> +.I off_in
> >> +is not NULL, then
> >> +.I off_in
> >> +must point to a buffer that specifies the starting
> >> +offset where bytes from
> >> +.I fd_in
> >> +will be read.  The current file offset of
> >> +.I fd_in
> >> +is not changed, but
> >> +.I off_in
> >> +is adjusted appropriately.
> >> +.PP
> >> +The default behavior of
> >> +.BR copy_file_range ()
> >> +is filesystem specific, and might result in creating a
> >> +copy-on-write reflink.
> >> +In the event that a given filesystem does not implement
> >> +any form of copy acceleration, the kernel will perform
> >> +a deep copy of the requested range by reading bytes from
> > 
> > I wonder if it's wise to allow deep copies -- what happens if len == 1T?
> > Will this syscall just block for a really long time?
> 
> We use rw_verify_area(), (similar to read and write) so we won't allow a
> value of len that long.  I can mention this in an updated version of this man
> page!

Ok.  I guess MAX_RW_COUNT limits us to about 4G at once, which for a splice
copy is probably reasonable.

The reason why I asked about len == 1T specifically is that I can (with
somewhat long delays) reflink about 260 million extents at a time on XFS,
which is about 1TB.  Given that locks get held for the duration, it's probably
not a bad thing to limit userspace to 4G at a time.

(But hey, it's fun to stress-test once in a while. :))

--D

> 
> 
> > 
> >> +.I fd_in
> >> +and writing them to
> >> +.IR fd_out .
> > 
> > "...if COPY_REFLINK is not set in flags."
> 
> Sure.
> 
> > 
> >> +
> >> +Currently, Linux only supports the following flag:
> >> +.TP 1.9i
> >> +.B COPY_REFLINK
> >> +Only perform the copy if the filesystem can do it as a reflink.
> >> +Do not fall back on performing a deep copy.
> >> +.SH RETURN VALUE
> >> +Upon successful completion,
> >> +.BR copy_file_range ()
> >> +will return the number of bytes copied between files.
> >> +This could be less than the length originally requested.
> >> +
> >> +On error,
> >> +.BR copy_file_range ()
> >> +returns \-1 and
> >> +.I errno
> >> +is set to indicate the error.
> >> +.SH ERRORS
> >> +.TP
> >> +.B EBADF
> >> +One or more file descriptors are not valid,
> >> +or do not have proper read-write mode.
> > 
> > "or fd_out is not opened for writing"?
> 
> I'll add that.
> 
> > 
> 

Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Darrick J. Wong
On Tue, Sep 08, 2015 at 11:08:03AM -0400, Anna Schumaker wrote:
> On 09/05/2015 04:33 AM, Al Viro wrote:
> > On Fri, Sep 04, 2015 at 04:25:27PM -0600, Andreas Dilger wrote:
> > 
> >> This is a bit of a surprising result, since in my testing in the
> >> past, copy_{to/from}_user() is a major consumer of CPU time (50%
> >> of a CPU core at 1GB/s).  What backing filesystem did you test on?
> > 
> > While we are at it, was cp(1) using read(2)/write(2) loop or was it using
> > something else (sendfile(2), for example)?
> 
> cp uses a read / write loop, and has some heuristics for guessing an optimum 
> buffer size.

..but afaict cp doesn't fsync at the end, which means it's possible that
the destination file's blocks are still delalloc and nothing's been flushed
to disk yet.  What happens if you time (cp /tmp/a /tmp/b ; sync) ?

2048M / 1.667s = ~1200MB/s.

--D

> 
> Anna
> 
> > 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Darrick J. Wong
On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> On 08/09/15 20:10, Andy Lutomirski wrote:
> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >  wrote:
> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >>> I see copy_file_range() is a reflink() on BTRFS?
> >>> That's a bit surprising, as it avoids the copy completely.
> >>> cp(1) for example considered doing a BTRFS clone by default,
> >>> but didn't due to expectations that users actually wanted
> >>> the data duplicated on disk for resilience reasons,
> >>> and for performance reasons so that write latencies were
> >>> restricted to the copy operation, rather than being
> >>> introduced at usage time as the dest file is CoW'd.
> >>>
> >>> If reflink() is a possibility for copy_file_range()
> >>> then could it be done optionally with a flag?
> >>
> >> The idea is that filesystems get to choose how to handle copies in the
> >> default case.  BTRFS could do a reflink, but NFS could do a server side

Eww, different default behaviors depending on the filesystem. :)

> >> copy instead.  I can change the default behavior to only do a data copy
> >> (unless the reflink flag is specified) instead, if that is desirable.
> >>
> >> What does everybody think?
> > 
> > I think the best you could do is to have a hint asking politely for
> > the data to be deep-copied.  After all, some filesystems reserve the
> > right to transparently deduplicate.
> > 
> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> > advantage to deep copying unless you actually want two copies for
> > locality reasons.
> 
> Agreed. The relink and server side copy are separate things.
> There's no advantage to not doing a server side copy,
> but as mentioned there may be advantages to doing deep copies on BTRFS
> (another reason not previous mentioned in this thread, would be
> to avoid ENOSPC errors at some time in the future).
> 
> So having control over the deep copy seems useful.
> It's debatable whether ALLOW_REFLINK should be on/off by default
> for copy_file_range().  I'd be inclined to have such a setting off by default,
> but cp(1) at least will work with whatever is chosen.

So far it looks like people are interested in at least these "make data appear
in this other place" filesystem operations:

1. reflink
2. reflink, but only if the contents are the same (dedupe)
3. regular copy
4. regular copy, but make the hardware do it for us
5. regular copy, but require a second copy on the media (no-dedupe)
6. regular copy, but don't CoW (eatmyothercopies) (joke)

(Please add whatever ops I missed.)

I think I can see a case for letting (4) fall back to (3) since (4) is an
optimization of (3).

However, I particularly don't like the idea of (1) falling back to (3-5).
Either the kernel can satisfy a request or it can't, but let's not just
assume that we should transmogrify one type of request into another.  Userspace
should decide if a reflink failure should turn into one of the copy variants,
depending on whether the user wants to spread allocation costs over rewrites or
pay it all up front.  Also, if we allow reflink to fall back to copy, how do
programs find out what actually took place?  Or do we simply not allow them to
find out?

Also, programs that expect reflink either to finish or fail quickly might be
surprised if it's possible for reflink to take a longer time than usual and
with the side effect that a deep(er) copy was made.

I guess if someone asks for both (1) and (3) we can do the fallback in the
kernel, like how we handle it right now.

--D

> 
> thanks,
> Pádraig.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Darrick J. Wong
On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong  
> wrote:
> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >> On 08/09/15 20:10, Andy Lutomirski wrote:
> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >> >  wrote:
> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >> >>> I see copy_file_range() is a reflink() on BTRFS?
> >> >>> That's a bit surprising, as it avoids the copy completely.
> >> >>> cp(1) for example considered doing a BTRFS clone by default,
> >> >>> but didn't due to expectations that users actually wanted
> >> >>> the data duplicated on disk for resilience reasons,
> >> >>> and for performance reasons so that write latencies were
> >> >>> restricted to the copy operation, rather than being
> >> >>> introduced at usage time as the dest file is CoW'd.
> >> >>>
> >> >>> If reflink() is a possibility for copy_file_range()
> >> >>> then could it be done optionally with a flag?
> >> >>
> >> >> The idea is that filesystems get to choose how to handle copies in the
> >> >> default case.  BTRFS could do a reflink, but NFS could do a server side
> >
> > Eww, different default behaviors depending on the filesystem. :)
> >
> >> >> copy instead.  I can change the default behavior to only do a data copy
> >> >> (unless the reflink flag is specified) instead, if that is desirable.
> >> >>
> >> >> What does everybody think?
> >> >
> >> > I think the best you could do is to have a hint asking politely for
> >> > the data to be deep-copied.  After all, some filesystems reserve the
> >> > right to transparently deduplicate.
> >> >
> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >> > advantage to deep copying unless you actually want two copies for
> >> > locality reasons.
> >>
> >> Agreed. The relink and server side copy are separate things.
> >> There's no advantage to not doing a server side copy,
> >> but as mentioned there may be advantages to doing deep copies on BTRFS
> >> (another reason not previous mentioned in this thread, would be
> >> to avoid ENOSPC errors at some time in the future).
> >>
> >> So having control over the deep copy seems useful.
> >> It's debatable whether ALLOW_REFLINK should be on/off by default
> >> for copy_file_range().  I'd be inclined to have such a setting off by 
> >> default,
> >> but cp(1) at least will work with whatever is chosen.
> >
> > So far it looks like people are interested in at least these "make data 
> > appear
> > in this other place" filesystem operations:
> >
> > 1. reflink
> > 2. reflink, but only if the contents are the same (dedupe)
> 
> What I meant by this was: if you ask for "regular copy", you may end
> up with a reflink anyway.  Anyway, how can you reflink a range and
> have the contents *not* be the same?

reflink forcibly remaps fd_dest's range to fd_src's range.  If they didn't
match before, they will afterwards.

dedupe remaps fd_dest's range to fd_src's range only if they match, of course.

Perhaps I should have said "...if the contents are the same before the call"?

> 
> > 3. regular copy
> > 4. regular copy, but make the hardware do it for us
> > 5. regular copy, but require a second copy on the media (no-dedupe)
> 
> If this comes from me, I have no desire to ever use this as a flag.

I meant (5) as a "disable auto-dedupe for this operation" flag, not as
a "reallocate all the shared blocks now" op...

> If someone wants to use chattr or some new operation to say "make this
> range of this file belong just to me for purpose of optimizing future
> writes", then sure, go for it, with the understanding that there are
> plenty of filesystems for which that doesn't even make sense.

"Unshare these blocks" sounds more like something fallocate could do.

So far in my XFS reflink playground, it seems that using the defrag tool to
un-cow a file makes most sense.  AFAICT the XFS and ext4 defraggers copy a
fragmented file's data to a second file and use a 'swap extents' operation,
after which the donor file is unlinked.

Hey, if this syscall turns into a more generic "do something involving two
(fd:off:len) (fd:off:len) tuples" call, I guess we could throw in "swap
extents" as a 7th operation, to refactor the ioctls.  

> 
> > 6. regular copy, but don't CoW (eatmyothercopies) (joke)
> >
> > (Please add whatever ops I missed.)
> >
> > I think I can see a case for letting (4) fall back to (3) since (4) is an
> > optimization of (3).
> >
> > However, I particularly don't like the idea of (1) falling back to (3-5).
> > Either the kernel can satisfy a request or it can't, but let's not just
> > assume that we should transmogrify one type of request into another.  
> > Userspace
> > should decide if a reflink failure should turn into one of the copy 
> > variants,
> > depending on whether the user wants to spread allocation costs over 
> > rewrites or
> > pay it all up front. 

Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Andy Lutomirski
On Tue, Sep 8, 2015 at 3:39 PM, Darrick J. Wong  wrote:
> On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
>> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong  
>> wrote:
>> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
>> >> On 08/09/15 20:10, Andy Lutomirski wrote:
>> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
>> >> >  wrote:
>> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
>> >> >>> I see copy_file_range() is a reflink() on BTRFS?
>> >> >>> That's a bit surprising, as it avoids the copy completely.
>> >> >>> cp(1) for example considered doing a BTRFS clone by default,
>> >> >>> but didn't due to expectations that users actually wanted
>> >> >>> the data duplicated on disk for resilience reasons,
>> >> >>> and for performance reasons so that write latencies were
>> >> >>> restricted to the copy operation, rather than being
>> >> >>> introduced at usage time as the dest file is CoW'd.
>> >> >>>
>> >> >>> If reflink() is a possibility for copy_file_range()
>> >> >>> then could it be done optionally with a flag?
>> >> >>
>> >> >> The idea is that filesystems get to choose how to handle copies in the
>> >> >> default case.  BTRFS could do a reflink, but NFS could do a server side
>> >
>> > Eww, different default behaviors depending on the filesystem. :)
>> >
>> >> >> copy instead.  I can change the default behavior to only do a data copy
>> >> >> (unless the reflink flag is specified) instead, if that is desirable.
>> >> >>
>> >> >> What does everybody think?
>> >> >
>> >> > I think the best you could do is to have a hint asking politely for
>> >> > the data to be deep-copied.  After all, some filesystems reserve the
>> >> > right to transparently deduplicate.
>> >> >
>> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
>> >> > advantage to deep copying unless you actually want two copies for
>> >> > locality reasons.
>> >>
>> >> Agreed. The relink and server side copy are separate things.
>> >> There's no advantage to not doing a server side copy,
>> >> but as mentioned there may be advantages to doing deep copies on BTRFS
>> >> (another reason not previous mentioned in this thread, would be
>> >> to avoid ENOSPC errors at some time in the future).
>> >>
>> >> So having control over the deep copy seems useful.
>> >> It's debatable whether ALLOW_REFLINK should be on/off by default
>> >> for copy_file_range().  I'd be inclined to have such a setting off by 
>> >> default,
>> >> but cp(1) at least will work with whatever is chosen.
>> >
>> > So far it looks like people are interested in at least these "make data 
>> > appear
>> > in this other place" filesystem operations:
>> >
>> > 1. reflink
>> > 2. reflink, but only if the contents are the same (dedupe)
>>
>> What I meant by this was: if you ask for "regular copy", you may end
>> up with a reflink anyway.  Anyway, how can you reflink a range and
>> have the contents *not* be the same?
>
> reflink forcibly remaps fd_dest's range to fd_src's range.  If they didn't
> match before, they will afterwards.
>
> dedupe remaps fd_dest's range to fd_src's range only if they match, of course.
>
> Perhaps I should have said "...if the contents are the same before the call"?
>

Oh, I see.

Can we have a clean way to figure out whether two file ranges are the
same in a way that allows false negatives?  I.e. return 1 if the
ranges are reflinks of each other and 0 if not?  Pretty please?  I've
implemented that in the past on btrfs by syncing the ranges and then
comparing FIEMAP output, but that's hideous.

>>
>> > 3. regular copy
>> > 4. regular copy, but make the hardware do it for us
>> > 5. regular copy, but require a second copy on the media (no-dedupe)
>>
>> If this comes from me, I have no desire to ever use this as a flag.
>
> I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> a "reallocate all the shared blocks now" op...

Hmm, interesting.  What effect does it have on systems that do
deferred auto-dedupe?

>>
>> I think we should focus on what the actual legit use cases might be.
>> Certainly we want to support a mode that's "reflink or fail".  We
>> could have these flags:
>>
>> COPY_FILE_RANGE_ALLOW_REFLINK
>> COPY_FILE_RANGE_ALLOW_COPY
>>
>> Setting neither gets -EINVAL.  Setting both works as is.  Setting just
>> ALLOW_REFLINK will fail if a reflink can't be supported.  Setting just
>> ALLOW_COPY will make a best-effort attempt not to reflink but
>> expressly permits reflinking in cases where either (a) plain old
>> write(2) might also result in a reflink or (b) there is no advantage
>> to not reflinking.
>
> I don't agree with having a 'copy' flag that can reflink when we also have a
> 'reflink' flag.  I guess I just don't like having a flag with different
> meanings depending on context.
>
> Users should be able to get the default behavior by passing '0' for flags, so
> provide 

Re: [PATCH 01/19] btrfs: qgroup: New function declaration for new reserve implement

2015-09-08 Thread Tsutomu Itoh
Hi, Qu,

On 2015/09/08 17:56, Qu Wenruo wrote:
> Add new structures and functions for new qgroup reserve implement dirty
> phase.
> Which will focus on avoiding over-reserve as in that case, which means
> for already reserved dirty space range, we won't reserve space again.
> 
> This patch adds the needed structure declaration and comments.
> 
> Signed-off-by: Qu Wenruo 
> ---
>   fs/btrfs/btrfs_inode.h |  4 
>   fs/btrfs/qgroup.c  | 58 
> ++
>   fs/btrfs/qgroup.h  |  3 +++
>   3 files changed, 65 insertions(+)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 81220b2..e3ece65 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -24,6 +24,7 @@
>   #include "extent_io.h"
>   #include "ordered-data.h"
>   #include "delayed-inode.h"
> +#include "qgroup.h"
>   
>   /*
>* ordered_data_close is set by truncate when a file that used
> @@ -195,6 +196,9 @@ struct btrfs_inode {
>   struct timespec i_otime;
>   
>   struct inode vfs_inode;
> +
> + /* qgroup dirty map for data space reserve */
> + struct btrfs_qgroup_data_rsv_map *qgroup_rsv_map;
>   };
>   
>   extern unsigned char btrfs_filetype_table[];
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index e9ace09..561c36d 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -91,6 +91,64 @@ struct btrfs_qgroup {
>   u64 new_refcnt;
>   };
>   
> +/*
> + * Record one range of reserved space.
> + */
> +struct data_rsv_range {
> + struct rb_node node;
> + u64 start;
> + u64 len;
> +};
> +
> +/*
> + * Record per inode reserved range.
> + * This is mainly used to resolve reserved space leaking problem.
> + * One of the cause is the mismatch with reserve and free.
> + *
> + * New qgroup will handle reserve in two phase.
> + * 1) Dirty phase.
> + *Pages are just marked dirty, but not written to disk.
> + * 2) Flushed phase
> + *Pages are written to disk, but transaction is not committed yet.
> + *

> + * At Diryt phase, we only need to focus on avoiding over-reserve.

 dirty

> + *
> + * The idea is like below.
> + * 1) Write [0,8K)
> + * 0 4K  8K  12K 16K
> + * ||
> + * Reserve +8K, total reserved: 8K
> + *
> + * 2) Write [0,4K)
> + * 0 4K  8K  12K 16K
> + * ||
> + * Reserve 0, total reserved 8K
> + *
> + * 3) Write [12K,16K)
> + * 0 4K  8K  12K 16K

> + * |||///|
> + * Reserve +4K, tocal reserved 12K

   total

> + *
> + * 4) Flush [0,8K)
> + * Can happen without commit transaction, like fallocate will trigger the
> + * write.
> + * 0 4K  8K  12K 16K
> + *   |///|

> + * Reserve 0, tocal reserved 12K

 total

Thanks,
Tsutomu

> + * As the extent is written to disk, not dirty any longer, the range get
> + * removed.
> + * But as its delayed_refs is not run, its reserved space will not be freed.
> + * And things continue to Flushed phase.
> + *
> + * By this method, we can avoid over-reserve, which will lead to reserved
> + * space leak.
> + */
> +struct btrfs_qgroup_data_rsv_map {
> + struct rb_root root;
> + u64 reserved;
> + spinlock_t lock;
> +};
> +
>   static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
>  int mod)
>   {
> diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
> index 6387dcf..2f863a4 100644
> --- a/fs/btrfs/qgroup.h
> +++ b/fs/btrfs/qgroup.h
> @@ -33,6 +33,9 @@ struct btrfs_qgroup_extent_record {
>   struct ulist *old_roots;
>   };
>   
> +/* For per-inode dirty range reserve */
> +struct btrfs_qgroup_data_rsv_map;
> +
>   int btrfs_quota_enable(struct btrfs_trans_handle *trans,
>  struct btrfs_fs_info *fs_info);
>   int btrfs_quota_disable(struct btrfs_trans_handle *trans,
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range()

2015-09-08 Thread Anna Schumaker
On 09/04/2015 05:38 PM, Darrick J. Wong wrote:
> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>> copy_file_range() is a new system call for copying ranges of data
>> completely in the kernel.  This gives filesystems an opportunity to
>> implement some kind of "copy acceleration", such as reflinks or
>> server-side-copy (in the case of NFS).
>>
>> Signed-off-by: Anna Schumaker 
>> ---
>>  man2/copy_file_range.2 | 168 
>> +
>>  1 file changed, 168 insertions(+)
>>  create mode 100644 man2/copy_file_range.2
>>
>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>> new file mode 100644
>> index 000..4a4cb73
>> --- /dev/null
>> +++ b/man2/copy_file_range.2
>> @@ -0,0 +1,168 @@
>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker 
>> 
>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>> +.SH NAME
>> +copy_file_range \- Copy a range of data from one file to another
>> +.SH SYNOPSIS
>> +.nf
>> +.B #include 
>> +.B #include 
>> +.B #include 
>> +
>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " off_in 
>> ",
>> +.BI "int " fd_out ", loff_t * " off_out ", size_t " len ",
>> +.BI "unsigned int " flags );
>> +.fi
>> +.SH DESCRIPTION
>> +The
>> +.BR copy_file_range ()
>> +system call performs an in-kernel copy between two file descriptors
>> +without all that tedious mucking about in userspace.
> 
> ;)
> 
>> +It copies up to
>> +.I len
>> +bytes of data from file descriptor
>> +.I fd_in
>> +to file descriptor
>> +.I fd_out
>> +at
>> +.IR off_out .
>> +The file descriptors must not refer to the same file.
> 
> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
> with itself.

I've never really thought about it... Zach had that in his initial submission, 
so mentioned it in the man page.  Should I remove that bit?


> 
>> +
>> +The following semantics apply for
>> +.IR fd_in ,
>> +and similar statements apply to
>> +.IR off_out :
>> +.IP * 3
>> +If
>> +.I off_in
>> +is NULL, then bytes are read from
>> +.I fd_in
>> +starting from the current file offset and the current
>> +file offset is adjusted appropriately.
>> +.IP *
>> +If
>> +.I off_in
>> +is not NULL, then
>> +.I off_in
>> +must point to a buffer that specifies the starting
>> +offset where bytes from
>> +.I fd_in
>> +will be read.  The current file offset of
>> +.I fd_in
>> +is not changed, but
>> +.I off_in
>> +is adjusted appropriately.
>> +.PP
>> +The default behavior of
>> +.BR copy_file_range ()
>> +is filesystem specific, and might result in creating a
>> +copy-on-write reflink.
>> +In the event that a given filesystem does not implement
>> +any form of copy acceleration, the kernel will perform
>> +a deep copy of the requested range by reading bytes from
> 
> I wonder if it's wise to allow deep copies -- what happens if len == 1T?
> Will this syscall just block for a really long time?

We use rw_verify_area(), (similar to read and write) so we won't allow a value 
of len that long.  I can mention this in an updated version of this man page!


> 
>> +.I fd_in
>> +and writing them to
>> +.IR fd_out .
> 
> "...if COPY_REFLINK is not set in flags."

Sure.

> 
>> +
>> +Currently, Linux only supports the following flag:
>> +.TP 1.9i
>> +.B COPY_REFLINK
>> +Only perform the copy if the filesystem can do it as a reflink.
>> +Do not fall back on performing a deep copy.
>> +.SH RETURN VALUE
>> +Upon successful completion,
>> +.BR copy_file_range ()
>> +will return the number of bytes copied between files.
>> +This could be less than the length originally requested.
>> +
>> +On error,
>> +.BR copy_file_range ()
>> +returns \-1 and
>> +.I errno
>> +is set to indicate the error.
>> +.SH ERRORS
>> +.TP
>> +.B EBADF
>> +One or more file descriptors are not valid,
>> +or do not have proper read-write mode.
> 
> "or fd_out is not opened for writing"?

I'll add that.

> 
>> +.TP
>> +.B EINVAL
>> +Requested range extends beyond the end of the file;
>> +.I flags
>> +argument is set to an invalid value.
>> +.TP
>> +.B EOPNOTSUPP
>> +.B COPY_REFLINK
>> +was specified in
>> +.IR flags ,
>> +but the target filesystem does not support reflinks.
>> +.TP
>> +.B EXDEV
>> +Target filesystem doesn't support cross-filesystem copies.
>> +.SH VERSIONS
> 
> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
> that can be returned?  (I was looking at the fallocate manpage.)

Okay.  I'll poke around for what else could be returned!

Thanks,
Anna

> 
> --D
> 
>> +The
>> +.BR copy_file_range ()
>> +system call first appeared in Linux 4.3.
>> +.SH CONFORMING TO
>> +The
>> +.BR copy_file_range ()
>> +system call is a nonstandard Linux extension.
>> +.SH EXAMPLE
>> +.nf
>> +
>> +#define _GNU_SOURCE
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +
>> +int main(int argc, char **argv)
>> 

Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Pádraig Brady
On 04/09/15 21:16, Anna Schumaker wrote:
> Copy system calls came up during Plumbers a couple of weeks ago, because
> several filesystems (including NFS and XFS) are currently working on copy
> acceleration implementations.  We haven't heard from Zach Brown in a while,
> so I volunteered to push his patches upstream so individual filesystems
> don't need to keep writing their own ioctls.

Just mentioning that this is just pertaining to the data, not the metadata.
Providing metadata copying facilities would be _very_ useful, as
most file system specific details relate to the metadata, and having
VFS operations for that would avoid the plethora of details in each userspace 
tool,
and theoretically support translations between disparate metadata.

> The first three patches are a simple reposting of Zach's patches from several
> months ago, with one minor error code fix.  The remaining patches add in a
> fallback mechanism when filesystems don't provide a copy function.  This is
> especially useful when doing a server-side copy on NFS (using the new COPY
> operation in NFS v4.2).  This fallback can be disabled by passing the flag
> COPY_REFLINK to the system call.

I see copy_file_range() is a reflink() on BTRFS?
That's a bit surprising, as it avoids the copy completely.
cp(1) for example considered doing a BTRFS clone by default,
but didn't due to expectations that users actually wanted
the data duplicated on disk for resilience reasons,
and for performance reasons so that write latencies were
restricted to the copy operation, rather than being
introduced at usage time as the dest file is CoW'd.

If reflink() is a possibility for copy_file_range()
then could it be done optionally with a flag?

thanks,
Pádraig
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 10/19] btrfs: delayed_ref: release and free qgroup reserved at proper timing

2015-09-08 Thread Qu Wenruo



Tsutomu Itoh wrote on 2015/09/09 10:21 +0900:

Hi, Qu,

On 2015/09/08 18:08, Qu Wenruo wrote:

Qgroup reserved space needs to be released from inode dirty map and get
freed at different timing:

1) Release when the metadata is written into tree
After corresponding metadata is written into tree, any newer write will
be COWed(don't include NOCOW case yet).
So we must release its range from inode dirty range map, or we will
forget to reserve needed range, causing accounting exceeding the limit.

2) Free reserved bytes when delayed ref is run
When delayed refs are run, qgroup accounting will follow soon and turn
the reserved bytes into rfer/excl numbers.
As run_delayed_refs and qgroup accounting are all done at
commit_transaction() time, we are safe to free reserved space in
run_delayed_ref time().

With these timing to release/free reserved space, we should be able to
resolve the long existing qgroup reserve space leak problem.

Signed-off-by: Qu Wenruo 
---
   fs/btrfs/extent-tree.c |  4 
   fs/btrfs/inode.c   | 10 ++
   fs/btrfs/qgroup.c  |  5 ++---
   fs/btrfs/qgroup.h  |  8 +++-
   4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5411f0a..65e60eb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2345,6 +2345,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle 
*trans,
  node->num_bytes);
}
}
+
+   /* Also free its reserved qgroup space */
+   btrfs_qgroup_free_refroot(root->fs_info, head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61b2c17..1f7cac0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2112,6 +2112,16 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
btrfs_ino(inode), file_pos, );
+   if (ret < 0)
+   goto out;
+   /*
+* Release the reserved range from inode dirty range map, and
+* move it to delayed ref codes, as now accounting only happens at
+* commit_transaction() time.
+*/
+   btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+   ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+   root->objectid, disk_bytenr, ram_bytes);
   out:
btrfs_free_path(path);

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ba7888f..5a69a2d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2169,14 +2169,13 @@ out:
return ret;
   }

-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+  u64 ref_root, u64 num_bytes)
   {
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
-   struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist_node *unode;
struct ulist_iterator uiter;
-   u64 ref_root = root->root_key.objectid;
int ret = 0;

if (!is_fstree(ref_root))
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 8e69dc1..49fa15e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -75,7 +75,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 struct btrfs_qgroup_inherit *inherit);
   int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+  u64 ref_root, u64 num_bytes);
+static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+   return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+num_bytes);


Is 'return' necessary?

Thanks,
Tsutomu



Hi Itoh-san

Unnecessary yet, but should provide some extensibility for later 
modification.


For example, if later qgroup_free() function need to return value, then
it will be quite easy to modify.

But I'm OK to remove the return as it's not used now.

Also thanks for the spell check, I'll update them in the next pull 
request directly, without patch bombing the mail list.


Thanks,
Qu

+}

   void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);





--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" 

Re: [PATCH 3/3] btrfs-progs: tests: Introduce misc-tests/008-leaf-accross-stripes

2015-09-08 Thread Qu Wenruo



Zhao Lei wrote on 2015/09/04 21:23 +0800:

To check is btrfs-convert create bad filesystem with
leaf accross stripes.

It is happened in progs version <=v4.1.2, and fixed by patch titled:
btrfs: convert: Avoid allocating metadata extent crossing stripe boundary
which was merged in v4.2.

Notice thar this testcase can not report error in old version of
btrfs-progs, because "btrfs check" can't check this type of error
in those version, but we have another testcase in fsck-tests, to
check is "btrfs check" support this check.

So, use above 2 testcase together can check out leaf-accross-stripes
bug in all version.

Signed-off-by: Zhao Lei 
---
  tests/misc-tests/008-leaf-accross-stripes/test.sh | 24 +++
  1 file changed, 24 insertions(+)
  create mode 100755 tests/misc-tests/008-leaf-accross-stripes/test.sh

diff --git a/tests/misc-tests/008-leaf-accross-stripes/test.sh 
b/tests/misc-tests/008-leaf-accross-stripes/test.sh
new file mode 100755
index 000..4801dce
--- /dev/null
+++ b/tests/misc-tests/008-leaf-accross-stripes/test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# test btrfs subvolume run normally with more than one subvolume
+#
+# - btrfs subvolume must not loop indefinetelly
+# - btrfs subvolume return 0 in normal case
+
+source $TOP/tests/common
+
+check_prereq btrfs-convert
+check_prereq btrfs
+
+# In my test, it happened in 514M~560M, 737M~769M, 929M~917M,
+# and HAVE_ERROR=((size + 1) / 2) % 2 if size >= 970
+#
+SIZE_FROM=514
+SIZE_END=560
+A_PRIME_NUM=17
+for ((size = SIZE_FROM; size <= SIZE_END; size += A_PRIME_NUM)); do
+   run_check truncate -s "$size"M "$IMAGE"
+   run_check mkfs.ext4 -F "$IMAGE"
+   run_check $TOP/btrfs-convert "$IMAGE"
+   $TOP/btrfs check "$IMAGE" 2>&1 | grep "crossing stripe boundary" &&
+   _fail "leaf accross stripes in btrfs-convert"
+done


Maybe I'm wrong, but even with your script, I still can't reproduce the bug.

After some search in git log, I found that, from 4.1 btrfs-convert won't 
allow mixed block group since the following commit:

commit c9b73b702be71bbc1a3a815a745e6e4e57eadffc
Author: David Sterba 
Date:   Mon Mar 23 19:31:31 2015 +0100

btrfs-progs: convert: allow to set filesystem features

So the new test may not be necessary and won't reproduce the bug.

Thanks,
Qu
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/19] btrfs: qgroup: Introduce function to release/free reserved data range

2015-09-08 Thread Tsutomu Itoh
Hi, Qu,

On 2015/09/08 18:08, Qu Wenruo wrote:
> Introduce functions btrfs_qgroup_release/free_data() to release/free
> reserved data range.
> 
> Release means, just remove the data range from data rsv map, but doesn't
> free the reserved space.
> This is for normal buffered write case, when data is written into disc
> and its metadata is added into tree, its reserved space should still be
> kept until commit_trans().
> So in that case, we only release dirty range, but keep the reserved
> space recorded some other place until commit_tran().
> 
> Free means not only remove data range, but also free reserved space.
> This is used for case for cleanup.
> 
> Signed-off-by: Qu Wenruo 
> ---
>   fs/btrfs/qgroup.c | 48 
>   fs/btrfs/qgroup.h |  2 ++
>   2 files changed, 50 insertions(+)
> 
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index e24c10d..ba7888f 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2979,6 +2979,54 @@ next:
>   return 0;
>   }
>   
> +static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 
> len,
> +int free_reserved)
> +{
> + struct data_rsv_range *tmp;
> + struct btrfs_qgroup_data_rsv_map *map;
> + u64 reserved = 0;
> + int ret;
> +
> + spin_lock(_I(inode)->qgroup_init_lock);
> + map = BTRFS_I(inode)->qgroup_rsv_map;
> + spin_unlock(_I(inode)->qgroup_init_lock);
> + if (!map)
> + return 0;
> +
> + tmp = kmalloc(sizeof(*tmp), GFP_NOFS);
> + if (!tmp)
> + return -ENOMEM;
> + spin_lock(>lock);
> + ret = release_data_range(map, tmp, start, len, );
> + /* release_data_range() won't fail only check if memory is used */
> + if (ret == 0)
> + kfree(tmp);
> + if (free_reserved)
> + btrfs_qgroup_free(BTRFS_I(inode)->root, reserved);
> + spin_unlock(>lock);
> + return 0;
> +}
> +
> +/*
> + * Caller should be truncate/invalidate_page.
> + * As it will release the reserved data.
> + */
> +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
> +{
> + return __btrfs_qgroup_release_data(inode, start, len, 1);
> +}
> +
> +/*
> + * Caller should be finish_ordered_io

> + * As qgroup accouting happens at commit time, for data written to disk

accounting

Thanks,
Tsutomu

> + * its reserved space should not be freed until commit.
> + * Or we may beyond the limit.
> + */
> +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
> +{
> + return __btrfs_qgroup_release_data(inode, start, len, 0);
> +}
> +
>   /*
>* Init data_rsv_map for a given inode.
>*
> diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
> index 366b853..8e69dc1 100644
> --- a/fs/btrfs/qgroup.h
> +++ b/fs/btrfs/qgroup.h
> @@ -88,4 +88,6 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info 
> *fs_info, u64 qgroupid,
>   int btrfs_qgroup_init_data_rsv_map(struct inode *inode);
>   void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
>   int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
> +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
> +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
>   #endif /* __BTRFS_QGROUP__ */
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Darrick J. Wong
On Tue, Sep 08, 2015 at 04:08:43PM -0700, Andy Lutomirski wrote:
> On Tue, Sep 8, 2015 at 3:39 PM, Darrick J. Wong  
> wrote:
> > On Tue, Sep 08, 2015 at 02:45:39PM -0700, Andy Lutomirski wrote:
> >> On Tue, Sep 8, 2015 at 2:29 PM, Darrick J. Wong  
> >> wrote:
> >> > On Tue, Sep 08, 2015 at 09:03:09PM +0100, Pádraig Brady wrote:
> >> >> On 08/09/15 20:10, Andy Lutomirski wrote:
> >> >> > On Tue, Sep 8, 2015 at 11:23 AM, Anna Schumaker
> >> >> >  wrote:
> >> >> >> On 09/08/2015 11:21 AM, Pádraig Brady wrote:
> >> >> >>> I see copy_file_range() is a reflink() on BTRFS?
> >> >> >>> That's a bit surprising, as it avoids the copy completely.
> >> >> >>> cp(1) for example considered doing a BTRFS clone by default,
> >> >> >>> but didn't due to expectations that users actually wanted
> >> >> >>> the data duplicated on disk for resilience reasons,
> >> >> >>> and for performance reasons so that write latencies were
> >> >> >>> restricted to the copy operation, rather than being
> >> >> >>> introduced at usage time as the dest file is CoW'd.
> >> >> >>>
> >> >> >>> If reflink() is a possibility for copy_file_range()
> >> >> >>> then could it be done optionally with a flag?
> >> >> >>
> >> >> >> The idea is that filesystems get to choose how to handle copies in 
> >> >> >> the
> >> >> >> default case.  BTRFS could do a reflink, but NFS could do a server 
> >> >> >> side
> >> >
> >> > Eww, different default behaviors depending on the filesystem. :)
> >> >
> >> >> >> copy instead.  I can change the default behavior to only do a data 
> >> >> >> copy
> >> >> >> (unless the reflink flag is specified) instead, if that is desirable.
> >> >> >>
> >> >> >> What does everybody think?
> >> >> >
> >> >> > I think the best you could do is to have a hint asking politely for
> >> >> > the data to be deep-copied.  After all, some filesystems reserve the
> >> >> > right to transparently deduplicate.
> >> >> >
> >> >> > Also, on a true COW filesystem (e.g. btrfs sometimes), there may be no
> >> >> > advantage to deep copying unless you actually want two copies for
> >> >> > locality reasons.
> >> >>
> >> >> Agreed. The relink and server side copy are separate things.
> >> >> There's no advantage to not doing a server side copy,
> >> >> but as mentioned there may be advantages to doing deep copies on BTRFS
> >> >> (another reason not previous mentioned in this thread, would be
> >> >> to avoid ENOSPC errors at some time in the future).
> >> >>
> >> >> So having control over the deep copy seems useful.
> >> >> It's debatable whether ALLOW_REFLINK should be on/off by default
> >> >> for copy_file_range().  I'd be inclined to have such a setting off by 
> >> >> default,
> >> >> but cp(1) at least will work with whatever is chosen.
> >> >
> >> > So far it looks like people are interested in at least these "make data 
> >> > appear
> >> > in this other place" filesystem operations:
> >> >
> >> > 1. reflink
> >> > 2. reflink, but only if the contents are the same (dedupe)
> >>
> >> What I meant by this was: if you ask for "regular copy", you may end
> >> up with a reflink anyway.  Anyway, how can you reflink a range and
> >> have the contents *not* be the same?
> >
> > reflink forcibly remaps fd_dest's range to fd_src's range.  If they didn't
> > match before, they will afterwards.
> >
> > dedupe remaps fd_dest's range to fd_src's range only if they match, of 
> > course.
> >
> > Perhaps I should have said "...if the contents are the same before the 
> > call"?
> >
> 
> Oh, I see.
> 
> Can we have a clean way to figure out whether two file ranges are the
> same in a way that allows false negatives?  I.e. return 1 if the
> ranges are reflinks of each other and 0 if not?  Pretty please?  I've
> implemented that in the past on btrfs by syncing the ranges and then
> comparing FIEMAP output, but that's hideous.

Another mode for this call... :)

> >>
> >> > 3. regular copy
> >> > 4. regular copy, but make the hardware do it for us
> >> > 5. regular copy, but require a second copy on the media (no-dedupe)
> >>
> >> If this comes from me, I have no desire to ever use this as a flag.
> >
> > I meant (5) as a "disable auto-dedupe for this operation" flag, not as
> > a "reallocate all the shared blocks now" op...
> 
> Hmm, interesting.  What effect does it have on systems that do
> deferred auto-dedupe?

If it's a userspace deferred auto-dedupe, then hopefully the program
coordinates with the dedupe program.

Otherwise, it's only effective with a dedupe that runs in the write-path.

> >>
> >> I think we should focus on what the actual legit use cases might be.
> >> Certainly we want to support a mode that's "reflink or fail".  We
> >> could have these flags:
> >>
> >> COPY_FILE_RANGE_ALLOW_REFLINK
> >> COPY_FILE_RANGE_ALLOW_COPY
> >>
> >> Setting neither gets -EINVAL.  Setting both works as is.  Setting just
> >> ALLOW_REFLINK will fail if a reflink can't be supported.  

Re: [PATCH 10/19] btrfs: delayed_ref: release and free qgroup reserved at proper timing

2015-09-08 Thread Tsutomu Itoh
Hi, Qu,

On 2015/09/08 18:08, Qu Wenruo wrote:
> Qgroup reserved space needs to be released from inode dirty map and get
> freed at different timing:
> 
> 1) Release when the metadata is written into tree
> After corresponding metadata is written into tree, any newer write will
> be COWed(don't include NOCOW case yet).
> So we must release its range from inode dirty range map, or we will
> forget to reserve needed range, causing accounting exceeding the limit.
> 
> 2) Free reserved bytes when delayed ref is run
> When delayed refs are run, qgroup accounting will follow soon and turn
> the reserved bytes into rfer/excl numbers.
> As run_delayed_refs and qgroup accounting are all done at
> commit_transaction() time, we are safe to free reserved space in
> run_delayed_ref time().
> 
> With these timing to release/free reserved space, we should be able to
> resolve the long existing qgroup reserve space leak problem.
> 
> Signed-off-by: Qu Wenruo 
> ---
>   fs/btrfs/extent-tree.c |  4 
>   fs/btrfs/inode.c   | 10 ++
>   fs/btrfs/qgroup.c  |  5 ++---
>   fs/btrfs/qgroup.h  |  8 +++-
>   4 files changed, 23 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 5411f0a..65e60eb 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2345,6 +2345,10 @@ static int run_one_delayed_ref(struct 
> btrfs_trans_handle *trans,
> node->num_bytes);
>   }
>   }
> +
> + /* Also free its reserved qgroup space */
> + btrfs_qgroup_free_refroot(root->fs_info, head->qgroup_ref_root,
> +   head->qgroup_reserved);
>   return ret;
>   }
>   
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 61b2c17..1f7cac0 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2112,6 +2112,16 @@ static int insert_reserved_file_extent(struct 
> btrfs_trans_handle *trans,
>   ret = btrfs_alloc_reserved_file_extent(trans, root,
>   root->root_key.objectid,
>   btrfs_ino(inode), file_pos, );
> + if (ret < 0)
> + goto out;
> + /*
> +  * Release the reserved range from inode dirty range map, and
> +  * move it to delayed ref codes, as now accounting only happens at
> +  * commit_transaction() time.
> +  */
> + btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
> + ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
> + root->objectid, disk_bytenr, ram_bytes);
>   out:
>   btrfs_free_path(path);
>   
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index ba7888f..5a69a2d 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2169,14 +2169,13 @@ out:
>   return ret;
>   }
>   
> -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
> +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
> +u64 ref_root, u64 num_bytes)
>   {
>   struct btrfs_root *quota_root;
>   struct btrfs_qgroup *qgroup;
> - struct btrfs_fs_info *fs_info = root->fs_info;
>   struct ulist_node *unode;
>   struct ulist_iterator uiter;
> - u64 ref_root = root->root_key.objectid;
>   int ret = 0;
>   
>   if (!is_fstree(ref_root))
> diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
> index 8e69dc1..49fa15e 100644
> --- a/fs/btrfs/qgroup.h
> +++ b/fs/btrfs/qgroup.h
> @@ -75,7 +75,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
>struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
>struct btrfs_qgroup_inherit *inherit);
>   int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
> -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
> +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
> +u64 ref_root, u64 num_bytes);
> +static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
> +{
> + return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
> +  num_bytes);

Is 'return' necessary?

Thanks,
Tsutomu

> +}
>   
>   void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
>   
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 04/19] btrfs: qgroup: Introduce function to insert non-overlap reserve range

2015-09-08 Thread Tsutomu Itoh
Hi, Qu,

On 2015/09/08 18:01, Qu Wenruo wrote:
> New function insert_data_ranges() will insert non-overlap reserve ranges
> into reserve map.
> 
> It provides the basis for later qgroup reserve map implement.
> 
> Signed-off-by: Qu Wenruo 
> ---
>   fs/btrfs/qgroup.c | 124 
> ++
>   1 file changed, 124 insertions(+)
> 
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index fc24fc3..a4e3af4 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2577,6 +2577,130 @@ find_reserve_range(struct btrfs_qgroup_data_rsv_map 
> *map, u64 start)
>   }
>   
>   /*
> + * Insert one data range

> + * [start,len) here won't overflap with each other.

 overlap

Thanks,
Tsutomu

> + *
> + * Return 0 if range is inserted and tmp is not used.
> + * Return > 0 if range is inserted and tmp is used.
> + * No catchable error case. Only possible error will cause BUG_ON() as
> + * that's logical error.
> + */
> +static int insert_data_range(struct btrfs_qgroup_data_rsv_map *map,
> +  struct data_rsv_range *tmp,
> +  u64 start, u64 len)
> +{
> + struct rb_node **p = >root.rb_node;
> + struct rb_node *parent = NULL;
> + struct rb_node *tmp_node = NULL;
> + struct data_rsv_range *range = NULL;
> + struct data_rsv_range *prev_range = NULL;
> + struct data_rsv_range *next_range = NULL;
> + int prev_merged = 0;
> + int next_merged = 0;
> + int ret = 0;
> +
> + while (*p) {
> + parent = *p;
> + range = rb_entry(parent, struct data_rsv_range, node);
> + if (range->start < start)
> + p = &(*p)->rb_right;
> + else if (range->start > start)
> + p = &(*p)->rb_left;
> + else
> + BUG_ON(1);
> + }
> +
> + /* Empty tree, goto isolated case */
> + if (!range)
> + goto insert_isolated;
> +
> + /* get adjusted ranges */
> + if (range->start < start) {
> + prev_range = range;
> + tmp_node = rb_next(parent);
> + if (tmp)
> + next_range = rb_entry(tmp_node, struct data_rsv_range,
> +   node);
> + } else {
> + next_range = range;
> + tmp_node = rb_prev(parent);
> + if (tmp)
> + prev_range = rb_entry(tmp_node, struct data_rsv_range,
> +   node);
> + }
> +
> + /* try to merge with previous and next ranges */
> + if (prev_range && prev_range->start + prev_range->len == start) {
> + prev_merged = 1;
> + prev_range->len += len;
> + }
> + if (next_range && start + len == next_range->start) {
> + next_merged = 1;
> +
> + /*
> +  * the range can be merged with adjusted two ranges into one,
> +  * remove the tailing range.
> +  */
> + if (prev_merged) {
> + prev_range->len += next_range->len;
> + rb_erase(_range->node, >root);
> + kfree(next_range);
> + } else {
> + next_range->start = start;
> + next_range->len += len;
> + }
> + }
> +
> +insert_isolated:
> + /* isolated case, need to insert range now */
> + if (!next_merged && !prev_merged) {
> + BUG_ON(!tmp);
> +
> + tmp->start = start;
> + tmp->len = len;
> + rb_link_node(>node, parent, p);
> + rb_insert_color(>node, >root);
> + ret = 1;
> + }
> + return ret;
> +}
> +
> +/*
> + * insert reserve range and merge them if possible
> + *
> + * Return 0 if all inserted and tmp not used
> + * Return > 0 if all inserted and tmp used
> + * No catchable error return value.
> + */
> +static int insert_data_ranges(struct btrfs_qgroup_data_rsv_map *map,
> +   struct data_rsv_range *tmp,
> +   struct ulist *insert_list)
> +{
> + struct ulist_node *unode;
> + struct ulist_iterator uiter;
> + int tmp_used = 0;
> + int ret = 0;
> +
> + ULIST_ITER_INIT();
> + while ((unode = ulist_next(insert_list, ))) {
> + ret = insert_data_range(map, tmp, unode->val, unode->aux);
> +
> + /*
> +  * insert_data_range() won't return error return value,
> +  * no need to hanle <0 case.
> +  *
> +  * Also tmp should be used at most one time, so clear it to
> +  * NULL to cooperate with sanity check in insert_data_range().
> +  */
> + if (ret > 0) {
> + tmp_used = 1;
> + tmp = NULL;
> + }
> + }
> + return tmp_used;
> +}
> +
> +/*
>* 

Re: Btrfs progs release 4.1

2015-09-08 Thread Qu Wenruo

Hi David,

Sorry for the late reply, but I noticed something interesting.

David Sterba wrote on 2015/06/22 17:00 +0200:

Hi,

btrfs-progs 4.1 have been released (in time with kernel 4.1). Unusual load of
changes.

Fixed since rc1:
   - uuid rewrite prints the correct original UUID
   - map-logical updated
   - fi show size units
   - typos

* bugfixes
   - fsck.btrfs: no bash-isms
   - bugzilla 97171: invalid memory access (with tests)
   - receive:
 - cloning works with --chroot
 - capabilities not lost
   - mkfs: do not try to register bare file images
   - option --help accepted by the standalone utilities

* enhancements
   - corrupt block: ability to remove csums
   - mkfs:
 - warn if metadata redundancy is lower than for data
 - options to make the output quiet (only errors)
 - mixed case names of raid profiles accepted
 - rework the output:
   - more comprehensive, 'key: value' format
   - subvol:
 - show:
   - print received uuid
   - update the output
   - new options to specify size units
 - sync:
   - grab all deleted ids and print them as they're removed,
previous implementation only checked if there are any
to be deleted - change in command semantics
   - scrub: print timestamps in days HMS format
   - receive:
 - can specify mount point, do not rely on /proc
 - can work inside subvolumes
   - send:
 - new option to send stream without data (NO_FILE_DATA)
   - convert:
 - specify incompat features on the new fs
   - qgroup:
 - show: distinguish no limits and 0 limit value
 - limit: ability to clear the limit
   - help for 'btrfs' is shorter, 1st level command overview
   - debug tree: print key names according to their C name

* new
   - rescure zero-log
   - btrfsune:
 - rewrite uuid on a filesystem image
 - new option to turn on NO_HOLES incompat feature

* deprecated
   - standalone btrfs-zero-log

* other
   - testing framework updates
 - uuid rewrite test
 - btrfstune feature setting test
 - zero-log tests
 - more testing image formats
   - manual page updates
   - ioctl.h synced with current kernel uapi version
   - convert: preparatory works for more filesystems (reiserfs pending)
   - use static buffers for path handling where possible
   - add new helpers for send uilts that check memory allocations,
 switch all users, deprecate old helpers
   - Makefile: fix build dependency generation
   - map-logical: make it work again

Tarballs: https://www.kernel.org/pub/linux/kernel/people/kdave/btrfs-progs/
Git: git://git.kernel.org/pub/scm/linux/kernel/git/kdave/btrfs-progs.git

Shortlog:

Anand Jain (2):
   btrfs-progs: add info about list-all to the help
   btrfs-progs: use function is_block_device() instead

Dimitri John Ledkov (1):
   btrfs-progs: fsck.btrfs: Fix bashism and bad getopts processing

Dongsheng Yang (4):
   btrfs-progs: qgroup: show 'none' when we did not limit it on this qgroup
   btrfs-progs: qgroup: allow user to clear some limitation on qgroup.
   btrfs-progs: qgroup limit: error out if input value is negative
   btrfs-progs: qgroup limit: add a check for invalid input of 'T/G/M/K'

Emil Karlson (1):
   btrfs-progs: use openat for process_clone in receive

Goffredo Baroncelli (4):
   btrfs-progs: add strdup in btrfs_add_to_fsid() to track the device path
   btrfs-progs: return the fsid from make_btrfs()
   btrfs-progs: mkfs: track sizes of created block groups
   btrfs-progs: mkfs: print the summary

Jeff Mahoney (8):
   btrfs-progs: convert: clean up blk_iterate_data handling wrt 
record_file_blocks
   btrfs-progs: convert: remove unused fs argument from block_iterate_proc
   btrfs-progs: convert: remove unused inode_key in copy_single_inode
   btrfs-progs: convert: rename ext2_root to image_root
   btrfs-progs: compat: define DIV_ROUND_UP if not already defined
   btrfs-progs: convert: fix typo in btrfs_insert_dir_item call
   btrfs-progs: convert: factor out adding dirent into convert_insert_dirent
   btrfs-progs: convert: factor out block iteration callback

Josef Bacik (3):
   Btrfs-progs: corrupt-block: add the ability to remove csums
   btrfs-progs: specify mountpoint for recieve
   btrfs-progs: make receive work inside of subvolumes

Qu Wenruo (13):
   btrfs-progs: Enhance read_tree_block to avoid memory corruption
   btrfs-progs: btrfstune: rework change_uuid
   btrfs-progs: btrfstune: add ability to restore unfinished fsid change
   btrfs-progs: btrfstune: add '-U' and '-u' option to change fsid
   btrfs-progs: Documentation: uuid change
   btrfs-progs: btrfstune: fix a bug which makes unfinished fsid change 
unrecoverable
   btrfs-progs: export read_extent_data function
   btrfs-progs: map-logical: introduce map_one_extent function
   Btrfs-progs: map-logical: introduce print_mapping_info function
   

Re: [PATCH 13/19] btrfs: extent-tree: Add new verions of btrfs_check_data_free_space

2015-09-08 Thread Tsutomu Itoh
Hi, Qu,

On 2015/09/08 18:22, Qu Wenruo wrote:
> Add new function __btrfs_check_data_free_space() to do precious space
> reservation.
> 
> The new function will replace old btrfs_check_data_free_space(), but
> until all the change is done, let's just use the new name.
> 
> Also, export internal use function btrfs_alloc_data_chunk_ondemand(), as
> now qgroup reserve requires precious bytes, which can only be got in
> later loop(like fallocate).
> But data space info check and data chunk allocate doesn't need to be
> that accurate, and can be called at the beginning.
> 
> So export it for later operations.
> 
> Signed-off-by: Qu Wenruo 
> ---
>   fs/btrfs/ctree.h   |  2 ++
>   fs/btrfs/extent-tree.c | 50 
> +-
>   2 files changed, 43 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index ae86025..c1a0aaf 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3453,6 +3453,8 @@ enum btrfs_reserve_flush_enum {
>   };
>   
>   int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
> write_bytes);
> +int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
> +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
>   void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
>   void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
>   struct btrfs_root *root);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 402415c..61366ca 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -3907,11 +3907,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, 
> int data)
>   return ret;
>   }
>   
> -/*
> - * This will check the space that the inode allocates from to make sure we 
> have
> - * enough space for bytes.
> - */
> -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
> write_bytes)
> +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
>   {
>   struct btrfs_space_info *data_sinfo;
>   struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -4032,19 +4028,55 @@ commit_trans:
> data_sinfo->flags, bytes, 1);
>   return -ENOSPC;
>   }
> - ret = btrfs_qgroup_reserve(root, write_bytes);
> - if (ret)
> - goto out;
>   data_sinfo->bytes_may_use += bytes;
>   trace_btrfs_space_reservation(root->fs_info, "space_info",
> data_sinfo->flags, bytes, 1);
> -out:
>   spin_unlock(_sinfo->lock);
>   
>   return ret;
>   }
>   
>   /*
> + * This will check the space that the inode allocates from to make sure we 
> have
> + * enough space for bytes.
> + */
> +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
> write_bytes)
> +{
> + struct btrfs_root *root = BTRFS_I(inode)->root;
> + int ret;
> +
> + ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
> + if (ret < 0)
> + return ret;
> + ret = btrfs_qgroup_reserve(root, write_bytes);
> + return ret;
> +}
> +
> +/*

> + * New check_data_free_space() with ability for precious data reserveation

 reservation

Thanks,
Tsutomu

> + * Will replace old btrfs_check_data_free_space(), but for patch split,
> + * add a new function first and then replace it.
> + */
> +int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
> +{
> + struct btrfs_root *root = BTRFS_I(inode)->root;
> + int ret;
> +
> + /* align the range */
> + len = round_up(start + len, root->sectorsize) -
> +   round_down(start, root->sectorsize);
> + start = round_down(start, root->sectorsize);
> +
> + ret = btrfs_alloc_data_chunk_ondemand(inode, len);
> + if (ret < 0)
> + return ret;
> +
> + /* Use new btrfs_qgroup_reserve_data to reserve precious data space */
> + ret = btrfs_qgroup_reserve_data(inode, start, len);
> + return ret;
> +}
> +
> +/*
>* Called if we need to clear a data reservation for this inode.
>*/
>   void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [btrfs tools] ability to fail a device...

2015-09-08 Thread Anand Jain



On 09/09/2015 03:34 AM, Hugo Mills wrote:

On Tue, Sep 08, 2015 at 09:18:05PM +0200, Ian Kumlien wrote:

Hi,

Currently i have a raid1 configuration on two disks where one of them
is failing.

But since:
btrfs fi df /mnt/disk/
Data, RAID1: total=858.00GiB, used=638.16GiB
Data, single: total=1.00GiB, used=256.00KiB
System, RAID1: total=32.00MiB, used=132.00KiB
Metadata, RAID1: total=4.00GiB, used=1.21GiB
GlobalReserve, single: total=412.00MiB, used=0.00B

There should be no problem in failing one disk... Or so i thought!

btrfs dev delete /dev/sdb2 /mnt/disk/
ERROR: error removing the device '/dev/sdb2' - unable to go below two
devices on raid1


dev delete is more like a reshaping operation in mdadm: it tries to
remove a device safely whilst retaining all of the redundancy
guarantees. You can't go down to one device with RAID-1 and still keep
the redundancy.

dev delete is really for managed device removal under non-failure
conditions, not for error recovery.


And i can't issue rebalance either since it will tell me about errors
until the failing disk dies.

Whats even more interesting is that i can't mount just the working
disk - ie if the other disk
*has* failed and is inaccessible... though, i haven't tried physically
removing it...


Physically removing it is the way to go (or disabling it using echo
offline >/sys/block/sda/device/state). Once you've done that, you can
mount the degraded FS with -odegraded, then either add a new device
and balance to restore the RAID-1, or balance with
-{d,m}convert=single to drop the redundancy to single.


 its like you _must_ add a disk in this context otherwise the volume 
will render unmountable in the next mount cycle. the below mentioned 
patch has more details.




mdam has fail and remove, I assume for this reason - perhaps it's
something that should be added?


I think there should be a btrfs dev drop, which is the fail-like
operation: tell the FS that a device is useless, and should be dropped
from the array, so the FS doesn't keep trying to write to it. That's
not implemented yet, though.



 There is a patch set to handle this..
'Btrfs: introduce function to handle device offline'

Thanks, Anand


Hugo.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 18/19] btrfs: qgroup: Cleanup old inaccurate facilities

2015-09-08 Thread Tsutomu Itoh
Hi, Qu,

On 2015/09/08 18:25, Qu Wenruo wrote:
> Cleanup the old facilities which use old btrfs_qgroup_reserve() function
> call, replace them with the newer version, and remove the "__" prefix in
> them.
> 
> Also, make btrfs_qgroup_reserve/free() functions private, as they are
> now only used inside qgroup codes.
> 
> Now, the whole btrfs qgroup is swithed to use the new reserve facilities.
> 
> Signed-off-by: Qu Wenruo 
> ---
>   fs/btrfs/ctree.h   |  6 ++
>   fs/btrfs/extent-tree.c | 56 
> --
>   fs/btrfs/file.c|  2 +-
>   fs/btrfs/inode-map.c   |  2 +-
>   fs/btrfs/inode.c   | 12 +--
>   fs/btrfs/ioctl.c   |  2 +-
>   fs/btrfs/qgroup.c  | 19 ++---
>   fs/btrfs/qgroup.h  |  7 ---
>   8 files changed, 27 insertions(+), 79 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 12f14fd..8489419 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3452,8 +3452,7 @@ enum btrfs_reserve_flush_enum {
>   BTRFS_RESERVE_FLUSH_ALL,
>   };
>   
> -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
> write_bytes);
> -int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
> +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
>   int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
>   void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
>   void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
> @@ -3471,8 +3470,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root 
> *root,
> u64 qgroup_reserved);
>   int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
>   void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
> -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
> -int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
> +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
>   void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
>   void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
>   struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 07f45b7..ab1b1a1 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -3352,7 +3352,7 @@ again:
>   num_pages *= 16;
>   num_pages *= PAGE_CACHE_SIZE;
>   
> - ret = __btrfs_check_data_free_space(inode, 0, num_pages);
> + ret = btrfs_check_data_free_space(inode, 0, num_pages);
>   if (ret)
>   goto out_put;
>   
> @@ -4037,27 +4037,11 @@ commit_trans:
>   }
>   
>   /*
> - * This will check the space that the inode allocates from to make sure we 
> have
> - * enough space for bytes.
> - */
> -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
> write_bytes)
> -{
> - struct btrfs_root *root = BTRFS_I(inode)->root;
> - int ret;
> -
> - ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
> - if (ret < 0)
> - return ret;
> - ret = btrfs_qgroup_reserve(root, write_bytes);
> - return ret;
> -}
> -
> -/*
>* New check_data_free_space() with ability for precious data reserveation
>* Will replace old btrfs_check_data_free_space(), but for patch split,
>* add a new function first and then replace it.
>*/
> -int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
> +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
>   {
>   struct btrfs_root *root = BTRFS_I(inode)->root;
>   int ret;
> @@ -5710,11 +5694,11 @@ void btrfs_delalloc_release_metadata(struct inode 
> *inode, u64 num_bytes)
>* Return 0 for success
>* Return <0 for error(-ENOSPC or -EQUOT)
>*/
> -int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
> +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
>   {
>   int ret;
>   
> - ret = __btrfs_check_data_free_space(inode, start, len);
> + ret = btrfs_check_data_free_space(inode, start, len);
>   if (ret < 0)
>   return ret;
>   ret = btrfs_delalloc_reserve_metadata(inode, len);
> @@ -5724,38 +5708,6 @@ int __btrfs_delalloc_reserve_space(struct inode 
> *inode, u64 start, u64 len)
>   }
>   
>   /**
> - * btrfs_delalloc_reserve_space - reserve data and metadata space for 
> delalloc
> - * @inode: inode we're writing to
> - * @num_bytes: the number of bytes we want to allocate
> - *
> - * This will do the following things
> - *
> - * o reserve space in the data space info for num_bytes
> - * o reserve space in the metadata space info based on number of outstanding
> - *   extents and how much csums will be needed
> - * o add to the inodes ->delalloc_bytes
> - * o add it to the 

Re: Btrfs progs release 4.1

2015-09-08 Thread Qu Wenruo



Qu Wenruo wrote on 2015/09/09 09:34 +0800:

Hi David,

Sorry for the late reply, but I noticed something interesting.

David Sterba wrote on 2015/06/22 17:00 +0200:

Hi,

btrfs-progs 4.1 have been released (in time with kernel 4.1). Unusual
load of
changes.

Fixed since rc1:
   - uuid rewrite prints the correct original UUID
   - map-logical updated
   - fi show size units
   - typos

* bugfixes
   - fsck.btrfs: no bash-isms
   - bugzilla 97171: invalid memory access (with tests)
   - receive:
 - cloning works with --chroot
 - capabilities not lost
   - mkfs: do not try to register bare file images
   - option --help accepted by the standalone utilities

* enhancements
   - corrupt block: ability to remove csums
   - mkfs:
 - warn if metadata redundancy is lower than for data
 - options to make the output quiet (only errors)
 - mixed case names of raid profiles accepted
 - rework the output:
   - more comprehensive, 'key: value' format
   - subvol:
 - show:
   - print received uuid
   - update the output
   - new options to specify size units
 - sync:
   - grab all deleted ids and print them as they're removed,
previous implementation only checked if there are any
to be deleted - change in command semantics
   - scrub: print timestamps in days HMS format
   - receive:
 - can specify mount point, do not rely on /proc
 - can work inside subvolumes
   - send:
 - new option to send stream without data (NO_FILE_DATA)
   - convert:
 - specify incompat features on the new fs
   - qgroup:
 - show: distinguish no limits and 0 limit value
 - limit: ability to clear the limit
   - help for 'btrfs' is shorter, 1st level command overview
   - debug tree: print key names according to their C name

* new
   - rescure zero-log
   - btrfsune:
 - rewrite uuid on a filesystem image
 - new option to turn on NO_HOLES incompat feature

* deprecated
   - standalone btrfs-zero-log

* other
   - testing framework updates
 - uuid rewrite test
 - btrfstune feature setting test
 - zero-log tests
 - more testing image formats
   - manual page updates
   - ioctl.h synced with current kernel uapi version
   - convert: preparatory works for more filesystems (reiserfs pending)
   - use static buffers for path handling where possible
   - add new helpers for send uilts that check memory allocations,
 switch all users, deprecate old helpers
   - Makefile: fix build dependency generation
   - map-logical: make it work again

Tarballs:
https://www.kernel.org/pub/linux/kernel/people/kdave/btrfs-progs/
Git: git://git.kernel.org/pub/scm/linux/kernel/git/kdave/btrfs-progs.git

Shortlog:

Anand Jain (2):
   btrfs-progs: add info about list-all to the help
   btrfs-progs: use function is_block_device() instead

Dimitri John Ledkov (1):
   btrfs-progs: fsck.btrfs: Fix bashism and bad getopts processing

Dongsheng Yang (4):
   btrfs-progs: qgroup: show 'none' when we did not limit it on
this qgroup
   btrfs-progs: qgroup: allow user to clear some limitation on
qgroup.
   btrfs-progs: qgroup limit: error out if input value is negative
   btrfs-progs: qgroup limit: add a check for invalid input of
'T/G/M/K'

Emil Karlson (1):
   btrfs-progs: use openat for process_clone in receive

Goffredo Baroncelli (4):
   btrfs-progs: add strdup in btrfs_add_to_fsid() to track the
device path
   btrfs-progs: return the fsid from make_btrfs()
   btrfs-progs: mkfs: track sizes of created block groups
   btrfs-progs: mkfs: print the summary

Jeff Mahoney (8):
   btrfs-progs: convert: clean up blk_iterate_data handling wrt
record_file_blocks
   btrfs-progs: convert: remove unused fs argument from
block_iterate_proc
   btrfs-progs: convert: remove unused inode_key in copy_single_inode
   btrfs-progs: convert: rename ext2_root to image_root
   btrfs-progs: compat: define DIV_ROUND_UP if not already defined
   btrfs-progs: convert: fix typo in btrfs_insert_dir_item call
   btrfs-progs: convert: factor out adding dirent into
convert_insert_dirent
   btrfs-progs: convert: factor out block iteration callback

Josef Bacik (3):
   Btrfs-progs: corrupt-block: add the ability to remove csums
   btrfs-progs: specify mountpoint for recieve
   btrfs-progs: make receive work inside of subvolumes

Qu Wenruo (13):
   btrfs-progs: Enhance read_tree_block to avoid memory corruption
   btrfs-progs: btrfstune: rework change_uuid
   btrfs-progs: btrfstune: add ability to restore unfinished fsid
change
   btrfs-progs: btrfstune: add '-U' and '-u' option to change fsid
   btrfs-progs: Documentation: uuid change
   btrfs-progs: btrfstune: fix a bug which makes unfinished fsid
change unrecoverable
   btrfs-progs: export read_extent_data function
   btrfs-progs: map-logical: introduce map_one_extent function
   Btrfs-progs: map-logical: introduce 

Re: [PATCH v1 9/8] copy_file_range.2: New page documenting copy_file_range()

2015-09-08 Thread Anna Schumaker
On 09/04/2015 06:31 PM, Andreas Dilger wrote:
> On Sep 4, 2015, at 3:38 PM, Darrick J. Wong  wrote:
>>
>> On Fri, Sep 04, 2015 at 04:17:03PM -0400, Anna Schumaker wrote:
>>> copy_file_range() is a new system call for copying ranges of data
>>> completely in the kernel.  This gives filesystems an opportunity to
>>> implement some kind of "copy acceleration", such as reflinks or
>>> server-side-copy (in the case of NFS).
>>>
>>> Signed-off-by: Anna Schumaker 
>>> ---
>>> man2/copy_file_range.2 | 168 
>>> +
>>> 1 file changed, 168 insertions(+)
>>> create mode 100644 man2/copy_file_range.2
>>>
>>> diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
>>> new file mode 100644
>>> index 000..4a4cb73
>>> --- /dev/null
>>> +++ b/man2/copy_file_range.2
>>> @@ -0,0 +1,168 @@
>>> +.\"This manpage is Copyright (C) 2015 Anna Schumaker 
>>> 
>>> +.TH COPY 2 2015-8-31 "Linux" "Linux Programmer's Manual"
>>> +.SH NAME
>>> +copy_file_range \- Copy a range of data from one file to another
>>> +.SH SYNOPSIS
>>> +.nf
>>> +.B #include 
>>> +.B #include 
>>> +.B #include 
>>> +
>>> +.BI "ssize_t syscall(__NR_copy_file_range, int " fd_in ", loff_t * " 
>>> off_in ",
>>> +.BI "int " fd_out ", loff_t * " off_out ", size_t " len ",
>>> +.BI "unsigned int " flags );
>>> +.fi
>>> +.SH DESCRIPTION
>>> +The
>>> +.BR copy_file_range ()
>>> +system call performs an in-kernel copy between two file descriptors
>>> +without all that tedious mucking about in userspace.
>>
>> ;)
>>
>>> +It copies up to
>>> +.I len
>>> +bytes of data from file descriptor
>>> +.I fd_in
>>> +to file descriptor
>>> +.I fd_out
>>> +at
>>> +.IR off_out .
>>> +The file descriptors must not refer to the same file.
>>
>> Why?  btrfs (and XFS) reflink can handle the case of a file sharing blocks
>> with itself.
>>
>>> +
>>> +The following semantics apply for
>>> +.IR fd_in ,
>>> +and similar statements apply to
>>> +.IR off_out :
>>> +.IP * 3
>>> +If
>>> +.I off_in
>>> +is NULL, then bytes are read from
>>> +.I fd_in
>>> +starting from the current file offset and the current
>>> +file offset is adjusted appropriately.
>>> +.IP *
>>> +If
>>> +.I off_in
>>> +is not NULL, then
>>> +.I off_in
>>> +must point to a buffer that specifies the starting
>>> +offset where bytes from
>>> +.I fd_in
>>> +will be read.  The current file offset of
>>> +.I fd_in
>>> +is not changed, but
>>> +.I off_in
>>> +is adjusted appropriately.
>>> +.PP
>>> +The default behavior of
>>> +.BR copy_file_range ()
>>> +is filesystem specific, and might result in creating a
>>> +copy-on-write reflink.
>>> +In the event that a given filesystem does not implement
>>> +any form of copy acceleration, the kernel will perform
>>> +a deep copy of the requested range by reading bytes from
>>
>> I wonder if it's wise to allow deep copies -- what happens if
>> len == 1T? Will this syscall just block for a really long time?
> 
> It should be interruptible, and return the length of the number of
> bytes copied so far, just like read() and write().  That allows
> the caller to continue where it left off, or abort and delete the
> target file, or whatever it wants to do.

We already return the number of bytes copied so far, so I'll look into making 
it interruptable!

Thanks,
Anna

> 
> Cheers, Andreas
> 
>>> +.I fd_in
>>> +and writing them to
>>> +.IR fd_out .
>>
>> "...if COPY_REFLINK is not set in flags."
>>
>>> +
>>> +Currently, Linux only supports the following flag:
>>> +.TP 1.9i
>>> +.B COPY_REFLINK
>>> +Only perform the copy if the filesystem can do it as a reflink.
>>> +Do not fall back on performing a deep copy.
>>> +.SH RETURN VALUE
>>> +Upon successful completion,
>>> +.BR copy_file_range ()
>>> +will return the number of bytes copied between files.
>>> +This could be less than the length originally requested.
>>> +
>>> +On error,
>>> +.BR copy_file_range ()
>>> +returns \-1 and
>>> +.I errno
>>> +is set to indicate the error.
>>> +.SH ERRORS
>>> +.TP
>>> +.B EBADF
>>> +One or more file descriptors are not valid,
>>> +or do not have proper read-write mode.
>>
>> "or fd_out is not opened for writing"?
>>
>>> +.TP
>>> +.B EINVAL
>>> +Requested range extends beyond the end of the file;
>>> +.I flags
>>> +argument is set to an invalid value.
>>> +.TP
>>> +.B EOPNOTSUPP
>>> +.B COPY_REFLINK
>>> +was specified in
>>> +.IR flags ,
>>> +but the target filesystem does not support reflinks.
>>> +.TP
>>> +.B EXDEV
>>> +Target filesystem doesn't support cross-filesystem copies.
>>> +.SH VERSIONS
>>
>> Perhaps this ought to list a few more errors (EIO, ENOSPC, ENOSYS, EPERM...)
>> that can be returned?  (I was looking at the fallocate manpage.)
>>
>> --D
>>
>>> +The
>>> +.BR copy_file_range ()
>>> +system call first appeared in Linux 4.3.
>>> +.SH CONFORMING TO
>>> +The
>>> +.BR copy_file_range ()
>>> +system call is a nonstandard Linux extension.

Re: [PATCH v1 0/8] VFS: In-kernel copy system call

2015-09-08 Thread Anna Schumaker
On 09/04/2015 06:25 PM, Andreas Dilger wrote:
> On Sep 4, 2015, at 2:16 PM, Anna Schumaker  wrote:
>>
>> Copy system calls came up during Plumbers a couple of weeks ago,
>> because several filesystems (including NFS and XFS) are currently
>> working on copy acceleration implementations.  We haven't heard from
>> Zach Brown in a while, so I volunteered to push his patches upstream
>> so individual filesystems don't need to keep writing their own ioctls.
>>
>> The first three patches are a simple reposting of Zach's patches
>> from several months ago, with one minor error code fix.  The remaining
>> patches add in a fallback mechanism when filesystems don't provide a
>> copy function.  This is especially useful when doing a server-side
>> copy on NFS (using the new COPY operation in NFS v4.2).  This fallback
>> can be disabled by passing the flag COPY_REFLINK to the system call.
>>
>> The last patch is a man page patch documenting this new system call,
>> including an example program.
>>
>> I tested the fallback option by using /dev/urandom to generate files
>> of varying sizes and copying them.  I compared the time to copy
>> against that of `cp` just to see if there is a noticable difference.
>> I found that runtimes are roughly the same, but in-kernel copy tends
>> to use less of the cpu.  Values in the tables below are averages
>> across multiple trials.
>>
>>
>> /usr/bin/cp |   512 MB  |   1024 MB |   1536 MB |   2048 MB
>> -|---|---|---|---
>>   user  |   0.00s   |   0.00s   |   0.00s   |   0.00s
>> system  |   0.32s   |   0.52s   |   1.04s   |   1.04s
>>cpu  | 73%   | 69%   | 62%   | 62%
>>  total  |   0.446   |   0.757   |   1.197   |   1.667
>>
>>
>>   VFS copy  |   512 MB  |   1024 MB |   1536 MB |   2048 MB
>> -|---|---|---|---
>>   user  |   0.00s   |   0.00s   |   0.00s   |  0.00s
>> system  |   0.33s   |   0.49s   |   0.76s   |  0.99s
>>cpu  | 77%   | 62%   | 60%   |59%
>>  total  |   0.422   |   0.777   |   1.267   |  1.655
>>
>>
>> Questions?  Comments?  Thoughts?
> 
> This is a bit of a surprising result, since in my testing in the
> past, copy_{to/from}_user() is a major consumer of CPU time (50%
> of a CPU core at 1GB/s).  What backing filesystem did you test on?

I tested using XFS against two KVM guests.  Maybe something there is adding the 
extra cpu cycles?

Anna

> 
> In theory, the VFS copy routines should save at least 50% of the
> CPU usage since it only needs to make one copy (src->dest) instead
> of two (kernel->user, user->kernel).  Ideally it wouldn't make any
> data copies at all and just pass page references from the source
> to the target.
> 
> Cheers, Andreas
>>
>> Anna
>>
>>
>> Anna Schumaker (5):
>>  btrfs: Add mountpoint checking during btrfs_copy_file_range
>>  vfs: Remove copy_file_range mountpoint checks
>>  vfs: Copy should check len after file open mode
>>  vfs: Copy should use file_out rather than file_in
>>  vfs: Fall back on splice if no copy function defined
>>
>> Zach Brown (3):
>>  vfs: add copy_file_range syscall and vfs helper
>>  x86: add sys_copy_file_range to syscall tables
>>  btrfs: add .copy_file_range file operation
>>
>> arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>> arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>> fs/btrfs/ctree.h   |   3 +
>> fs/btrfs/file.c|   1 +
>> fs/btrfs/ioctl.c   |  95 ++--
>> fs/read_write.c| 132 
>> +
>> include/linux/copy.h   |   6 ++
>> include/linux/fs.h |   3 +
>> include/uapi/asm-generic/unistd.h  |   4 +-
>> include/uapi/linux/Kbuild  |   1 +
>> include/uapi/linux/copy.h  |   6 ++
>> kernel/sys_ni.c|   1 +
>> 12 files changed, 214 insertions(+), 40 deletions(-)
>> create mode 100644 include/linux/copy.h
>> create mode 100644 include/uapi/linux/copy.h
>>
>> -- 
>> 2.5.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> Cheers, Andreas
> 
> 
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] btrfs-progs: Avoid uninitialized data in output of btrfs-convert

2015-09-08 Thread Zhao Lei
sequence, transid and reserved field of inode was write to disk
with uninitizlized value, this patch fixed it.

Signed-off-by: Zhao Lei 
---
 btrfs-convert.c | 4 
 ctree.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/btrfs-convert.c b/btrfs-convert.c
index f4fc650..934f4dc 100644
--- a/btrfs-convert.c
+++ b/btrfs-convert.c
@@ -1016,6 +1016,8 @@ static int copy_inode_item(struct btrfs_inode_item *dst,
   struct ext2_inode *src, u32 blocksize)
 {
btrfs_set_stack_inode_generation(dst, 1);
+   btrfs_set_stack_inode_sequence(dst, 0);
+   btrfs_set_stack_inode_transid(dst, 1);
btrfs_set_stack_inode_size(dst, src->i_size);
btrfs_set_stack_inode_nbytes(dst, 0);
btrfs_set_stack_inode_block_group(dst, 0);
@@ -1052,6 +1054,8 @@ static int copy_inode_item(struct btrfs_inode_item *dst,
new_decode_dev(src->i_block[1]));
}
}
+   memset(>reserved, 0, sizeof(dst->reserved));
+
return 0;
 }
 
diff --git a/ctree.h b/ctree.h
index 2061e1e..c57f9ca 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1424,6 +1424,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_generation,
 struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence,
 struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid,
+struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_size,
 struct btrfs_inode_item, size, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes,
-- 
1.8.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] btrfs-progs: fix set_get typo of btrfs_inode_item->sequence

2015-09-08 Thread Zhao Lei
s/generation/sequence
for BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, ...)

Signed-off-by: Zhao Lei 
---
 ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ctree.h b/ctree.h
index bcad2b9..2061e1e 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1423,7 +1423,7 @@ BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, 
flags, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_generation,
 struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence,
-struct btrfs_inode_item, generation, 64);
+struct btrfs_inode_item, sequence, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_size,
 struct btrfs_inode_item, size, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes,
-- 
1.8.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] btrfs-progs: add ext2fs_close_inode_scan to copy_inodes

2015-09-08 Thread Zhao Lei
We need use ext2fs_close_inode_scan to release relative resource
get from ext2fs_open_inode_scan() in copy_inodes()

Signed-off-by: Zhao Lei 
---
 btrfs-convert.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/btrfs-convert.c b/btrfs-convert.c
index 934f4dc..e7f3c9e 100644
--- a/btrfs-convert.c
+++ b/btrfs-convert.c
@@ -1186,6 +1186,7 @@ static int copy_inodes(struct btrfs_root *root, 
ext2_filsys ext2_fs,
}
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
+   ext2fs_close_inode_scan(ext2_scan);
 
return ret;
 }
-- 
1.8.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 00/14] Accurate qgroup reserve framework

2015-09-08 Thread Qu Wenruo
[[BUG]]
One of the most common case to trigger the bug is the following method:
1) Enable quota
2) Limit excl of qgroup 5 to 16M
3) Write [0,2M) of a file inside subvol 5 10 times without sync

EQUOT will be triggered at about the 8th write.

[[CAUSE]]
The problem is caused by the fact that qgroup will reserve space even
the data space is already reserved.

In above reproducer, each time we buffered write [0,2M) qgroup will
reserve 2M space, but in fact, at the 1st time, we have already reserved
2M and from then on, we don't need to reserved any data space as we are
only writing [0,2M).

Also, the reserved space will only be freed *ONCE* when its backref is
run at commit_transaction() time.

That's causing the reserved space leaking.

[[FIX]]
The fix is not a simple one, as currently btrfs_qgroup_reserve() follow
the very bad btrfs space allocating principle:
  Allocate as much as you needed, even it's not fully used.

So for accurate qgroup reserve, we introduce a completely new framework
for data and metadata.
1) Per-inode data reserve map
   Now, each inode will have a data reserve map, recording which range
   of data is already reserved.
   If we are writing a range which is already reserved, we won't need to
   reserve space again.

   Also, for the fact that qgroup is only accounted at commit_trans(),
   for data commit into disc and its metadata is also inserted into
   current tree, we should free the data reserved range, but still keep
   the reserved space until commit_trans().

   So delayed_ref_head will have new members to record how much space is
   reserved and free them at commit_trans() time.

2) Per-root metadata reserve counter
   For metadata(tree block), it's impossible to know how much space it
   will use exactly in advance.
   And due to the new qgroup accounting framework, the old
   free-at-end-trans may lead to exceeding limit.

   So we record how much metadata space is reserved for each root, and
   free them at commit_trans() time.
   This method is not perfect, but thanks to the compared small size of
   metadata, it should be quite good.

More detailed info can be found in each commit message and source
commend.

Qu Wenruo (19):
  btrfs: qgroup: New function declaration for new reserve implement
  btrfs: qgroup: Implement data_rsv_map init/free functions
  btrfs: qgroup: Introduce new function to search most left reserve
range
  btrfs: qgroup: Introduce function to insert non-overlap reserve range
  btrfs: qgroup: Introduce function to reserve data range per inode
  btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function
  btrfs: qgroup: Introduce function to release reserved range
  btrfs: qgroup: Introduce function to release/free reserved data range
  btrfs: delayed_ref: Add new function to record reserved space into
delayed ref
  btrfs: delayed_ref: release and free qgroup reserved at proper timing
  btrfs: qgroup: Introduce new functions to reserve/free metadata
  btrfs: qgroup: Use new metadata reservation.
  btrfs: extent-tree: Add new verions of btrfs_check_data_free_space
  btrfs: Switch to new check_data_free_space
  btrfs: fallocate: Add support to accurate qgroup reserve
  btrfs: extent-tree: Add new version of btrfs_delalloc_reserve_space
  btrfs: extent-tree: Use new __btrfs_delalloc_reserve_space function
  btrfs: qgroup: Cleanup old inaccurate facilities
  btrfs: qgroup: Add handler for NOCOW and inline

 fs/btrfs/btrfs_inode.h |   6 +
 fs/btrfs/ctree.h   |   8 +-
 fs/btrfs/delayed-ref.c |  29 +++
 fs/btrfs/delayed-ref.h |  14 +
 fs/btrfs/disk-io.c |   1 +
 fs/btrfs/extent-tree.c |  99 +---
 fs/btrfs/file.c| 169 +
 fs/btrfs/inode-map.c   |   2 +-
 fs/btrfs/inode.c   |  51 +++-
 fs/btrfs/ioctl.c   |   3 +-
 fs/btrfs/qgroup.c  | 674 -
 fs/btrfs/qgroup.h  |  18 +-
 fs/btrfs/transaction.c |  34 +--
 fs/btrfs/transaction.h |   1 -
 14 files changed, 979 insertions(+), 130 deletions(-)

-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 17/19] btrfs: extent-tree: Use new __btrfs_delalloc_reserve_space function

2015-09-08 Thread Qu Wenruo
Use new __btrfs_delalloc_reserve_space to reserve space.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/inode-map.c |  2 +-
 fs/btrfs/inode.c | 16 ++--
 fs/btrfs/ioctl.c |  5 +++--
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582a..ab639d3 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,7 +488,7 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
 
-   ret = btrfs_delalloc_reserve_space(inode, prealloc);
+   ret = __btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
goto out_put;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1f7cac0..d70cb26 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1985,7 +1985,8 @@ again:
goto again;
}
 
-   ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+   ret = __btrfs_delalloc_reserve_space(inode, page_start,
+PAGE_CACHE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -4581,7 +4582,8 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, 
loff_t len,
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
-   ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+   ret = __btrfs_delalloc_reserve_space(inode,
+   round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
if (ret)
goto out;
 
@@ -8373,7 +8375,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct 
iov_iter *iter,
mutex_unlock(>i_mutex);
relock = true;
}
-   ret = btrfs_delalloc_reserve_space(inode, count);
+   ret = __btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
goto out;
outstanding_extents = div64_u64(count +
@@ -8620,7 +8622,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, 
struct vm_fault *vmf)
u64 page_end;
 
sb_start_pagefault(inode->i_sb);
-   ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+   page_start = page_offset(page);
+   page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+   ret = __btrfs_delalloc_reserve_space(inode, page_start,
+PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
@@ -8639,8 +8645,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct 
vm_fault *vmf)
 again:
lock_page(page);
size = i_size_read(inode);
-   page_start = page_offset(page);
-   page_end = page_start + PAGE_CACHE_SIZE - 1;
 
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf542..e0291fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1119,8 +1119,9 @@ static int cluster_pages_for_defrag(struct inode *inode,
 
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-   ret = btrfs_delalloc_reserve_space(inode,
-  page_cnt << PAGE_CACHE_SHIFT);
+   ret = __btrfs_delalloc_reserve_space(inode,
+   start_index << PAGE_CACHE_SHIFT,
+   page_cnt << PAGE_CACHE_SHIFT);
if (ret)
return ret;
i_done = 0;
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 16/19] btrfs: extent-tree: Add new version of btrfs_delalloc_reserve_space

2015-09-08 Thread Qu Wenruo
Add new version of btrfs_delalloc_reserve_space() function, which
supports accurate qgroup reserve.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/extent-tree.c | 38 ++
 2 files changed, 39 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c1a0aaf..12f14fd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3472,6 +3472,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root 
*root,
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2e3f19e..07f45b7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5686,6 +5686,44 @@ void btrfs_delalloc_release_metadata(struct inode 
*inode, u64 num_bytes)
 }
 
 /**
+ * __btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
+ * @inode: inode we're writing to
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
+ *
+ * This will do the following things
+ *
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
+ *   extents and how much csums will be needed
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
+ *
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
+ */
+int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+{
+   int ret;
+
+   ret = __btrfs_check_data_free_space(inode, start, len);
+   if (ret < 0)
+   return ret;
+   ret = btrfs_delalloc_reserve_metadata(inode, len);
+   if (ret < 0)
+   btrfs_free_reserved_data_space(inode, len);
+   return ret;
+}
+
+/**
  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
  * @inode: inode we're writing to
  * @num_bytes: the number of bytes we want to allocate
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/19] btrfs: Switch to new check_data_free_space

2015-09-08 Thread Qu Wenruo
Use new check_data_free_space for buffered write and inode cache.

For buffered write case, as nodatacow write won't increase quota account,
so unlike old behavior which does reserve before check nocow, now we
check nocow first and then only reserve data if we can't do nocow write.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/file.c| 22 +-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61366ca..2e3f19e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3352,7 +3352,7 @@ again:
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
 
-   ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+   ret = __btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b823fac..c1eec4f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1510,12 +1510,17 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
}
 
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-   ret = btrfs_check_data_free_space(inode, reserve_bytes, 
write_bytes);
-   if (ret == -ENOSPC &&
-   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC))) {
+
+   if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, _bytes);
+   if (ret < 0)
+   break;
if (ret > 0) {
+   /*
+* For nodata cow case, no need to reserve
+* data space.
+*/
only_release_metadata = true;
/*
 * our prealloc extent may be smaller than
@@ -1524,15 +1529,14 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
 PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-   ret = 0;
-   } else {
-   ret = -ENOSPC;
+   goto reserve_metadata;
}
}
-
-   if (ret)
+   ret = __btrfs_check_data_free_space(inode, pos, write_bytes);
+   if (ret < 0)
break;
 
+reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/19] btrfs: qgroup: Introduce new functions to reserve/free metadata

2015-09-08 Thread Qu Wenruo
Introduce new functions btrfs_qgroup_reserve/free_meta() to reserve/free
metadata reserved space.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  3 +++
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/qgroup.c  | 40 
 fs/btrfs/qgroup.h  |  4 
 4 files changed, 48 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..ae86025 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1943,6 +1943,9 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshoted;
+
+   /* For qgroup metadata space reserve */
+   atomic_t qgroup_meta_rsv;
 };
 
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0b658d0..704d212 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1259,6 +1259,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, 
u32 stripesize,
atomic_set(>orphan_inodes, 0);
atomic_set(>refs, 1);
atomic_set(>will_be_snapshoted, 0);
+   atomic_set(>qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5a69a2d..b759e96 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3102,3 +3102,43 @@ void btrfs_qgroup_free_data_rsv_map(struct inode *inode)
kfree(dirty_map);
binode->qgroup_rsv_map = NULL;
 }
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+   int ret;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+   num_bytes == 0)
+   return 0;
+
+   BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+   ret = btrfs_qgroup_reserve(root, num_bytes);
+   if (ret < 0)
+   return ret;
+   atomic_add(num_bytes, >qgroup_meta_rsv);
+   return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+   int reserved;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+   return;
+
+   reserved = atomic_xchg(>qgroup_meta_rsv, 0);
+   if (reserved == 0)
+   return;
+   btrfs_qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+   return;
+
+   BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+   WARN_ON(atomic_read(>qgroup_meta_rsv) < num_bytes);
+   atomic_sub(num_bytes, >qgroup_meta_rsv);
+   btrfs_qgroup_free(root, num_bytes);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 49fa15e..2d507c8 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -96,4 +96,8 @@ void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
 #endif /* __BTRFS_QGROUP__ */
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/19] btrfs: extent-tree: Add new verions of btrfs_check_data_free_space

2015-09-08 Thread Qu Wenruo
Add new function __btrfs_check_data_free_space() to do precious space
reservation.

The new function will replace old btrfs_check_data_free_space(), but
until all the change is done, let's just use the new name.

Also, export internal use function btrfs_alloc_data_chunk_ondemand(), as
now qgroup reserve requires precious bytes, which can only be got in
later loop(like fallocate).
But data space info check and data chunk allocate doesn't need to be
that accurate, and can be called at the beginning.

So export it for later operations.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/extent-tree.c | 50 +-
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ae86025..c1a0aaf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3453,6 +3453,8 @@ enum btrfs_reserve_flush_enum {
 };
 
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes);
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 402415c..61366ca 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3907,11 +3907,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int 
data)
return ret;
 }
 
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
 {
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4032,19 +4028,55 @@ commit_trans:
  data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
-   ret = btrfs_qgroup_reserve(root, write_bytes);
-   if (ret)
-   goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
  data_sinfo->flags, bytes, 1);
-out:
spin_unlock(_sinfo->lock);
 
return ret;
 }
 
 /*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes)
+{
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   int ret;
+
+   ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
+   if (ret < 0)
+   return ret;
+   ret = btrfs_qgroup_reserve(root, write_bytes);
+   return ret;
+}
+
+/*
+ * New check_data_free_space() with ability for precious data reserveation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   int ret;
+
+   /* align the range */
+   len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+   start = round_down(start, root->sectorsize);
+
+   ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+   if (ret < 0)
+   return ret;
+
+   /* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+   ret = btrfs_qgroup_reserve_data(inode, start, len);
+   return ret;
+}
+
+/*
  * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/19] btrfs: qgroup: Use new metadata reservation.

2015-09-08 Thread Qu Wenruo
As we have the new metadata reservation functions, use them to replace
the old btrfs_qgroup_reserve() call for metadata.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c | 14 ++
 fs/btrfs/transaction.c | 34 ++
 fs/btrfs/transaction.h |  1 -
 3 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 65e60eb..402415c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5343,7 +5343,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
-   ret = btrfs_qgroup_reserve(root, num_bytes);
+   ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
@@ -5361,10 +5361,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
 
-   if (ret) {
-   if (*qgroup_reserved)
-   btrfs_qgroup_free(root, *qgroup_reserved);
-   }
+   if (ret && *qgroup_reserved)
+   btrfs_qgroup_free_meta(root, *qgroup_reserved);
 
return ret;
 }
@@ -5525,15 +5523,15 @@ int btrfs_delalloc_reserve_metadata(struct inode 
*inode, u64 num_bytes)
spin_unlock(_I(inode)->lock);
 
if (root->fs_info->quota_enabled) {
-   ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+   ret = btrfs_qgroup_reserve_meta(root,
+   nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
 
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
-   if (root->fs_info->quota_enabled)
-   btrfs_qgroup_free(root, nr_extents * root->nodesize);
+   btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 68ad89e..707e8ea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -446,13 +446,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, 
unsigned int type,
 * the appropriate flushing if need be.
 */
if (num_items > 0 && root != root->fs_info->chunk_root) {
-   if (root->fs_info->quota_enabled &&
-   is_fstree(root->root_key.objectid)) {
-   qgroup_reserved = num_items * root->nodesize;
-   ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-   if (ret)
-   return ERR_PTR(ret);
-   }
+   qgroup_reserved = num_items * root->nodesize;
+   ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+   if (ret)
+   return ERR_PTR(ret);
 
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
/*
@@ -521,7 +518,6 @@ again:
h->block_rsv = NULL;
h->orig_rsv = NULL;
h->aborted = 0;
-   h->qgroup_reserved = 0;
h->delayed_ref_elem.seq = 0;
h->type = type;
h->allocating_chunk = false;
@@ -546,7 +542,6 @@ again:
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
-   h->qgroup_reserved = qgroup_reserved;
 
 got_it:
btrfs_record_root_in_trans(h, root);
@@ -564,8 +559,7 @@ alloc_fail:
btrfs_block_rsv_release(root, >fs_info->trans_block_rsv,
num_bytes);
 reserve_fail:
-   if (qgroup_reserved)
-   btrfs_qgroup_free(root, qgroup_reserved);
+   btrfs_qgroup_free_meta(root, qgroup_reserved);
return ERR_PTR(ret);
 }
 
@@ -782,15 +776,6 @@ static int __btrfs_end_transaction(struct 
btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
 
-   if (trans->qgroup_reserved) {
-   /*
-* the same root has to be passed here between start_transaction
-* and end_transaction. Subvolume quota depends on this.
-*/
-   btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-   trans->qgroup_reserved = 0;
-   }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
 
@@ -1205,6 +1190,7 @@ static noinline int commit_fs_roots(struct 
btrfs_trans_handle *trans,
spin_lock(_info->fs_roots_radix_lock);
if (err)
break;
+   btrfs_qgroup_free_meta_all(root);
}
  

[PATCH 10/19] btrfs: delayed_ref: release and free qgroup reserved at proper timing

2015-09-08 Thread Qu Wenruo
Qgroup reserved space needs to be released from inode dirty map and get
freed at different timing:

1) Release when the metadata is written into tree
After corresponding metadata is written into tree, any newer write will
be COWed(don't include NOCOW case yet).
So we must release its range from inode dirty range map, or we will
forget to reserve needed range, causing accounting exceeding the limit.

2) Free reserved bytes when delayed ref is run
When delayed refs are run, qgroup accounting will follow soon and turn
the reserved bytes into rfer/excl numbers.
As run_delayed_refs and qgroup accounting are all done at
commit_transaction() time, we are safe to free reserved space in
run_delayed_ref time().

With these timing to release/free reserved space, we should be able to
resolve the long existing qgroup reserve space leak problem.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c |  4 
 fs/btrfs/inode.c   | 10 ++
 fs/btrfs/qgroup.c  |  5 ++---
 fs/btrfs/qgroup.h  |  8 +++-
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5411f0a..65e60eb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2345,6 +2345,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle 
*trans,
  node->num_bytes);
}
}
+
+   /* Also free its reserved qgroup space */
+   btrfs_qgroup_free_refroot(root->fs_info, head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61b2c17..1f7cac0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2112,6 +2112,16 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
btrfs_ino(inode), file_pos, );
+   if (ret < 0)
+   goto out;
+   /*
+* Release the reserved range from inode dirty range map, and
+* move it to delayed ref codes, as now accounting only happens at
+* commit_transaction() time.
+*/
+   btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+   ret = btrfs_add_delayed_qgroup_reserve(root->fs_info, trans,
+   root->objectid, disk_bytenr, ram_bytes);
 out:
btrfs_free_path(path);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ba7888f..5a69a2d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2169,14 +2169,13 @@ out:
return ret;
 }
 
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+  u64 ref_root, u64 num_bytes)
 {
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
-   struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist_node *unode;
struct ulist_iterator uiter;
-   u64 ref_root = root->root_key.objectid;
int ret = 0;
 
if (!is_fstree(ref_root))
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 8e69dc1..49fa15e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -75,7 +75,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 struct btrfs_qgroup_inherit *inherit);
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+  u64 ref_root, u64 num_bytes);
+static inline void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+   return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+num_bytes);
+}
 
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
 
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/19] btrfs: qgroup: Introduce function to release reserved range

2015-09-08 Thread Qu Wenruo
Introduce new function release_data_range() to release reserved ranges.
It will iterate through all existing ranges and remove/shrink them.

Note this function will not free reserved space, as the range can be
released in the following conditions:
1) The dirty range gets written to disk.
   In this case, reserved range will be released but reserved bytes
   will not be freed until the delayed_ref is run.

2) Truncate
   In this case, dirty ranges will be released and reserved bytes will
   also be freed.

So the new function won't free reserved space, but record them into
parameter if called needs.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 130 ++
 1 file changed, 130 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 337b784..e24c10d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2849,6 +2849,136 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 
start, u64 len)
return ret;
 }
 
+/* Small helper used in release_data_range() to update rsv map */
+static inline void __update_rsv(struct btrfs_qgroup_data_rsv_map *map,
+   u64 *reserved, u64 cur_rsv)
+{
+   if (reserved)
+   *reserved += cur_rsv;
+   if (WARN_ON(map->reserved < cur_rsv))
+   map->reserved = 0;
+   else
+   map->reserved -= cur_rsv;
+}
+
+/*
+ * Release the range [start, start + len) from rsv map.
+ *
+ * The behavior should be much like reserve_data_range().
+ * @tmp: the allocated memory for case which need to split existing
+ *   range into two.
+ * @reserved: the number of bytes that may need to free
+ * Return > 0 if 'tmp' memory is used and release range successfully
+ * Return 0 if 'tmp' memory is not used and release range successfully
+ * Return < 0 for error
+ */
+static int release_data_range(struct btrfs_qgroup_data_rsv_map *map,
+ struct data_rsv_range *tmp,
+ u64 start, u64 len, u64 *reserved)
+{
+   struct data_rsv_range *range;
+   u64 cur_rsv = 0;
+   int ret = 0;
+
+   range = find_reserve_range(map, start);
+   /* empty tree, just return */
+   if (!range)
+   return 0;
+   /*
+* For split case
+*  ||
+* ||
+* In this case, we need to insert one new range.
+*/
+   if (range->start < start && range->start + range->len > start + len) {
+   u64 new_start = start + len;
+   u64 new_len = range->start + range->len - start - len;
+
+   cur_rsv = len;
+   if (reserved)
+   *reserved += cur_rsv;
+   map->reserved -= cur_rsv;
+
+   range->len = start - range->start;
+   ret = insert_data_range(map, tmp, new_start, new_len);
+   WARN_ON(ret <= 0);
+   return 1;
+   }
+
+   /*
+* Iterate until the end of the range and free release all
+* reserved data from map.
+* We iterate by existing range, as that will makes codes a
+* little more clean.
+*
+*  |<-desired>|
+* |//1//|  |//2//| |//3//| |//4//|
+*/
+   while (range->start < start + len) {
+   struct rb_node *next = NULL;
+   int range_freed = 0;
+
+   /*
+*  |<---desired>|
+* |///|
+*/
+   if (unlikely(range->start + range->len <= start))
+   goto next;
+
+   /*
+*  ||
+* |///|
+*/
+   if (range->start < start &&
+   range->start + range->len > start) {
+   cur_rsv = range->start + range->len - start;
+
+   range->len = start - range->start;
+   goto next;
+   }
+
+   /*
+*  |<--desired-->|
+*  |/|
+* Including same start/end case, so other case don't need
+* to check start/end equal case and don't need bother
+* deleting range.
+*/
+   if (range->start >= start &&
+   range->start + range->len <= start + len) {
+   cur_rsv = range->len;
+
+   range_freed = 1;
+   next = rb_next(>node);
+   rb_erase(>node, >root);
+   kfree(range);
+   goto next;
+
+   }
+
+   /*
+*  |<--desired-->|
+*|///|
+*/
+   if (range->start < 

[PATCH 09/19] btrfs: delayed_ref: Add new function to record reserved space into delayed ref

2015-09-08 Thread Qu Wenruo
Add new function btrfs_add_delayed_qgroup_reserve() function to record
how much space is reserved for that extent.

As btrfs only accounts qgroup at run_delayed_refs() time, so newly
allocated extent should keep the reserved space until then.

So add needed function with related members to do it.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/delayed-ref.c | 29 +
 fs/btrfs/delayed-ref.h | 14 ++
 2 files changed, 43 insertions(+)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81d..bd9b63b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -476,6 +476,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+   head_ref->qgroup_reserved = 0;
+   head_ref->qgroup_ref_root = 0;
 
/* Record qgroup extent info if provided */
if (qrecord) {
@@ -746,6 +748,33 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info 
*fs_info,
return 0;
 }
 
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+struct btrfs_trans_handle *trans,
+u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+   struct btrfs_delayed_ref_root *delayed_refs;
+   struct btrfs_delayed_ref_head *ref_head;
+   int ret = 0;
+
+   if (!fs_info->quota_enabled || !is_fstree(ref_root))
+   return 0;
+
+   delayed_refs = >transaction->delayed_refs;
+
+   spin_lock(_refs->lock);
+   ref_head = find_ref_head(_refs->href_root, bytenr, 0);
+   if (!ref_head) {
+   ret = -ENOENT;
+   goto out;
+   }
+   WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+   ref_head->qgroup_ref_root = ref_root;
+   ref_head->qgroup_reserved = num_bytes;
+out:
+   spin_unlock(_refs->lock);
+   return ret;
+}
+
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6..d4c41e2 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -113,6 +113,17 @@ struct btrfs_delayed_ref_head {
int total_ref_mod;
 
/*
+* For qgroup reserved space freeing.
+*
+* ref_root and reserved will be recorded after
+* BTRFS_ADD_DELAYED_EXTENT is called.
+* And will be used to free reserved qgroup space at
+* run_delayed_refs() time.
+*/
+   u64 qgroup_ref_root;
+   u64 qgroup_reserved;
+
+   /*
 * when a new extent is allocated, it is just reserved in memory
 * The actual extent isn't inserted into the extent allocation tree
 * until the delayed ref is processed.  must_insert_reserved is
@@ -242,6 +253,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info 
*fs_info,
   u64 owner, u64 offset, int action,
   struct btrfs_delayed_extent_op *extent_op,
   int no_quota);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+struct btrfs_trans_handle *trans,
+u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/19] btrfs: qgroup: Implement data_rsv_map init/free functions

2015-09-08 Thread Qu Wenruo
New functions btrfs_qgroup_init/free_data_rsv_map() to init/free data
reserve map.

Data reserve map is used to mark which range already holds reserved
space, to avoid current reserved space leak.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/btrfs_inode.h |  2 ++
 fs/btrfs/inode.c   | 10 +++
 fs/btrfs/qgroup.c  | 77 ++
 fs/btrfs/qgroup.h  |  3 ++
 4 files changed, 92 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e3ece65..27cc338 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -199,6 +199,8 @@ struct btrfs_inode {
 
/* qgroup dirty map for data space reserve */
struct btrfs_qgroup_data_rsv_map *qgroup_rsv_map;
+   /* lock to ensure rsv_map will only be initialized once */
+   spinlock_t qgroup_init_lock;
 };
 
 extern unsigned char btrfs_filetype_table[];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 37dd8d0..61b2c17 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8939,6 +8939,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(>delalloc_inodes);
RB_CLEAR_NODE(>rb_node);
 
+   /*
+* Init qgroup info to empty, as they will be initialized at write
+* time.
+* This behavior is needed for enable quota later case.
+*/
+   spin_lock_init(>qgroup_init_lock);
+   ei->qgroup_rsv_map = NULL;
+
return inode;
 }
 
@@ -8996,6 +9004,8 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
+   /* free and check data rsv map */
+   btrfs_qgroup_free_data_rsv_map(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 561c36d..cf07c17 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2539,3 +2539,80 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
btrfs_queue_work(fs_info->qgroup_rescan_workers,
 _info->qgroup_rescan_work);
 }
+
+/*
+ * Init data_rsv_map for a given inode.
+ *
+ * This is needed at write time as quota can be disabled and then enabled
+ */
+int btrfs_qgroup_init_data_rsv_map(struct inode *inode)
+{
+   struct btrfs_inode *binode = BTRFS_I(inode);
+   struct btrfs_root *root = binode->root;
+   struct btrfs_qgroup_data_rsv_map *dirty_map;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+   return 0;
+
+   spin_lock(>qgroup_init_lock);
+   /* Quick route for init */
+   if (likely(binode->qgroup_rsv_map))
+   goto out;
+   spin_unlock(>qgroup_init_lock);
+
+   /*
+* Slow allocation route
+*
+* TODO: Use kmem_cache to speedup allocation
+*/
+   dirty_map = kmalloc(sizeof(*dirty_map), GFP_NOFS);
+   if (!dirty_map)
+   return -ENOMEM;
+
+   dirty_map->reserved = 0;
+   dirty_map->root = RB_ROOT;
+   spin_lock_init(_map->lock);
+
+   /* Lock again to ensure no one has already init it before */
+   spin_lock(>qgroup_init_lock);
+   if (binode->qgroup_rsv_map) {
+   spin_unlock(>qgroup_init_lock);
+   kfree(dirty_map);
+   return 0;
+   }
+   binode->qgroup_rsv_map = dirty_map;
+out:
+   spin_unlock(>qgroup_init_lock);
+   return 0;
+}
+
+void btrfs_qgroup_free_data_rsv_map(struct inode *inode)
+{
+   struct btrfs_inode *binode = BTRFS_I(inode);
+   struct btrfs_root *root = binode->root;
+   struct btrfs_qgroup_data_rsv_map *dirty_map = binode->qgroup_rsv_map;
+   struct rb_node *node;
+
+   /*
+* this function is called at inode destroy routine, so no concurrency
+* will happen, no need to get the lock.
+*/
+   if (!dirty_map)
+   return;
+
+   /* insanity check */
+   WARN_ON(!root->fs_info->quota_enabled || !is_fstree(root->objectid));
+
+   btrfs_qgroup_free(root, dirty_map->reserved);
+   spin_lock(_map->lock);
+   while ((node = rb_first(_map->root)) != NULL) {
+   struct data_rsv_range *range;
+
+   range = rb_entry(node, struct data_rsv_range, node);
+   rb_erase(node, _map->root);
+   kfree(range);
+   }
+   spin_unlock(_map->lock);
+   kfree(dirty_map);
+   binode->qgroup_rsv_map = NULL;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 2f863a4..c87b7dc 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -84,4 +84,7 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, 
u64 qgroupid,
   u64 rfer, u64 excl);
 #endif
 
+/* for qgroup reserve */
+int btrfs_qgroup_init_data_rsv_map(struct inode *inode);
+void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
 #endif /* 

[PATCH 18/19] btrfs: qgroup: Cleanup old inaccurate facilities

2015-09-08 Thread Qu Wenruo
Cleanup the old facilities which use old btrfs_qgroup_reserve() function
call, replace them with the newer version, and remove the "__" prefix in
them.

Also, make btrfs_qgroup_reserve/free() functions private, as they are
now only used inside qgroup codes.

Now, the whole btrfs qgroup is swithed to use the new reserve facilities.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  6 ++
 fs/btrfs/extent-tree.c | 56 --
 fs/btrfs/file.c|  2 +-
 fs/btrfs/inode-map.c   |  2 +-
 fs/btrfs/inode.c   | 12 +--
 fs/btrfs/ioctl.c   |  2 +-
 fs/btrfs/qgroup.c  | 19 ++---
 fs/btrfs/qgroup.h  |  7 ---
 8 files changed, 27 insertions(+), 79 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 12f14fd..8489419 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3452,8 +3452,7 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes);
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3471,8 +3470,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root 
*root,
  u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07f45b7..ab1b1a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3352,7 +3352,7 @@ again:
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
 
-   ret = __btrfs_check_data_free_space(inode, 0, num_pages);
+   ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
 
@@ -4037,27 +4037,11 @@ commit_trans:
 }
 
 /*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 
write_bytes)
-{
-   struct btrfs_root *root = BTRFS_I(inode)->root;
-   int ret;
-
-   ret = btrfs_alloc_data_chunk_ondemand(inode, bytes);
-   if (ret < 0)
-   return ret;
-   ret = btrfs_qgroup_reserve(root, write_bytes);
-   return ret;
-}
-
-/*
  * New check_data_free_space() with ability for precious data reserveation
  * Will replace old btrfs_check_data_free_space(), but for patch split,
  * add a new function first and then replace it.
  */
-int __btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
 {
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
@@ -5710,11 +5694,11 @@ void btrfs_delalloc_release_metadata(struct inode 
*inode, u64 num_bytes)
  * Return 0 for success
  * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int __btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
 {
int ret;
 
-   ret = __btrfs_check_data_free_space(inode, start, len);
+   ret = btrfs_check_data_free_space(inode, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len);
@@ -5724,38 +5708,6 @@ int __btrfs_delalloc_reserve_space(struct inode *inode, 
u64 start, u64 len)
 }
 
 /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
- * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
- *
- * This will do the following things
- *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
- *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
- * o add it to the fs_info's delalloc inodes list.
- *
- * This will return 0 for success and -ENOSPC if there is no space left.
- */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
-{
-   int ret;
-
-   ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-   if (ret)

[PATCH 05/19] btrfs: qgroup: Introduce function to reserve data range per inode

2015-09-08 Thread Qu Wenruo
Introduce new function reserve_data_range().
This function will find non-overlap range and to insert it into reserve
map using previously introduced functions.

This provides the basis for later per inode reserve map implement.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 92 +++
 1 file changed, 92 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a4e3af4..77a2e07 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2701,6 +2701,98 @@ static int insert_data_ranges(struct 
btrfs_qgroup_data_rsv_map *map,
 }
 
 /*
+ * Check qgroup limit and insert dirty range into reserve_map.
+ *
+ * Must be called with map->lock hold
+ */
+static int reserve_data_range(struct btrfs_root *root,
+ struct btrfs_qgroup_data_rsv_map *map,
+ struct data_rsv_range *tmp,
+ struct ulist *insert_list, u64 start, u64 len)
+{
+   struct data_rsv_range *range;
+   u64 cur_start = 0;
+   u64 cur_len = 0;
+   u64 reserve = 0;
+   int ret = 0;
+
+   range = find_reserve_range(map, start);
+   /* empty tree, insert the whole range */
+   if (!range) {
+   reserve = len;
+   ret = ulist_add(insert_list, start, len, GFP_ATOMIC);
+   if (ret < 0)
+   return ret;
+   goto insert;
+   }
+
+   /* For case range is covering the leading part */
+   if (range->start <= start && range->start + range->len > start)
+   cur_start = range->start + range->len;
+   else
+   cur_start = start;
+
+   /*
+* iterate until the end of the range.
+* Like the following:
+*
+*  ||
+*|//1//|   |2//|   |///3///|   <- exists
+* Then we will need to insert the following
+*  |\\\4\\\|   |\\\5\\\|   |\\\6\\\|
+* And only add qgroup->reserved for rang 4,5,6.
+*/
+   while (cur_start < start + len) {
+   struct rb_node *next_node;
+   u64 next_start;
+
+   if (range->start + range->len <= cur_start) {
+   /*
+* Move to next range if current range is before
+* cur_start
+* e.g range is 1, cur_start is the end of range 1.
+*/
+   next_node = rb_next(>node);
+   if (!next_node) {
+   /*
+* no next range, fill the rest
+* e.g range is 3, cur_start is end of range 3.
+*/
+   cur_len = start + len - cur_start;
+   next_start = start + len;
+   } else {
+   range = rb_entry(next_node,
+struct data_rsv_range, node);
+   cur_len = min(range->start, start + len) -
+ cur_start;
+   next_start = range->start + range->len;
+   }
+   } else {
+   /*
+* current range is already after cur_start
+* e.g range is 2, cur_start is end of range 1.
+*/
+   cur_len = min(range->start, start + len) - cur_start;
+   next_start = range->start + range->len;
+   }
+   reserve += cur_len;
+   ret = ulist_add(insert_list, cur_start, cur_len, GFP_ATOMIC);
+   if (ret < 0)
+   return ret;
+
+   cur_start = next_start;
+   }
+insert:
+   ret = btrfs_qgroup_reserve(root, reserve);
+   if (ret < 0)
+   return ret;
+   /* ranges must be inserted after we are sure it has enough space */
+   ret = insert_data_ranges(map, tmp, insert_list);
+   map->reserved += reserve;
+   return ret;
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/19] btrfs: qgroup: Introduce function to insert non-overlap reserve range

2015-09-08 Thread Qu Wenruo
New function insert_data_ranges() will insert non-overlap reserve ranges
into reserve map.

It provides the basis for later qgroup reserve map implement.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 124 ++
 1 file changed, 124 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fc24fc3..a4e3af4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2577,6 +2577,130 @@ find_reserve_range(struct btrfs_qgroup_data_rsv_map 
*map, u64 start)
 }
 
 /*
+ * Insert one data range
+ * [start,len) here won't overflap with each other.
+ *
+ * Return 0 if range is inserted and tmp is not used.
+ * Return > 0 if range is inserted and tmp is used.
+ * No catchable error case. Only possible error will cause BUG_ON() as
+ * that's logical error.
+ */
+static int insert_data_range(struct btrfs_qgroup_data_rsv_map *map,
+struct data_rsv_range *tmp,
+u64 start, u64 len)
+{
+   struct rb_node **p = >root.rb_node;
+   struct rb_node *parent = NULL;
+   struct rb_node *tmp_node = NULL;
+   struct data_rsv_range *range = NULL;
+   struct data_rsv_range *prev_range = NULL;
+   struct data_rsv_range *next_range = NULL;
+   int prev_merged = 0;
+   int next_merged = 0;
+   int ret = 0;
+
+   while (*p) {
+   parent = *p;
+   range = rb_entry(parent, struct data_rsv_range, node);
+   if (range->start < start)
+   p = &(*p)->rb_right;
+   else if (range->start > start)
+   p = &(*p)->rb_left;
+   else
+   BUG_ON(1);
+   }
+
+   /* Empty tree, goto isolated case */
+   if (!range)
+   goto insert_isolated;
+
+   /* get adjusted ranges */
+   if (range->start < start) {
+   prev_range = range;
+   tmp_node = rb_next(parent);
+   if (tmp)
+   next_range = rb_entry(tmp_node, struct data_rsv_range,
+ node);
+   } else {
+   next_range = range;
+   tmp_node = rb_prev(parent);
+   if (tmp)
+   prev_range = rb_entry(tmp_node, struct data_rsv_range,
+ node);
+   }
+
+   /* try to merge with previous and next ranges */
+   if (prev_range && prev_range->start + prev_range->len == start) {
+   prev_merged = 1;
+   prev_range->len += len;
+   }
+   if (next_range && start + len == next_range->start) {
+   next_merged = 1;
+
+   /*
+* the range can be merged with adjusted two ranges into one,
+* remove the tailing range.
+*/
+   if (prev_merged) {
+   prev_range->len += next_range->len;
+   rb_erase(_range->node, >root);
+   kfree(next_range);
+   } else {
+   next_range->start = start;
+   next_range->len += len;
+   }
+   }
+
+insert_isolated:
+   /* isolated case, need to insert range now */
+   if (!next_merged && !prev_merged) {
+   BUG_ON(!tmp);
+
+   tmp->start = start;
+   tmp->len = len;
+   rb_link_node(>node, parent, p);
+   rb_insert_color(>node, >root);
+   ret = 1;
+   }
+   return ret;
+}
+
+/*
+ * insert reserve range and merge them if possible
+ *
+ * Return 0 if all inserted and tmp not used
+ * Return > 0 if all inserted and tmp used
+ * No catchable error return value.
+ */
+static int insert_data_ranges(struct btrfs_qgroup_data_rsv_map *map,
+ struct data_rsv_range *tmp,
+ struct ulist *insert_list)
+{
+   struct ulist_node *unode;
+   struct ulist_iterator uiter;
+   int tmp_used = 0;
+   int ret = 0;
+
+   ULIST_ITER_INIT();
+   while ((unode = ulist_next(insert_list, ))) {
+   ret = insert_data_range(map, tmp, unode->val, unode->aux);
+
+   /*
+* insert_data_range() won't return error return value,
+* no need to hanle <0 case.
+*
+* Also tmp should be used at most one time, so clear it to
+* NULL to cooperate with sanity check in insert_data_range().
+*/
+   if (ret > 0) {
+   tmp_used = 1;
+   tmp = NULL;
+   }
+   }
+   return tmp_used;
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a 

[PATCH 15/19] btrfs: fallocate: Add support to accurate qgroup reserve

2015-09-08 Thread Qu Wenruo
Now fallocate will do accurate qgroup reserve space check, unlike old
method, which will always reserve the whole length of the range.

With this patch, fallocate will:
1) Iterate the desired range and mark in data rsv map
   Only range which is going to be allocated will be recorded in data
   rsv map and reserve the space.
   For already allocated range (normal/prealloc extent) they will be
   skipped.
   Also, record the marked range into a new list for later use.

2) If 1) succeeded, do real file extent allocate.
   And at file extent allocation time, corresponding range will be
   removed from the range in data rsv map.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/file.c | 147 +---
 1 file changed, 107 insertions(+), 40 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c1eec4f..26e59bc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2545,17 +2545,61 @@ out_only_mutex:
return err;
 }
 
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+   struct list_head list;
+   u64 start;
+   u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+   struct falloc_range *prev = NULL;
+   struct falloc_range *range = NULL;
+
+   if (list_empty(head))
+   goto insert;
+
+   /*
+* As fallocate iterate by bytenr order, we only need to check
+* the last range.
+*/
+   prev = list_entry(head->prev, struct falloc_range, list);
+   if (prev->start + prev->len == start) {
+   prev->len += len;
+   return 0;
+   }
+insert:
+   range = kmalloc(sizeof(*range), GFP_NOFS);
+   if (!range)
+   return -ENOMEM;
+   range->start = start;
+   range->len = len;
+   list_add_tail(>list, head);
+   return 0;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
 {
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+   struct falloc_range *range;
+   struct falloc_range *tmp;
+   struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
+   u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2571,10 +2615,11 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
 
/*
-* Make sure we have enough space before we do the
-* allocation.
+* Only trigger disk allocation, don't trigger qgroup reserve
+*
+* For qgroup space, it will be checked later.
 */
-   ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, 
alloc_end - alloc_start);
+   ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
if (ret)
return ret;
 
@@ -2583,6 +2628,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
 
+   /*
+* TODO: Move these two operations after we have checked
+* accurate reserved space, or fallocate can still fail but
+* with page truncated or size expanded.
+*
+* But that's a minor problem and won't do much harm BTW.
+*/
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
@@ -2641,10 +2693,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
 
+   /* First, check if we exceed the qgroup limit */
+   INIT_LIST_HEAD(_list);
cur_offset = alloc_start;
while (1) {
-   u64 actual_end;
-
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
  alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2657,54 +2709,69 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
-
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
 !test_bit(EXTENT_FLAG_PREALLOC, >flags))) {
-   ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-   last_byte - cur_offset,
-   1 << 

[PATCH 19/19] btrfs: qgroup: Add handler for NOCOW and inline

2015-09-08 Thread Qu Wenruo
For NOCOW and inline case, there will be no delayed_ref created for
them, so we should free their reserved data space at proper
time(finish_ordered_io for NOCOW and cow_file_inline for inline).

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c |  7 ++-
 fs/btrfs/inode.c   | 15 +++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ab1b1a1..ca15bd3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4055,7 +4055,12 @@ int btrfs_check_data_free_space(struct inode *inode, u64 
start, u64 len)
if (ret < 0)
return ret;
 
-   /* Use new btrfs_qgroup_reserve_data to reserve precious data space */
+   /*
+* Use new btrfs_qgroup_reserve_data to reserve precious data space
+*
+* TODO: Find a good method to avoid reserve data space for NOCOW
+* range, but don't impact performance on quota disable case.
+*/
ret = btrfs_qgroup_reserve_data(inode, start, len);
return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8c09197..9b783e6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct 
btrfs_root *root,
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+   /*
+* Don't forget to free the reserved space, as for inlined extent
+* it won't count as data extent, free them directly here.
+* And at reserve time, it's always aligned to page size, so
+* just free one page here.
+*/
+   btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -2831,6 +2838,14 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
 
if (test_bit(BTRFS_ORDERED_NOCOW, _extent->flags)) {
BUG_ON(!list_empty(_extent->list)); /* Logic error */
+
+   /*
+* For mwrite(mmap + memset to write) case, we still reserve
+* space for NOCOW range.
+* As NOCOW won't cause a new delayed ref, just free the space
+*/
+   btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+  ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/19] btrfs: qgroup: Introduce new function to search most left reserve range

2015-09-08 Thread Qu Wenruo
Introduce the new function to search the most left reserve range in a
reserve map.

It provides the basis for later reserve map implement.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cf07c17..fc24fc3 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2541,6 +2541,42 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Return the nearest left range of given start
+ * No ensure about the range will cover start.
+ */
+static struct data_rsv_range *
+find_reserve_range(struct btrfs_qgroup_data_rsv_map *map, u64 start)
+{
+   struct rb_node **p = >root.rb_node;
+   struct rb_node *parent = NULL;
+   struct rb_node *prev = NULL;
+   struct data_rsv_range *range = NULL;
+
+   while (*p) {
+   parent = *p;
+   range = rb_entry(parent, struct data_rsv_range, node);
+   if (range->start < start)
+   p = &(*p)->rb_right;
+   else if (range->start > start)
+   p = &(*p)->rb_left;
+   else
+   return range;
+   }
+
+   /* empty tree */
+   if (!parent)
+   return NULL;
+   if (range->start <= start)
+   return range;
+
+   prev = rb_prev(parent);
+   /* Already most left one */
+   if (!prev)
+   return range;
+   return rb_entry(prev, struct data_rsv_range, node);
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/19] btrfs: qgroup: Introduce function to release/free reserved data range

2015-09-08 Thread Qu Wenruo
Introduce functions btrfs_qgroup_release/free_data() to release/free
reserved data range.

Release means, just remove the data range from data rsv map, but doesn't
free the reserved space.
This is for normal buffered write case, when data is written into disc
and its metadata is added into tree, its reserved space should still be
kept until commit_trans().
So in that case, we only release dirty range, but keep the reserved
space recorded some other place until commit_tran().

Free means not only remove data range, but also free reserved space.
This is used for case for cleanup.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 48 
 fs/btrfs/qgroup.h |  2 ++
 2 files changed, 50 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e24c10d..ba7888f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2979,6 +2979,54 @@ next:
return 0;
 }
 
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+  int free_reserved)
+{
+   struct data_rsv_range *tmp;
+   struct btrfs_qgroup_data_rsv_map *map;
+   u64 reserved = 0;
+   int ret;
+
+   spin_lock(_I(inode)->qgroup_init_lock);
+   map = BTRFS_I(inode)->qgroup_rsv_map;
+   spin_unlock(_I(inode)->qgroup_init_lock);
+   if (!map)
+   return 0;
+
+   tmp = kmalloc(sizeof(*tmp), GFP_NOFS);
+   if (!tmp)
+   return -ENOMEM;
+   spin_lock(>lock);
+   ret = release_data_range(map, tmp, start, len, );
+   /* release_data_range() won't fail only check if memory is used */
+   if (ret == 0)
+   kfree(tmp);
+   if (free_reserved)
+   btrfs_qgroup_free(BTRFS_I(inode)->root, reserved);
+   spin_unlock(>lock);
+   return 0;
+}
+
+/*
+ * Caller should be truncate/invalidate_page.
+ * As it will release the reserved data.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+   return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Caller should be finish_ordered_io
+ * As qgroup accouting happens at commit time, for data written to disk
+ * its reserved space should not be freed until commit.
+ * Or we may beyond the limit.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+   return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
 /*
  * Init data_rsv_map for a given inode.
  *
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 366b853..8e69dc1 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -88,4 +88,6 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, 
u64 qgroupid,
 int btrfs_qgroup_init_data_rsv_map(struct inode *inode);
 void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
 int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/19] btrfs: qgroup: New function declaration for new reserve implement

2015-09-08 Thread Qu Wenruo
Add new structures and functions for new qgroup reserve implement dirty
phase.
Which will focus on avoiding over-reserve as in that case, which means
for already reserved dirty space range, we won't reserve space again.

This patch adds the needed structure declaration and comments.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/btrfs_inode.h |  4 
 fs/btrfs/qgroup.c  | 58 ++
 fs/btrfs/qgroup.h  |  3 +++
 3 files changed, 65 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 81220b2..e3ece65 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,7 @@
 #include "extent_io.h"
 #include "ordered-data.h"
 #include "delayed-inode.h"
+#include "qgroup.h"
 
 /*
  * ordered_data_close is set by truncate when a file that used
@@ -195,6 +196,9 @@ struct btrfs_inode {
struct timespec i_otime;
 
struct inode vfs_inode;
+
+   /* qgroup dirty map for data space reserve */
+   struct btrfs_qgroup_data_rsv_map *qgroup_rsv_map;
 };
 
 extern unsigned char btrfs_filetype_table[];
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e9ace09..561c36d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -91,6 +91,64 @@ struct btrfs_qgroup {
u64 new_refcnt;
 };
 
+/*
+ * Record one range of reserved space.
+ */
+struct data_rsv_range {
+   struct rb_node node;
+   u64 start;
+   u64 len;
+};
+
+/*
+ * Record per inode reserved range.
+ * This is mainly used to resolve reserved space leaking problem.
+ * One of the cause is the mismatch with reserve and free.
+ *
+ * New qgroup will handle reserve in two phase.
+ * 1) Dirty phase.
+ *Pages are just marked dirty, but not written to disk.
+ * 2) Flushed phase
+ *Pages are written to disk, but transaction is not committed yet.
+ *
+ * At Diryt phase, we only need to focus on avoiding over-reserve.
+ *
+ * The idea is like below.
+ * 1) Write [0,8K)
+ * 0   4K  8K  12K 16K
+ * ||
+ * Reserve +8K, total reserved: 8K
+ *
+ * 2) Write [0,4K)
+ * 0   4K  8K  12K 16K
+ * ||
+ * Reserve 0, total reserved 8K
+ *
+ * 3) Write [12K,16K)
+ * 0   4K  8K  12K 16K
+ * ||  |///|
+ * Reserve +4K, tocal reserved 12K
+ *
+ * 4) Flush [0,8K)
+ * Can happen without commit transaction, like fallocate will trigger the
+ * write.
+ * 0   4K  8K  12K 16K
+ * |///|
+ * Reserve 0, tocal reserved 12K
+ * As the extent is written to disk, not dirty any longer, the range get
+ * removed.
+ * But as its delayed_refs is not run, its reserved space will not be freed.
+ * And things continue to Flushed phase.
+ *
+ * By this method, we can avoid over-reserve, which will lead to reserved
+ * space leak.
+ */
+struct btrfs_qgroup_data_rsv_map {
+   struct rb_root root;
+   u64 reserved;
+   spinlock_t lock;
+};
+
 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
   int mod)
 {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcf..2f863a4 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,9 @@ struct btrfs_qgroup_extent_record {
struct ulist *old_roots;
 };
 
+/* For per-inode dirty range reserve */
+struct btrfs_qgroup_data_rsv_map;
+
 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
   struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/19] btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function

2015-09-08 Thread Qu Wenruo
This new function will do all the hard work to reserve precious space
for a write.

The overall work flow will be the following.

File A already has some dirty pages:

0   4K  8K  12K 16K
|///|   |///|

And then, someone want to write some data into range [4K, 16K).
|<--desired>|

Unlike the old and wrong implement, which reserve 12K, this function
will only reserve space for newly dirty part:
|\\\|   |\\\|
Which only takes 8K reserve space, as other part has already allocated
their own reserve space.

So the final reserve map will be:
|///|

This provides the basis to resolve the long existing qgroup limit bug.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 57 +++
 fs/btrfs/qgroup.h |  1 +
 2 files changed, 58 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 77a2e07..337b784 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2793,6 +2793,63 @@ insert:
 }
 
 /*
+ * Make sure the data space for [start, start + len) is reserved.
+ * It will either reserve new space from given qgroup or reuse the already
+ * reserved space.
+ *
+ * Return 0 for successful reserve.
+ * Return <0 for error.
+ *
+ * TODO: to handle nocow case, like NODATACOW or write into prealloc space
+ * along with other mixed case.
+ * Like write 2M, first 1M can be nocowed, but next 1M is on hole and need COW.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+   struct btrfs_inode *binode = BTRFS_I(inode);
+   struct btrfs_root *root = binode->root;
+   struct btrfs_qgroup_data_rsv_map *reserve_map;
+   struct data_rsv_range *tmp = NULL;
+   struct ulist *insert_list;
+   int ret;
+
+   if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+   len == 0)
+   return 0;
+
+   if (!binode->qgroup_rsv_map) {
+   ret = btrfs_qgroup_init_data_rsv_map(inode);
+   if (ret < 0)
+   return ret;
+   }
+   reserve_map = binode->qgroup_rsv_map;
+   insert_list = ulist_alloc(GFP_NOFS);
+   if (!insert_list)
+   return -ENOMEM;
+   tmp = kzalloc(sizeof(*tmp), GFP_NOFS);
+   if (!tmp) {
+   ulist_free(insert_list);
+   return -ENOMEM;
+   }
+
+   spin_lock(_map->lock);
+   ret = reserve_data_range(root, reserve_map, tmp, insert_list, start,
+len);
+   /*
+* For error and already exists case, free tmp memory.
+* For tmp used case, set ret to 0, as some careless
+* caller consider >0 as error.
+*/
+   if (ret <= 0)
+   kfree(tmp);
+   else
+   ret = 0;
+   spin_unlock(_map->lock);
+   ulist_free(insert_list);
+   return ret;
+}
+
+/*
  * Init data_rsv_map for a given inode.
  *
  * This is needed at write time as quota can be disabled and then enabled
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c87b7dc..366b853 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -87,4 +87,5 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, 
u64 qgroupid,
 /* for qgroup reserve */
 int btrfs_qgroup_init_data_rsv_map(struct inode *inode);
 void btrfs_qgroup_free_data_rsv_map(struct inode *inode);
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
 #endif /* __BTRFS_QGROUP__ */
-- 
2.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html