[PATCH 2/3] btrfs: Unexport get_block_group_index()

2018-01-29 Thread Qu Wenruo
That function is only used inside extent-tree.c.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   | 1 -
 fs/btrfs/extent-tree.c | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 13c260b525a1..27249240fa3e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2629,7 +2629,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
 u64 bytenr);
 void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-int get_block_group_index(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 struct btrfs_root *root,
 u64 parent, u64 root_objectid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e9c31b567a9c..6e1128aa29d6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7346,7 +7346,8 @@ wait_block_group_cache_done(struct 
btrfs_block_group_cache *cache)
return ret;
 }
 
-int get_block_group_index(struct btrfs_block_group_cache *cache)
+static enum btrfs_raid_types
+get_block_group_index(struct btrfs_block_group_cache *cache)
 {
return btrfs_bg_flags_to_raid_index(cache->flags);
 }
-- 
2.16.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] btrfs: Refactor __get_raid_index() to btrfs_bg_flags_to_raid_index()

2018-01-29 Thread Qu Wenruo
Function __get_raid_index() is used to convert block group flags into
raid index, which can be used to get various info directly from
btrfs_raid_array[].

Refactor this function a little:

1) Rename to btrfs_bg_flags_to_raid_index()
   Double underline prefix is normally for internal functions, while the
   function is used by both extent-tree and volumes.

   Although the name is a little longer, but it should explain its usage
   quite well.

2) Move it to volumes.h and make it static inline
   Just several if-else branches, really no need to define it as a normal
   function.

   This also makes later code re-use between kernel and btrfs-progs
   easier.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c | 26 --
 fs/btrfs/volumes.c |  2 +-
 fs/btrfs/volumes.h | 18 ++
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2f4328511ac8..e9c31b567a9c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7346,27 +7346,9 @@ wait_block_group_cache_done(struct 
btrfs_block_group_cache *cache)
return ret;
 }
 
-int __get_raid_index(u64 flags)
-{
-   if (flags & BTRFS_BLOCK_GROUP_RAID10)
-   return BTRFS_RAID_RAID10;
-   else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-   return BTRFS_RAID_RAID1;
-   else if (flags & BTRFS_BLOCK_GROUP_DUP)
-   return BTRFS_RAID_DUP;
-   else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-   return BTRFS_RAID_RAID0;
-   else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-   return BTRFS_RAID_RAID5;
-   else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-   return BTRFS_RAID_RAID6;
-
-   return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
 int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
-   return __get_raid_index(cache->flags);
+   return btrfs_bg_flags_to_raid_index(cache->flags);
 }
 
 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
@@ -7483,7 +7465,7 @@ static noinline int find_free_extent(struct btrfs_fs_info 
*fs_info,
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
-   int index = __get_raid_index(flags);
+   int index = btrfs_bg_flags_to_raid_index(flags);
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7579,7 +7561,7 @@ static noinline int find_free_extent(struct btrfs_fs_info 
*fs_info,
}
 search:
have_caching_bg = false;
-   if (index == 0 || index == __get_raid_index(flags))
+   if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
full_search = true;
down_read(_info->groups_sem);
list_for_each_entry(block_group, _info->block_groups[index],
@@ -9643,7 +9625,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 
bytenr)
 */
target = get_restripe_target(fs_info, block_group->flags);
if (target) {
-   index = __get_raid_index(extended_to_chunk(target));
+   index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
} else {
/*
 * this is just a balance, so if we were marked as full
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a25684287501..d818b1f9c625 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4625,7 +4625,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
if (list_empty(_devices->alloc_list))
return -ENOSPC;
 
-   index = __get_raid_index(type);
+   index = btrfs_bg_flags_to_raid_index(type);
 
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ff15208344a7..413e07e44b42 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -533,6 +533,24 @@ static inline void btrfs_dev_stat_reset(struct 
btrfs_device *dev,
btrfs_dev_stat_set(dev, index, 0);
 }
 
+static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
+{
+   if (flags & BTRFS_BLOCK_GROUP_RAID10)
+   return BTRFS_RAID_RAID10;
+   else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+   return BTRFS_RAID_RAID1;
+   else if (flags & BTRFS_BLOCK_GROUP_DUP)
+   return BTRFS_RAID_DUP;
+   else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+   return BTRFS_RAID_RAID0;
+   else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+   return BTRFS_RAID_RAID5;
+   else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+   return BTRFS_RAID_RAID6;
+
+   return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
 void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
  

[PATCH 3/3] btrfs: Refactor parameter of BTRFS_MAX_DEVS() from root to fs_info

2018-01-29 Thread Qu Wenruo
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/volumes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d818b1f9c625..215e85e22c8e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4581,7 +4581,7 @@ static void check_raid56_incompat_flag(struct 
btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
 }
 
-#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info)\
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)   \
- sizeof(struct btrfs_chunk))   \
/ sizeof(struct btrfs_stripe) + 1)
 
@@ -4638,7 +4638,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
-   devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+   devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -4647,7 +4647,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
-   devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+   devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
-- 
2.16.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 7/7] btrfs-progs: Cleanup use of root in leaf_data_end

2018-01-29 Thread Qu Wenruo


On 2018年01月26日 15:26, Gu Jinxiang wrote:
> In function leaf_data_end, root is just used to get fs_info,
> so change the parameter of this function from btrfs_root to
> btrfs_fs_info.
> And also make it consistent with kernel.
> 
> Signed-off-by: Gu Jinxiang 
> ---
>  ctree.c | 32 +---
>  1 file changed, 17 insertions(+), 15 deletions(-)
> 
> diff --git a/ctree.c b/ctree.c
> index 11d207e7..2417483d 100644
> --- a/ctree.c
> +++ b/ctree.c
> @@ -410,12 +410,12 @@ static int btrfs_comp_keys(struct btrfs_disk_key *disk, 
> struct btrfs_key *k2)
>   * this returns the address of the start of the last item,
>   * which is the stop of the leaf data stack
>   */
> -static inline unsigned int leaf_data_end(struct btrfs_root *root,
> -  struct extent_buffer *leaf)
> +static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info,
> +  const struct extent_buffer *leaf)

Here leaf is const, which is fine.

>  {
>   u32 nr = btrfs_header_nritems(leaf);
>   if (nr == 0)
> - return BTRFS_LEAF_DATA_SIZE(root->fs_info);
> + return BTRFS_LEAF_DATA_SIZE(fs_info);
>   return btrfs_item_offset_nr(leaf, nr - 1);

But btrfs_item_offset_nr() doesn't have const prefix for leaf.
Leaving the following warning:

passing argument 1 of ‘btrfs_item_offset_nr’ discards ‘const’ qualifier
from pointer target type [-Wdiscarded-qualifiers]
  return btrfs_item_offset_nr(leaf, nr - 1);
  ^~~~

Thanks,
Qu
>  }
>  
> @@ -1737,10 +1737,10 @@ static int push_leaf_right(struct btrfs_trans_handle 
> *trans, struct btrfs_root
>   right_nritems = btrfs_header_nritems(right);
>  
>   push_space = btrfs_item_end_nr(left, left_nritems - push_items);
> - push_space -= leaf_data_end(root, left);
> + push_space -= leaf_data_end(fs_info, left);
>  
>   /* make room in the right data area */
> - data_end = leaf_data_end(root, right);
> + data_end = leaf_data_end(fs_info, right);
>   memmove_extent_buffer(right,
> btrfs_leaf_data(right) + data_end - push_space,
> btrfs_leaf_data(right) + data_end,
> @@ -1749,7 +1749,7 @@ static int push_leaf_right(struct btrfs_trans_handle 
> *trans, struct btrfs_root
>   /* copy from the left data area */
>   copy_extent_buffer(right, left, btrfs_leaf_data(right) +
>BTRFS_LEAF_DATA_SIZE(root->fs_info) - push_space,
> -  btrfs_leaf_data(left) + leaf_data_end(root, left),
> +  btrfs_leaf_data(left) + leaf_data_end(fs_info, left),
>push_space);
>  
>   memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
> @@ -1887,7 +1887,7 @@ static int push_leaf_left(struct btrfs_trans_handle 
> *trans, struct btrfs_root
>btrfs_item_offset_nr(right, push_items -1);
>  
>   copy_extent_buffer(left, right, btrfs_leaf_data(left) +
> -  leaf_data_end(root, left) - push_space,
> +  leaf_data_end(fs_info, left) - push_space,
>btrfs_leaf_data(right) +
>btrfs_item_offset_nr(right, push_items - 1),
>push_space);
> @@ -1914,12 +1914,13 @@ static int push_leaf_left(struct btrfs_trans_handle 
> *trans, struct btrfs_root
>  
>   if (push_items < right_nritems) {
>   push_space = btrfs_item_offset_nr(right, push_items - 1) -
> -   leaf_data_end(root, right);
> +   leaf_data_end(fs_info, right);
>   memmove_extent_buffer(right, btrfs_leaf_data(right) +
> BTRFS_LEAF_DATA_SIZE(root->fs_info) -
> push_space,
> btrfs_leaf_data(right) +
> -   leaf_data_end(root, right), push_space);
> +   leaf_data_end(fs_info, right),
> +   push_space);
>  
>   memmove_extent_buffer(right, btrfs_item_nr_offset(0),
> btrfs_item_nr_offset(push_items),
> @@ -1978,7 +1979,8 @@ static noinline int copy_for_split(struct 
> btrfs_trans_handle *trans,
>  
>   nritems = nritems - mid;
>   btrfs_set_header_nritems(right, nritems);
> - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
> + data_copy_size = btrfs_item_end_nr(l, mid) -
> + leaf_data_end(root->fs_info, l);
>  
>   copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
>  btrfs_item_nr_offset(mid),
> @@ -1988,7 +1990,7 @@ static noinline int copy_for_split(struct 
> btrfs_trans_handle *trans,
>btrfs_leaf_data(right) +
>BTRFS_LEAF_DATA_SIZE(root->fs_info) -
>

Re: [PATCH 04/26] libbtrfsutil: add btrfs_util_is_subvolume() and btrfs_util_subvolume_id()

2018-01-29 Thread Nikolay Borisov


On 29.01.2018 23:43, Omar Sandoval wrote:
> On Mon, Jan 29, 2018 at 12:24:26PM +0200, Nikolay Borisov wrote:
>> On 26.01.2018 20:40, Omar Sandoval wrote:
> [snip]
>>> +/*
>>> + * This intentionally duplicates btrfs_util_f_is_subvolume() instead of 
>>> opening
>>> + * a file descriptor and calling it, because fstat() and fstatfs() don't 
>>> accept
>>> + * file descriptors opened with O_PATH on old kernels (before v3.6 and 
>>> before
>>> + * v3.12, respectively), but stat() and statfs() can be called on a path 
>>> that
>>> + * the user doesn't have read or write permissions to.
>>> + */
>>> +__attribute__((visibility("default")))
>>
>> Why do we need to explicitly set the attribute visibility to default,
>> isn't it implicitly default already?
> 
> Ah, I forgot to add -fvisibility=hidden to the build rule when I ported
> this to the btrfs-progs Makefile, that's why that's there. I'll add
> -fvisibility=hidden to the Makefile.

Right, it could be a good idea to hide the visibility attribute behind
an eloquent macro i.e. (PUBLIC|LIBRARY)_FUNC or some such.

> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] btrfs: add read_mirror_policy parameter devid

2018-01-29 Thread Anand Jain
Adds the mount option:
  mount -o read_mirror_policy=

To set the devid of the device which should be used for read. That
means all the normal reads will go to that particular device only.

This also helps testing and gives a better control for the test
scripts including mount context reads.

Signed-off-by: Anand Jain 
---
 fs/btrfs/super.c   | 21 +
 fs/btrfs/volumes.c | 10 ++
 fs/btrfs/volumes.h |  2 ++
 3 files changed, 33 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dfe6b3c67df3..d3aad87e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -847,6 +847,27 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
BTRFS_READ_MIRROR_BY_PID;
break;
}
+
+   intarg = 0;
+   if (match_int([0], ) == 0) {
+   struct btrfs_device *device;
+
+   device = btrfs_find_device(info, intarg,
+  NULL, NULL);
+   if (!device) {
+   btrfs_err(info,
+ "read_mirror_policy: invalid devid 
%d",
+ intarg);
+   ret = -EINVAL;
+   goto out;
+   }
+   info->read_mirror_policy =
+   BTRFS_READ_MIRROR_BY_DEV;
+   set_bit(BTRFS_DEV_STATE_READ_MIRROR,
+   >dev_state);
+   break;
+   }
+
ret = -EINVAL;
goto out;
case Opt_err:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 39ba59832f38..478623e6e074 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5270,6 +5270,16 @@ static int find_live_mirror(struct btrfs_fs_info 
*fs_info,
num = map->num_stripes;
 
switch(fs_info->read_mirror_policy) {
+   case BTRFS_READ_MIRROR_BY_DEV:
+   optimal = first;
+   if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+>stripes[optimal].dev->dev_state))
+   break;
+   if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+>stripes[++optimal].dev->dev_state))
+   break;
+   optimal = first;
+   break;
case BTRFS_READ_MIRROR_DEFAULT:
case BTRFS_READ_MIRROR_BY_PID:
default:
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f35d299a61..7281f55dea05 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_pending_bios {
 enum btrfs_read_mirror_type {
BTRFS_READ_MIRROR_DEFAULT,
BTRFS_READ_MIRROR_BY_PID,
+   BTRFS_READ_MIRROR_BY_DEV,
 };
 
 #define BTRFS_DEV_STATE_WRITEABLE  (0)
@@ -57,6 +58,7 @@ enum btrfs_read_mirror_type {
 #define BTRFS_DEV_STATE_MISSING(2)
 #define BTRFS_DEV_STATE_REPLACE_TGT(3)
 #define BTRFS_DEV_STATE_FLUSH_SENT (4)
+#define BTRFS_DEV_STATE_READ_MIRROR(5)
 
 struct btrfs_device {
struct list_head dev_list;
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] btrfs: add mount option read_mirror_policy

2018-01-29 Thread Anand Jain
In case of RAID1 and RAID10 devices are mirror-ed, a read IO can
pick any device for reading. This choice of picking a device for
reading should be configurable. In short not one policy would
satisfy all types of workload and configs.

So before we add more policies, this patch-set makes existing
$pid policy configurable from the mount option.

For example..
  mount -o read_mirror_policy=pid (which is also default)

Signed-off-by: Anand Jain 
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/super.c   | 10 ++
 fs/btrfs/volumes.c |  8 +++-
 fs/btrfs/volumes.h |  5 +
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a462ab85c49..4759e988b0df 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1100,6 +1100,8 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
 #endif
+   /* Policy to balance read across mirrored devices */
+   int read_mirror_policy;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 367ecbf477b9..dfe6b3c67df3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -329,6 +329,7 @@ enum {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
 #endif
+   Opt_read_mirror_policy,
Opt_err,
 };
 
@@ -393,6 +394,7 @@ static const match_table_t tokens = {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
{Opt_ref_verify, "ref_verify"},
 #endif
+   {Opt_read_mirror_policy, "read_mirror_policy=%s"},
{Opt_err, NULL},
 };
 
@@ -839,6 +841,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
btrfs_set_opt(info->mount_opt, REF_VERIFY);
break;
 #endif
+   case Opt_read_mirror_policy:
+   if (strcmp(args[0].from, "pid") == 0) {
+   info->read_mirror_policy =
+   BTRFS_READ_MIRROR_BY_PID;
+   break;
+   }
+   ret = -EINVAL;
+   goto out;
case Opt_err:
btrfs_info(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a61715677b67..39ba59832f38 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5269,7 +5269,13 @@ static int find_live_mirror(struct btrfs_fs_info 
*fs_info,
else
num = map->num_stripes;
 
-   optimal = first + current->pid % num;
+   switch(fs_info->read_mirror_policy) {
+   case BTRFS_READ_MIRROR_DEFAULT:
+   case BTRFS_READ_MIRROR_BY_PID:
+   default:
+   optimal = first + current->pid % num;
+   break;
+   }
 
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 28c28eeadff3..78f35d299a61 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -47,6 +47,11 @@ struct btrfs_pending_bios {
 #define btrfs_device_data_ordered_init(device) do { } while (0)
 #endif
 
+enum btrfs_read_mirror_type {
+   BTRFS_READ_MIRROR_DEFAULT,
+   BTRFS_READ_MIRROR_BY_PID,
+};
+
 #define BTRFS_DEV_STATE_WRITEABLE  (0)
 #define BTRFS_DEV_STATE_IN_FS_METADATA (1)
 #define BTRFS_DEV_STATE_MISSING(2)
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] btrfs: drop optimal argument from find_live_mirror()

2018-01-29 Thread Anand Jain
Drop optimal argument from the function find_live_mirror()
as we can deduce it in the function itself.

Signed-off-by: Anand Jain 
---
 fs/btrfs/volumes.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9c9d987838c2..a61715677b67 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5253,10 +5253,11 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
*fs_info, u64 logical, u64 len)
 
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
-   int optimal, int dev_replace_is_ongoing)
+   int dev_replace_is_ongoing)
 {
int i;
int num;
+   int optimal;
int tolerance;
struct btrfs_device *srcdev;
 
@@ -5268,6 +5269,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num = map->num_stripes;
 
+   optimal = first + current->pid % num;
+
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
@@ -5821,7 +5824,6 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
-   current->pid % map->num_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
@@ -5849,8 +5851,6 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
  stripe_index,
- stripe_index +
- current->pid % map->sub_stripes,
  dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] Policy to balance read across mirrored devices

2018-01-29 Thread Anand Jain
In case of RAID1 and RAID10 devices are mirror-ed, a read IO can
pick any device for reading. This choice of picking a device for
reading should be configurable. In short not one policy would
satisfy all types of workload and configs.

So before we add more policies, this patch-set makes existing
$pid policy configurable from the mount option and adds a devid
based read_mirror device policy. And keeps $pid based policy as
the default option for now. So this mount option helps to try out
different read mirror load balances.

Further we can add more policies on top of it, for example..

  mount -o read_mirror_policy=pid (current, default) [1]
  mount -o read_mirror_policy= [2]
  mount -o read_mirror_policy=lba (illustration only) [3]
  mount -o read_mirror_policy=ssd (illustration only) [4]
  mount -o read_mirror_policy=io (illustration only) [5]
  mount -o read_mirror_policy= (illustration only) [6]

[1]
 Current PID based read mirror balance.

[2]
 Set the devid of the device which should be used for read. That
 means all the normal read will go to that particular device only.
 This also helps testing and gives a better control for the test
 scripts including mount context reads.

[3]
 In case of SAN storage, some storage prefers that host access the
 LUN based on the LBA so that there won't be duplications of
 caching on the storage.

[4]
 In case of mix of SSD and HD we may want to use SSD as the primary
 read device.

[5]
 If storage caching is not the bottleneck but the transport layer
 is then read load should be tuned based on the IO load.

[6]
 Or a combination of any of above as per the priority.
 Timofey should consider to base his patch on top of this.
Btrfs: enchanse raid1/10 balance heuristic

This patch set is on top of the preparatory patch set:
  [PATCH 0/2] Preparatory to add read_mirror mount option


Anand Jain (2):
  btrfs: add mount option read_mirror_policy
  btrfs: add read_mirror_policy parameter devid

 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/super.c   | 31 +++
 fs/btrfs/volumes.c | 18 +-
 fs/btrfs/volumes.h |  7 +++
 4 files changed, 57 insertions(+), 1 deletion(-)

-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] Preparatory to add read_mirror mount option

2018-01-29 Thread Anand Jain
Adds cleanups to find_live_mirror(), so that we can add more
policy on how the read mirror device should be found.

Anand Jain (2):
  btrfs: drop num argument from find_live_mirror()
  btrfs: drop optimal argument from find_live_mirror()

 fs/btrfs/volumes.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] btrfs: drop num argument from find_live_mirror()

2018-01-29 Thread Anand Jain
Obtain the stripes info from the map directly and so no need
to pass it as an argument.

Signed-off-by: Anand Jain 
---
 fs/btrfs/volumes.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f7147740b68e..9c9d987838c2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5252,13 +5252,22 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
*fs_info, u64 logical, u64 len)
 }
 
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
-   struct map_lookup *map, int first, int num,
+   struct map_lookup *map, int first,
int optimal, int dev_replace_is_ongoing)
 {
int i;
+   int num;
int tolerance;
struct btrfs_device *srcdev;
 
+   ASSERT((map->type &
+(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+
+   if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+   num = map->sub_stripes;
+   else
+   num = map->num_stripes;
+
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
@@ -5812,7 +5821,6 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
-   map->num_stripes,
current->pid % map->num_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
@@ -5841,7 +5849,7 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
  stripe_index,
- map->sub_stripes, stripe_index +
+ stripe_index +
  current->pid % map->sub_stripes,
  dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: degraded permanent mount option

2018-01-29 Thread Chris Murphy
On Mon, Jan 29, 2018 at 1:54 AM, Tomasz Pala  wrote:
> On Sun, Jan 28, 2018 at 17:00:46 -0700, Chris Murphy wrote:
>
>> systemd can't possibly need to know more information than a person
>> does in the exact same situation in order to do the right thing. No
>> human would wait 10 minutes, let alone literally the heat death of the
>> planet for "all devices have appeared" but systemd will. And it does
>
> We're already repeating - systemd waits for THE btrfs-compound-device,
> not ALL the block-devices. Just like it 'waits' for someone to plug USB
> pendrive in.

Btrfs is orthogonal to systemd's willingness to wait forever while
making no progress. It doesn't matter what it is, it shouldn't wait
forever.

It occurs to me there are such systemd service units specifically for
waiting for example

systemd-networkd-wait-online.service, systemd-networkd-wait-online -
Wait for network to
   come online

 chrony-wait.service - Wait for chrony to synchronize system clock

NetworkManager has a version of this. I don't see why there can't be a
wait for Btrfs to normally mount, just simply try to mount, it fails,
wait 10, try again, wait 10 try again. And then fail the unit so we
end up at a prompt. Or some people can optionally ask for a mount -o
degraded instead of a fail, and then if that also doesn't work, the
unit fails. Of course service units can have such conditionals rather
than waiting forever.





-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread Qu Wenruo


On 2018年01月30日 02:16, ^m'e wrote:
> Thanks!
> 
> Got these
> 
>   # ./btrfs.static inspect dump-super -fFa /dev/sdb3 |grep
> backup_tree_root: | sort -u
> backup_tree_root:180410073088gen: 463765level: 1
> backup_tree_root:180415758336gen: 463766level: 1
> backup_tree_root:180416364544gen: 463767level: 1
> backup_tree_root:4194304gen: 463764level: 1
> 
> but, nada: all have transid failures...

That's why I call it "small chance"

> 
> The backup snapshots are OK as per original check.

Then you should be OK to restore.

Thanks,
Qu

> 
> 
> On Mon, Jan 29, 2018 at 3:09 PM, Qu Wenruo  wrote:
>>
>>
>> On 2018年01月29日 22:49, ^m'e wrote:
>>> On Mon, Jan 29, 2018 at 2:04 PM, Qu Wenruo  wrote:


 On 2018年01月29日 21:58, ^m'e wrote:
> Thanks for the advice, Qu!
>
> I used the system for a while, did some package upgrades -- writing in
> the suspect corrupted area. Then tried a btrfs-send to my backup vol,
> and it failed miserably with a nice kernel oops.
>
> So I went for a lowmem repair:
> 
> # ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
> /mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
> WARNING: low-memory mode repair support is only partial
> Fixed 0 roots.
> checking extents
> checking free space cache
> checking fs roots
> ERROR: failed to add inode 28891726 as orphan item root 257
> ERROR: root 257 INODE[28891726] is orphan item

 At least I need dig the kernel code further to determine if the orphan
 inode handling in btrfs-progs is correct or not.

 So there won't be more dirty fix soon.

 Hopefully you could get some good backup and restore the system.

 At least the problem is limited to a very small range, and it's
 something we could handle easily.

 Thanks for all your report,
 Qu


>>>
>>> Right.
>>>
>>> Meanwhile, could you please suggest the best course of action? btrfs
>>> rescue or restore?
>>> I have snapshots of my two subvols (rootfs, home -- now fs-checking
>>> them  just in case...)
>>
>> Don't run --repair any more.
>> It seems to make the case worse.
>>
>> While the RW mount with orphan cleanup abort seems to screwed up the
>> filesystem.
>>
>> In this case, it's pretty hard to recover, but still has small chance.
>>
>> Use btrfs inspec dump-super to get backup roots:
>>
>> # btrfs inspect dump-super -fFa  |grep backup_tree_root: | sort
>> | uniq
>>
>> And try all the 4 numbers in the following commands:
>>
>> # btrfs check --tree-root  
>>
>> To see if is there any good one without transid error.
>>
>> Thanks,
>> Qu
>>
>>>
>>> Cheers,
>>>
>>>   Marco
>>>
>>
> 
> 
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 02/26] Add libbtrfsutil

2018-01-29 Thread Omar Sandoval
On Mon, Jan 29, 2018 at 10:16:40AM +0800, Qu Wenruo wrote:
> 
> 
> On 2018年01月27日 02:40, Omar Sandoval wrote:
> > From: Omar Sandoval 
> > 
> > Currently, users wishing to manage Btrfs filesystems programatically
> > have to shell out to btrfs-progs and parse the output. This isn't ideal.
> > The goal of libbtrfsutil is to provide a library version of as many of
> > the operations of btrfs-progs as possible and to migrate btrfs-progs to
> > use it.
> > 
> > Rather than simply refactoring the existing btrfs-progs code, the code
> > has to be written from scratch for a couple of reasons:
> > 
> > * A lot of the btrfs-progs code was not designed with a nice library API
> >   in mind in terms of reusability, naming, and error reporting.
> > * libbtrfsutil is licensed under the LGPL, whereas btrfs-progs is under
> >   the GPL, which makes it dubious to directly copy or move the code.
> > 
> > Eventually, most of the low-level btrfs-progs code should either live in
> > libbtrfsutil or the shared kernel/userspace filesystem code, and
> > btrfs-progs will just be the CLI wrapper.
> > 
> > This first commit just includes the build system changes, license,
> > README, and error reporting helper.
> > 
> > Signed-off-by: Omar Sandoval 
> > ---
> >  .gitignore  |   2 +
> >  Makefile|  47 +--
> >  libbtrfsutil/COPYING| 674 
> > 
> >  libbtrfsutil/COPYING.LESSER | 165 +++
> >  libbtrfsutil/README.md  |  32 +++
> >  libbtrfsutil/btrfsutil.h|  72 +
> >  libbtrfsutil/errors.c   |  55 
> >  7 files changed, 1031 insertions(+), 16 deletions(-)
> >  create mode 100644 libbtrfsutil/COPYING
> >  create mode 100644 libbtrfsutil/COPYING.LESSER
> >  create mode 100644 libbtrfsutil/README.md
> >  create mode 100644 libbtrfsutil/btrfsutil.h
> >  create mode 100644 libbtrfsutil/errors.c
> > 

[snip]

> > diff --git a/libbtrfsutil/btrfsutil.h b/libbtrfsutil/btrfsutil.h
> > new file mode 100644
> > index ..fe1091ca
> > --- /dev/null
> > +++ b/libbtrfsutil/btrfsutil.h
> > @@ -0,0 +1,72 @@
> > +/*
> > + * Copyright (C) 2018 Facebook
> > + *
> > + * This file is part of libbtrfsutil.
> > + *
> > + * libbtrfsutil is free software: you can redistribute it and/or modify
> > + * it under the terms of the GNU Lesser General Public License as 
> > published by
> > + * the Free Software Foundation, either version 3 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * libbtrfsutil is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public License
> > + * along with libbtrfsutil.  If not, see .
> > + */
> > +
> > +#ifndef BTRFS_UTIL_H
> > +#define BTRFS_UTIL_H
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +/**
> > + * enum btrfs_util_error - libbtrfsutil error codes.
> > + *
> > + * All functions in libbtrfsutil that can return an error return this type 
> > and
> > + * set errno.
> > + */
> 
> I totally understand that libbtrfsutils needs extra error numbers, but I
> didn't see similar practice, would you mind to give some existing
> example of such >0 error usage in other projects?
> (Just curious)

Sure, pthreads returns 0 on success, positive errnos on failure. libcurl
also uses positive error codes 
(https://curl.haxx.se/libcurl/c/libcurl-errors.html).
Those are just the first two I checked.

> Normally people would expect error values < 0 to indicate errors, just
> like glibc system call wrappers, which always return -1 to indicate errors.
> 
> > +enum btrfs_util_error {
> > +   BTRFS_UTIL_OK,
> > +   BTRFS_UTIL_ERROR_STOP_ITERATION,
> > +   BTRFS_UTIL_ERROR_NO_MEMORY,
> 
> Not sure if this is duplicated with -ENOMEM errno.
> 
> From my understanding, these extra numbers should be used to indicate
> extra error not definied in generic errno.h.
> 
> For NOT_BTRFS and NOT_SUBVOLUME they makes sense, but for NO_MEMORY, I'm
> really not sure.

So sometimes we return an errno, sometimes an enum btrfs_util_error? And
then we have to make sure to avoid collisions between the enum values
and errno values? That seems clunky.

Thanks for taking a look.

> > +   BTRFS_UTIL_ERROR_INVALID_ARGUMENT,
> > +   BTRFS_UTIL_ERROR_NOT_BTRFS,
> > +   BTRFS_UTIL_ERROR_NOT_SUBVOLUME,
> > +   BTRFS_UTIL_ERROR_SUBVOLUME_NOT_FOUND,
> > +   BTRFS_UTIL_ERROR_OPEN_FAILED,
> > +   BTRFS_UTIL_ERROR_RMDIR_FAILED,
> > +   BTRFS_UTIL_ERROR_UNLINK_FAILED,
> > +   BTRFS_UTIL_ERROR_STAT_FAILED,
> > +   BTRFS_UTIL_ERROR_STATFS_FAILED,
> > +   BTRFS_UTIL_ERROR_SEARCH_FAILED,
> > +   BTRFS_UTIL_ERROR_INO_LOOKUP_FAILED,
> > +   BTRFS_UTIL_ERROR_SUBVOL_GETFLAGS_FAILED,
> > +   

Re: [PATCH] btrfs: fix err_cast.cocci warnings

2018-01-29 Thread Anand Jain



On 01/30/2018 02:40 AM, David Sterba wrote:

On Mon, Jan 29, 2018 at 02:50:10AM +0800, kbuild test robot wrote:

From: Fengguang Wu 

fs/btrfs/volumes.c:742:10-17: WARNING: ERR_CAST can be used with fs_devices


  Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...))

Generated by: scripts/coccinelle/api/err_cast.cocci

Fixes: bf155c98d312 ("btrfs: get device pointer from device_list_add()")
CC: Anand Jain 
Signed-off-by: Fengguang Wu 


Thanks for the fix, I'll fold it to the patch.


 Oh. I just saw this email thread, err my filters. sorry.
 Thanks for the fix.

Anand


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: degraded permanent mount option

2018-01-29 Thread waxhead



Austin S. Hemmelgarn wrote:

On 2018-01-29 12:58, Andrei Borzenkov wrote:

29.01.2018 14:24, Adam Borowski пишет:
...


So any event (the user's request) has already happened.  A rc system, of
which systemd is one, knows whether we reached the "want root 
filesystem" or
"want secondary filesystems" stage.  Once you're there, you can issue 
the

mount() call and let the kernel do the work.

It is a btrfs choice to not expose compound device as separate one 
(like

every other device manager does)


Btrfs is not a device manager, it's a filesystem.

it is a btrfs drawback that doesn't provice anything else except for 
this

IOCTL with it's logic


How can it provide you with something it doesn't yet have?  If you 
want the

information, call mount().  And as others in this thread have mentioned,
what, pray tell, would you want to know "would a mount succeed?" for 
if you

don't want to mount?

it is a btrfs drawback that there is nothing to push assembling into 
"OK,

going degraded" state


The way to do so is to timeout, then retry with -o degraded.



That's possible way to solve it. This likely requires support from
mount.btrfs (or btrfs.ko) to return proper indication that filesystem is
incomplete so caller can decide whether to retry or to try degraded 
mount.
We already do so in the accepted standard manner.  If the mount fails 
because of a missing device, you get a very specific message in the 
kernel log about it, as is the case for most other common errors (for 
uncommon ones you usually just get a generic open_ctree error).  This is 
really the only option too, as the mount() syscall (which the mount 
command calls) returns only 0 on success or -1 and an appropriate errno 
value on failure, and we can't exactly go about creating a half dozen 
new error numbers just for this (well, technically we could, but I very 
much doubt that they would be accepted upstream, which defeats the 
purpose).


Or may be mount.btrfs should implement this logic internally. This would
really be the most simple way to make it acceptable to the other side by
not needing to accept anything :)
And would also be another layering violation which would require a 
proliferation of extra mount options to control the mount command itself 
and adjust the timeout handling.


This has been done before with mount.nfs, but for slightly different 
reasons (primarily to allow nested NFS mounts, since the local directory 
that the filesystem is being mounted on not being present is treated 
like a mount timeout), and it had near zero control.  It works there 
because they push the complicated policy decisions to userspace (namely, 
there is no support for retrying with different options or trying a 
different server).


I just felt like commenting a bit on this from a regular users point of 
view.


Remember that at some point BTRFS will probably be the default 
filesystem for the average penguin.
BTRFS big selling point is redundance and a guarantee that whatever you 
write is the same that you will read sometime later.


Many users will probably build their BTRFS system on a redundant array 
of storage devices. As long as there are sufficient (not necessarily 
all) storage devices present they expect their system to come up and 
work. If the system is not able to come up in a fully operative state it 
must at least be able to limp until the issue is fixed.


Starting a argument about what init system is the most sane or most 
shiny is not helping. The truth is that systemd is not going away 
sometime soon and one might as well try to become friends if nothing 
else for the sake of having things working which should be a common goal 
regardless of the religion.


I personally think the degraded mount option is a mistake as this 
assumes that a lightly degraded system is not able to work which is false.
If the system can mount to some working state then it should mount 
regardless if it is fully operative or not. If the array is in a bad 
state you need to learn about it by issuing a command or something. The 
same goes for a MD array (and yes, I am aware of the block layer vs 
filesystem thing here).

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] inode->i_version rework for v4.16

2018-01-29 Thread Linus Torvalds
On Mon, Jan 29, 2018 at 1:50 PM, Linus Torvalds
 wrote:
>
> Hmm. I have pulled this, but it is really really broken in one place,
> to the degree that I always went "no, I won't pull this garbage".

always=almost.

I'd blame auto-correct, but I'm not on the phone.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] inode->i_version rework for v4.16

2018-01-29 Thread Linus Torvalds
On Mon, Jan 29, 2018 at 4:26 AM, Jeff Layton  wrote:
>
> This pile of patches is a rework of the inode->i_version field. We have
> traditionally incremented that field on every inode data or metadata
> change. Typically this increment needs to be logged on disk even when
> nothing else has changed, which is rather expensive.

Hmm. I have pulled this, but it is really really broken in one place,
to the degree that I always went "no, I won't pull this garbage".

But the breakage is potential, not actual, and can be fixed trivially,
so I'll let it slide - but I do require it to be fixed. And I require
people to *think* about it.

So what's to horribly horribly wrong?

The inode_cmp_iversion{+raw}() functions are pure and utter crap.

Why?

You say that they return 0/negative/positive, but they do so in a
completely broken manner. They return that ternary value as the
sequence number difference in a 's64', which means that if you
actually care about that ternary value, and do the *sane* thing that
the kernel-doc of the function implies is the right thing, you would
do

int cmp = inode_cmp_iversion(inode, old);

if (cmp < 0 ...

and as a result you get code that looks sane, but that doesn't
actually *WORK* right.

To make it even worse, it will actually work in practice by accident
in 99.9% of all cases, so now you have

 (a) subtly buggy code
 (b) that looks fine
 (c) and that works in testing

which is just about the worst possible case for any code. The
interface is simply garbage that encourages bugs.

And the bug wouldn't be in the user, the bug would be in this code you
just sent me. The interface is simply wrong.

So this absolutely needs to be fixed. I see two fixes:

 - just return a boolean. That's all that any current user actually
wants, so the ternary value seems pointless.

 - make it return an 'int', and not just any int, but -1/0/1. That way
there is no worry about uses, and if somebody *really* cares about the
ternary value, they can now use a "switch" statement to get it
(alternatively, make it return an enum, but whatever).

That "ternary" function that has 18446744069414584320 incorrect return
values really is unacceptable.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 04/26] libbtrfsutil: add btrfs_util_is_subvolume() and btrfs_util_subvolume_id()

2018-01-29 Thread Omar Sandoval
On Mon, Jan 29, 2018 at 12:24:26PM +0200, Nikolay Borisov wrote:
> On 26.01.2018 20:40, Omar Sandoval wrote:
[snip]
> > +/*
> > + * This intentionally duplicates btrfs_util_f_is_subvolume() instead of 
> > opening
> > + * a file descriptor and calling it, because fstat() and fstatfs() don't 
> > accept
> > + * file descriptors opened with O_PATH on old kernels (before v3.6 and 
> > before
> > + * v3.12, respectively), but stat() and statfs() can be called on a path 
> > that
> > + * the user doesn't have read or write permissions to.
> > + */
> > +__attribute__((visibility("default")))
> 
> Why do we need to explicitly set the attribute visibility to default,
> isn't it implicitly default already?

Ah, I forgot to add -fvisibility=hidden to the build rule when I ported
this to the btrfs-progs Makefile, that's why that's there. I'll add
-fvisibility=hidden to the Makefile.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] Btrfs for 4.16

2018-01-29 Thread David Sterba
Hi,

the btrfs updates for this cycle are mostly cleanups with a few raid56 bugfixes
and some feature additions. Please pull, thanks.


Features or user visible changes:

- fallocate: implement zero range mode

- avoid losing data raid profile when deleting a device

- tree item checker: more checks for directory items and xattrs

Notable fixes:

- raid56 recovery: don't use cached stripes, that could be potentially changed
  and a later RMW or recovery would lead to corruptions or failures

- let raid56 try harder to rebuild damaged data, reading from all stripes if
  necessary

- fix scrub to repair raid56 in a similar way as in the case above

Other:

- cleanups: device freeing, removed some call indirections, redundant
  bio_put/_get, unused parameters, refactorings and renames

- RCU list traversal fixups

- simplify mount callchain, remove recursing back when mounting a subvolume

- plug for fsync, may improve bio merging on multiple devices

- compression heurisic: replace heap sort with radix sort, gains some
  performance

- add extent map selftests, buffered write vs dio



The following changes since commit 0c5b9b5d9adbad4b60491f9ba0d2af38904bb4b9:

  Linux 4.15-rc9 (2018-01-21 13:51:26 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-4.16-tag

for you to fetch changes up to 3acbcbfc8f06d4ade2aab2ebba0a2542a05ce90c:

  btrfs: drop devid as device_list_add() arg (2018-01-29 19:31:16 +0100)


Anand Jain (33):
  btrfs: clean up btrfs_dev_stat_inc usage
  btrfs: move volume_mutex into the btrfs_rm_device()
  btrfs: rename btrfs_add_device to btrfs_add_dev_item
  btrfs: set fs_devices->seed directly
  btrfs: move check for device generation to the last
  btrfs: factor __btrfs_open_devices() to create btrfs_open_one_device()
  btrfs: drop btrfs_device::can_discard to query directly
  btrfs: add helper for device path or missing
  btrfs: cleanup device states define BTRFS_DEV_STATE_WRITEABLE
  btrfs: cleanup device states define BTRFS_DEV_STATE_IN_FS_METADATA
  btrfs: cleanup device states define BTRFS_DEV_STATE_MISSING
  btrfs: cleanup device states define BTRFS_DEV_STATE_REPLACE_TGT
  btrfs: cleanup device states define BTRFS_DEV_STATE_FLUSH_SENT
  btrfs: put btrfs_ioctl_vol_args_v2 related defines together
  btrfs: factor btrfs_check_rw_degradable() to check given device
  btrfs: remove check for BTRFS_FS_STATE_ERROR which we just set
  btrfS: collapse btrfs_handle_error() into __btrfs_handle_fs_error()
  btrfs: rename btrfs_device::scrub_device to scrub_ctx
  btrfs: simplify mutex unlocking code in btrfs_commit_transaction
  btrfs: minor style cleanups in btrfs_scan_one_device
  btrfs: define SUPER_FLAG_METADUMP_V2
  btrfs: add support for SUPER_FLAG_CHANGING_FSID
  btrfs: fail mount when sb flag is not in BTRFS_SUPER_FLAG_SUPP
  btrfs: cleanup btrfs_free_stale_device() usage
  btrfs: no need to check for btrfs_fs_devices::seeding
  btrfs: make btrfs_free_stale_device() to iterate all stales
  btrfs: make btrfs_free_stale_devices() argument optional
  btrfs: rename btrfs_free_stale_devices() arg to skip_dev
  btrfs: make btrfs_free_stale_devices() to match the path
  btrfs: move pr_info into device_list_add
  btrfs: set the total_devices in device_list_add()
  btrfs: get device pointer from device_list_add()
  btrfs: drop devid as device_list_add() arg

Arnd Bergmann (1):
  btrfs: tree-checker: use %zu format string for size_t

Colin Ian King (1):
  btrfs: make function update_share_count static

David Sterba (46):
  btrfs: rename device free rcu helper to free_device_rcu
  btrfs: introduce free_device helper
  btrfs: use free_device where opencoded
  btrfs: simplify exit paths in btrfs_init_new_device
  btrfs: document device locking
  btrfs: simplify btrfs_close_bdev
  btrfs: switch to RCU for device traversal in btrfs_ioctl_dev_info
  btrfs: switch to RCU for device traversal in btrfs_ioctl_fs_info
  btrfs: use non-RCU list traversal in write_all_supers callees
  btrfs: prepare to drop gfp mask parameter from clear_extent_bit
  btrfs: sink gfp parameter to clear_extent_bit
  btrfs: sink gfp parameter to clear_extent_uptodate
  btrfs: use GFP_KERNEL in btrfs_alloc_inode
  btrfs: sink get_extent parameter to extent_writepages
  btrfs: sink get_extent parameter to extent_write_locked_range
  btrfs: sink get_extent parameter to extent_write_full_page
  btrfs: drop get_extent from extent_page_data
  btrfs: sink get_extent parameter to extent_fiemap
  btrfs: sink get_extent parameter to get_extent_skip_holes
  btrfs: sink get_extent parameter to extent_readpages
  btrfs: sink get_extent 

Re: [PATCH] btrfs: btrfs_evict_inode must clear all inodes

2018-01-29 Thread Jeff Mahoney
On 1/29/18 2:58 PM, Liu Bo wrote:
> On Mon, Jan 29, 2018 at 11:46:28AM -0500, Jeff Mahoney wrote:
>> btrfs_evict_inode must clear all inodes or we'll hit a BUG_ON in evict().
>>
>> Fixes: 3d48d9810de (btrfs: Handle uninitialised inode eviction)
>> Cc: Nikolay Borisov 
>> Cc:  # v4.8+
>> Signed-off-by: Jeff Mahoney 
>> ---
>>  fs/btrfs/inode.c |1 +
>>  1 file changed, 1 insertion(+)
>>
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -5282,6 +5282,7 @@ void btrfs_evict_inode(struct inode *ino
>>  trace_btrfs_inode_evict(inode);
>>  
>>  if (!root) {
>> +clear_inode(inode);
>>  kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
> 
> I had a patch for this, and also kmem_cache_free() is not supposed to
> be called here, but in ->destroy_inode().

Yep, that too.

Thanks,

-Jeff

-- 
Jeff Mahoney
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: btrfs_evict_inode must clear all inodes

2018-01-29 Thread Liu Bo
On Mon, Jan 29, 2018 at 11:46:28AM -0500, Jeff Mahoney wrote:
> btrfs_evict_inode must clear all inodes or we'll hit a BUG_ON in evict().
> 
> Fixes: 3d48d9810de (btrfs: Handle uninitialised inode eviction)
> Cc: Nikolay Borisov 
> Cc:  # v4.8+
> Signed-off-by: Jeff Mahoney 
> ---
>  fs/btrfs/inode.c |1 +
>  1 file changed, 1 insertion(+)
> 
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -5282,6 +5282,7 @@ void btrfs_evict_inode(struct inode *ino
>   trace_btrfs_inode_evict(inode);
>  
>   if (!root) {
> + clear_inode(inode);
>   kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));

I had a patch for this, and also kmem_cache_free() is not supposed to
be called here, but in ->destroy_inode().

Thanks,

-liubo
>   return;
>   }
> 
> -- 
> Jeff Mahoney
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: degraded permanent mount option

2018-01-29 Thread Austin S. Hemmelgarn

On 2018-01-29 12:58, Andrei Borzenkov wrote:

29.01.2018 14:24, Adam Borowski пишет:
...


So any event (the user's request) has already happened.  A rc system, of
which systemd is one, knows whether we reached the "want root filesystem" or
"want secondary filesystems" stage.  Once you're there, you can issue the
mount() call and let the kernel do the work.


It is a btrfs choice to not expose compound device as separate one (like
every other device manager does)


Btrfs is not a device manager, it's a filesystem.


it is a btrfs drawback that doesn't provice anything else except for this
IOCTL with it's logic


How can it provide you with something it doesn't yet have?  If you want the
information, call mount().  And as others in this thread have mentioned,
what, pray tell, would you want to know "would a mount succeed?" for if you
don't want to mount?


it is a btrfs drawback that there is nothing to push assembling into "OK,
going degraded" state


The way to do so is to timeout, then retry with -o degraded.



That's possible way to solve it. This likely requires support from
mount.btrfs (or btrfs.ko) to return proper indication that filesystem is
incomplete so caller can decide whether to retry or to try degraded mount.
We already do so in the accepted standard manner.  If the mount fails 
because of a missing device, you get a very specific message in the 
kernel log about it, as is the case for most other common errors (for 
uncommon ones you usually just get a generic open_ctree error).  This is 
really the only option too, as the mount() syscall (which the mount 
command calls) returns only 0 on success or -1 and an appropriate errno 
value on failure, and we can't exactly go about creating a half dozen 
new error numbers just for this (well, technically we could, but I very 
much doubt that they would be accepted upstream, which defeats the purpose).


Or may be mount.btrfs should implement this logic internally. This would
really be the most simple way to make it acceptable to the other side by
not needing to accept anything :)
And would also be another layering violation which would require a 
proliferation of extra mount options to control the mount command itself 
and adjust the timeout handling.


This has been done before with mount.nfs, but for slightly different 
reasons (primarily to allow nested NFS mounts, since the local directory 
that the filesystem is being mounted on not being present is treated 
like a mount timeout), and it had near zero control.  It works there 
because they push the complicated policy decisions to userspace (namely, 
there is no support for retrying with different options or trying a 
different server).


With what you're proposing for BTRFS however, _everything_ is a 
complicated decision, namely:
1. Do you retry at all?  During boot, the answer should usually be yes, 
but during normal system operation it should normally be no (because we 
should be letting the user handle issues at that point).
2. How long should you wait before you retry?  There is no right answer 
here that will work in all cases (I've seen systems which take multiple 
minutes for devices to become available on boot), especially considering 
those of us who would rather have things fail early.
3. If the retry fails, do you retry again?  How many times before it 
just outright fails?  This is going to be system specific policy.  On 
systems where devices may take a while to come online, the answer is 
probably yes and some reasonably large number, while on systems where 
devices are known to reliably be online immediately, it makes no sense 
to retry more than once or twice.
4. If you are going to retry, should you try a degraded mount?  Again, 
this is going to be system specific policy (regular users would probably 
want this to be a yes, while people who care about data integrity over 
availability would likely want it to be a no).
5. Assuming you do retry with the degraded mount, how many times should 
a normal mount fail before things go degraded?  This ties in with 3 and 
has the same arguments about variability I gave there.
6. How many times do you try a degraded mount before just giving up? 
Again, similar variability to 3.
7. Should each attempt try first a regular mount and then a degraded 
one, or do you try just normal a couple times and then switch to 
degraded, or even start out trying normal and then start alternating? 
Any of those patterns has valid arguments both for and against it, so 
this again needs to be user configurable policy.


Altogether, that's a total of 7 policy decisions that should be user 
configurable.  Having a config file other than /etc/fstab for the mount 
command should probably be avoided for sanity reasons (again, BTRFS is a 
filesystem, not a volume manager), so they would all have to be handled 
through mount options.  The kernel will additionally have to understand 
that those options need to be ignored (things do try to mount 
filesystems 

Re: [PATCH] btrfs: fix err_cast.cocci warnings

2018-01-29 Thread David Sterba
On Mon, Jan 29, 2018 at 02:50:10AM +0800, kbuild test robot wrote:
> From: Fengguang Wu 
> 
> fs/btrfs/volumes.c:742:10-17: WARNING: ERR_CAST can be used with fs_devices
> 
> 
>  Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...))
> 
> Generated by: scripts/coccinelle/api/err_cast.cocci
> 
> Fixes: bf155c98d312 ("btrfs: get device pointer from device_list_add()")
> CC: Anand Jain 
> Signed-off-by: Fengguang Wu 

Thanks for the fix, I'll fold it to the patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread ^m'e
Thanks!

Got these

  # ./btrfs.static inspect dump-super -fFa /dev/sdb3 |grep
backup_tree_root: | sort -u
backup_tree_root:180410073088gen: 463765level: 1
backup_tree_root:180415758336gen: 463766level: 1
backup_tree_root:180416364544gen: 463767level: 1
backup_tree_root:4194304gen: 463764level: 1

but, nada: all have transid failures...

The backup snapshots are OK as per original check.


On Mon, Jan 29, 2018 at 3:09 PM, Qu Wenruo  wrote:
>
>
> On 2018年01月29日 22:49, ^m'e wrote:
>> On Mon, Jan 29, 2018 at 2:04 PM, Qu Wenruo  wrote:
>>>
>>>
>>> On 2018年01月29日 21:58, ^m'e wrote:
 Thanks for the advice, Qu!

 I used the system for a while, did some package upgrades -- writing in
 the suspect corrupted area. Then tried a btrfs-send to my backup vol,
 and it failed miserably with a nice kernel oops.

 So I went for a lowmem repair:
 
 # ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
 /mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
 WARNING: low-memory mode repair support is only partial
 Fixed 0 roots.
 checking extents
 checking free space cache
 checking fs roots
 ERROR: failed to add inode 28891726 as orphan item root 257
 ERROR: root 257 INODE[28891726] is orphan item
>>>
>>> At least I need dig the kernel code further to determine if the orphan
>>> inode handling in btrfs-progs is correct or not.
>>>
>>> So there won't be more dirty fix soon.
>>>
>>> Hopefully you could get some good backup and restore the system.
>>>
>>> At least the problem is limited to a very small range, and it's
>>> something we could handle easily.
>>>
>>> Thanks for all your report,
>>> Qu
>>>
>>>
>>
>> Right.
>>
>> Meanwhile, could you please suggest the best course of action? btrfs
>> rescue or restore?
>> I have snapshots of my two subvols (rootfs, home -- now fs-checking
>> them  just in case...)
>
> Don't run --repair any more.
> It seems to make the case worse.
>
> While the RW mount with orphan cleanup abort seems to screwed up the
> filesystem.
>
> In this case, it's pretty hard to recover, but still has small chance.
>
> Use btrfs inspec dump-super to get backup roots:
>
> # btrfs inspect dump-super -fFa  |grep backup_tree_root: | sort
> | uniq
>
> And try all the 4 numbers in the following commands:
>
> # btrfs check --tree-root  
>
> To see if is there any good one without transid error.
>
> Thanks,
> Qu
>
>>
>> Cheers,
>>
>>   Marco
>>
>



-- 
^m'e
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: degraded permanent mount option

2018-01-29 Thread Andrei Borzenkov
29.01.2018 14:24, Adam Borowski пишет:
...
> 
> So any event (the user's request) has already happened.  A rc system, of
> which systemd is one, knows whether we reached the "want root filesystem" or
> "want secondary filesystems" stage.  Once you're there, you can issue the
> mount() call and let the kernel do the work.
> 
>> It is a btrfs choice to not expose compound device as separate one (like
>> every other device manager does)
> 
> Btrfs is not a device manager, it's a filesystem.
> 
>> it is a btrfs drawback that doesn't provice anything else except for this
>> IOCTL with it's logic
> 
> How can it provide you with something it doesn't yet have?  If you want the
> information, call mount().  And as others in this thread have mentioned,
> what, pray tell, would you want to know "would a mount succeed?" for if you
> don't want to mount?
> 
>> it is a btrfs drawback that there is nothing to push assembling into "OK,
>> going degraded" state
> 
> The way to do so is to timeout, then retry with -o degraded.
> 

That's possible way to solve it. This likely requires support from
mount.btrfs (or btrfs.ko) to return proper indication that filesystem is
incomplete so caller can decide whether to retry or to try degraded mount.

Or may be mount.btrfs should implement this logic internally. This would
really be the most simple way to make it acceptable to the other side by
not needing to accept anything :)
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: btrfs_evict_inode must clear all inodes

2018-01-29 Thread Jeff Mahoney
btrfs_evict_inode must clear all inodes or we'll hit a BUG_ON in evict().

Fixes: 3d48d9810de (btrfs: Handle uninitialised inode eviction)
Cc: Nikolay Borisov 
Cc:  # v4.8+
Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/inode.c |1 +
 1 file changed, 1 insertion(+)

--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5282,6 +5282,7 @@ void btrfs_evict_inode(struct inode *ino
trace_btrfs_inode_evict(inode);
 
if (!root) {
+   clear_inode(inode);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
return;
}

-- 
Jeff Mahoney
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread Qu Wenruo


On 2018年01月29日 23:08, ^m'e wrote:
> Lowmem check on my backup pool reports dozens of 'backref lost' on
> extents... Excerpt:
> --
> # ./btrfsck.static check --mode=lowmem /dev/sda1
> checking extents
> ERROR: data extent[33866182656 4096] backref lost
> ERROR: data extent[37102219264 114688] backref lost
> ERROR: data extent[37090353152 45056] backref lost
> ERROR: data extent[37193342976 114688] backref lost
> ERROR: data extent[50151686144 53248] backref lost
> ERROR: data extent[49782943744 126976] backref lost
> ERROR: data extent[49718861824 77824] backref lost
> ERROR: data extent[33853538304 4096] backref lost
> ERROR: data extent[41170333696 692224] backref lost
> ERROR: data extent[37550239744 4096] backref lost
> ERROR: data extent[33866182656 4096] backref lost
> ERROR: data extent[37102219264 114688] backref lost
> ...
> 
> ERROR: data extent[37193342976 114688] backref lost
> ERROR: data extent[50151686144 53248] backref lost
> ERROR: data extent[49782943744 126976] backref lost
> ERROR: data extent[49718861824 77824] backref lost
> ERROR: data extent[33853538304 4096] backref lost
> ERROR: data extent[41170333696 692224] backref lost
> ERROR: data extent[37550239744 4096] backref lost
> ERROR: data extent[44122832896 8192] backref lost
> ERROR: data extent[45874237440 40960] backref lost
> ERROR: errors found in extent allocation tree or chunk allocation
> checking free space cache
> block group 112772251648 has wrong amount of free space
> failed to load free space cache for block group 112772251648
> Wanted offset 127268339712, found 127268323328
> Wanted offset 127268339712, found 127268323328
> cache appears valid but isn't 127267766272
> block group 133173346304 has wrong amount of free space
> failed to load free space cache for block group 133173346304
> Wanted offset 142837039104, found 142837022720
> Wanted offset 142837039104, found 142837022720
> cache appears valid but isn't 142837022720
> ERROR: errors found in free space cache
> Checking filesystem on /dev/sda1
> UUID: 854e1bf5-7a98-4bcb-b971-0d9f2ac9452a
> found 200684883968 bytes used, error(s) found
> total csum bytes: 186712500
> total tree bytes: 40191000576
> total fs tree bytes: 39574110208
> total extent tree bytes: 359038976
> btree space waste bytes: 7605597010
> file data blocks allocated: 946099404800
>  referenced 1091731312640
> --
> 
> Is that related to flawed snapshots of the ailing rootfs vol?

For that specific problem, lowmem is not that reliable.
Use original mode to make sure.

Thanks,
Qu

> Is it safe to try to rollback from it?
> 
> Cheers,
> 
>   M.
> 
> 
> On Mon, Jan 29, 2018 at 2:49 PM, ^m'e  wrote:
>> On Mon, Jan 29, 2018 at 2:04 PM, Qu Wenruo  wrote:
>>>
>>>
>>> On 2018年01月29日 21:58, ^m'e wrote:
 Thanks for the advice, Qu!

 I used the system for a while, did some package upgrades -- writing in
 the suspect corrupted area. Then tried a btrfs-send to my backup vol,
 and it failed miserably with a nice kernel oops.

 So I went for a lowmem repair:
 
 # ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
 /mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
 WARNING: low-memory mode repair support is only partial
 Fixed 0 roots.
 checking extents
 checking free space cache
 checking fs roots
 ERROR: failed to add inode 28891726 as orphan item root 257
 ERROR: root 257 INODE[28891726] is orphan item
>>>
>>> At least I need dig the kernel code further to determine if the orphan
>>> inode handling in btrfs-progs is correct or not.
>>>
>>> So there won't be more dirty fix soon.
>>>
>>> Hopefully you could get some good backup and restore the system.
>>>
>>> At least the problem is limited to a very small range, and it's
>>> something we could handle easily.
>>>
>>> Thanks for all your report,
>>> Qu
>>>
>>>
>>
>> Right.
>>
>> Meanwhile, could you please suggest the best course of action? btrfs
>> rescue or restore?
>> I have snapshots of my two subvols (rootfs, home -- now fs-checking
>> them  just in case...)
>>
>> Cheers,
>>
>>   Marco
> 
> 
> 



signature.asc
Description: OpenPGP digital signature


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread ^m'e
Lowmem check on my backup pool reports dozens of 'backref lost' on
extents... Excerpt:
--
# ./btrfsck.static check --mode=lowmem /dev/sda1
checking extents
ERROR: data extent[33866182656 4096] backref lost
ERROR: data extent[37102219264 114688] backref lost
ERROR: data extent[37090353152 45056] backref lost
ERROR: data extent[37193342976 114688] backref lost
ERROR: data extent[50151686144 53248] backref lost
ERROR: data extent[49782943744 126976] backref lost
ERROR: data extent[49718861824 77824] backref lost
ERROR: data extent[33853538304 4096] backref lost
ERROR: data extent[41170333696 692224] backref lost
ERROR: data extent[37550239744 4096] backref lost
ERROR: data extent[33866182656 4096] backref lost
ERROR: data extent[37102219264 114688] backref lost
...

ERROR: data extent[37193342976 114688] backref lost
ERROR: data extent[50151686144 53248] backref lost
ERROR: data extent[49782943744 126976] backref lost
ERROR: data extent[49718861824 77824] backref lost
ERROR: data extent[33853538304 4096] backref lost
ERROR: data extent[41170333696 692224] backref lost
ERROR: data extent[37550239744 4096] backref lost
ERROR: data extent[44122832896 8192] backref lost
ERROR: data extent[45874237440 40960] backref lost
ERROR: errors found in extent allocation tree or chunk allocation
checking free space cache
block group 112772251648 has wrong amount of free space
failed to load free space cache for block group 112772251648
Wanted offset 127268339712, found 127268323328
Wanted offset 127268339712, found 127268323328
cache appears valid but isn't 127267766272
block group 133173346304 has wrong amount of free space
failed to load free space cache for block group 133173346304
Wanted offset 142837039104, found 142837022720
Wanted offset 142837039104, found 142837022720
cache appears valid but isn't 142837022720
ERROR: errors found in free space cache
Checking filesystem on /dev/sda1
UUID: 854e1bf5-7a98-4bcb-b971-0d9f2ac9452a
found 200684883968 bytes used, error(s) found
total csum bytes: 186712500
total tree bytes: 40191000576
total fs tree bytes: 39574110208
total extent tree bytes: 359038976
btree space waste bytes: 7605597010
file data blocks allocated: 946099404800
 referenced 1091731312640
--

Is that related to flawed snapshots of the ailing rootfs vol?
Is it safe to try to rollback from it?

Cheers,

  M.


On Mon, Jan 29, 2018 at 2:49 PM, ^m'e  wrote:
> On Mon, Jan 29, 2018 at 2:04 PM, Qu Wenruo  wrote:
>>
>>
>> On 2018年01月29日 21:58, ^m'e wrote:
>>> Thanks for the advice, Qu!
>>>
>>> I used the system for a while, did some package upgrades -- writing in
>>> the suspect corrupted area. Then tried a btrfs-send to my backup vol,
>>> and it failed miserably with a nice kernel oops.
>>>
>>> So I went for a lowmem repair:
>>> 
>>> # ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
>>> /mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
>>> WARNING: low-memory mode repair support is only partial
>>> Fixed 0 roots.
>>> checking extents
>>> checking free space cache
>>> checking fs roots
>>> ERROR: failed to add inode 28891726 as orphan item root 257
>>> ERROR: root 257 INODE[28891726] is orphan item
>>
>> At least I need dig the kernel code further to determine if the orphan
>> inode handling in btrfs-progs is correct or not.
>>
>> So there won't be more dirty fix soon.
>>
>> Hopefully you could get some good backup and restore the system.
>>
>> At least the problem is limited to a very small range, and it's
>> something we could handle easily.
>>
>> Thanks for all your report,
>> Qu
>>
>>
>
> Right.
>
> Meanwhile, could you please suggest the best course of action? btrfs
> rescue or restore?
> I have snapshots of my two subvols (rootfs, home -- now fs-checking
> them  just in case...)
>
> Cheers,
>
>   Marco



-- 
^m'e
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread Qu Wenruo


On 2018年01月29日 22:49, ^m'e wrote:
> On Mon, Jan 29, 2018 at 2:04 PM, Qu Wenruo  wrote:
>>
>>
>> On 2018年01月29日 21:58, ^m'e wrote:
>>> Thanks for the advice, Qu!
>>>
>>> I used the system for a while, did some package upgrades -- writing in
>>> the suspect corrupted area. Then tried a btrfs-send to my backup vol,
>>> and it failed miserably with a nice kernel oops.
>>>
>>> So I went for a lowmem repair:
>>> 
>>> # ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
>>> /mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
>>> WARNING: low-memory mode repair support is only partial
>>> Fixed 0 roots.
>>> checking extents
>>> checking free space cache
>>> checking fs roots
>>> ERROR: failed to add inode 28891726 as orphan item root 257
>>> ERROR: root 257 INODE[28891726] is orphan item
>>
>> At least I need dig the kernel code further to determine if the orphan
>> inode handling in btrfs-progs is correct or not.
>>
>> So there won't be more dirty fix soon.
>>
>> Hopefully you could get some good backup and restore the system.
>>
>> At least the problem is limited to a very small range, and it's
>> something we could handle easily.
>>
>> Thanks for all your report,
>> Qu
>>
>>
> 
> Right.
> 
> Meanwhile, could you please suggest the best course of action? btrfs
> rescue or restore?
> I have snapshots of my two subvols (rootfs, home -- now fs-checking
> them  just in case...)

Don't run --repair any more.
It seems to make the case worse.

While the RW mount with orphan cleanup abort seems to screwed up the
filesystem.

In this case, it's pretty hard to recover, but still has small chance.

Use btrfs inspec dump-super to get backup roots:

# btrfs inspect dump-super -fFa  |grep backup_tree_root: | sort
| uniq

And try all the 4 numbers in the following commands:

# btrfs check --tree-root  

To see if is there any good one without transid error.

Thanks,
Qu

> 
> Cheers,
> 
>   Marco
> 



signature.asc
Description: OpenPGP digital signature


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread ^m'e
On Mon, Jan 29, 2018 at 2:04 PM, Qu Wenruo  wrote:
>
>
> On 2018年01月29日 21:58, ^m'e wrote:
>> Thanks for the advice, Qu!
>>
>> I used the system for a while, did some package upgrades -- writing in
>> the suspect corrupted area. Then tried a btrfs-send to my backup vol,
>> and it failed miserably with a nice kernel oops.
>>
>> So I went for a lowmem repair:
>> 
>> # ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
>> /mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
>> WARNING: low-memory mode repair support is only partial
>> Fixed 0 roots.
>> checking extents
>> checking free space cache
>> checking fs roots
>> ERROR: failed to add inode 28891726 as orphan item root 257
>> ERROR: root 257 INODE[28891726] is orphan item
>
> At least I need dig the kernel code further to determine if the orphan
> inode handling in btrfs-progs is correct or not.
>
> So there won't be more dirty fix soon.
>
> Hopefully you could get some good backup and restore the system.
>
> At least the problem is limited to a very small range, and it's
> something we could handle easily.
>
> Thanks for all your report,
> Qu
>
>

Right.

Meanwhile, could you please suggest the best course of action? btrfs
rescue or restore?
I have snapshots of my two subvols (rootfs, home -- now fs-checking
them  just in case...)

Cheers,

  Marco
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] btrfs: Ignore errors from btrfs_qgroup_trace_extent_post

2018-01-29 Thread Qu Wenruo


On 2018年01月29日 21:53, Nikolay Borisov wrote:
> Running generic/019 with qgroups on the scratch device enabled is
> almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's
> supposed to trigger only on -ENOMEM, in reality, however, it's possible
> to get -EIO from btrfs_qgroup_trace_extent_post. This function just
> finds the roots of the extent being tracked and sets the qrecord->old_roots
> list. If this operation fails nothing critical happens except the
> quota accounting can be considered wrong. In such case just set the
> INCONSISTENT flag for the quota and print a warning, rather than
> killing off the system. Additionally, it's possible to trigger a BUG_ON
> in btrfs_truncate_inode_items as well.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> ---
> V3: 
>  * Incorporate feedback from Qu and make btrfs_qgroup_trace_extent_post always
>  return 0 since higher layers don't really handle errors properly.
> 
> V2: 
>  * Always print a warning if btrfs_qgroup_trace_extent_post fails 
>   * Set quota inconsistent flag if btrfs_qgroup_trace_extent_post fails
>  fs/btrfs/delayed-ref.c | 3 ++-
>  fs/btrfs/qgroup.c  | 7 +--
>  2 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index a1a40cf382e3..7ab5e0128f0c 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -821,7 +821,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
> *fs_info,
>   spin_unlock(_refs->lock);
>  
>   if (qrecord_inserted)
> - return btrfs_qgroup_trace_extent_post(fs_info, record);
> + btrfs_qgroup_trace_extent_post(fs_info, record);
> +
>   return 0;
>  
>  free_head_ref:
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index 9e61dd624f7b..ebde770ba498 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -1442,8 +1442,11 @@ int btrfs_qgroup_trace_extent_post(struct 
> btrfs_fs_info *fs_info,
>   int ret;
>  
>   ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, _root, false);
> - if (ret < 0)
> - return ret;
> + if (ret < 0) {
> + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
> + btrfs_warn(fs_info, "Error accounting new delayed refs extent 
> (err code: %d). Quota inconsistent", ret);
> + return 0;
> + }
>  
>   /*
>* Here we don't need to get the lock of
> 



signature.asc
Description: OpenPGP digital signature


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread Qu Wenruo


On 2018年01月29日 21:58, ^m'e wrote:
> Thanks for the advice, Qu!
> 
> I used the system for a while, did some package upgrades -- writing in
> the suspect corrupted area. Then tried a btrfs-send to my backup vol,
> and it failed miserably with a nice kernel oops.
> 
> So I went for a lowmem repair:
> 
> # ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
> /mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
> WARNING: low-memory mode repair support is only partial
> Fixed 0 roots.
> checking extents
> checking free space cache
> checking fs roots
> ERROR: failed to add inode 28891726 as orphan item root 257
> ERROR: root 257 INODE[28891726] is orphan item

At least I need dig the kernel code further to determine if the orphan
inode handling in btrfs-progs is correct or not.

So there won't be more dirty fix soon.

Hopefully you could get some good backup and restore the system.

At least the problem is limited to a very small range, and it's
something we could handle easily.

Thanks for all your report,
Qu

> ERROR: failed to add inode 28892766 as orphan item root 257
> ERROR: root 257 INODE[28892766] is orphan item
> ERROR: failed to add inode 28892768 as orphan item root 257
> ERROR: root 257 INODE[28892768] is orphan item
> ERROR: failed to add inode 28893017 as orphan item root 257
> ERROR: root 257 INODE[28893017] is orphan item
> ERROR: failed to add inode 28893132 as orphan item root 257
> ERROR: root 257 INODE[28893132] is orphan item
> ERROR: failed to add inode 28893724 as orphan item root 257
> ERROR: root 257 INODE[28893724] is orphan item
> ERROR: failed to add inode 28893744 as orphan item root 257
> ERROR: root 257 INODE[28893744] is orphan item
> ERROR: failed to add inode 28893858 as orphan item root 257
> ERROR: root 257 INODE[28893858] is orphan item
> ERROR: failed to add inode 28893860 as orphan item root 257
> ERROR: root 257 INODE[28893860] is orphan item
> ERROR: failed to add inode 28894007 as orphan item root 257
> ERROR: root 257 INODE[28894007] is orphan item
> ERROR: failed to add inode 28894011 as orphan item root 257
> ERROR: root 257 INODE[28894011] is orphan item
> ERROR: failed to add inode 28894084 as orphan item root 257
> ERROR: root 257 INODE[28894084] is orphan item
> ERROR: failed to add inode 28894145 as orphan item root 257
> ERROR: root 257 INODE[28894145] is orphan item
> ERROR: failed to add inode 28894223 as orphan item root 257
> ERROR: root 257 INODE[28894223] is orphan item
> ERROR: failed to add inode 28894249 as orphan item root 257
> ERROR: root 257 INODE[28894249] is orphan item
> ERROR: failed to add inode 28895101 as orphan item root 257
> ERROR: root 257 INODE[28895101] is orphan item
> ERROR: failed to add inode 44565472 as orphan item root 257
> ERROR: root 257 INODE[44565472] is orphan item
> ERROR: failed to add inode 44565473 as orphan item root 257
> ERROR: root 257 INODE[44565473] is orphan item
> ERROR: failed to add inode 44565474 as orphan item root 257
> ERROR: root 257 INODE[44565474] is orphan item
> ERROR: failed to add inode 44565475 as orphan item root 257
> ERROR: root 257 INODE[44565475] is orphan item
> ERROR: failed to add inode 44565477 as orphan item root 257
> ERROR: root 257 INODE[44565477] is orphan item
> ERROR: failed to add inode 44565478 as orphan item root 257
> ERROR: root 257 INODE[44565478] is orphan item
> ERROR: failed to add inode 44565486 as orphan item root 257
> ERROR: root 257 INODE[44565486] is orphan item
> ERROR: failed to add inode 44565487 as orphan item root 257
> ERROR: root 257 INODE[44565487] is orphan item
> ERROR: failed to add inode 44565489 as orphan item root 257
> ERROR: root 257 INODE[44565489] is orphan item
> ERROR: failed to add inode 44773502 as orphan item root 257
> ERROR: root 257 INODE[44773502] is orphan item
> ERROR: failed to add inode 44773506 as orphan item root 257
> ERROR: root 257 INODE[44773506] is orphan item
> ERROR: failed to add inode 44773507 as orphan item root 257
> ERROR: root 257 INODE[44773507] is orphan item
> ERROR: failed to add inode 44773508 as orphan item root 257
> ERROR: root 257 INODE[44773508] is orphan item
> ERROR: failed to add inode 44825557 as orphan item root 257
> ERROR: root 257 INODE[44825557] is orphan item
> ERROR: failed to add inode 47300728 as orphan item root 257
> ERROR: root 257 INODE[47300728] is orphan item
> ERROR: failed to add inode 47300730 as orphan item root 257
> ERROR: root 257 INODE[47300730] is orphan item
> ERROR: failed to add inode 47300732 as orphan item root 257
> ERROR: root 257 INODE[47300732] is orphan item
> ERROR: failed to add inode 47300733 as orphan item root 257
> ERROR: root 257 INODE[47300733] is orphan item
> ERROR: failed to add inode 47333260 as orphan item root 257
> ERROR: root 257 INODE[47333260] is orphan item
> ERROR: failed to add inode 47347678 as orphan item root 257
> ERROR: root 257 

Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-29 Thread ^m'e
Thanks for the advice, Qu!

I used the system for a while, did some package upgrades -- writing in
the suspect corrupted area. Then tried a btrfs-send to my backup vol,
and it failed miserably with a nice kernel oops.

So I went for a lowmem repair:

# ./btrfsck.static check --repair --mode=lowmem /dev/sdb3 2>&1 | tee
/mnt/custom/rescue/btrfs-recovery/btrfs-repair.BTR-POOL.1.log
WARNING: low-memory mode repair support is only partial
Fixed 0 roots.
checking extents
checking free space cache
checking fs roots
ERROR: failed to add inode 28891726 as orphan item root 257
ERROR: root 257 INODE[28891726] is orphan item
ERROR: failed to add inode 28892766 as orphan item root 257
ERROR: root 257 INODE[28892766] is orphan item
ERROR: failed to add inode 28892768 as orphan item root 257
ERROR: root 257 INODE[28892768] is orphan item
ERROR: failed to add inode 28893017 as orphan item root 257
ERROR: root 257 INODE[28893017] is orphan item
ERROR: failed to add inode 28893132 as orphan item root 257
ERROR: root 257 INODE[28893132] is orphan item
ERROR: failed to add inode 28893724 as orphan item root 257
ERROR: root 257 INODE[28893724] is orphan item
ERROR: failed to add inode 28893744 as orphan item root 257
ERROR: root 257 INODE[28893744] is orphan item
ERROR: failed to add inode 28893858 as orphan item root 257
ERROR: root 257 INODE[28893858] is orphan item
ERROR: failed to add inode 28893860 as orphan item root 257
ERROR: root 257 INODE[28893860] is orphan item
ERROR: failed to add inode 28894007 as orphan item root 257
ERROR: root 257 INODE[28894007] is orphan item
ERROR: failed to add inode 28894011 as orphan item root 257
ERROR: root 257 INODE[28894011] is orphan item
ERROR: failed to add inode 28894084 as orphan item root 257
ERROR: root 257 INODE[28894084] is orphan item
ERROR: failed to add inode 28894145 as orphan item root 257
ERROR: root 257 INODE[28894145] is orphan item
ERROR: failed to add inode 28894223 as orphan item root 257
ERROR: root 257 INODE[28894223] is orphan item
ERROR: failed to add inode 28894249 as orphan item root 257
ERROR: root 257 INODE[28894249] is orphan item
ERROR: failed to add inode 28895101 as orphan item root 257
ERROR: root 257 INODE[28895101] is orphan item
ERROR: failed to add inode 44565472 as orphan item root 257
ERROR: root 257 INODE[44565472] is orphan item
ERROR: failed to add inode 44565473 as orphan item root 257
ERROR: root 257 INODE[44565473] is orphan item
ERROR: failed to add inode 44565474 as orphan item root 257
ERROR: root 257 INODE[44565474] is orphan item
ERROR: failed to add inode 44565475 as orphan item root 257
ERROR: root 257 INODE[44565475] is orphan item
ERROR: failed to add inode 44565477 as orphan item root 257
ERROR: root 257 INODE[44565477] is orphan item
ERROR: failed to add inode 44565478 as orphan item root 257
ERROR: root 257 INODE[44565478] is orphan item
ERROR: failed to add inode 44565486 as orphan item root 257
ERROR: root 257 INODE[44565486] is orphan item
ERROR: failed to add inode 44565487 as orphan item root 257
ERROR: root 257 INODE[44565487] is orphan item
ERROR: failed to add inode 44565489 as orphan item root 257
ERROR: root 257 INODE[44565489] is orphan item
ERROR: failed to add inode 44773502 as orphan item root 257
ERROR: root 257 INODE[44773502] is orphan item
ERROR: failed to add inode 44773506 as orphan item root 257
ERROR: root 257 INODE[44773506] is orphan item
ERROR: failed to add inode 44773507 as orphan item root 257
ERROR: root 257 INODE[44773507] is orphan item
ERROR: failed to add inode 44773508 as orphan item root 257
ERROR: root 257 INODE[44773508] is orphan item
ERROR: failed to add inode 44825557 as orphan item root 257
ERROR: root 257 INODE[44825557] is orphan item
ERROR: failed to add inode 47300728 as orphan item root 257
ERROR: root 257 INODE[47300728] is orphan item
ERROR: failed to add inode 47300730 as orphan item root 257
ERROR: root 257 INODE[47300730] is orphan item
ERROR: failed to add inode 47300732 as orphan item root 257
ERROR: root 257 INODE[47300732] is orphan item
ERROR: failed to add inode 47300733 as orphan item root 257
ERROR: root 257 INODE[47300733] is orphan item
ERROR: failed to add inode 47333260 as orphan item root 257
ERROR: root 257 INODE[47333260] is orphan item
ERROR: failed to add inode 47347678 as orphan item root 257
ERROR: root 257 INODE[47347678] is orphan item
ERROR: failed to add inode 47499572 as orphan item root 257
ERROR: root 257 INODE[47499572] is orphan item
enabling repair mode
Checking filesystem on /dev/sdb3
UUID: de1723e2-150c-4448-bb36-be14d7d96093
No device size related problem found
cache and super generation don't match, space cache will be invalidated
Can't find file name for inode 47353519, use 47353519 instead
Moving file '47353519' to 'lost+found' dir since it has no valid backref
Fixed nlink of inode 257 root 47353519 name 47353519 filetype 2
Can't find file name for inode 

[PATCH v3] btrfs: Ignore errors from btrfs_qgroup_trace_extent_post

2018-01-29 Thread Nikolay Borisov
Running generic/019 with qgroups on the scratch device enabled is
almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's
supposed to trigger only on -ENOMEM, in reality, however, it's possible
to get -EIO from btrfs_qgroup_trace_extent_post. This function just
finds the roots of the extent being tracked and sets the qrecord->old_roots
list. If this operation fails nothing critical happens except the
quota accounting can be considered wrong. In such case just set the
INCONSISTENT flag for the quota and print a warning, rather than
killing off the system. Additionally, it's possible to trigger a BUG_ON
in btrfs_truncate_inode_items as well.

Signed-off-by: Nikolay Borisov 
---
V3: 
 * Incorporate feedback from Qu and make btrfs_qgroup_trace_extent_post always
 return 0 since higher layers don't really handle errors properly.

V2: 
 * Always print a warning if btrfs_qgroup_trace_extent_post fails 
  * Set quota inconsistent flag if btrfs_qgroup_trace_extent_post fails
 fs/btrfs/delayed-ref.c | 3 ++-
 fs/btrfs/qgroup.c  | 7 +--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a1a40cf382e3..7ab5e0128f0c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -821,7 +821,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
*fs_info,
spin_unlock(_refs->lock);
 
if (qrecord_inserted)
-   return btrfs_qgroup_trace_extent_post(fs_info, record);
+   btrfs_qgroup_trace_extent_post(fs_info, record);
+
return 0;
 
 free_head_ref:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9e61dd624f7b..ebde770ba498 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1442,8 +1442,11 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info 
*fs_info,
int ret;
 
ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, _root, false);
-   if (ret < 0)
-   return ret;
+   if (ret < 0) {
+   fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+   btrfs_warn(fs_info, "Error accounting new delayed refs extent 
(err code: %d). Quota inconsistent", ret);
+   return 0;
+   }
 
/*
 * Here we don't need to get the lock of
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] btrfs: Gracefully handle errors in btrfs_qgroup_trace_extent_post

2018-01-29 Thread Nikolay Borisov


On 29.01.2018 13:57, Qu Wenruo wrote:
> 
> 
> On 2018年01月29日 19:21, Nikolay Borisov wrote:
>>
>>
>> On 29.01.2018 13:09, Qu Wenruo wrote:
>>>
>>>
>>> On 2018年01月29日 15:44, Nikolay Borisov wrote:
 Running generic/019 with qgroups on the scratch device enabled is
 almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's
 supposed to trigger only on -ENOMEM, in reality, however, it's possible
 to get -EIO from btrfs_qgroup_trace_extent_post. This function just
 finds the roots of the extent being tracked and sets the qrecord->old_roots
 list. If this operation fails nothing critical happens except the
 quota accounting can be considered wrong. In such case just set the
 INCONSISTENT flag for the quota and print a warning.

 Signed-off-by: Nikolay Borisov 
 ---

 V2: 
  * Always print a warning if btrfs_qgroup_trace_extent_post fails 
  * Set quota inconsistent flag if btrfs_qgroup_trace_extent_post fails

  fs/btrfs/delayed-ref.c | 7 +--
  fs/btrfs/qgroup.c  | 6 --
  2 files changed, 9 insertions(+), 4 deletions(-)

 diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
 index a1a40cf382e3..5b2789a28a13 100644
 --- a/fs/btrfs/delayed-ref.c
 +++ b/fs/btrfs/delayed-ref.c
 @@ -820,8 +820,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
 *fs_info,
 num_bytes, parent, ref_root, level, action);
spin_unlock(_refs->lock);
  
 -  if (qrecord_inserted)
 -  return btrfs_qgroup_trace_extent_post(fs_info, record);
 +  if (qrecord_inserted) {
 +  int ret = btrfs_qgroup_trace_extent_post(fs_info, record);
 +  if (ret < 0)
 +  btrfs_warn(fs_info, "Error accounting new delayed refs 
 extent (err code: %d). Quota inconsistent", ret);
>>>
>>> Sorry that I didn't point it out in previous review, there are 2 callers
>>> in delayed-ref.c using btrfs_qgroup_trace_extent_post().
>>>
>>> One is the one you're fixing, and the other one is
>>> btrfs_add_delayed_data_ref().
>>
>> Yes, but the callers of btrfs_add_delayed_data_ref seem to be expecting
>> error values and actually handling them.
> 
> Not exactly.
> 
> A quick search leads to extra unhandled btrfs_add_delayed_data_ref().
> 
> walk_down_proc()
> |- btrfs_dec_ref()
>|- __btrfs_mod_ref()
>   |- btrfs_free_extent()
>  |- btrfs_add_delayed_data_ref()
> |- btrfs_qgroup_trace_extent_post()
> 
> And this leads to another BUG_ON().
> 


And I just hit : 

[ 4289.716628] [ cut here ]
[ 4289.716631] kernel BUG at fs/btrfs/inode.c:4661!
[ 4289.716872] invalid opcode:  [#1] SMP KASAN PTI
[ 4289.717029] Modules linked in:
[ 4289.717158] CPU: 5 PID: 2815 Comm: btrfs-cleaner Tainted: GB   W
4.15.0-rc9-nbor #421
[ 4289.717408] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 4289.717680] RIP: 0010:btrfs_truncate_inode_items+0x1016/0x1e00
[ 4289.717850] RSP: 0018:8801047d7ae0 EFLAGS: 00010282
[ 4289.718009] RAX: fffb RBX: 006c RCX: 8800ad0faf00
[ 4289.718197] RDX: ed00208faf3f RSI: 0001 RDI: 0246
[ 4289.718381] RBP: 88002fcd7840 R08: 0406 R09: 001c3000
[ 4289.718565] R10: 8801047d7410 R11: 1100208fae58 R12: dc00
[ 4289.718748] R13: 8800adb42000 R14: 001c3000 R15: 8800b7eaa700
[ 4289.718932] FS:  () GS:88010ef4() 
knlGS:
[ 4289.719161] CS:  0010 DS:  ES:  CR0: 80050033
[ 4289.719364] CR2: 7f5b2bb86008 CR3: 6ea4 CR4: 06a0
[ 4289.719574] Call Trace:
[ 4289.719700]  ? btrfs_rmdir+0x610/0x610
[ 4289.719839]  ? _raw_spin_unlock+0x24/0x30
[ 4289.719983]  ? join_transaction+0x268/0xce0
[ 4289.720139]  ? start_transaction+0x193/0xf00
[ 4289.720378]  ? btrfs_block_rsv_refill+0x9e/0xb0
[ 4289.720628]  btrfs_evict_inode+0xb89/0x1070
[ 4289.720808]  ? btrfs_setattr+0x750/0x750
[ 4289.720949]  evict+0x20f/0x570
[ 4289.721075]  btrfs_run_delayed_iputs+0x122/0x220
[ 4289.721248]  ? mutex_trylock+0x152/0x1b0
[ 4289.721387]  cleaner_kthread+0x2e5/0x400
[ 4289.721527]  ? btrfs_destroy_pinned_extent+0x140/0x140
[ 4289.721680]  ? btrfs_destroy_pinned_extent+0x140/0x140
[ 4289.721835]  ? btrfs_destroy_pinned_extent+0x140/0x140
[ 4289.721989]  kthread+0x2d4/0x3d0
[ 4289.722115]  ? kthread_create_on_node+0xb0/0xb0
[ 4289.722258]  ret_from_fork+0x3a/0x50
[ 4289.722390] Code: eb 8b 9c 24 e0 00 00 00 48 8b ac 24 d8 00 00 00 65 ff 0d 
3e 63 4e 7e e9 9d fc ff ff 48 8b 7c 24 40 e8 df c5 0e 00 e9 bc f2 ff ff <0f> 0b 
48 8b 7c 24 78 e8 1e b6 96 ff e9 7d f3 ff ff e8 14 b6 96 
[ 4289.722850] RIP: btrfs_truncate_inode_items+0x1016/0x1e00 RSP: 
8801047d7ae0
[ 4289.723101] ---[ end trace bb97e3f06308a096 ]---

Which is: 

 ret = 

Re: degraded permanent mount option

2018-01-29 Thread Austin S. Hemmelgarn

On 2018-01-27 17:42, Tomasz Pala wrote:

On Sat, Jan 27, 2018 at 14:26:41 +0100, Adam Borowski wrote:


It's quite obvious who's the culprit: every single remaining rc system
manages to mount degraded btrfs without problems.  They just don't try to
outsmart the kernel.


Yes. They are stupid enough to fail miserably with any more complicated
setups, like stacking volume managers, crypto layer, network attached
storage etc.
I think you mean any setup that isn't sensibly layered.  BCP for over a 
decade has been to put multipathing at the bottom, then crypto, then 
software RAID, than LVM, and then whatever filesystem you're using. 
Multipathing has to be the bottom layer for a given node because it 
interacts directly with hardware topology which gets obscured by the 
other layers.  Crypto essentially has to be next, otherwise you leak 
info about the storage stack.  Swapping LVM and software RAID ends up 
giving you a setup which is difficult for most people to understand and 
therefore is hard to reliably maintain.


Other init systems enforce things being this way because it maintains 
people's sanity, not because they have significant difficulty doing 
things differently (and in fact, it is _trivial_ to change the ordering 
in some of them, OpenRC on Gentoo for example quite literally requires 
exactly N-1 lines to change in each of N files when re-ordering N 
layers), provided each layer occurs exactly once for a given device and 
the relative ordering is the same on all devices.  And you know what? 
Given my own experience with systemd, it has exactly the same constraint 
on relative ordering.  I've tried to run split setups with LVM and 
dm-crypt where one device had dm-crypt as the bottom layer and the other 
had it as the top layer, and things locked up during boot on _every_ 
generalized init system I tried.



Recently I've started mdadm on top of bunch of LVM volumes, with others
using btrfs and others prepared for crypto. And you know what? systemd
assembled everything just fine.

So with argument just like yours:

It's quite obvious who's the culprit: every single remaining filesystem
manages to mount under systemd without problems. They just expose
informations about their state.
No, they don't (except ZFS).  There is no 'state' to expose for anything 
but BTRFS (and ZFS) except possibly if the filesystem needs checked or 
not.  You're conflating filesystems and volume management.


The alternative way of putting what you just said is:
Every single remaining filesystem manages to mount under systemd without 
problems, because it doesn't try to treat them as a block layer.



This is not a systemd issue, but apparently btrfs design choice to allow
using any single component device name also as volume name itself.


And what other user interface would you propose? The only alternative I see
is inventing a device manager (like you're implying below that btrfs does),
which would needlessly complicate the usual, single-device, case.


The 'needless complication', as you named it, usually should be the default
to use. Avoiding LVM? Then take care of repartitioning. Avoiding mdadm?
No easy way to RAID the drive (there are device-mapper tricks, they are
just way more complicated). Even attaching SSD cache is not trivial
without preparations (for bcache being the absolutely necessary, much
easier with LVM in place).
For a bog-standard client system, all of those _ARE_ overkill (and 
actually, so is BTRFS in many cases too, it's just that we're the only 
option for main-line filesystem-level snapshots at the moment).



If btrfs pretends to be device manager it should expose more states,


But it doesn't pretend to.


Why mounting sda2 requires sdb2 in my setup then?
First off, it shouldn't unless you're using a profile that doesn't 
tolerate any missing devices and have provided the `degraded` mount 
option.  It doesn't in your case because you are using systemd.


Second, BTRFS is not a volume manager, it's a filesystem with 
multi-device support.  The difference is that it's not a block layer, 
despite the fact that systemd is treating it as such.   Yes, BTRFS has 
failure modes that result in regular operations being refused based on 
what storage devices are present, but so does every single distributed 
filesystem in existence, and none of those are volume managers either.



especially "ready to be mounted, but not fully populated" (i.e.
"degraded mount possible"). Then systemd could _fallback_ after timing
out to degraded mount automatically according to some systemd-level
option.


You're assuming that btrfs somehow knows this itself.


"It's quite obvious who's the culprit: every single volume manager keeps
track of it's component devices".


  Unlike the bogus
assumption systemd does that by counting devices you can know whether a
degraded or non-degraded mount is possible, it is in general not possible to
know whether a mount attempt will succeed without actually trying.


There is a term 

Re: degraded permanent mount option

2018-01-29 Thread Austin S. Hemmelgarn

On 2018-01-29 06:24, Adam Borowski wrote:

On Mon, Jan 29, 2018 at 09:54:04AM +0100, Tomasz Pala wrote:

it is a btrfs drawback that doesn't provice anything else except for this
IOCTL with it's logic


How can it provide you with something it doesn't yet have?  If you want the
information, call mount().  And as others in this thread have mentioned,
what, pray tell, would you want to know "would a mount succeed?" for if you
don't want to mount?
And more importantly, WHY THE HELL DO YOU _WANT_ A TOCTOU RACE CONDITION 
INVOLVED?


Seriously, _THERE IS A RACE CONDITION IN SYSTEMD'S CURRENT HANDLING OF 
THIS_.  It's functionally no different than prefacing an attempt to send 
a signal to a process by checking if the process exists, or trying to 
see if some other process is using a file that might be locked by 
scanning /proc instead of just trying to lock the file yourself, or 
scheduling something to check if a RAID array is out of sync before even 
trying to start a scrub.  No sane programmer would do any of that 
(although a lot of rather poorly educated sysadmins do the third), 
because _IT'S NOT RELIABLE_.  The process you're trying to send a signal 
to might disappear after checking for it (or worse, might be a different 
process), the file might get locked by something with a low PID while 
you're busy scanning /proc, or the array could completely die right 
after you check if it's OK.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Fix UAF

2018-01-29 Thread Anand Jain



On 01/29/2018 03:01 PM, Nikolay Borisov wrote:



On 29.01.2018 04:38, Anand Jain wrote:



On 01/26/2018 09:20 PM, Nikolay Borisov wrote:

Commit 4fde46f0cc71 ("Btrfs: free the stale device") introduced
btrfs_free_stale_device which iterates the device lists for all
registered btrfs filesystems and deletes those devices which aren't
mounted. In a btrfs_devices structure has only 1 device attached to it
and it is unused then btrfs_free_stale_devices will proceed to also
free the btrfs_fs_devices struct itself. Currently this leads to a UAF
since list_for_each_entry will try to perform a check on the already-
freed memory to see if it has to terminated the loop.

The fix is to use 'break' when we know we are freeing the current
fs_devs.


  No break is needed as we need to iterate all stale devices and delete
  the found stale entry, so commit [1] used list_for_each_entry_safe()
  and removed the break,


We only do the break if we know we have a single device in the current
fs_devs struct. And executing free_fs_devices would have already freed
the device + fs_devs.



  [1]
   commit 38cf665d338fca33af4b16f9ec7cad6637fc0fec
   Author: Anand Jain 
     btrfs: make btrfs_free_stale_device() to iterate all stales


  I am guessing UAF might be in[2], instead ?

  [2]
     free_fs_devices(fs_devs)
::
     while (!list_empty(_devices->devices)) {
     device = list_entry(fs_devices->devices.next,
     struct btrfs_device, dev_list);


It's not that, I thought so at first. But here using list_empty you are
awlays accessing the head the of the list, which is guaranteed to be
valid since you do the freeing of fs_devices outside of the while loop.


break when num_devices == 1 is fine;

  Reviewed-by: Anand Jain 

Thanks, Anand



Thanks, Anand


Fixes: 4fde46f0cc71 ("Btrfs: free the stale device")
Signed-off-by: Nikolay Borisov 

---
   fs/btrfs/volumes.c | 1 +
   1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f7147740b68e..c3ab55336ee0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -645,6 +645,7 @@ static void btrfs_free_stale_devices(const char
*path,
   btrfs_sysfs_remove_fsid(fs_devs);
   list_del(_devs->list);
   free_fs_devices(fs_devs);
+    break;
   } else {
   fs_devs->num_devices--;
   list_del(>dev_list);




--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] inode->i_version rework for v4.16

2018-01-29 Thread Jeff Layton
The following changes since commit 50c4c4e268a2d7a3e58ebb698ac74da0de40ae36:

  Linux 4.15-rc3 (2017-12-10 17:56:26 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git 
tags/iversion-v4.16-1

for you to fetch changes up to f02a9ad1f15daf4378afeda025a53455f72645dd:

  fs: handle inode->i_version more efficiently (2018-01-29 06:42:21 -0500)


Hi Linus,

This pile of patches is a rework of the inode->i_version field. We have
traditionally incremented that field on every inode data or metadata
change. Typically this increment needs to be logged on disk even when
nothing else has changed, which is rather expensive.

It turns out though that none of the consumers of that field actually
require this behavior. The only real requirement for all of them is that
it be different iff the inode has changed since the last time the field
was checked.

Given that, we can optimize away most of the i_version increments and
avoid dirtying inode metadata when the only change is to the i_version 
and no one is querying it. Queries of the i_version field are rather
rare, so we can help write performance under many common workloads.

This patch series converts existing accesses of the i_version field to a
new API, and then converts all of the in-kernel filesystems to use it.
The last patch in the series then converts the backend implementation to
a scheme that optimizes away a large portion of the metadata updates
when no one is looking at it.

In my own testing this series significantly helps performance with small
I/O sizes. I also got this email for Christmas this year from the kernel
test robot (a 244% r/w bandwidth improvement with XFS over DAX, with 4k
writes):

https://lkml.org/lkml/2017/12/25/8

A few of the earlier patches in this pile are also flowing to you via
other trees (mm, integrity, and nfsd trees in particular), so there may
be some minor merge conflicts here. Hopefully they're trivial to
resolve, but let me know if there are problems.

Thanks!

Jeff Layton (21):
  lustre: don't set f_version in ll_readdir
  ntfs: remove i_version handling
  fs: new API for handling inode->i_version
  fs: don't take the i_lock in inode_inc_iversion
  fat: convert to new i_version API
  affs: convert to new i_version API
  afs: convert to new i_version API
  btrfs: convert to new i_version API
  exofs: switch to new i_version API
  ext2: convert to new i_version API
  ext4: convert to new i_version API
  nfs: convert to new i_version API
  nfsd: convert to new i_version API
  ocfs2: convert to new i_version API
  ufs: use new i_version API
  xfs: convert to new i_version API
  IMA: switch IMA over to new i_version API
  fs: only set S_VERSION when updating times if necessary
  xfs: avoid setting XFS_ILOG_CORE if i_version doesn't need incrementing
  btrfs: only dirty the inode in btrfs_update_time if something was changed
  fs: handle inode->i_version more efficiently

Sascha Hauer (1):
  ima: Use i_version only when filesystem supports it

 drivers/staging/lustre/lustre/llite/dir.c |   3 -
 fs/affs/amigaffs.c|   5 +-
 fs/affs/dir.c |   5 +-
 fs/affs/super.c   |   3 +-
 fs/afs/fsclient.c |   3 +-
 fs/afs/inode.c|   5 +-
 fs/btrfs/delayed-inode.c  |   7 +-
 fs/btrfs/file.c   |   1 +
 fs/btrfs/inode.c  |  12 +-
 fs/btrfs/ioctl.c  |   1 +
 fs/btrfs/tree-log.c   |   4 +-
 fs/btrfs/xattr.c  |   1 +
 fs/exofs/dir.c|   9 +-
 fs/exofs/super.c  |   3 +-
 fs/ext2/dir.c |   9 +-
 fs/ext2/super.c   |   5 +-
 fs/ext4/dir.c |   9 +-
 fs/ext4/inline.c  |   7 +-
 fs/ext4/inode.c   |  13 +-
 fs/ext4/ioctl.c   |   3 +-
 fs/ext4/namei.c   |   5 +-
 fs/ext4/super.c   |   3 +-
 fs/ext4/xattr.c   |   5 +-
 fs/fat/dir.c  |   3 +-
 fs/fat/inode.c|   9 +-
 fs/fat/namei_msdos.c  |   7 +-
 fs/fat/namei_vfat.c   |  22 +-
 fs/inode.c|  11 +-
 fs/nfs/delegation.c   |   3 +-
 fs/nfs/fscache-index.c|   5 +-
 fs/nfs/inode.c|  18 +-
 fs/nfs/nfs4proc.c |  10 +-
 fs/nfs/nfstrace.h |   5 +-
 fs/nfs/write.c  

Re: [PATCH v2] btrfs: Gracefully handle errors in btrfs_qgroup_trace_extent_post

2018-01-29 Thread Qu Wenruo


On 2018年01月29日 19:21, Nikolay Borisov wrote:
> 
> 
> On 29.01.2018 13:09, Qu Wenruo wrote:
>>
>>
>> On 2018年01月29日 15:44, Nikolay Borisov wrote:
>>> Running generic/019 with qgroups on the scratch device enabled is
>>> almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's
>>> supposed to trigger only on -ENOMEM, in reality, however, it's possible
>>> to get -EIO from btrfs_qgroup_trace_extent_post. This function just
>>> finds the roots of the extent being tracked and sets the qrecord->old_roots
>>> list. If this operation fails nothing critical happens except the
>>> quota accounting can be considered wrong. In such case just set the
>>> INCONSISTENT flag for the quota and print a warning.
>>>
>>> Signed-off-by: Nikolay Borisov 
>>> ---
>>>
>>> V2: 
>>>  * Always print a warning if btrfs_qgroup_trace_extent_post fails 
>>>  * Set quota inconsistent flag if btrfs_qgroup_trace_extent_post fails
>>>
>>>  fs/btrfs/delayed-ref.c | 7 +--
>>>  fs/btrfs/qgroup.c  | 6 --
>>>  2 files changed, 9 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
>>> index a1a40cf382e3..5b2789a28a13 100644
>>> --- a/fs/btrfs/delayed-ref.c
>>> +++ b/fs/btrfs/delayed-ref.c
>>> @@ -820,8 +820,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
>>> *fs_info,
>>>  num_bytes, parent, ref_root, level, action);
>>> spin_unlock(_refs->lock);
>>>  
>>> -   if (qrecord_inserted)
>>> -   return btrfs_qgroup_trace_extent_post(fs_info, record);
>>> +   if (qrecord_inserted) {
>>> +   int ret = btrfs_qgroup_trace_extent_post(fs_info, record);
>>> +   if (ret < 0)
>>> +   btrfs_warn(fs_info, "Error accounting new delayed refs 
>>> extent (err code: %d). Quota inconsistent", ret);
>>
>> Sorry that I didn't point it out in previous review, there are 2 callers
>> in delayed-ref.c using btrfs_qgroup_trace_extent_post().
>>
>> One is the one you're fixing, and the other one is
>> btrfs_add_delayed_data_ref().
> 
> Yes, but the callers of btrfs_add_delayed_data_ref seem to be expecting
> error values and actually handling them.

Not exactly.

A quick search leads to extra unhandled btrfs_add_delayed_data_ref().

walk_down_proc()
|- btrfs_dec_ref()
   |- __btrfs_mod_ref()
  |- btrfs_free_extent()
 |- btrfs_add_delayed_data_ref()
|- btrfs_qgroup_trace_extent_post()

And this leads to another BUG_ON().

> So a failure doesn't
> necessarily mean the fs is in inconsistent state.

But at the cost of aborting current transaction.

> 
>>
>> So it would be even better if the warning message can be integrated into
>> btrfs_qgroup_trace_extent_post().
> 
> See below why I don't think integrating the warning is a good idea. In
> the case being modified in this patch we will continue operating
> normally, hence the warning and INCONSISTENT flag make sense.
> 
>>
>> Also btrfs_qgroup_trace_extent_post() also needs to ignore the return
>> value from btrfs_qgroup_trace_extent_post().
> 
> I don't think so, if we are able to handle failures as is the case in
> the delayed_data_ref case then we might abort the current transaction
> and this should leave the fs in a consistent state.

Here comes the trade-off.

Keep the on-disk data consistent while abort current transaction and
make fs read-only.

VS

Make the fs continue running while just discard the qgroup data.


Although the truth is, either way we may eventually goes
abort_transaction() since we failed to read some tree blocks.

But since there are still some BUG_ON() wondering around the wild, the
latter one seems a little better.

> In that case even
> the "STATUS_FLAG_INCONSISTENT" being set in qgroup_trace_extent_post
> might be "wrong" in the sense that a failure from this function doesn't
> necessarily make the quota inconsistent if upper layers can handle the
> failures and revert their work.

Well, most of them just abort the transaction and leads to above trade-off.

Unfortunately, there is not much thing we can do in error handler. :(

Thanks,
Qu

> So I'm now starting to think that the
> inconsistent flag should be set in add_delayed_tree_ref, but this sort
> of leaks the qgroup implementation detail into the delayed tree ref
> function...
>>
>> Thanks,
>> Qu
>>
>>> +   }
>>> return 0;
>>>  
>>>  free_head_ref:
>>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>>> index b2ab5f795816..33f9dba44e92 100644
>>> --- a/fs/btrfs/qgroup.c
>>> +++ b/fs/btrfs/qgroup.c
>>> @@ -1440,8 +1440,10 @@ int btrfs_qgroup_trace_extent_post(struct 
>>> btrfs_fs_info *fs_info,
>>> int ret;
>>>  
>>> ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, _root, false);
>>> -   if (ret < 0)
>>> +   if (ret < 0) {
>>> +   fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
>>> return ret;
>>> +   }
>>>  
>>> /*
>>>  * Here we don't need to get the lock of
>>> @@ -2933,7 +2935,7 @@ 

Re: degraded permanent mount option

2018-01-29 Thread Adam Borowski
On Mon, Jan 29, 2018 at 09:54:04AM +0100, Tomasz Pala wrote:
> On Sun, Jan 28, 2018 at 17:00:46 -0700, Chris Murphy wrote:
> 
> > systemd can't possibly need to know more information than a person
> > does in the exact same situation in order to do the right thing. No
> > human would wait 10 minutes, let alone literally the heat death of the
> > planet for "all devices have appeared" but systemd will. And it does
> 
> We're already repeating - systemd waits for THE btrfs-compound-device,
> not ALL the block-devices.

Because there is NO compound device.  You can't wait for something that
doesn't exist.  The user wants a filesystem, not some mythical compound
device, and as knowing whether we have enough requires doing most of mount
work, we can as well complete the mount instead of backing off and
reporting, so you can then racily repeat the work.

> Just like it 'waits' for someone to plug USB pendrive in.

Plugging an USB pendrive is an event -- there's no user request.  On the
other hand, we already know we want to mount -- the user requested so either
by booting ("please mount everything in fstab") or by an explicit mount
command.

So any event (the user's request) has already happened.  A rc system, of
which systemd is one, knows whether we reached the "want root filesystem" or
"want secondary filesystems" stage.  Once you're there, you can issue the
mount() call and let the kernel do the work.

> It is a btrfs choice to not expose compound device as separate one (like
> every other device manager does)

Btrfs is not a device manager, it's a filesystem.

> it is a btrfs drawback that doesn't provice anything else except for this
> IOCTL with it's logic

How can it provide you with something it doesn't yet have?  If you want the
information, call mount().  And as others in this thread have mentioned,
what, pray tell, would you want to know "would a mount succeed?" for if you
don't want to mount?

> it is a btrfs drawback that there is nothing to push assembling into "OK,
> going degraded" state

The way to do so is to timeout, then retry with -o degraded.

> I've told already - pretend the /dev/sda1 device doesn't
> exist until assembled.

It does... you're confusing a block device (a _part_ of the filesystem) with
the filesystem itself.  MD takes a bunch of such block devices and provides
you with another block devices, btrfs takes a bunch of block devices and
provides you with a filesystem.

> If this overlapping usage was designed with 'easier mounting' on mind,
> this is simply bad design.

No other rc system but systemd has a problem.

> > that by its own choice, its own policy. That's the complaint. It's
> > choosing to do something a person wouldn't do, given identical
> > available information.
> 
> You are expecting systemd to mix in functions of kernel and udev.
> There is NO concept of 'assembled stuff' in systemd AT ALL.
> There is NO concept of 'waiting' in udev AT ALL.
> If you want to do some crazy interlayer shortcuts just implement btrfsd.

No, I don't want systemd, or any userspace daemon, to try knowing kernel
stuff better than the kernel.  Just call mount(), and that's it.

Let me explain via a car analogy.  There is a flood that covers many roads,
the phone network is unreliable, and you want to drive to help relatives at
place X.

You can ask someone who was there yesterday how to get there (ie, ask a
device; it can tell you "when I was a part of the filesystem last time, its
layout was such and such").  Usually, this is reliable (you don't reshape an
array every day), but if there's flooding (you're contemplating a degraded
mount), yesterday's data being stale shouldn't be a surprise.

So, you climb into the car and drive.  It's possible that the road you
wanted to take has changed, it's also possible some other roads you didn't
even know about are now driveable.  Once you have X in sight, do you retrace
all the way home, tell your mom (systemd) who's worrying but has no way to
help, that the road is clear, and only then get to X?  Or do you stop,
search for a spot with working phone coverage to phone mom asking for
advice, despite her having no informations you don't have?  The reasonable
thing to do (and what all other rc systems do) is to get to X, help the
relatives, and only then tell mom that all is ok.

But with mom wanting to control everything, things can go worse.  If you,
without mom's prior knowledge (the user typed "mount" by hand) manage to
find a side road to X, she shouldn't tell you "I hear you telling me you're
at X -- as the road is flooded, that's impossible, so get home this instant"
(ie, systemd thinking the filesystem not being complete, despite it being
already mounted).

> > There's nothing the kernel is doing that's
> > telling systemd to wait for goddamn ever.
> 
> There's nothing the kernel is doing that's
> telling udev there IS a degraded device assembled to be used.

Because there is no device.

> There's nothing a userspace-thing is doing that's
> 

Re: [PATCH v2] btrfs: Gracefully handle errors in btrfs_qgroup_trace_extent_post

2018-01-29 Thread Nikolay Borisov


On 29.01.2018 13:09, Qu Wenruo wrote:
> 
> 
> On 2018年01月29日 15:44, Nikolay Borisov wrote:
>> Running generic/019 with qgroups on the scratch device enabled is
>> almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's
>> supposed to trigger only on -ENOMEM, in reality, however, it's possible
>> to get -EIO from btrfs_qgroup_trace_extent_post. This function just
>> finds the roots of the extent being tracked and sets the qrecord->old_roots
>> list. If this operation fails nothing critical happens except the
>> quota accounting can be considered wrong. In such case just set the
>> INCONSISTENT flag for the quota and print a warning.
>>
>> Signed-off-by: Nikolay Borisov 
>> ---
>>
>> V2: 
>>  * Always print a warning if btrfs_qgroup_trace_extent_post fails 
>>  * Set quota inconsistent flag if btrfs_qgroup_trace_extent_post fails
>>
>>  fs/btrfs/delayed-ref.c | 7 +--
>>  fs/btrfs/qgroup.c  | 6 --
>>  2 files changed, 9 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
>> index a1a40cf382e3..5b2789a28a13 100644
>> --- a/fs/btrfs/delayed-ref.c
>> +++ b/fs/btrfs/delayed-ref.c
>> @@ -820,8 +820,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
>> *fs_info,
>>   num_bytes, parent, ref_root, level, action);
>>  spin_unlock(_refs->lock);
>>  
>> -if (qrecord_inserted)
>> -return btrfs_qgroup_trace_extent_post(fs_info, record);
>> +if (qrecord_inserted) {
>> +int ret = btrfs_qgroup_trace_extent_post(fs_info, record);
>> +if (ret < 0)
>> +btrfs_warn(fs_info, "Error accounting new delayed refs 
>> extent (err code: %d). Quota inconsistent", ret);
> 
> Sorry that I didn't point it out in previous review, there are 2 callers
> in delayed-ref.c using btrfs_qgroup_trace_extent_post().
> 
> One is the one you're fixing, and the other one is
> btrfs_add_delayed_data_ref().

Yes, but the callers of btrfs_add_delayed_data_ref seem to be expecting
error values and actually handling them. So a failure doesn't
necessarily mean the fs is in inconsistent state.

> 
> So it would be even better if the warning message can be integrated into
> btrfs_qgroup_trace_extent_post().

See below why I don't think integrating the warning is a good idea. In
the case being modified in this patch we will continue operating
normally, hence the warning and INCONSISTENT flag make sense.

> 
> Also btrfs_qgroup_trace_extent_post() also needs to ignore the return
> value from btrfs_qgroup_trace_extent_post().

I don't think so, if we are able to handle failures as is the case in
the delayed_data_ref case then we might abort the current transaction
and this should leave the fs in a consistent state. In that case even
the "STATUS_FLAG_INCONSISTENT" being set in qgroup_trace_extent_post
might be "wrong" in the sense that a failure from this function doesn't
necessarily make the quota inconsistent if upper layers can handle the
failures and revert their work. So I'm now starting to think that the
inconsistent flag should be set in add_delayed_tree_ref, but this sort
of leaks the qgroup implementation detail into the delayed tree ref
function...
> 
> Thanks,
> Qu
> 
>> +}
>>  return 0;
>>  
>>  free_head_ref:
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index b2ab5f795816..33f9dba44e92 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -1440,8 +1440,10 @@ int btrfs_qgroup_trace_extent_post(struct 
>> btrfs_fs_info *fs_info,
>>  int ret;
>>  
>>  ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, _root, false);
>> -if (ret < 0)
>> +if (ret < 0) {
>> +fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
>>  return ret;
>> +}
>>  
>>  /*
>>   * Here we don't need to get the lock of
>> @@ -2933,7 +2935,7 @@ static int __btrfs_qgroup_release_data(struct inode 
>> *inode,
>>  if (free && reserved)
>>  return qgroup_free_reserved_data(inode, reserved, start, len);
>>  extent_changeset_init();
>> -ret = clear_record_extent_bits(_I(inode)->io_tree, start, 
>> +ret = clear_record_extent_bits(_I(inode)->io_tree, start,
>>  start + len -1, EXTENT_QGROUP_RESERVED, );
>>  if (ret < 0)
>>  goto out;
>>
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] btrfs: Gracefully handle errors in btrfs_qgroup_trace_extent_post

2018-01-29 Thread Qu Wenruo


On 2018年01月29日 15:44, Nikolay Borisov wrote:
> Running generic/019 with qgroups on the scratch device enabled is
> almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's
> supposed to trigger only on -ENOMEM, in reality, however, it's possible
> to get -EIO from btrfs_qgroup_trace_extent_post. This function just
> finds the roots of the extent being tracked and sets the qrecord->old_roots
> list. If this operation fails nothing critical happens except the
> quota accounting can be considered wrong. In such case just set the
> INCONSISTENT flag for the quota and print a warning.
> 
> Signed-off-by: Nikolay Borisov 
> ---
> 
> V2: 
>  * Always print a warning if btrfs_qgroup_trace_extent_post fails 
>  * Set quota inconsistent flag if btrfs_qgroup_trace_extent_post fails
> 
>  fs/btrfs/delayed-ref.c | 7 +--
>  fs/btrfs/qgroup.c  | 6 --
>  2 files changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index a1a40cf382e3..5b2789a28a13 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -820,8 +820,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
> *fs_info,
>num_bytes, parent, ref_root, level, action);
>   spin_unlock(_refs->lock);
>  
> - if (qrecord_inserted)
> - return btrfs_qgroup_trace_extent_post(fs_info, record);
> + if (qrecord_inserted) {
> + int ret = btrfs_qgroup_trace_extent_post(fs_info, record);
> + if (ret < 0)
> + btrfs_warn(fs_info, "Error accounting new delayed refs 
> extent (err code: %d). Quota inconsistent", ret);

Sorry that I didn't point it out in previous review, there are 2 callers
in delayed-ref.c using btrfs_qgroup_trace_extent_post().

One is the one you're fixing, and the other one is
btrfs_add_delayed_data_ref().

So it would be even better if the warning message can be integrated into
btrfs_qgroup_trace_extent_post().

Also btrfs_qgroup_trace_extent_post() also needs to ignore the return
value from btrfs_qgroup_trace_extent_post().

Thanks,
Qu

> + }
>   return 0;
>  
>  free_head_ref:
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index b2ab5f795816..33f9dba44e92 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -1440,8 +1440,10 @@ int btrfs_qgroup_trace_extent_post(struct 
> btrfs_fs_info *fs_info,
>   int ret;
>  
>   ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, _root, false);
> - if (ret < 0)
> + if (ret < 0) {
> + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
>   return ret;
> + }
>  
>   /*
>* Here we don't need to get the lock of
> @@ -2933,7 +2935,7 @@ static int __btrfs_qgroup_release_data(struct inode 
> *inode,
>   if (free && reserved)
>   return qgroup_free_reserved_data(inode, reserved, start, len);
>   extent_changeset_init();
> - ret = clear_record_extent_bits(_I(inode)->io_tree, start, 
> + ret = clear_record_extent_bits(_I(inode)->io_tree, start,
>   start + len -1, EXTENT_QGROUP_RESERVED, );
>   if (ret < 0)
>   goto out;
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 04/26] libbtrfsutil: add btrfs_util_is_subvolume() and btrfs_util_subvolume_id()

2018-01-29 Thread Nikolay Borisov


On 26.01.2018 20:40, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> These are the most trivial helpers in the library and will be used to
> implement several of the more involved functions.
> 
> Signed-off-by: Omar Sandoval 
> ---
>  Makefile|   2 +-
>  libbtrfsutil/btrfsutil.h|  33 +++
>  libbtrfsutil/python/btrfsutilpy.h   |   3 +
>  libbtrfsutil/python/module.c|  12 +++
>  libbtrfsutil/python/setup.py|   1 +
>  libbtrfsutil/python/subvolume.c |  73 +++
>  libbtrfsutil/python/tests/__init__.py   |  66 ++
>  libbtrfsutil/python/tests/test_subvolume.py |  57 
>  libbtrfsutil/subvolume.c| 137 
> 
>  9 files changed, 383 insertions(+), 1 deletion(-)
>  create mode 100644 libbtrfsutil/python/subvolume.c
>  create mode 100644 libbtrfsutil/python/tests/test_subvolume.py
>  create mode 100644 libbtrfsutil/subvolume.c
> 
> diff --git a/Makefile b/Makefile
> index 02b03e81..48a558a9 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -123,7 +123,7 @@ libbtrfs_headers = send-stream.h send-utils.h send.h 
> kernel-lib/rbtree.h btrfs-l
>  kernel-lib/radix-tree.h kernel-lib/sizes.h kernel-lib/raid56.h \
>  extent-cache.h extent_io.h ioctl.h ctree.h btrfsck.h version.h
>  libbtrfsutil_version := 0.1
> -libbtrfsutil_objects = libbtrfsutil/errors.o
> +libbtrfsutil_objects = libbtrfsutil/errors.o libbtrfsutil/subvolume.o
>  convert_objects = convert/main.o convert/common.o convert/source-fs.o \
> convert/source-ext2.o convert/source-reiserfs.o
>  mkfs_objects = mkfs/main.o mkfs/common.o
> diff --git a/libbtrfsutil/btrfsutil.h b/libbtrfsutil/btrfsutil.h
> index fe1091ca..dff6599d 100644
> --- a/libbtrfsutil/btrfsutil.h
> +++ b/libbtrfsutil/btrfsutil.h
> @@ -20,6 +20,8 @@
>  #ifndef BTRFS_UTIL_H
>  #define BTRFS_UTIL_H
>  
> +#include 
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> @@ -65,6 +67,37 @@ enum btrfs_util_error {
>   */
>  const char *btrfs_util_strerror(enum btrfs_util_error err);
>  
> +/**
> + * btrfs_util_is_subvolume() - Return whether a given path is a Btrfs 
> subvolume.
> + * @path: Path to check.
> + *
> + * Return: %BTRFS_UTIL_OK if @path is a Btrfs subvolume,
> + * %BTRFS_UTIL_ERROR_NOT_BTRFS if @path is not on a Btrfs filesystem,
> + * %BTRFS_UTIL_ERROR_NOT_SUBVOLUME if @path is not a subvolume, non-zero 
> error
> + * code on any other failure.
> + */
> +enum btrfs_util_error btrfs_util_is_subvolume(const char *path);
> +
> +/**
> + * btrfs_util_f_is_subvolume() - See btrfs_util_is_subvolume().
> + */
> +enum btrfs_util_error btrfs_util_f_is_subvolume(int fd);
> +
> +/**
> + * btrfs_util_subvolume_id() - Get the ID of the subvolume containing a path.
> + * @path: Path on a Btrfs filesystem.
> + * @id_ret: Returned subvolume ID.
> + *
> + * Return: %BTRFS_UTIL_OK on success, non-zero error code on failure.
> + */
> +enum btrfs_util_error btrfs_util_subvolume_id(const char *path,
> +   uint64_t *id_ret);
> +
> +/**
> + * btrfs_util_f_subvolume_id() - See btrfs_util_subvolume_id().
> + */
> +enum btrfs_util_error btrfs_util_f_subvolume_id(int fd, uint64_t *id_ret);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/libbtrfsutil/python/btrfsutilpy.h 
> b/libbtrfsutil/python/btrfsutilpy.h
> index 6d82f7e1..9a04fda7 100644
> --- a/libbtrfsutil/python/btrfsutilpy.h
> +++ b/libbtrfsutil/python/btrfsutilpy.h
> @@ -52,6 +52,9 @@ void SetFromBtrfsUtilErrorWithPaths(enum btrfs_util_error 
> err,
>   struct path_arg *path1,
>   struct path_arg *path2);
>  
> +PyObject *is_subvolume(PyObject *self, PyObject *args, PyObject *kwds);
> +PyObject *subvolume_id(PyObject *self, PyObject *args, PyObject *kwds);
> +
>  void add_module_constants(PyObject *m);
>  
>  #endif /* BTRFSUTILPY_H */
> diff --git a/libbtrfsutil/python/module.c b/libbtrfsutil/python/module.c
> index d7398808..d492cbc7 100644
> --- a/libbtrfsutil/python/module.c
> +++ b/libbtrfsutil/python/module.c
> @@ -132,6 +132,18 @@ void path_cleanup(struct path_arg *path)
>  }
>  
>  static PyMethodDef btrfsutil_methods[] = {
> + {"is_subvolume", (PyCFunction)is_subvolume,
> +  METH_VARARGS | METH_KEYWORDS,
> +  "is_subvolume(path) -> bool\n\n"
> +  "Get whether a file is a subvolume.\n\n"
> +  "Arguments:\n"
> +  "path -- string, bytes, path-like object, or open file descriptor"},
> + {"subvolume_id", (PyCFunction)subvolume_id,
> +  METH_VARARGS | METH_KEYWORDS,
> +  "subvolume_id(path) -> int\n\n"
> +  "Get the ID of the subvolume containing a file.\n\n"
> +  "Arguments:\n"
> +  "path -- string, bytes, path-like object, or open file descriptor"},
>   {},
>  };
>  
> diff --git a/libbtrfsutil/python/setup.py 

[PATCH v2] btrfs: Gracefully handle errors in btrfs_qgroup_trace_extent_post

2018-01-29 Thread Nikolay Borisov
Running generic/019 with qgroups on the scratch device enabled is
almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's
supposed to trigger only on -ENOMEM, in reality, however, it's possible
to get -EIO from btrfs_qgroup_trace_extent_post. This function just
finds the roots of the extent being tracked and sets the qrecord->old_roots
list. If this operation fails nothing critical happens except the
quota accounting can be considered wrong. In such case just set the
INCONSISTENT flag for the quota and print a warning.

Signed-off-by: Nikolay Borisov 
---

V2: 
 * Always print a warning if btrfs_qgroup_trace_extent_post fails 
 * Set quota inconsistent flag if btrfs_qgroup_trace_extent_post fails

 fs/btrfs/delayed-ref.c | 7 +--
 fs/btrfs/qgroup.c  | 6 --
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a1a40cf382e3..5b2789a28a13 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -820,8 +820,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
*fs_info,
 num_bytes, parent, ref_root, level, action);
spin_unlock(_refs->lock);
 
-   if (qrecord_inserted)
-   return btrfs_qgroup_trace_extent_post(fs_info, record);
+   if (qrecord_inserted) {
+   int ret = btrfs_qgroup_trace_extent_post(fs_info, record);
+   if (ret < 0)
+   btrfs_warn(fs_info, "Error accounting new delayed refs 
extent (err code: %d). Quota inconsistent", ret);
+   }
return 0;
 
 free_head_ref:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b2ab5f795816..33f9dba44e92 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1440,8 +1440,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info 
*fs_info,
int ret;
 
ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, _root, false);
-   if (ret < 0)
+   if (ret < 0) {
+   fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
+   }
 
/*
 * Here we don't need to get the lock of
@@ -2933,7 +2935,7 @@ static int __btrfs_qgroup_release_data(struct inode 
*inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init();
-   ret = clear_record_extent_bits(_I(inode)->io_tree, start, 
+   ret = clear_record_extent_bits(_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, );
if (ret < 0)
goto out;
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 06/10] writeback: introduce super_operations->write_metadata

2018-01-29 Thread Chandan Rajendra
On Wednesday, January 3, 2018 9:59:24 PM IST Josef Bacik wrote:
> On Wed, Jan 03, 2018 at 05:26:03PM +0100, Jan Kara wrote:

> 
> Oh ok well if that's the case then I'll fix this up to be a ratio, test
> everything, and send it along probably early next week.  Thanks,
> 

Hi Josef,

Did you get a chance to work on the next version of this patchset?


-- 
chandan

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: degraded permanent mount option

2018-01-29 Thread Tomasz Pala
On Sun, Jan 28, 2018 at 17:00:46 -0700, Chris Murphy wrote:

> systemd can't possibly need to know more information than a person
> does in the exact same situation in order to do the right thing. No
> human would wait 10 minutes, let alone literally the heat death of the
> planet for "all devices have appeared" but systemd will. And it does

We're already repeating - systemd waits for THE btrfs-compound-device,
not ALL the block-devices. Just like it 'waits' for someone to plug USB
pendrive in.

It is a btrfs choice to not expose compound device as separate one (like
every other device manager does), it is a btrfs drawback that doesn't
provice anything else except for this IOCTL with it's logic, it is a
btrfs drawback that there is nothing to push assembling into "OK, going
degraded" state, it is btrfs drawback that there are no states...

I've told already - pretend the /dev/sda1 device doesn't
exist until assembled. If this overlapping usage was designed with
'easier mounting' on mind, this is simply bad design.

> that by its own choice, its own policy. That's the complaint. It's
> choosing to do something a person wouldn't do, given identical
> available information.

You are expecting systemd to mix in functions of kernel and udev.
There is NO concept of 'assembled stuff' in systemd AT ALL.
There is NO concept of 'waiting' in udev AT ALL.
If you want to do some crazy interlayer shortcuts just implement btrfsd.

> There's nothing the kernel is doing that's
> telling systemd to wait for goddamn ever.

There's nothing the kernel is doing that's
telling udev there IS a degraded device assembled to be used.

There's nothing a userspace-thing is doing that's
telling udev to mark degraded device as mountable.

There is NO DEVICE to be mounted, so systemd doesn't mount it.

The difference is:

YOU think that sda1 device is ephemeral, as it's covered by sda1 btrfs device 
that COULD BE mounted.

I think that there is real sda1 device, following Linux rules of system
registration, which CAN be overtaken by ephemeral btrfs-compound device.
Can I mount that thing above sda1 block device? ONLY when it's properly
registered in the system.

Does btrfs-compound-device register in the system? - Yes, but only fully 
populated.

Just don't expect people will break their code with broken designs just
to overcome your own limitations. If you want systemd to mount degraded
btrfs volume, just MAKE IT REGISTER in the system.

How can btrfs register in the system being degraded? Either by some
userspace daemon handling btrfs volumes states (which are missing from
the kernel), or by some IOCTLs altering in-kernel states.


So for the last time: nobody will break his own code to patch missing
code from other (actively maintained) subsystem.

If you expect degraded mounts, there are 2 choices:

1. implement degraded STATE _some_where_ - udev would handle falling
   back to degraded mount after specified timeout,

2. change this IOCTL to _always_ return 1 - udev would register any
   btrfs device, but you will get random behaviour of mounting
   degraded/populated. But you should expect that since there is no
   concept of any state below.


Actually, this is ridiculous - you expect the degradation to be handled
in some 3rd party software?! In init system? With the only thing you got
is 'degraded' mount option?!
What next - moving MD and LVM logic into systemd?

This is not systemd's job - there are
btrfs-specific kernel cmdline options to be parsed (allowing degraded
volumes), there is tracking of volume health required.
Yes, device-manager needs to track it's components, RAID controller
needs to track minimum required redundancy. It's not only about
mounting. But doing the degraded mounting is easy, only this one
particular ioctl needs to be fixed:

1. counted devices not_ready

2. counted devices ok_degraded

3. counted devices==all => ok


If btrfs DISTINGUISHES these two states, systemd would be able to use them.


You might ask why this is important for the state to be kept inside some
btrfs-related stuff, like kernel or btrfsd, while the systemd timer
could do the same and 'just mount degraded'. The answear is simple:
systemd.timer is just a sane default CONFIGURATION, that can be EASILY
changed by system administrator. But somewhere, sometime, someone would
have a NEED for totally different set of rules for handling degraded
volumes, just like MD or LVM does. This would be totally irresponsible
to hardcode any mount-degraded rule inside systemd itself.

That is exactly why this must go through the udev - udev is responsible
for handling devices in Linux world. How can I register btrfs device
in udev, since it's overlapping the block device? I can't - the ioctl
is one-way, doesn't accept any userspace feedback.

-- 
Tomasz Pala 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at