Re: SLES 11 SP4: can't mount btrfs

2017-10-19 Thread Andrei Borzenkov
19.10.2017 23:04, Chris Murphy пишет:
> Btrfs
> is not just supported by SUSE, it's the default file system.
> 

It is default choice for root starting with SLES12, not in SLES11. But
yes, it should still be supported.

I do not hold my breath though. For all I can tell transid errors are
usually fatal and if this is root, it may be easier and faster to just
reinstall.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/7] btrfs-progs: mkfs/rootdir: Shrink fs for rootdir option

2017-10-19 Thread Qu Wenruo
Use the new dev extent based shrink method for rootdir option.

Signed-off-by: Qu Wenruo 
---
 mkfs/main.c|   5 +++
 mkfs/rootdir.c | 111 +
 mkfs/rootdir.h |   1 +
 3 files changed, 117 insertions(+)

diff --git a/mkfs/main.c b/mkfs/main.c
index 7b78cfe3550e..0866e40d155f 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -1154,6 +1154,11 @@ raid_groups:
error("error wihle filling filesystem: %d", ret);
goto out;
}
+   ret = btrfs_mkfs_shrink_fs(fs_info, NULL);
+   if (ret < 0) {
+   error("error while shrinking filesystem: %d", ret);
+   goto out;
+   }
}
 
if (verbose) {
diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c
index 1ca37996a3b3..9593bbc25b39 100644
--- a/mkfs/rootdir.c
+++ b/mkfs/rootdir.c
@@ -821,3 +821,114 @@ out:
btrfs_release_path();
return ret;
 }
+
+/*
+ * Set device size to @new_size.
+ *
+ * Only used for --rootdir option.
+ * We will need to reset the following values:
+ * 1) dev item in chunk tree
+ * 2) super->dev_item
+ * 3) super->total_bytes
+ */
+static int set_device_size(struct btrfs_fs_info *fs_info,
+  struct btrfs_device *device, u64 new_size)
+{
+   struct btrfs_root *chunk_root = fs_info->chunk_root;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_dev_item *di;
+   struct btrfs_path path;
+   struct btrfs_key key;
+   int ret;
+
+   /*
+* Update in-meory device->total_bytes, so that at trans commit time,
+* it super->dev_item will also get updated
+*/
+   device->total_bytes = new_size;
+   btrfs_init_path();
+
+   /* Update device item in chunk tree */
+   trans = btrfs_start_transaction(chunk_root, 1);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   error("failed to start transaction: %d (%s)", ret,
+   strerror(-ret));
+   return ret;
+   }
+   key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+   key.type = BTRFS_DEV_ITEM_KEY;
+   key.offset = device->devid;
+
+   ret = btrfs_search_slot(trans, chunk_root, , , 0, 1);
+   if (ret < 0)
+   goto err;
+   if (ret > 0)
+   ret = -ENOENT;
+   di = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_dev_item);
+   btrfs_set_device_total_bytes(path.nodes[0], di, new_size);
+   btrfs_mark_buffer_dirty(path.nodes[0]);
+
+   /*
+* Update super->total_bytes, since it's only used for --rootdir,
+* there is only one device, just use the @new_size.
+*/
+   btrfs_set_super_total_bytes(fs_info->super_copy, new_size);
+
+   /*
+* Commit transaction to reflect the updated super->total_bytes and
+* super->dev_item
+*/
+   ret = btrfs_commit_transaction(trans, chunk_root);
+   if (ret < 0)
+   error("failed to commit current transaction: %d (%s)",
+   ret, strerror(-ret));
+   btrfs_release_path();
+   return ret;
+
+err:
+   btrfs_release_path();
+   /*
+* Commit trans here won't cause problem since the fs still has
+* bad magic, and something wrong already happened, we don't
+* care the return value anyway.
+*/
+   btrfs_commit_transaction(trans, chunk_root);
+   return ret;
+}
+
+int btrfs_mkfs_shrink_fs(struct btrfs_fs_info *fs_info, u64 *new_size_ret)
+{
+   u64 new_size;
+   struct btrfs_device *device;
+   struct list_head *cur;
+   int nr_devs = 0;
+   int ret;
+
+   list_for_each(cur, _info->fs_devices->devices)
+   nr_devs++;
+
+   if (nr_devs > 1) {
+   error("cannot shrink fs with more than 1 device");
+   return -ENOTTY;
+   }
+
+   ret = get_device_extent_end(fs_info, 1, _size);
+   if (ret < 0) {
+   error("failed to get minimal device size: %d (%s)",
+   ret, strerror(-ret));
+   return ret;
+   }
+
+   BUG_ON(!IS_ALIGNED(new_size, fs_info->sectorsize));
+
+   device = list_entry(fs_info->fs_devices->devices.next,
+  struct btrfs_device, dev_list);
+   ret = set_device_size(fs_info, device, new_size);
+   if (ret < 0)
+   return ret;
+   if (new_size_ret)
+   *new_size_ret = new_size;
+   return ret;
+}
diff --git a/mkfs/rootdir.h b/mkfs/rootdir.h
index 75169f37e026..e5b739ede48a 100644
--- a/mkfs/rootdir.h
+++ b/mkfs/rootdir.h
@@ -29,4 +29,5 @@ int btrfs_mkfs_fill_dir(const char *source_dir, struct 
btrfs_root *root,
bool verbose);
 u64 btrfs_mkfs_size_dir(const char *dir_name, u32 sectorsize, u64 min_dev_size,
u64 meta_profile, u64 

[PATCH 6/7] btrfs-progs: mkfs: Update allocation info before verbose output

2017-10-19 Thread Qu Wenruo
Since new --rootdir can allocate chunk, it will modify the chunk
allocation result.

This patch will update allocation info before verbose output to reflect
such info.

Signed-off-by: Qu Wenruo 
---
 mkfs/main.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/mkfs/main.c b/mkfs/main.c
index 0866e40d155f..6aefb50a8033 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -670,6 +670,38 @@ out:
return ret;
 }
 
+/*
+ * Just update chunk allocation info, since --rootdir may allocate new
+ * chunks which is not updated in @allocation structure.
+ */
+static void update_chunk_allocation(struct btrfs_fs_info *fs_info,
+   struct mkfs_allocation *allocation)
+{
+   struct btrfs_block_group_cache *bg_cache;
+   u64 mixed_flag = BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA;
+   u64 search_start = 0;
+
+   allocation->mixed = 0;
+   allocation->data = 0;
+   allocation->metadata = 0;
+   allocation->system = 0;
+   while (1) {
+   bg_cache = btrfs_lookup_first_block_group(fs_info,
+ search_start);
+   if (!bg_cache)
+   break;
+   if ((bg_cache->flags & mixed_flag) == mixed_flag)
+   allocation->mixed += bg_cache->key.offset;
+   else if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
+   allocation->data += bg_cache->key.offset;
+   else if (bg_cache->flags & BTRFS_BLOCK_GROUP_METADATA)
+   allocation->metadata += bg_cache->key.offset;
+   else
+   allocation->system += bg_cache->key.offset;
+   search_start = bg_cache->key.objectid + bg_cache->key.offset;
+   }
+}
+
 int main(int argc, char **argv)
 {
char *file;
@@ -1164,6 +1196,7 @@ raid_groups:
if (verbose) {
char features_buf[64];
 
+   update_chunk_allocation(fs_info, );
printf("Label:  %s\n", label);
printf("UUID:   %s\n", mkfs_cfg.fs_uuid);
printf("Node size:  %u\n", nodesize);
-- 
2.14.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/7] btrfs-progs: mkfs: Only zero out the first 1M for rootdir

2017-10-19 Thread Qu Wenruo
It's a waste of IO to fill the whole image before creating btrfs on it,
just wiping the first 1M, and then write 1 byte to the last position to
create a sparse file.

Signed-off-by: Qu Wenruo 
---
 mkfs/main.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/mkfs/main.c b/mkfs/main.c
index 1355089505ca..7b78cfe3550e 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -404,18 +404,25 @@ static int zero_output_file(int out_fd, u64 size)
 {
int loop_num;
u64 location = 0;
-   char buf[4096];
+   char buf[SZ_4K];
int ret = 0, i;
ssize_t written;
 
-   memset(buf, 0, 4096);
-   loop_num = size / 4096;
+   memset(buf, 0, SZ_4K);
+
+   /* Only zero out the first 1M */
+   loop_num = SZ_1M / SZ_4K;
for (i = 0; i < loop_num; i++) {
-   written = pwrite64(out_fd, buf, 4096, location);
-   if (written != 4096)
+   written = pwrite64(out_fd, buf, SZ_4K, location);
+   if (written != SZ_4K)
ret = -EIO;
-   location += 4096;
+   location += SZ_4K;
}
+
+   /* Then enlarge the file to size */
+   written = pwrite64(out_fd, buf, 1, size - 1);
+   if (written < 1)
+   ret = -EIO;
return ret;
 }
 
-- 
2.14.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7] btrfs-progs: mkfs: Separate shrink from rootdir

2017-10-19 Thread Qu Wenruo
Make --shrink a separate option for --rootdir, and make it default to
off.
So this will cause less confusion.

Signed-off-by: Qu Wenruo 
---
 Documentation/mkfs.btrfs.asciidoc | 11 +++
 mkfs/main.c   | 27 +--
 mkfs/rootdir.c| 21 -
 mkfs/rootdir.h|  3 ++-
 4 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/Documentation/mkfs.btrfs.asciidoc 
b/Documentation/mkfs.btrfs.asciidoc
index d53d9e265fb7..5ddbedbcea97 100644
--- a/Documentation/mkfs.btrfs.asciidoc
+++ b/Documentation/mkfs.btrfs.asciidoc
@@ -106,6 +106,17 @@ Please see the mount option 'discard' for that in 
`btrfs`(5).
 *-r|--rootdir *::
 Populate the toplevel subvolume with files from 'rootdir'.  This does not
 require root permissions and does not mount the filesystem.
++
+NOTE: This option may enlarge the image or file to ensure it's large enough to
+contain the files from 'rootdir'.
+
+*--shrink*:
+Shrink the filesystem to its minimal size, only works with *-r|--rootdir*
+option.
++
+NOTE: If the destination is regular file, this option will also reduce the
+file size. Or it will only reduce the filesystem available space.
+Extra space will not be usable unless resized using 'btrfs filesystem resize'.
 
 *-O|--features [,...]*::
 A list of filesystem features turned on at mkfs time. Not all features are
diff --git a/mkfs/main.c b/mkfs/main.c
index 6aefb50a8033..1d72702414bc 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -731,9 +731,11 @@ int main(int argc, char **argv)
int ssd = 0;
int force_overwrite = 0;
char *source_dir = NULL;
-   int source_dir_set = 0;
+   bool source_dir_set = false;
+   bool shrink_rootdir = false;
u64 source_dir_size = 0;
u64 min_dev_size;
+   u64 shrink_size;
int dev_cnt = 0;
int saved_optind;
char fs_uuid[BTRFS_UUID_UNPARSED_SIZE] = { 0 };
@@ -743,6 +745,7 @@ int main(int argc, char **argv)
 
while(1) {
int c;
+   enum { GETOPT_VAL_SHRINK = 257 };
static const struct option long_options[] = {
{ "alloc-start", required_argument, NULL, 'A'},
{ "byte-count", required_argument, NULL, 'b' },
@@ -760,6 +763,7 @@ int main(int argc, char **argv)
{ "features", required_argument, NULL, 'O' },
{ "uuid", required_argument, NULL, 'U' },
{ "quiet", 0, NULL, 'q' },
+   { "shrink", no_argument, NULL, GETOPT_VAL_SHRINK },
{ "help", no_argument, NULL, GETOPT_VAL_HELP },
{ NULL, 0, NULL, 0}
};
@@ -827,7 +831,7 @@ int main(int argc, char **argv)
goto success;
case 'r':
source_dir = optarg;
-   source_dir_set = 1;
+   source_dir_set = true;
break;
case 'U':
strncpy(fs_uuid, optarg,
@@ -839,6 +843,10 @@ int main(int argc, char **argv)
case 'q':
verbose = 0;
break;
+   case GETOPT_VAL_SHRINK:
+   shrink_rootdir = true;
+   break;
+   break;
case GETOPT_VAL_HELP:
default:
print_usage(c != GETOPT_VAL_HELP);
@@ -861,6 +869,10 @@ int main(int argc, char **argv)
error("the option -r is limited to a single device");
goto error;
}
+   if (shrink_rootdir && !source_dir_set) {
+   error("the option --shrink can only be paired with -r");
+   goto error;
+   }
 
if (*fs_uuid) {
uuid_t dummy_uuid;
@@ -1186,10 +1198,13 @@ raid_groups:
error("error wihle filling filesystem: %d", ret);
goto out;
}
-   ret = btrfs_mkfs_shrink_fs(fs_info, NULL);
-   if (ret < 0) {
-   error("error while shrinking filesystem: %d", ret);
-   goto out;
+   if (shrink_rootdir) {
+   ret = btrfs_mkfs_shrink_fs(fs_info, _size,
+  shrink_rootdir);
+   if (ret < 0) {
+   error("error while shrinking filesystem: %d", 
ret);
+   goto out;
+   }
}
}
 
diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c
index 9593bbc25b39..aa42d186a43f 100644
--- a/mkfs/rootdir.c
+++ b/mkfs/rootdir.c
@@ -898,11 +898,13 @@ err:
   

[PATCH 4/7] btrfs-progs: mkfs/rootdir: Introduce function to get end position of last device extent

2017-10-19 Thread Qu Wenruo
Useful for later 'mkfs.btrfs --rootdir' shrink support.

Signed-off-by: Qu Wenruo 
---
 mkfs/rootdir.c | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/mkfs/rootdir.c b/mkfs/rootdir.c
index 99022afaa030..1ca37996a3b3 100644
--- a/mkfs/rootdir.c
+++ b/mkfs/rootdir.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include "ctree.h"
+#include "volumes.h"
 #include "internal.h"
 #include "disk-io.h"
 #include "messages.h"
@@ -778,3 +779,45 @@ u64 btrfs_mkfs_size_dir(const char *dir_name, u32 
sectorsize, u64 min_dev_size,
total_size = data_chunk_size + meta_chunk_size + min_dev_size;
return total_size;
 }
+
+/*
+ * Get the end position of the last device extent for given @devid;
+ * @size_ret is exclsuive (means it should be aligned to sectorsize)
+ */
+static int get_device_extent_end(struct btrfs_fs_info *fs_info,
+u64 devid, u64 *size_ret)
+{
+   struct btrfs_root *dev_root = fs_info->dev_root;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   struct btrfs_dev_extent *de;
+   int ret;
+
+   key.objectid = devid;
+   key.type = BTRFS_DEV_EXTENT_KEY;
+   key.offset = (u64)-1;
+
+   btrfs_init_path();
+   ret = btrfs_search_slot(NULL, dev_root, , , 0, 0);
+   /* Not really possible */
+   BUG_ON(ret == 0);
+
+   ret = btrfs_previous_item(dev_root, , devid, BTRFS_DEV_EXTENT_KEY);
+   if (ret < 0)
+   goto out;
+
+   /* No dev_extent at all, not really possible for rootdir case*/
+   if (ret > 0) {
+   *size_ret = 0;
+   ret = -EUCLEAN;
+   goto out;
+   }
+
+   btrfs_item_key_to_cpu(path.nodes[0], , path.slots[0]);
+   de = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_dev_extent);
+   *size_ret = key.offset + btrfs_dev_extent_length(path.nodes[0], de);
+out:
+   btrfs_release_path();
+   return ret;
+}
-- 
2.14.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/7] btrfs-progs: mkfs/rootdir: Use over-reserve method to make size estimate easier

2017-10-19 Thread Qu Wenruo
Use an easier method to calculate the estimate device for mkfs.btrfs
--rootdir.

The new method will over-estimate, but should ensure we won't encounter
ENOSPC.

It relies on the following data to estimate:
1) number of inodes
   for metadata chunk size
2) rounded up data size of each regular inode
   for data chunk size.

Total meta chunk size = round_up(nr_inode * (PATH_MAX * 3 + sectorsize),
min_chunk_size) * profile_multiplier

PATH_MAX is the maximum size possible for INODE_REF/DIR_INDEX/DIR_ITEM.
Sectorsize is the maximum size possible for inline extent.
min_chunk_size is 8M for SINGLE, and 32M for DUP, get from
btrfs_alloc_chunk().
profile_multiplier is 1 for Single, 2 for DUP.

Total data chunk size is much easier.
Total data chunk size = round_up(total_data_usage, min_chunk_size) *
profile_multiplier

Total_data_usage is the sum of *rounded up* size of each regular inode
use.
min_chunk_size is 8M for SINGLE, 64M for DUP, get from
btrfS_alloc_chunk().
Same profile_multiplier for meta.

This over-estimate calculate is, of course, over-estimate.
But since we will later shrink the fs to its real usage, it doesn't
matter much now.

Signed-off-by: Qu Wenruo 
---
 mkfs/main.c| 109 ++--
 mkfs/rootdir.c | 119 +++--
 mkfs/rootdir.h |   4 +-
 3 files changed, 139 insertions(+), 93 deletions(-)

diff --git a/mkfs/main.c b/mkfs/main.c
index 5b8de6f690bb..1355089505ca 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -693,8 +693,6 @@ int main(int argc, char **argv)
int force_overwrite = 0;
char *source_dir = NULL;
int source_dir_set = 0;
-   u64 num_of_meta_chunks = 0;
-   u64 size_of_data = 0;
u64 source_dir_size = 0;
u64 min_dev_size;
int dev_cnt = 0;
@@ -909,6 +907,34 @@ int main(int argc, char **argv)
 
min_dev_size = btrfs_min_dev_size(nodesize, mixed, metadata_profile,
  data_profile);
+   /*
+* Enlarge the destination file or create new one, using the
+* size calculated from source dir.
+*
+* This must be done before minimal device size check.
+*/
+   if (source_dir_set) {
+   fd = open(file, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP |
+ S_IWGRP | S_IROTH);
+   if (fd < 0) {
+   error("unable to open %s: %s", file, strerror(errno));
+   goto error;
+   }
+
+   source_dir_size = btrfs_mkfs_size_dir(source_dir, sectorsize,
+   min_dev_size, metadata_profile, data_profile);
+   if (block_count < source_dir_size)
+   block_count = source_dir_size;
+   ret = zero_output_file(fd, block_count);
+   if (ret) {
+   error("unable to zero the output file");
+   close(fd);
+   goto error;
+   }
+   /* our "device" is the new image file */
+   dev_block_count = block_count;
+   close(fd);
+   }
/* Check device/block_count after the nodesize is determined */
if (block_count && block_count < min_dev_size) {
error("size %llu is too small to make a usable filesystem",
@@ -942,51 +968,28 @@ int main(int argc, char **argv)
 
dev_cnt--;
 
-   if (!source_dir_set) {
-   /*
-* open without O_EXCL so that the problem should not
-* occur by the following processing.
-* (btrfs_register_one_device() fails if O_EXCL is on)
-*/
-   fd = open(file, O_RDWR);
-   if (fd < 0) {
-   error("unable to open %s: %s", file, strerror(errno));
-   goto error;
-   }
-   ret = btrfs_prepare_device(fd, file, _block_count,
-   block_count,
-   (zero_end ? PREP_DEVICE_ZERO_END : 0) |
-   (discard ? PREP_DEVICE_DISCARD : 0) |
-   (verbose ? PREP_DEVICE_VERBOSE : 0));
-   if (ret) {
-   goto error;
-   }
-   if (block_count && block_count > dev_block_count) {
-   error("%s is smaller than requested size, expected 
%llu, found %llu",
-   file,
-   (unsigned long long)block_count,
-   (unsigned long long)dev_block_count);
-   goto error;
-   }
-   } else {
-   fd = open(file, O_CREAT | O_RDWR,
-   S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | 
S_IROTH);
-   if (fd < 0) {
-   error("unable 

[PATCH 1/7] btrfs-progs: mkfs: Don't use custom chunk allocator for rootdir

2017-10-19 Thread Qu Wenruo
Remove these custom chunk allocator for mkfs.
Use generic btrfs chunk allocator instead.

Signed-off-by: Qu Wenruo 
---
 mkfs/main.c | 75 ++---
 1 file changed, 7 insertions(+), 68 deletions(-)

diff --git a/mkfs/main.c b/mkfs/main.c
index 423b35579722..5b8de6f690bb 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -400,53 +400,6 @@ static char *parse_label(const char *input)
return strdup(input);
 }
 
-static int create_chunks(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, u64 num_of_meta_chunks,
-u64 size_of_data,
-struct mkfs_allocation *allocation)
-{
-   struct btrfs_fs_info *fs_info = root->fs_info;
-   u64 chunk_start;
-   u64 chunk_size;
-   u64 meta_type = BTRFS_BLOCK_GROUP_METADATA;
-   u64 data_type = BTRFS_BLOCK_GROUP_DATA;
-   u64 minimum_data_chunk_size = SZ_8M;
-   u64 i;
-   int ret;
-
-   for (i = 0; i < num_of_meta_chunks; i++) {
-   ret = btrfs_alloc_chunk(trans, fs_info,
-   _start, _size, meta_type);
-   if (ret)
-   return ret;
-   ret = btrfs_make_block_group(trans, fs_info, 0,
-meta_type, 
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-chunk_start, chunk_size);
-   allocation->metadata += chunk_size;
-   if (ret)
-   return ret;
-   set_extent_dirty(>fs_info->free_space_cache,
-chunk_start, chunk_start + chunk_size - 1);
-   }
-
-   if (size_of_data < minimum_data_chunk_size)
-   size_of_data = minimum_data_chunk_size;
-
-   ret = btrfs_alloc_data_chunk(trans, fs_info,
-_start, size_of_data, data_type, 0);
-   if (ret)
-   return ret;
-   ret = btrfs_make_block_group(trans, fs_info, 0,
-data_type, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-chunk_start, size_of_data);
-   allocation->data += size_of_data;
-   if (ret)
-   return ret;
-   set_extent_dirty(>fs_info->free_space_cache,
-chunk_start, chunk_start + size_of_data - 1);
-   return ret;
-}
-
 static int zero_output_file(int out_fd, u64 size)
 {
int loop_num;
@@ -1180,34 +1133,20 @@ raid_groups:
goto out;
}
 
-   if (source_dir_set) {
-   trans = btrfs_start_transaction(root, 1);
-   BUG_ON(IS_ERR(trans));
-   ret = create_chunks(trans, root,
-   num_of_meta_chunks, size_of_data,
-   );
-   if (ret) {
-   error("unable to create chunks: %d", ret);
-   goto out;
-   }
-   ret = btrfs_commit_transaction(trans, root);
-   if (ret) {
-   error("transaction commit failed: %d", ret);
-   goto out;
-   }
+   ret = cleanup_temp_chunks(fs_info, , data_profile,
+ metadata_profile, metadata_profile);
+   if (ret < 0) {
+   error("failed to cleanup temporary chunks: %d", ret);
+   goto out;
+   }
 
+   if (source_dir_set) {
ret = btrfs_mkfs_fill_dir(source_dir, root, verbose);
if (ret) {
error("error wihle filling filesystem: %d", ret);
goto out;
}
}
-   ret = cleanup_temp_chunks(fs_info, , data_profile,
- metadata_profile, metadata_profile);
-   if (ret < 0) {
-   error("failed to cleanup temporary chunks: %d", ret);
-   goto out;
-   }
 
if (verbose) {
char features_buf[64];
-- 
2.14.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/7] btrfs-progs: mkfs: Reword --rootdir

2017-10-19 Thread Qu Wenruo
Can be fetched from github:
https://github.com/adam900710/btrfs-progs/tree/mkfs_rootdir_rework

And fetching from github is preferred method to test, as this patchset
has 2 prerequisite:

1) Minimal device size patchset
   The image size estimate algorithm heavily relies on the minimal
   device size calculation

2) Rootdir refactor
   To make life a little easier.

Both the prerequisite has no further modification in this patchset, just the
version submitted to mail list, and rebased to v4.13.3 without any
conflict.

Rework 'mkfs.btrfs --rootdir' by:

1) Not using custom chunk allocator
   Use btrfs_alloc_chunk() only.
   Although currently chunk allocator in btrfs-progs is not small device
   friendly, which will try to allocate large chunk.
   This can be addressed by image size estimate algorithm, so it won't
   cause too much problem.
   (But still, it follow the minimal device size from normal mkfs, which
is over 100M for default profile)

2) New image size estimate algorithm
   Use over-reserve-for-metadata method, which should ensure we can write
   all content into the image.
   And rely later shrink the shrink the fs size to minimal.

   Although the method itself is based on over-reserve, but in fact it's
   quite space efficient in most case.
   For empty file case, we will use the allocated data/meta space
   allocated in normal mkfs, so no shrink really needed.

   For large file and small metadata case, the size difference between
   shrunk and unshrunk image is less than 1%.

   Although due to the nature we over-reserve for metadata, for
   extremely unbalanced data/meta case, like tons of empty files, we
   really need to rely shrink functionality.

   And the algorithm itself only needs minimal amount of data.
   It only uses number of inodes and file size of each regular inode.

3) Shrinking the fs by device extent
   As implemented in almost all version of rework which includes
   shrinking, tried and true, and easier to implement.
   And shrinking is completely independent now, can be easily modified
   to shrink multi-device btrfs.

4) Separate shrink functionality to '--shrink' option
   This causes less confusion.
   And due to my poor English, I only added basic explanation to the
   mkfs doc. Although I think this is enough since each functionality
   is easier to understand.

5) Not wasting IO to wipe the whole image
   Only to wipe the first 1M and create sparse file.

Qu Wenruo (7):
  btrfs-progs: mkfs: Don't use custom chunk allocator for rootdir
  btrfs-progs: mkfs/rootdir: Use over-reserve method to make size
estimate easier
  btrfs-progs: mkfs: Only zero out the first 1M for rootdir
  btrfs-progs: mkfs/rootdir: Introduce function to get end position of
last device extent
  btrfs-progs: mkfs/rootdir: Shrink fs for rootdir option
  btrfs-progs: mkfs: Update allocation info before verbose output
  btrfs-progs: mkfs: Separate shrink from rootdir

 Documentation/mkfs.btrfs.asciidoc |  11 ++
 mkfs/main.c   | 260 -
 mkfs/rootdir.c| 292 +-
 mkfs/rootdir.h|   6 +-
 4 files changed, 400 insertions(+), 169 deletions(-)

-- 
2.14.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 2/2] btrfs-progs: doc: add description of missing and example, of device remove

2017-10-19 Thread Misono, Tomohiro
This patch updates help/document of "btrfs device remove" in two points:

1. Add explanation of 'missing' for 'device remove'. This is only
written in wikipage currently.
(https://btrfs.wiki.kernel.org/index.php/Using_Btrfs_with_Multiple_Devices)

2. Add example of device removal in the man document. This is because
that explanation of "remove" says "See the example section below", but
there is no example of removal currently.

Signed-off-by: Tomohiro Misono 
Reviewed-by: Satoru Takeuchi 
---
 Documentation/btrfs-device.asciidoc | 24 +++-
 cmds-device.c   |  8 
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-device.asciidoc 
b/Documentation/btrfs-device.asciidoc
index 88822ec..2ed1e61 100644
--- a/Documentation/btrfs-device.asciidoc
+++ b/Documentation/btrfs-device.asciidoc
@@ -68,13 +68,22 @@ Remove device(s) from a filesystem identified by 
 Device removal must satisfy the profile constraints, otherwise the command
 fails. The filesystem must be converted to profile(s) that would allow the
 removal. This can typically happen when going down from 2 devices to 1 and
-using the RAID1 profile. See the example section below.
+using the RAID1 profile. See the *TYPICAL USECASES* section below.
 +
 The operation can take long as it needs to move all data from the device.
 +
 It is possible to delete the device that was used to mount the filesystem. The
 device entry in mount table will be replaced by another device name with the
 lowest device id.
++
+If filesystem is mounted in degraded mode (-o degraded), special term "missing"
+can be used for . In that case, the first device that is described by
+the filesystem metadata, but not present at the mount time will be removed.
++
+NOTE: In most cases, there is only one missing device in degraded mode,
+otherwise mount fails. If there are two or more devices missing (e.g. possible
+in RAID 6), you need specify "missing" as many times as the number of missing
+devices to remove all of them.
 
 *delete* | [|...] ::
 Alias of remove kept for backward compatibility
@@ -206,6 +215,19 @@ data or the block groups occupy the whole first device.
 The device size of '/dev/sdb' as seen by the filesystem remains unchanged, but
 the logical space from 50-100GiB will be unused.
 
+ REMOVE DEVICE 
+
+Device removal must satisfy the profile constraints, otherwise the command
+fails. For example:
+
+ $ btrfs device remove /dev/sda /mnt
+ ERROR: error removing device '/dev/sda': unable to go below two devices on 
raid1
+
+In order to remove a device, you need to convert the profile in this case:
+
+ $ btrfs balance start -mconvert=dup -dconvert=single /mnt
+ $ btrfs device remove /dev/sda /mnt
+
 DEVICE STATS
 
 
diff --git a/cmds-device.c b/cmds-device.c
index 3b6b985..e7e9ed5 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -224,9 +224,16 @@ static int _cmd_device_remove(int argc, char **argv,
return !!ret;
 }
 
+#define COMMON_USAGE_REMOVE_DELETE \
+   "", \
+   "If 'missing' is specified for , the first device that is", \
+   "described by the filesystem metadata, but not present at the mount", \
+   "time will be removed. (only in degraded mode)"
+
 static const char * const cmd_device_remove_usage[] = {
"btrfs device remove | [|...] ",
"Remove a device from a filesystem",
+   COMMON_USAGE_REMOVE_DELETE,
NULL
 };
 
@@ -238,6 +245,7 @@ static int cmd_device_remove(int argc, char **argv)
 static const char * const cmd_device_delete_usage[] = {
"btrfs device delete | [|...] ",
"Remove a device from a filesystem (alias of \"btrfs device remove\")",
+   COMMON_USAGE_REMOVE_DELETE,
NULL
 };
 
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 0/2] btrfs-progs: doc: update btrfs device remove

2017-10-19 Thread Misono, Tomohiro
This updates help/doc of "btrfs device remove".

First patch adds the explanation that delete is the alias of remove to help 
message.
Second patch adds the description of "remove missing", which is currently only
written in wikipage, and example of device removal.

v1->v2:
 split the patch and updates the messages
v2->v3
 withdrow "remove missing-all" feature
v3->v4
 add more description about missing to man

Tomohiro Misono (2):
  btrfs-progs: device: add description of alias to help message
  btrfs-progs: doc: add description of missing and example of device
remove

 Documentation/btrfs-device.asciidoc | 24 +++-
 cmds-device.c   | 10 +-
 2 files changed, 32 insertions(+), 2 deletions(-)

-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 1/2] btrfs-progs: device: add description of alias to help message

2017-10-19 Thread Misono, Tomohiro
State that the 'delete' is the alias of 'remove' as the man page says.

Signed-off-by: Tomohiro Misono 
Reviewed-by: Satoru Takeuchi 
---
 cmds-device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmds-device.c b/cmds-device.c
index 4337eb2..3b6b985 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -237,7 +237,7 @@ static int cmd_device_remove(int argc, char **argv)
 
 static const char * const cmd_device_delete_usage[] = {
"btrfs device delete | [|...] ",
-   "Remove a device from a filesystem",
+   "Remove a device from a filesystem (alias of \"btrfs device remove\")",
NULL
 };
 
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 11/49] btrfs: avoid access to .bi_vcnt directly

2017-10-19 Thread Ming Lei
On Thu, Aug 10, 2017 at 04:29:59AM -0700, Christoph Hellwig wrote:
> > +static unsigned int get_bio_pages(struct bio *bio)
> > +{
> > +   unsigned i;
> > +   struct bio_vec *bv;
> > +
> > +   bio_for_each_segment_all(bv, bio, i)
> > +   ;
> > +
> > +   return i;
> > +}
> 
> s/get_bio_pages/bio_nr_pages/ ?

Yeah, the name of bio_nr_pages() is much better.

> 
> Also this seems like a useful helper for bio.h

OK.


-- 
Ming
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 0/6] Btrfs: populate heuristic with code

2017-10-19 Thread Timofey Titovets
2017-10-19 18:39 GMT+03:00 David Sterba :
> On Fri, Sep 29, 2017 at 06:22:00PM +0200, David Sterba wrote:
>> On Thu, Sep 28, 2017 at 05:33:35PM +0300, Timofey Titovets wrote:
>> > Compile tested, hand tested on live system
>> >
>> > Change v7 -> v8
>> >   - All code moved to compression.c (again)
>> >   - Heuristic workspaces inmplemented another way
>> > i.e. only share logic with compression workspaces
>> >   - Some style fixes suggested by Devid
>> >   - Move sampling function from heuristic code
>> > (I'm afraid of big functions)
>> >   - Much more comments and explanations
>>
>> Thanks for the update, I went through the patches and they looked good
>> enough to be put into for-next. I may have more comments about a few
>> things, but nothing serious that would hinder testing.
>
> I did a final pass through the patches and edited comments wehre I was
> not able to undrerstand them. Please check the updated patches in [1] if
> I did not accidentally change the meaning.

I don't see a link [1] in mail, may be you missed it?
I look at my patches in for-next branch, and that's not looks like
changed, so i assume your link not point at kernel.org %).

> I'm about to add the patchset to the main patch pile for 4.15 soon.
> Further tuning is possible and such patches will be probably accepted
> during the 4.15 development cycle once the as parts have landed. It's
> desirable to gather some testing results of heuristic effects on various
> data types. So far I've been watching for performance drops only.

Just for my information, you compare compress + heuristic with
compression force?

P.S.
Just to sync that we expect from heuristic:
it's expected to get some performance drops on easy compressible data, because
heuristic not free,
but how much this drops?
Main reason for heuristic, it's to win cpu/latency cost for bad
compressible data.
In compare to direct compression.
That allow to provide some worst case stable latency/throughput for userspace.

P.S.S.
I send some emails before, where i show slow paths in heuristic
(sort(), ilog2()).
So i expect that kernel can see same slow downs on that paths.
But i'm don't have enough skills for now, to perform kernel profiling.

> In case the heuristic would turn out to cause problems we can't fix
> during 4.15 cycle, we can still disable it. This is only a last resort
> measure but we need to be prepared.
kk

Thanks.


-- 
Have a nice day,
Timofey.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: SLES 11 SP4: can't mount btrfs

2017-10-19 Thread Chris Murphy
On Thu, Oct 19, 2017 at 6:43 PM, Lentes, Bernd
 wrote:
> Hi,
>
> this is the continuation of a thread i started on a SLES forum 
> (https://forums.suse.com/showthread.php?10109-lv-with-btrfs-corrupt-some-tips-please),
>  but i think this is the more appropriate place.

Maybe, but as this is SLES, you're effectively paying for support, and
it's actually better to open a support instance, in my opinion. Btrfs
is not just supported by SUSE, it's the default file system.


> I booted the system now with knoppix 8.1, which has kernel 4.12.7 and 
> btrfs-progs 4.7.3-1. Is that ok ?

Since it's SLES, you have to use whatever they recommend. I don't know
Knoppix kernels, so I can't say with any certainty what's in 4.12.7,
but 4.7.3 for btrfs-progs is rather old. There have been many
improvements since that time, in particular with 'btrfs check'

What I usually recommend is getting a current version of Fedora
because it'll have very recent kernels, the gotcha being that to also
get recent btrfs-progs you have to get a copy of Rawhide or like right
now you could use Fedora 27 Beta which is decently safe and also has
new kernel and progs, and in terms of Btrfs of block level stuff we
care about, Fedora runs pretty much identical to to kernel.org code,
so it's not like developers have to use secret decoder rings to know
what the kernel version really means.
https://getfedora.org/en/workstation/prerelease/



> I tried:
>
> mount /dev/vg1/lv_root /lv_root -o recovery,ro
> and got:
> mount: wrong fs type, bad option, bad superblock on /dev/mapper/vg1-lv_root,
>missing codepage or helper program, or other error
>
>In some cases useful info is found in syslog - try
>dmesg | tail or so.
>
>
> and got via dmesg:
> [92518.955408] BTRFS info (device dm-0): disk space caching is enabled
> [92518.990561] BTRFS error (device dm-0): parent transid verify failed on 
> 196314759168 wanted 793932 found 793496
> [92518.990911] BTRFS error (device dm-0): parent transid verify failed on 
> 196314759168 wanted 793932 found 793496
> [92518.990919] BTRFS error (device dm-0): failed to read block groups: -5
> [92519.070084] BTRFS error (device dm-0): open_ctree failed





>
> next step:
>
> root@Microknoppix:~# btrfs device scan
> Scanning for Btrfs filesystems
> root@Microknoppix:~#
>
> no result !?!

Normal. Scan just tells btrfs to go look for devices, it doesn't
return a result.

>
> Now i changed to https://btrfs.wiki.kernel.org/index.php/Restore:
>
> btrfs restore -smSvi /dev/vg1/lv_root 
> /mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/ |tee 
> /mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/recover.log
>
> I just started it. I get lines like:
> offset is 61440
> offset is 98304
> offset is 4096
> offset is 143360
> offset is 8192
> offset is 184320
>
> or
>
> Error searching 
> /mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/@/tmp/localhpsum/assets/doc/help/en
> Error searching 
> /mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/@/tmp/localhpsum/assets/doc/help/ja/images/callouts
>
> What does that mean ?

btrfs restore is pretty much just a brute force hammer for scraping
data off a volume, pretty much it either works or doesn't work,
there's not a lot of in between.

You're probably better off doing both 'btrfs check' and 'btrfs check
--mode=lowmem' without the --repair option, and report back what both
results are.

You can cancel the restore that's in progress is sounds like it's
still doing something, but stuck in a loop maybe. I'm not sure if
there are many restore fixes since btrfs-progs 4.7 though; you could
check the changelog which is in the wiki.

I think the way forward is to get a little more information for
developers, and then maybe they'll be able to say whether --repair is
safe to use or not in your situation.

In any case, report back what you find out. It'd be useful.

-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Peter Grandi
[ ... ]

>> are USB drives really that unreliable [ ... ]
[ ... ]
> There are similar SATA chips too (occasionally JMicron and
> Marvell for example are somewhat less awesome than they could
> be), and practically all Firewire bridge chips of old "lied" a
> lot [ ... ]
> That plus Btrfs is designed to work on top of a "well defined"
> block device abstraction that is assumed to "work correctly"
> (except for data corruption), [ ... ]

When I insist on the reminder that Btrfs is designed to use the
block-device protocol and state machine, rather than USB and
SATA devices, it is because that makes more explicit that the
various layer between the USB and SATA device can "lie" too,
including for example the Linux page cache which is just below
the block-device layer. But also the disk scheduler, the SCSI
protocol handler, the USB and SATA drivers and disk drivers, the
PCIe chipset, the USB or SATA host bus adapter, the cable, the
backplane.

This paper reports the results of some testing of "enterprise
grade" storage systems at CERN, and some of the symptoms imply
that "lies" can happen *anywhere*. It is scary. It supports
having data checksumming in the filesystem, a rather extreme
choice.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: SLES 11 SP4: can't mount btrfs

2017-10-19 Thread Lentes, Bernd

> -Original Message-
> From: linux-btrfs-ow...@vger.kernel.org [mailto:linux-btrfs-
> ow...@vger.kernel.org] On Behalf Of Lentes, Bernd
> Sent: Thursday, October 19, 2017 7:44 PM
> To: Btrfs ML 
> Subject: SLES 11 SP4: can't mount btrfs
>
> Hi,
>
> this is the continuation of a thread i started on a SLES forum
> (https://forums.suse.com/showthread.php?10109-lv-with-btrfs-corrupt-
> some-tips-please), but i think this is the more appropriate place.
> I have a SLES 11 SP4 with a btrfs on top of a logical volume i can't mount
> anymore. The host was fenced in a two-node cluster, and the boot
> procedure can't mount the lv, and i reside in simple shell (i assume the
> one from initrd).
>
> I have a second nearly identical node, so i can give you some information:
>
> ha-idg-2:/etc/corosync # uname -a
> Linux ha-idg-2 3.0.101-84-default #1 SMP Tue Oct 18 10:32:51 UTC 2016
> (15251d6) x86_64 x86_64 x86_64 GNU/Linux
>
> ha-idg-2:/etc/corosync # rpm -qa|grep -i btrfs
> libbtrfs0-3.18.2-0.40.48
> btrfsmaintenance-0.1-3.1
> btrfsprogs-3.18.2-0.40.48
>
> I try to follow the recommendations on
> https://btrfs.wiki.kernel.org/index.php/Problem_FAQ.
>
> I booted the system now with knoppix 8.1, which has kernel 4.12.7 and
> btrfs-progs 4.7.3-1. Is that ok ?
> I tried:
>
> mount /dev/vg1/lv_root /lv_root -o recovery,ro and got:
> mount: wrong fs type, bad option, bad superblock on /dev/mapper/vg1-
> lv_root,
>missing codepage or helper program, or other error
>
>In some cases useful info is found in syslog - try
>dmesg | tail or so.
>
>
> and got via dmesg:
> [92518.955408] BTRFS info (device dm-0): disk space caching is enabled
> [92518.990561] BTRFS error (device dm-0): parent transid verify failed on
> 196314759168 wanted 793932 found 793496 [92518.990911] BTRFS error
> (device dm-0): parent transid verify failed on 196314759168 wanted
> 793932 found 793496 [92518.990919] BTRFS error (device dm-0): failed to
> read block groups: -5 [92519.070084] BTRFS error (device dm-0):
> open_ctree failed
>
> next step:
>
> root@Microknoppix:~# btrfs device scan
> Scanning for Btrfs filesystems
> root@Microknoppix:~#
>
> no result !?!
>
> Now i changed to https://btrfs.wiki.kernel.org/index.php/Restore:
>
> btrfs restore -smSvi /dev/vg1/lv_root /mnt/idg-
> 2/SysAdmin_AG_Wurst/recover/ha-idg-1/ |tee /mnt/idg-
> 2/SysAdmin_AG_Wurst/recover/ha-idg-1/recover.log
>
> I just started it. I get lines like:
> offset is 61440
> offset is 98304
> offset is 4096
> offset is 143360
> offset is 8192
> offset is 184320
>
> or
>
> Error searching /mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-
> 1/@/tmp/localhpsum/assets/doc/help/en
> Error searching /mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-
> 1/@/tmp/localhpsum/assets/doc/help/ja/images/callouts
>
> What does that mean ?
>
>
> Bernd
>

The process does not continue, but it's still visible with ps. Also top does 
not show that this process is consuming resources. iotop does not show any 
activity on my lv.
My logfile is unchanged for two hours. Does that mean that btrfs gave up or 
is it still struggling ?

Bernd
 

Helmholtz Zentrum Muenchen
Deutsches Forschungszentrum fuer Gesundheit und Umwelt (GmbH)
Ingolstaedter Landstr. 1
85764 Neuherberg
www.helmholtz-muenchen.de
Aufsichtsratsvorsitzende: MinDir'in Baerbel Brumme-Bothe
Geschaeftsfuehrer: Prof. Dr. Guenther Wess, Heinrich Bassler, Dr. Alfons Enhsen
Registergericht: Amtsgericht Muenchen HRB 6466
USt-IdNr: DE 129521671

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Peter Grandi
[ ... ]
>>> Oh please, please a bit less silliness would be welcome here.
>>> In a previous comment on this tedious thread I had written:

> If the block device abstraction layer and lower layers work
> correctly, Btrfs does not have problems of that sort when
> adding new devices; conversely if the block device layer and
> lower layers do not work correctly, no mainline Linux
> filesystem I know can cope with that.

> Note: "work correctly" does not mean "work error-free".

>>> The last line is very important and I added it advisedly.
[ ... ]
>> Filesystems run on top of *block-devices* with a definite
>> interface and a definite state machine, and filesystems in
>> general assume that the block-device works *correctly*.

> They do run on top of USB or SATA devices, otherwise a
> significant majority of systems running Linux and/or BSD
> should not be operating right now.

That would be big news to any Linux/UNIX filesystem developer,
who would have to rush to add SATA and USB protocol and state
machine handling to their implementations, which currently only
support the block-device protocol and state machine.
Please send patches :-)

  Note to some readers: there are filesystems designed to work
  on top not of block devices, like on top the MTD abstraction
  layer, for example.

> Yes, they don't directly access them, but the block layer
> isn't much more than command translation, scheduling, and
> accounting, so this distinction is meaningless and largely
> irrelevant.

More tedious silliness and grossly ignorant too, because the
protocol and state machine of the block-device layer is
completely different from that of both SATA and USB, and the
mapping of the SATA or USB protocols and state machines onto the
block-device ones is actually a very complex, difficult, and
error prone task, involving mountains of very hairy code. In
particular since the block-device protocol and state machine are
rather simplistic, a lot is lost in translation.

  Note: the SATA handling firmware in disk device often involves
  *dozens of thousands* of lines of code, and "all it does" is
  "just" reading the device and passing the content over the IO
  bus.

Filesystems are designed to that very simplistic protocol and
state machine for good reasons, and sometimes they are designed
to even just a subset; for example most filesystem designs
assume that block-device writes never fail (that is, bad sector
sparing is done by a lower layer), and only some handle
gracefully block-device read failures.

> [ ... ] to refer to a block-device connected via interface 'X'
> as an 'X device' or an 'X storage device'.

More tedious silliness as this is a grossly misleading shorthand
when the point of the discussion is the error recovery protocol
and state machine assumed by filesystem designers. To me it see
that if people use that shorthand in that context, as if it was
not a shorthand, they don't know what they are talking about, or
they are trying to mislead the discussion.

> [ ... ] For an end user, it generally doesn't matter whether a
> given layer reported the error or passed it on (or generated
> it), it matters whether it was corrected or not. [ ... ]

You seem unable or unwilling to appreciate how detected and
undetected errors are fundamentally different, and how layering
of greatly different protocols is a complicated issue highly
relevant to error recovery, so you seem to assume that other end
users are likewise unable or unwilling.

But I am not so dismissive of "end users", and I assume that
there are end users that can eventually understand that Btrfs in
the main is not designed to handle devices that "lie" because
Btrfs actually is designed to use the block-device layer which
is assumed to "work correctly" (except for checksums).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Peter Grandi
> [ ... ] when writes to a USB device fail due to a temporary
> disconnection, the kernel can actually recognize that a write
> error happened. [ ... ]

Usually, but who knows? Maybe half transfer gets written; maybe
the data gets written to the wrong address; maybe stuff gets
written but failure is reported, and this not just if the
connection dies, but also if it does not.

> are USB drives really that unreliable [ ... ]

Welcome to the "real world", also called "Shenzen" :-).

There aren't that many "USB drives", as I wrote somewhere there
are usually USB host bus adapters (on the system side) and USB
IO bus (usually SATA) bridges (on the device side).

They both have to do difficult feats of conversion and signaling,
and in the USB case they are usually designed by a stressed,
overworked engineer in Guangzhou or Taiwan employed by a no-name
contractor working who submitted the lowest bid to a no-name
manufacturer, and was told to do the cheapest design to fabricate
in the shortest possible time. Most of the time they mostly work,
good enough for keyboard and mice, and for photos of cats on usb
sticks; most users jut unplug and replug them in if they flake
out. BTW my own USB keyboard and mice and their USB host bus
adapter occasionaly crash too, and the cases where my webcam
flakes out are more common than when it does not. USB is a mixed
bag of poorly designed protocols and complex too, and it is very
easy to do a bad implementation.

There are similar SATA chips too (occasionally JMicron and
Marvell for example are somewhat less awesome than they could
be), and practically all Firewire bridge chips of old "lied" a
lot except a few Oxford Semi ones (the legendary 911 series).
I have even seen lying SAS "enterprise" grade storage
interconnects. I had indeed previously written:

  > If you have concerns about the reliability of specific
  > storage and system configurations you should become or find a
  > system integration and qualification engineer who understand
  > the many subletities of storage devices and device-system
  > interconnects and who would run extensive tests on it;
  > storage and system commissioning is often far from trivial
  > even in seemingly simple cases, due in part to the enormous
  > complexity of interfaces, even when they have few bugs, and
  > test made with one combination often do not have the same
  > results even on apparently similar combinations.

On the #Btrfs IRC channel there is a small group of cynical
helpers, and when someone mentions "strange things happening" one
of them usually immediately asks "USB?" and in most cases the
answer is "how did you know?".

That plus Btrfs is designed to work on top of a "well defined"
block device abstraction that is assumed to "work correctly"
(except for data corruption), and the Linux block device
abstraction and SATA and USB layers beneath it are not designed
to handle devices that "lie" (well, there are blacklists with
workaround for known systematic bugs, but that is partial).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Peter Grandi
> [ ... ] However, the disappearance of the device doesn't get
> propagated up to the filesystem correctly,

Indeed, sometimes it does, sometimes it does not, in part
because of chipset bugs, in part because the USB protocol
signaling side does not handle errors well even if the chipset
were bug free.

> and that is what causes the biggest issue with BTRFS. Because
> BTRFS just knows writes are suddenly failing for some reason,
> it doesn't try to release the device so that things get
> properly cleaned up in the kernel, and thus when the same
> device reappears (as it will when the disconnect was due to a
> transient bus error, which happens a lot), it shows up as a
> different device node, which gets scanned for filesystems by
> udev, and BTRFS then gets really confused because it now sees
> 3 (or more) devices for a 2 device filesystem.

That's a good description that should be on the wiki.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/8] btrfs: move btrfs_truncate_block out of trans handle

2017-10-19 Thread Josef Bacik
Since we do a delalloc reserve in btrfs_truncate_block we can deadlock
with freeze.  If somebody else is trying to allocate metadata for this
inode and it gets stuck in start_delalloc_inodes because of freeze we
will deadlock.  Be safe and move this outside of a trans handle.  This
also has a side-effect of making sure that we're not leaving stale data
behind in the other_encoding or encryption case.  Not an issue now since
nobody uses it, but it would be a problem in the future.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/inode.c | 119 ---
 1 file changed, 44 insertions(+), 75 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 68e28375e159..c94e8938b574 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4357,47 +4357,11 @@ static int truncate_space_check(struct 
btrfs_trans_handle *trans,
 
 }
 
-static int truncate_inline_extent(struct inode *inode,
- struct btrfs_path *path,
- struct btrfs_key *found_key,
- const u64 item_end,
- const u64 new_size)
-{
-   struct extent_buffer *leaf = path->nodes[0];
-   int slot = path->slots[0];
-   struct btrfs_file_extent_item *fi;
-   u32 size = (u32)(new_size - found_key->offset);
-   struct btrfs_root *root = BTRFS_I(inode)->root;
-
-   fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-
-   if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
-   loff_t offset = new_size;
-   loff_t page_end = ALIGN(offset, PAGE_SIZE);
-
-   /*
-* Zero out the remaining of the last page of our inline extent,
-* instead of directly truncating our inline extent here - that
-* would be much more complex (decompressing all the data, then
-* compressing the truncated data, which might be bigger than
-* the size of the inline extent, resize the extent, etc).
-* We release the path because to get the page we might need to
-* read the extent item from disk (data not in the page cache).
-*/
-   btrfs_release_path(path);
-   return btrfs_truncate_block(inode, offset, page_end - offset,
-   0);
-   }
-
-   btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-   size = btrfs_file_extent_calc_inline_size(size);
-   btrfs_truncate_item(root->fs_info, path, size, 1);
-
-   if (test_bit(BTRFS_ROOT_REF_COWS, >state))
-   inode_sub_bytes(inode, item_end + 1 - new_size);
-
-   return 0;
-}
+/*
+ * Return this if we need to call truncate_block for the last bit of the
+ * truncate.
+ */
+#define NEED_TRUNCATE_BLOCK 1
 
 /*
  * this can truncate away extent items, csum items and directory items.
@@ -4558,11 +4522,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
 
-   if (del_item)
-   last_size = found_key.offset;
-   else
-   last_size = new_size;
-
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -4604,40 +4563,29 @@ int btrfs_truncate_inode_items(struct 
btrfs_trans_handle *trans,
 */
if (!del_item &&
btrfs_file_extent_encryption(leaf, fi) == 0 &&
-   btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-
+   btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
+   btrfs_file_extent_compression(leaf, fi) == 0) {
+   u32 size = (u32)(new_size - found_key.offset);
+   btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+   size = btrfs_file_extent_calc_inline_size(size);
+   btrfs_truncate_item(root->fs_info, path, size, 
1);
+   } else if (!del_item) {
/*
-* Need to release path in order to truncate a
-* compressed extent. So delete any accumulated
-* extent items so far.
+* We have to bail so the last_size is set to
+* just before this extent.
 */
-   if (btrfs_file_extent_compression(leaf, fi) !=
-   BTRFS_COMPRESS_NONE && pending_del_nr) {
-   err = btrfs_del_items(trans, root, 

[PATCH 4/8] btrfs: switch args for comp_*_refs

2017-10-19 Thread Josef Bacik
Make it more consistent, we want the inserted ref to be compared against
what's already in there.  This will make the order go from lowest seq ->
highest seq, which will make us more likely to make forward progress if
there's a seqlock currently held.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/delayed-ref.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a2973340a94f..bc940bb374cf 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -40,8 +40,8 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
 /*
  * compare two delayed tree backrefs with same bytenr and type
  */
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
- struct btrfs_delayed_tree_ref *ref1)
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
+ struct btrfs_delayed_tree_ref *ref2)
 {
if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
if (ref1->root < ref2->root)
@@ -60,8 +60,8 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
 /*
  * compare two delayed data backrefs with same bytenr and type
  */
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
- struct btrfs_delayed_data_ref *ref1)
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
+ struct btrfs_delayed_data_ref *ref2)
 {
if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
if (ref1->root < ref2->root)
-- 
2.7.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] btrfs: make the delalloc block rsv per inode

2017-10-19 Thread Josef Bacik
The way we handle delalloc metadata reservations has gotten
progressively more complicated over the years.  There is so much cruft
and weirdness around keeping the reserved count and outstanding counters
consistent and handling the error cases that it's impossible to
understand.

Fix this by making the delalloc block rsv per-inode.  This way we can
calculate the actual size of the outstanding metadata reservations every
time we make a change, and then reserve the delta based on that amount.
This greatly simplifies the code everywhere, and makes the error
handling in btrfs_delalloc_reserve_metadata far less terrifying.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/btrfs_inode.h   |  27 ++--
 fs/btrfs/ctree.h |   5 +-
 fs/btrfs/delayed-inode.c |  46 +--
 fs/btrfs/disk-io.c   |  18 ++-
 fs/btrfs/extent-tree.c   | 320 ---
 fs/btrfs/inode.c |  18 +--
 6 files changed, 141 insertions(+), 293 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5ebeafc19936..63f0ccc92a71 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,14 +36,13 @@
 #define BTRFS_INODE_ORPHAN_META_RESERVED   1
 #define BTRFS_INODE_DUMMY  2
 #define BTRFS_INODE_IN_DEFRAG  3
-#define BTRFS_INODE_DELALLOC_META_RESERVED 4
-#define BTRFS_INODE_HAS_ORPHAN_ITEM5
-#define BTRFS_INODE_HAS_ASYNC_EXTENT   6
-#define BTRFS_INODE_NEEDS_FULL_SYNC7
-#define BTRFS_INODE_COPY_EVERYTHING8
-#define BTRFS_INODE_IN_DELALLOC_LIST   9
-#define BTRFS_INODE_READDIO_NEED_LOCK  10
-#define BTRFS_INODE_HAS_PROPS  11
+#define BTRFS_INODE_HAS_ORPHAN_ITEM4
+#define BTRFS_INODE_HAS_ASYNC_EXTENT   5
+#define BTRFS_INODE_NEEDS_FULL_SYNC6
+#define BTRFS_INODE_COPY_EVERYTHING7
+#define BTRFS_INODE_IN_DELALLOC_LIST   8
+#define BTRFS_INODE_READDIO_NEED_LOCK  9
+#define BTRFS_INODE_HAS_PROPS  10
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -176,7 +175,8 @@ struct btrfs_inode {
 * of extent items we've reserved metadata for.
 */
unsigned outstanding_extents;
-   unsigned reserved_extents;
+
+   struct btrfs_block_rsv block_rsv;
 
/*
 * Cached values of inode properties
@@ -278,15 +278,6 @@ static inline void btrfs_mod_outstanding_extents(struct 
btrfs_inode *inode,
  mod);
 }
 
-static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode,
- int mod)
-{
-   lockdep_assert_held(>lock);
-   inode->reserved_extents += mod;
-   if (btrfs_is_free_space_inode(inode))
-   return;
-}
-
 static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 {
int ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9d950c2dd53f..0685ec774d72 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -763,8 +763,6 @@ struct btrfs_fs_info {
 * delayed dir index item
 */
struct btrfs_block_rsv global_block_rsv;
-   /* block reservation for delay allocation */
-   struct btrfs_block_rsv delalloc_block_rsv;
/* block reservation for metadata operations */
struct btrfs_block_rsv trans_block_rsv;
/* block reservation for chunk tree */
@@ -2756,6 +2754,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
  unsigned short type);
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+  struct btrfs_block_rsv *rsv,
+  unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 19e4ad2f3f2e..5d73f79ded8b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
-   bool release = false;
 
src_rsv = trans->block_rsv;
dst_rsv = _info->delayed_block_rsv;
@@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
/*
-* If our block_rsv is the delalloc block reserve then check and see if
-* we have our extra reservation for updating the inode.  If not fall
-* through and try to reserve space quickly.
-*
-* We used to try and steal from the 

[PATCH 6/8] btrfs: track refs in a rb_tree instead of a list

2017-10-19 Thread Josef Bacik
If we get a significant amount of delayed refs for a single block (think
modifying multiple snapshots) we can end up spending an ungodly amount
of time looping through all of the entries trying to see if they can be
merged.  This is because we only add them to a list, so we have O(2n)
for every ref head.  This doesn't make any sense as we likely have refs
for different roots, and so they cannot be merged.  Tracking in a tree
will allow us to break as soon as we hit an entry that doesn't match,
making our worst case O(n).

With this we can also merge entries more easily.  Before we had to hope
that matching refs were on the ends of our list, but with the tree we
can search down to exact matches and merge them at insert time.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/backref.c |   5 ++-
 fs/btrfs/delayed-ref.c | 107 +
 fs/btrfs/delayed-ref.h |   5 +--
 fs/btrfs/disk-io.c |  10 +++--
 fs/btrfs/extent-tree.c |  21 ++
 5 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 33cba1abf8b6..9b627b895806 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -769,6 +769,7 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
struct btrfs_key key;
struct btrfs_key tmp_op_key;
struct btrfs_key *op_key = NULL;
+   struct rb_node *n;
int count;
int ret = 0;
 
@@ -778,7 +779,9 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
}
 
spin_lock(>lock);
-   list_for_each_entry(node, >ref_list, list) {
+   for (n = rb_first(>ref_tree); n; n = rb_next(n)) {
+   node = rb_entry(n, struct btrfs_delayed_ref_node,
+   ref_node);
if (node->seq > seq)
continue;
 
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index c4cfadb9768c..48a9b23774e6 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -143,6 +143,33 @@ static struct btrfs_delayed_ref_head *htree_insert(struct 
rb_root *root,
return NULL;
 }
 
+static struct btrfs_delayed_ref_node *
+tree_insert(struct rb_root *root, struct btrfs_delayed_ref_node *ins)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *node = >ref_node;
+   struct rb_node *parent_node = NULL;
+   struct btrfs_delayed_ref_node *entry;
+
+   while (*p) {
+   int comp;
+   parent_node = *p;
+   entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ref_node);
+   comp = comp_refs(ins, entry, true);
+   if (comp < 0)
+   p = &(*p)->rb_left;
+   else if (comp > 0)
+   p = &(*p)->rb_right;
+   else
+   return entry;
+   }
+
+   rb_link_node(node, parent_node, p);
+   rb_insert_color(node, root);
+   return NULL;
+}
+
 /*
  * find an head entry based on bytenr. This returns the delayed ref
  * head if it was able to find one, or NULL if nothing was in that spot.
@@ -212,7 +239,8 @@ static inline void drop_delayed_ref(struct 
btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref)
 {
assert_spin_locked(>lock);
-   list_del(>list);
+   rb_erase(>ref_node, >ref_tree);
+   RB_CLEAR_NODE(>ref_node);
if (!list_empty(>add_list))
list_del(>add_list);
ref->in_tree = 0;
@@ -229,24 +257,18 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
  u64 seq)
 {
struct btrfs_delayed_ref_node *next;
+   struct rb_node *node = rb_next(>ref_node);
bool done = false;
 
-   next = list_first_entry(>ref_list, struct btrfs_delayed_ref_node,
-   list);
-   while (!done && >list != >ref_list) {
+   while (!done && node) {
int mod;
-   struct btrfs_delayed_ref_node *next2;
-
-   next2 = list_next_entry(next, list);
-
-   if (next == ref)
-   goto next;
 
+   next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+   node = rb_next(node);
if (seq && next->seq >= seq)
-   goto next;
-
+   break;
if (comp_refs(ref, next, false))
-   goto next;
+   break;
 
if (ref->action == next->action) {
mod = next->ref_mod;
@@ -270,8 +292,6 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
}
-next:
-   next = next2;
}
 
return done;
@@ -283,11 +303,12 @@ void 

[PATCH 5/8] btrfs: add a comp_refs() helper

2017-10-19 Thread Josef Bacik
Instead of open-coding the delayed ref comparisons, add a helper to do
the comparisons generically and use that everywhere.  We compare
sequence numbers last for following patches.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/delayed-ref.c | 54 --
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index bc940bb374cf..c4cfadb9768c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -85,6 +85,34 @@ static int comp_data_refs(struct btrfs_delayed_data_ref 
*ref1,
return 0;
 }
 
+static int comp_refs(struct btrfs_delayed_ref_node *ref1,
+struct btrfs_delayed_ref_node *ref2,
+bool check_seq)
+{
+   int ret = 0;
+   if (ref1->type < ref2->type)
+   return -1;
+   if (ref1->type > ref2->type)
+   return 1;
+   if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+   ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
+   ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
+btrfs_delayed_node_to_tree_ref(ref2));
+   else
+   ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
+btrfs_delayed_node_to_data_ref(ref2));
+   if (ret)
+   return ret;
+   if (check_seq) {
+   if (ref1->seq < ref2->seq)
+   return -1;
+   if (ref1->seq > ref2->seq)
+   return 1;
+   }
+   return 0;
+}
+
+
 /* insert a new ref to head ref rbtree */
 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
   struct rb_node *node)
@@ -217,18 +245,7 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
if (seq && next->seq >= seq)
goto next;
 
-   if (next->type != ref->type)
-   goto next;
-
-   if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
-ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
-   comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
-  btrfs_delayed_node_to_tree_ref(next)))
-   goto next;
-   if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
-ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
-   comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
-  btrfs_delayed_node_to_data_ref(next)))
+   if (comp_refs(ref, next, false))
goto next;
 
if (ref->action == next->action) {
@@ -402,18 +419,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle 
*trans,
exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
   list);
/* No need to compare bytenr nor is_head */
-   if (exist->type != ref->type || exist->seq != ref->seq)
-   goto add_tail;
-
-   if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
-exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
-   comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
-  btrfs_delayed_node_to_tree_ref(ref)))
-   goto add_tail;
-   if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
-exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
-   comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
-  btrfs_delayed_node_to_data_ref(ref)))
+   if (comp_refs(exist, ref, true))
goto add_tail;
 
/* Now we are sure we can merge */
-- 
2.7.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] btrfs: don't call btrfs_start_delalloc_roots in flushoncommit

2017-10-19 Thread Josef Bacik
We're holding the sb_start_intwrite lock at this point, and doing async
filemap_flush of the inodes will result in a deadlock if we freeze the
fs during this operation.  This is because we could do a
btrfs_join_transaction() in the thread we are waiting on which would
block at sb_start_intwrite, and thus deadlock.  Using
writeback_inodes_sb() side steps the problem by not introducing all of
these extra locking dependencies.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/transaction.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 68c3e1c04bca..5a8c2649af2f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1916,8 +1916,17 @@ static void cleanup_transaction(struct 
btrfs_trans_handle *trans,
 
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
+   /*
+* We use writeback_inodes_sb here because if we used
+* btrfs_start_delalloc_roots we would deadlock with fs freeze.
+* Currently are holding the fs freeze lock, if we do an async flush
+* we'll do btrfs_join_transaction() and deadlock because we need to
+* wait for the fs freeze lock.  Using the direct flushing we benefit
+* from already being in a transaction and our join_transaction doesn't
+* have to re-take the fs freeze lock.
+*/
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
-   return btrfs_start_delalloc_roots(fs_info, 1, -1);
+   writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
return 0;
 }
 
-- 
2.7.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/8] btrfs: add tracepoints for outstanding extents mods

2017-10-19 Thread Josef Bacik
This is handy for tracing problems with modifying the outstanding
extents counters.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/btrfs_inode.h   |  2 ++
 include/trace/events/btrfs.h | 21 +
 2 files changed, 23 insertions(+)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e3ac29e72714..5ebeafc19936 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -274,6 +274,8 @@ static inline void btrfs_mod_outstanding_extents(struct 
btrfs_inode *inode,
inode->outstanding_extents += mod;
if (btrfs_is_free_space_inode(inode))
return;
+   trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
+ mod);
 }
 
 static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index bfe2f23b578c..567dcf2022bb 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1695,6 +1695,27 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert,
TP_ARGS(fs_info, oldref, newref, tree_size)
 );
 
+TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
+   TP_PROTO(struct btrfs_root *root, u64 ino, int mod),
+
+   TP_ARGS(root, ino, mod),
+
+   TP_STRUCT__entry_btrfs(
+   __field(u64, root_objectid  )
+   __field(u64, ino)
+   __field(int, mod)
+   ),
+
+   TP_fast_assign_btrfs(root->fs_info,
+   __entry->root_objectid  = root->objectid;
+   __entry->ino= ino;
+   __entry->mod= mod;
+   ),
+
+   TP_printk_btrfs("root = %llu(%s) ino = %llu mod = %d",
+   show_root_type(__entry->root_objectid),
+   (unsigned long long)__entry->ino, __entry->mod)
+);
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
2.7.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] Btrfs: rework outstanding_extents

2017-10-19 Thread Josef Bacik
Right now we do a lot of weird hoops around outstanding_extents in order
to keep the extent count consistent.  This is because we logically
transfer the outstanding_extent count from the initial reservation
through the set_delalloc_bits.  This makes it pretty difficult to get a
handle on how and when we need to mess with outstanding_extents.

Fix this by revamping the rules of how we deal with outstanding_extents.
Now instead everybody that is holding on to a delalloc extent is
required to increase the outstanding extents count for itself.  This
means we'll have something like this

btrfs_delalloc_reserve_metadata - outstanding_extents = 1
 btrfs_set_extent_delalloc  - outstanding_extents = 2
btrfs_release_delalloc_extents  - outstanding_extents = 1

for an initial file write.  Now take the append write where we extend an
existing delalloc range but still under the maximum extent size

btrfs_delalloc_reserve_metadata - outstanding_extents = 2
  btrfs_set_extent_delalloc
btrfs_set_bit_hook  - outstanding_extents = 3
btrfs_merge_extent_hook - outstanding_extents = 2
btrfs_delalloc_release_extents  - outstanding_extnets = 1

In order to make the ordered extent transition we of course must now
make ordered extents carry their own outstanding_extent reservation, so
for cow_file_range we end up with

btrfs_add_ordered_extent- outstanding_extents = 2
clear_extent_bit- outstanding_extents = 1
btrfs_remove_ordered_extent - outstanding_extents = 0

This makes all manipulations of outstanding_extents much more explicit.
Every successful call to btrfs_delalloc_reserve_metadata _must_ now be
combined with btrfs_release_delalloc_extents, even in the error case, as
that is the only function that actually modifies the
outstanding_extents counter.

The drawback to this is now we are much more likely to have transient
cases where outstanding_extents is much larger than it actually should
be.  This could happen before as we manipulated the delalloc bits, but
now it happens basically at every write.  This may put more pressure on
the ENOSPC flushing code, but I think making this code simpler is worth
the cost.  I have another change coming to mitigate this side-effect
somewhat.

I also added trace points for the counter manipulation.  These were used
by a bpf script I wrote to help track down leak issues.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/btrfs_inode.h   |  18 ++
 fs/btrfs/ctree.h |   2 +
 fs/btrfs/extent-tree.c   | 139 ---
 fs/btrfs/file.c  |  22 +++
 fs/btrfs/inode-map.c |   3 +-
 fs/btrfs/inode.c | 114 +++
 fs/btrfs/ioctl.c |   2 +
 fs/btrfs/ordered-data.c  |  21 ++-
 fs/btrfs/relocation.c|   3 +
 fs/btrfs/tests/inode-tests.c |  18 ++
 10 files changed, 186 insertions(+), 156 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index eccadb5f62a5..e3ac29e72714 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -267,6 +267,24 @@ static inline bool btrfs_is_free_space_inode(struct 
btrfs_inode *inode)
return false;
 }
 
+static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
+int mod)
+{
+   lockdep_assert_held(>lock);
+   inode->outstanding_extents += mod;
+   if (btrfs_is_free_space_inode(inode))
+   return;
+}
+
+static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode,
+ int mod)
+{
+   lockdep_assert_held(>lock);
+   inode->reserved_extents += mod;
+   if (btrfs_is_free_space_inode(inode))
+   return;
+}
+
 static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 {
int ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7bda8429e93f..9d950c2dd53f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2747,6 +2747,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
 u64 *qgroup_reserved, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4f874d02f310..aaa346562df6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5954,42 +5954,31 @@ void btrfs_subvolume_release_metadata(struct 
btrfs_fs_info *fs_info,
 }
 
 /**
- * drop_outstanding_extent - drop an outstanding extent
+ * 

[PATCH 0/8] Remaining queue

2017-10-19 Thread Josef Bacik
Here's the updated batch of the remaining queue of patches from me.  I've
addressed all of the outstanding review feedback for everything and they've been
pretty thoroughly tested.  Most of the changes are around changelogs and adding
comments, as well as switching to lockdep_assert_held from whatever crap I was
using before.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 01/21] Btrfs: rework outstanding_extents

2017-10-19 Thread Josef Bacik
On Fri, Oct 13, 2017 at 04:55:58PM +0300, Nikolay Borisov wrote:
> 
> > 
> > The outstanding_extents accounting is consistent with only the items needed 
> > to
> > handle the outstanding extent items.  However since changing the inode 
> > requires
> > updating the inode item as well we have to keep this floating reservation 
> > for
> > the inode item until we have 0 outstanding extents.  The way we do this is 
> > with
> > the BTRFS_INODE_DELALLOC_META_RESERVED flag.  So if it isn't set we will
> > allocate nr_exntents + 1 in btrfs_delalloc_reserve_metadata() and then set 
> > our
> > bit.  If we ever steal this reservation we make sure to clear the flag so we
> > know we don't have to clean it up when outstanding_extents goes to 0.  It's 
> > not
> > super intuitive but needs to be done under the BTRFS_I(inode)->lock so this 
> > was
> > the best place to put it.  I suppose we could move the logic out of here 
> > and put
> > it somewhere else to make it more clear.
> 
> I think defining this logic in its own, discrete block of code would be
> best w.r.t readibility. It's not super obvious.
> 

I went to do this and realized that I rip all of this out when we switch to
per-inode block rsvs, so I'm just going to leave this patch as is.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Remove WARN_ON for unaligned device created before v4.13 and adds more user friendly output

2017-10-19 Thread David Sterba
On Sat, Sep 23, 2017 at 03:22:36PM +0800, Qu Wenruo wrote:
> >>> --- a/fs/btrfs/volumes.c
> >>> +++ b/fs/btrfs/volumes.c
> >>> @@ -6472,15 +6472,23 @@ static int read_one_chunk(struct btrfs_fs_info 
> >>> *fs_info, struct btrfs_key *key,
> >>>   return 0;
> >>>   }
> >>>   
> >>> -static void fill_device_from_item(struct extent_buffer *leaf,
> >>> -  struct btrfs_dev_item *dev_item,
> >>> -  struct btrfs_device *device)
> >>> +static void fill_device_from_item(struct btrfs_fs_info *fs_info,
> >>> +   struct extent_buffer *leaf,
> >>> +   struct btrfs_dev_item *dev_item,
> >>> +   struct btrfs_device *device)
> >>>   {
> >>>   unsigned long ptr;
> >>>   
> >>>   device->devid = btrfs_device_id(leaf, dev_item);
> >>>   device->disk_total_bytes = btrfs_device_total_bytes(leaf, 
> >>> dev_item);
> >>>   device->total_bytes = device->disk_total_bytes;
> >>> + if (!IS_ALIGNED(device->total_bytes, fs_info->sectorsize)) {
> >>> + btrfs_warn(fs_info,
> >>> +"devid %llu has unaligned total bytes %llu",
> >>> +device->devid, device->disk_total_bytes);
> >>> + btrfs_warn(fs_info,
> >>> +"please shrink the device a little and resize back 
> >>> to fix it");
> >>> + }
> >>
> >> How about telling uses to know device->total_bytes should be alligned
> >> to fs_info->sectorsize here?
> >>
> >> Thanks,
> > 
> > I should make my comment clearer, sorry.
> > 
> > ===
> > +   if (!IS_ALIGNED(device->total_bytes, fs_info->sectorsize)) {
> > +   btrfs_warn(fs_info,
> > +  "devid %llu: total bytes %llu should be aligned to 
> > %u bytes",
> > +  device->devid, device->disk_total_bytes, 
> > fs_info->sectorsize);
> > +   btrfs_warn(fs_info,
> > +  "please shrink the device a little and resize back 
> > to fix it");
> > +   }
> > ===
> 
> That's better.
> 
> But I'm also considering modifying the total_bytes directly here.

Yeah, I think it would be better to fix here, without a warning even.
The rounding error is below 4k and nodesize, we would never use this
space for block groups so no accidental data loss.

> So that any time DEV_ITEM and super block get updated, new aligned value 
> will be written back to disk, and since the value is aligned in memory, 
> it won't cause WARN_ON() any longer.
> 
> I'll test and check the code for confirmation before updating the patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


SLES 11 SP4: can't mount btrfs

2017-10-19 Thread Lentes, Bernd
Hi,

this is the continuation of a thread i started on a SLES forum 
(https://forums.suse.com/showthread.php?10109-lv-with-btrfs-corrupt-some-tips-please),
 but i think this is the more appropriate place.
I have a SLES 11 SP4 with a btrfs on top of a logical volume i can't mount 
anymore. The host was fenced in a two-node cluster, and the boot procedure 
can't mount the lv, and i reside in simple shell (i assume the one from initrd).

I have a second nearly identical node, so i can give you some information:

ha-idg-2:/etc/corosync # uname -a
Linux ha-idg-2 3.0.101-84-default #1 SMP Tue Oct 18 10:32:51 UTC 2016 (15251d6) 
x86_64 x86_64 x86_64 GNU/Linux

ha-idg-2:/etc/corosync # rpm -qa|grep -i btrfs
libbtrfs0-3.18.2-0.40.48
btrfsmaintenance-0.1-3.1
btrfsprogs-3.18.2-0.40.48

I try to follow the recommendations on 
https://btrfs.wiki.kernel.org/index.php/Problem_FAQ.

I booted the system now with knoppix 8.1, which has kernel 4.12.7 and 
btrfs-progs 4.7.3-1. Is that ok ?
I tried:

mount /dev/vg1/lv_root /lv_root -o recovery,ro
and got:
mount: wrong fs type, bad option, bad superblock on /dev/mapper/vg1-lv_root,
   missing codepage or helper program, or other error

   In some cases useful info is found in syslog - try
   dmesg | tail or so.


and got via dmesg:
[92518.955408] BTRFS info (device dm-0): disk space caching is enabled
[92518.990561] BTRFS error (device dm-0): parent transid verify failed on 
196314759168 wanted 793932 found 793496
[92518.990911] BTRFS error (device dm-0): parent transid verify failed on 
196314759168 wanted 793932 found 793496
[92518.990919] BTRFS error (device dm-0): failed to read block groups: -5
[92519.070084] BTRFS error (device dm-0): open_ctree failed

next step:

root@Microknoppix:~# btrfs device scan
Scanning for Btrfs filesystems
root@Microknoppix:~#

no result !?!

Now i changed to https://btrfs.wiki.kernel.org/index.php/Restore:

btrfs restore -smSvi /dev/vg1/lv_root 
/mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/ |tee 
/mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/recover.log

I just started it. I get lines like:
offset is 61440
offset is 98304
offset is 4096
offset is 143360
offset is 8192
offset is 184320

or

Error searching 
/mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/@/tmp/localhpsum/assets/doc/help/en
Error searching 
/mnt/idg-2/SysAdmin_AG_Wurst/recover/ha-idg-1/@/tmp/localhpsum/assets/doc/help/ja/images/callouts

What does that mean ?


Bernd



-- 
Bernd Lentes 

Systemadministration 
institute of developmental genetics 
Gebäude 35.34 - Raum 208 
HelmholtzZentrum München 
bernd.len...@helmholtz-muenchen.de 
phone: +49 (0)89 3187 1241 
fax: +49 (0)89 3187 2294 

no backup - no mercy
 

Helmholtz Zentrum Muenchen
Deutsches Forschungszentrum fuer Gesundheit und Umwelt (GmbH)
Ingolstaedter Landstr. 1
85764 Neuherberg
www.helmholtz-muenchen.de
Aufsichtsratsvorsitzende: MinDir'in Baerbel Brumme-Bothe
Geschaeftsfuehrer: Prof. Dr. Guenther Wess, Heinrich Bassler, Dr. Alfons Enhsen
Registergericht: Amtsgericht Muenchen HRB 6466
USt-IdNr: DE 129521671

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] btrfs: increase output size for LOGICAL_INO_V2 ioctl

2017-10-19 Thread David Sterba
Hi,

On Sat, Sep 23, 2017 at 11:06:42PM +0200, Hans van Kranenburg wrote:
> Reviewed-by: Hans van Kranenburg 
> Tested-by: Hans van Kranenburg 

the patches look good to me and the usecase and testing coverage seem
sufficient to take the patches to 4.15, though we're close to the
informal merging deadline.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents

2017-10-19 Thread David Sterba
On Fri, Sep 22, 2017 at 01:58:45PM -0400, Zygo Blaxell wrote:
> The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
> offset (encoded as a single logical address) to a list of extent refs.
> LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
> (extent ref -> extent bytenr and offset, or logical address).  These are
> useful capabilities for programs that manipulate extents and extent
> references from userspace (e.g. dedup and defrag utilities).
> 
> When the extents are uncompressed (and not encrypted and not other),
> check_extent_in_eb performs filtering of the extent refs to remove any
> extent refs which do not contain the same extent offset as the 'logical'
> parameter's extent offset.  This prevents LOGICAL_INO from returning
> references to more than a single block.
> 
> To find the set of extent references to an uncompressed extent from [a,
> b), userspace has to run a loop like this pseudocode:
> 
>   for (i = a; i < b; ++i)
>   extent_ref_set += LOGICAL_INO(i);
> 
> At each iteration of the loop (up to 32768 iterations for a 128M extent),
> data we are interested in is collected in the kernel, then deleted by
> the filter in check_extent_in_eb.
> 
> When the extents are compressed (or encrypted or other), the 'logical'
> parameter must be an extent bytenr (the 'a' parameter in the loop).
> No filtering by extent offset is done (or possible?) so the result is
> the complete set of extent refs for the entire extent.  This removes
> the need for the loop, since we get all the extent refs in one call.
> 
> Add an 'ignore_offset' argument to iterate_inodes_from_logical,
> [...several levels of function call graph...], and check_extent_in_eb, so
> that we can disable the extent offset filtering for uncompressed extents.
> This flag can be set by an improved version of the LOGICAL_INO ioctl to
> get either behavior as desired.
> 
> There is no functional change in this patch.  The new flag is always
> false.
> 
> Signed-off-by: Zygo Blaxell 

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: qgroup: show subvol path when qgroup show

2017-10-19 Thread David Sterba
On Wed, Oct 18, 2017 at 11:36:21AM +0800, Lu Fengqi wrote:
> >> @@ -1140,7 +1249,8 @@ static int __qgroups_search(int fd, struct 
> >> qgroup_lookup *qgroup_lookup)
> >>goto skip;
> >>add_qgroup(qgroup_lookup,
> >>   btrfs_search_header_offset(sh), 0,
> >> - 0, 0, 0, 0, 0, 0, 0, 0, 0, bq, bq1);
> >> + 0, 0, 0, 0, 0, 0, 0, 0, 0, bq, bq1,
> >> + NULL);
> >
> >Oh no, yet another argument, so it's 15 in total.
> 
> So many arguments are really hard to accept, I will rework this.

This could be done independent of this patch, but the cleaning up this
area is highly desired.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[no subject]

2017-10-19 Thread Denis 'GNUtoo' Carikli
subscribe linux-btrfs
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 0/6] Btrfs: populate heuristic with code

2017-10-19 Thread David Sterba
On Fri, Sep 29, 2017 at 06:22:00PM +0200, David Sterba wrote:
> On Thu, Sep 28, 2017 at 05:33:35PM +0300, Timofey Titovets wrote:
> > Compile tested, hand tested on live system
> > 
> > Change v7 -> v8
> >   - All code moved to compression.c (again)
> >   - Heuristic workspaces inmplemented another way
> > i.e. only share logic with compression workspaces
> >   - Some style fixes suggested by Devid
> >   - Move sampling function from heuristic code
> > (I'm afraid of big functions)
> >   - Much more comments and explanations
> 
> Thanks for the update, I went through the patches and they looked good
> enough to be put into for-next. I may have more comments about a few
> things, but nothing serious that would hinder testing.

I did a final pass through the patches and edited comments wehre I was
not able to undrerstand them. Please check the updated patches in [1] if
I did not accidentally change the meaning.

I'm about to add the patchset to the main patch pile for 4.15 soon.
Further tuning is possible and such patches will be probably accepted
during the 4.15 development cycle once the as parts have landed. It's
desirable to gather some testing results of heuristic effects on various
data types. So far I've been watching for performance drops only.

In case the heuristic would turn out to cause problems we can't fix
during 4.15 cycle, we can still disable it. This is only a last resort
measure but we need to be prepared.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Austin S. Hemmelgarn

On 2017-10-19 10:42, Zoltan wrote:

On Thu, Oct 19, 2017 at 4:27 PM, Austin S. Hemmelgarn
 wrote:


and thus when the same device reappears (as it will when the disconnect was
due to a transient bus error, which happens a lot), it shows up as a
different device node, which gets scanned for filesystems by udev, and BTRFS
then gets really confused because it now sees 3 (or more) devices for a 2
device filesystem.


And what would happen with a regular, single-device BTRFS volume after
a reconnect? Isn't this issue just as bad for that case?
No, because the multi-device code only gets used if the filesystem 
claims to have more than one device, and it's a bug in the multi-device 
code that causes this problem.  From a data safety perspective, the 
disconnect will look like a power loss event if it was a single device 
filesystem, and BTRFS handles that situation fine (though you would 
probably need to remount the filesystem).


FWIW, the same bug causes similar data loss problems with block-level 
copies of BTRFS filesystems (if you then mount either the original or 
the copy while both are visible to the system), and allows you to screw 
up multi-device filesystems by connecting a storage device with a 
carefully crafted bogus BTRFS filesystem on it.  Overall though, it's 
not been seen as a high priority bug because:


1. Nobody has come up with a reliable method of handling it that doesn't 
break anything or require revising the on-disk layout.
2. It's easy to work around (don't do block level copies and ensure 
proper physical security of the system like you should be doing anyway).

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Zoltan
On Thu, Oct 19, 2017 at 4:27 PM, Austin S. Hemmelgarn
 wrote:

> and thus when the same device reappears (as it will when the disconnect was
> due to a transient bus error, which happens a lot), it shows up as a
> different device node, which gets scanned for filesystems by udev, and BTRFS
> then gets really confused because it now sees 3 (or more) devices for a 2
> device filesystem.

And what would happen with a regular, single-device BTRFS volume after
a reconnect? Isn't this issue just as bad for that case?

Thanks,

Zoltan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Austin S. Hemmelgarn

On 2017-10-19 09:48, Zoltan wrote:

Hi,

On Thu, Oct 19, 2017 at 1:01 PM, Peter Grandi  
wrote:


What the OP was doing was using "unreliable" both for the case
where the device "lies" and the case where the device does not
"lie" but reports a failure. Both of these are malfunctions in a
wide sense:

   * The [block] device "lies" as to its status or what it has done.
   * The [block] device reports truthfully that an action has failed.


Thanks for making this point, it made me realize that I had different
assumption than what you use in your reasoning. I assumed that when
writes to a USB device fail due to a temporary disconnection, the
kernel can actually recognize that a write error happened. So are you
saying that a write error due to USB problems can go completely
unnoticed? That seems very strange to me; are USB drives really that
unreliable or is that some software limitation?

It depends on what type of write error happens.

If it's a case where the data gets corrupted on it's way over the bus, 
or the device just drops the write, or you have a bogus storage device 
(this is actually a pretty big issue with flash drives and SD cards, 
check [1], and [2] for more info on this, and [3] for a tool you can use 
to check things), then it generally won't be detected by the kernel, but 
might be by the filesystem driver when it tries to read data.


However, it doesn't go completely undetected if the device disconnects 
(which is where the big issue with BTRFS comes in), the kernel will 
detect the disconnect, issue a bus reset (which will cause performance 
issues with other USB devices on the same controller), and generally 
recover.  However, the disappearance of the device doesn't get 
propagated up to the filesystem correctly, and that is what causes the 
biggest issue with BTRFS.  Because BTRFS just knows writes are suddenly 
failing for some reason, it doesn't try to release the device so that 
things get properly cleaned up in the kernel, and thus when the same 
device reappears (as it will when the disconnect was due to a transient 
bus error, which happens a lot), it shows up as a different device node, 
which gets scanned for filesystems by udev, and BTRFS then gets really 
confused because it now sees 3 (or more) devices for a 2 device 
filesystem.  That final resultant state is what's so dangerous about 
using USB devices with BTRFS right now, as it's pretty much guaranteed 
to result in data corruption.



[1] https://fightflashfraud.wordpress.com/
[2] https://sosfakeflash.wordpress.com/
[3] http://oss.digirati.com.br/f3/
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 0/3]: settable compression level for zstd

2017-10-19 Thread David Sterba
On Fri, Sep 15, 2017 at 05:34:57PM +0200, Adam Borowski wrote:
> Hi!
> Here's a patch set that allows changing the compression level for zstd,
> currently at mount time only.  I've played with it for a month, so despite
> being a quick hack, it's reasonably well tested.  Tested on 4.13 +
> btrfs-for-4.14 only, though -- I've booted 11th-day-of-merge-window only an
> hour ago on one machine, no explosions yet.
> 
> As a quick hack, it doesn't conserve memory as it should: all workspace
> allocations assume level 15 and waste space otherwise.
> 
> Because of an (easily changeable) quirk of compression level encoding, the
> max is set at 15, but I guess higher levels are pointless for 128KB blocks. 
> Nick and co can tell us more -- for me zstd is mostly a black box so it's
> you who knows anything about tuning it.
> 
> There are three patches:
> * [David Sterba] btrfs: allow to set compression level for zlib
>   Unmodified version of the patch from Jul 24, I'm re-sending it for
>   convenience.
> * btrfs: allow setting zlib compression level via :9
>   Some bikeshedding: it looks like Chris Mason also favours zlib:9 over
>   zlib9 as the former is more readable.  If you disagree... well, it's up
>   to you to decide anyway.  This patch accepts both syntaxes.

FYI, I'm going to add the patches 1 and 2 to 4.15 pull, ie. there will
be support for zlib levels.

Final syntax of the level specification is with ":". I've update the
patches, dropping the "zlib9" way, plus some more changelog updates.

> * btrfs: allow setting zstd level

Before the zstd levels are supported, we'll have to update the workspace
allocation.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Zoltan
Hi,

On Thu, Oct 19, 2017 at 1:01 PM, Peter Grandi  
wrote:

> What the OP was doing was using "unreliable" both for the case
> where the device "lies" and the case where the device does not
> "lie" but reports a failure. Both of these are malfunctions in a
> wide sense:
>
>   * The [block] device "lies" as to its status or what it has done.
>   * The [block] device reports truthfully that an action has failed.

Thanks for making this point, it made me realize that I had different
assumption than what you use in your reasoning. I assumed that when
writes to a USB device fail due to a temporary disconnection, the
kernel can actually recognize that a write error happened. So are you
saying that a write error due to USB problems can go completely
unnoticed? That seems very strange to me; are USB drives really that
unreliable or is that some software limitation?

Thanks,

Zoltan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/2] btrfs: Fix transaction abort during failure in btrfs_rm_dev_item

2017-10-19 Thread Nikolay Borisov


On 19.10.2017 14:54, David Sterba wrote:
> On Thu, Sep 28, 2017 at 11:45:27AM +0300, Nikolay Borisov wrote:
>> btrfs_rm_dev_item calls several function under an activa transaction, however
>> it fails to abort it if an error happens. Fix this by adding explicit
>> btrfs_abort_transaction/btrfs_end_transaction calls
>>
>> Signed-off-by: Nikolay Borisov 
>> ---
>>
>> v2: 
>>  * Explicitly handle every failure case w.r.t transaction abort rather than 
>>  rely on final btrfs_commit_transaction() to do the right thing. 
>>
>>  * Also consider the -ENOENT case from btrfs_search_slot as a failure.
>>
>>  fs/btrfs/volumes.c | 15 ---
>>  1 file changed, 12 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 0e8f16c305df..4709c7919ef2 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1765,20 +1765,29 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info 
>> *fs_info,
>>  key.offset = device->devid;
>>  
>>  ret = btrfs_search_slot(trans, root, , path, -1, 1);
>> -if (ret < 0)
>> +if (ret < 0) {
>> +btrfs_abort_transaction(trans, ret);
>> +btrfs_end_transaction(trans);
>>  goto out;
>> +}
>>  
>>  if (ret > 0) {
>>  ret = -ENOENT;
>> +btrfs_abort_transaction(trans, ret);
>> +btrfs_end_transaction(trans);
>>  goto out;
>>  }
>>  
>>  ret = btrfs_del_item(trans, root, path);
>> -if (ret)
>> +if (ret) {
>> +btrfs_abort_transaction(trans, ret);
>> +btrfs_end_transaction(trans);
>>  goto out;
>> +}
>> +
>> +ret = btrfs_commit_transaction(trans);
>>  out:
>>  btrfs_free_path(path);
>> -btrfs_commit_transaction(trans);
>>  return ret;
> 
> This is wrong and I don't know why. I've painfully bisected to this
> commit that causes a lockup of test btrfs/101. I'm going to remove it from
> misc-next.


Just had a call with Jeff and he suggested it might be due to the path
holding locks. Looking around the code it seems that indeed path is
first being freed everytime before a transaction is committed.

> 
> [ 3845.295346] run fstests btrfs/101 at 2017-10-19 02:51:17
> [ 3864.988027] BTRFS: device fsid 9c721da4-271d-4499-8c1d-77ccb5e611a3 devid 
> 1 transid 5 /dev/sda5
> [ 3864.997284] BTRFS: device fsid 9c721da4-271d-4499-8c1d-77ccb5e611a3 devid 
> 2 transid 5 /dev/sdb7
> [ 3865.007986] BTRFS: device fsid 9c721da4-271d-4499-8c1d-77ccb5e611a3 devid 
> 3 transid 5 /dev/mapper/error-test
> [ 3865.040554] BTRFS info (device dm-0): disk space caching is enabled
> [ 3865.040559] BTRFS info (device dm-0): has skinny extents
> [ 3865.040563] BTRFS info (device dm-0): flagging fs with big metadata feature
> [ 3865.053175] BTRFS info (device dm-0): creating UUID tree
> [ 3910.214683] BTRFS info (device dm-0): relocating block group 29360128 
> flags metadata|raid1
> [ 3910.252405] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 1, rd 0, flush 0, corrupt 0, gen 0
> [ 3910.263339] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 2, rd 0, flush 0, corrupt 0, gen 0
> [ 3910.273859] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 3, rd 0, flush 0, corrupt 0, gen 0
> [ 3910.274053] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 4, rd 0, flush 0, corrupt 0, gen 0
> [ 3910.274178] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 5, rd 0, flush 0, corrupt 0, gen 0
> [ 3910.326212] BTRFS warning (device dm-0): lost page write due to IO error 
> on /dev/mapper/error-test
> [ 3910.335684] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 6, rd 0, flush 0, corrupt 0, gen 0
> [ 3910.346135] BTRFS warning (device dm-0): lost page write due to IO error 
> on /dev/mapper/error-test
> [ 3910.355378] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 7, rd 0, flush 0, corrupt 0, gen 0
> [ 3912.082763] BTRFS warning (device dm-0): lost page write due to IO error 
> on /dev/mapper/error-test
> [ 3912.092300] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 8, rd 0, flush 0, corrupt 0, gen 0
> [ 3912.102690] BTRFS warning (device dm-0): lost page write due to IO error 
> on /dev/mapper/error-test
> [ 3912.112326] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 9, rd 0, flush 0, corrupt 0, gen 0
> [ 3913.778330] BTRFS warning (device dm-0): lost page write due to IO error 
> on /dev/mapper/error-test
> [ 3913.787814] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: 
> wr 10, rd 0, flush 0, corrupt 0, gen 0
> [ 3913.798292] BTRFS warning (device dm-0): lost page write due to IO error 
> on /dev/mapper/error-test
> [ 3913.990997] BTRFS info (device dm-0): found 1675 extents
> [ 3914.048231] BTRFS warning (device dm-0): lost page write due to IO error 
> on /dev/mapper/error-test
> [ 3914.059503] BTRFS warning (device 

Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Austin S. Hemmelgarn

On 2017-10-19 07:01, Peter Grandi wrote:

[ ... ]


Oh please, please a bit less silliness would be welcome here.
In a previous comment on this tedious thread I had written:



If the block device abstraction layer and lower layers work
correctly, Btrfs does not have problems of that sort when
adding new devices; conversely if the block device layer and
lower layers do not work correctly, no mainline Linux
filesystem I know can cope with that.



Note: "work correctly" does not mean "work error-free".


The last line is very important and I added it advisedly.



Even looking at things that way though, Zoltan's assessment
that reliability is essentially a measure of error rate is
correct.


It is instead based on a grave confusion between two very
different kinds of "error rate", confusion also partially based
on the ridiculous misunderstanding, which I have already pointed
out, that UNIX filesystems run on top of SATA or USB devices:


Internal SATA devices absolutely can randomly drop off the bus
just like many USB storage devices do,


Filesystems run on top of *block devices* with a definite
interface and a definite state machine, and filesystems in
general assume that the block device works *correctly*.
They do run on top of USB or SATA devices, otherwise a significant 
majority of systems running Linux and/or BSD should not be operating 
right now.  Yes, they don't directly access them, but the block layer 
isn't much more than command translation, scheduling, and accounting, so 
this distinction is meaningless and largely irrelevant.  It's also 
pretty standard practice among most sane sysadmins who aren't trying to 
be jerks, as well as most kernel developers I've met, is to refer to a 
block device connected via interface 'X' as an 'X device' or an 'X 
storage device'.



but it almost never happens (it's a statistical impossibility
if there are no hardware or firmware issues), so they are more
reliable in that respect.


What the OP was doing was using "unreliable" both for the case
where the device "lies" and the case where the device does not
"lie" but reports a failure. Both of these are malfunctions in a
wide sense:

   * The [block] device "lies" as to its status or what it has done.
   * The [block] device reports truthfully that an action has failed.

But they are of very different nature and need completely
different handling. Hint: one is an extensional property and the
other is a modal one, there is a huge difference between "this
data is wrong" and "I know that this data is wrong".

The really important "detail" is that filesystems are, as a rule
with very few exceptions, designed to work only if the block
device layer (and those below it) does not "lie" (see "Bizantyne
failures" below), that is "works correctly": reports the failure
of every operation that fails and the success of every operation
that succeeds and never gets into an unexpected state.

In particular filesystems designs are nearly always based on the
assumption that there are no undetected errors at the block
device level or below. Then the expected *frequency* of detected
errors influences how much redundancy and what kind of recovery
are desirable, but the frequency of "lies" is assumed to be
zero.

The one case where Btrfs does not assume that the storage layer
works *correctly* is checksumming: it is quite expensive and
makes sense only if the block device is expected to (sometimes)
"lie" about having written the data correctly or having read it
correctly. The role of the checksum is to spot when a block
device "lies" and turn an undetected read error into a detected
one (they could be used also to detect correct writes that are
misreported as having failed).

The crucial difference that exists between SATA and USB is not
that USB chips have higher rates of detected failures (even if
they often do), but that in my experience SATA interfaces from
reputable suppliers don't "lie" (more realistically have
negligible "lie" rates), and USB interfaces (both host bus
adapters and IO bus bridges) "lie" both systematically and
statistically with non negligible rates, and anyhow the USB mass
storage protocol is not very good at error reporting and
handling.
You do realize you just said exactly what I was saying, just in a more 
general and much more verbose manner which involved explaining things 
that are either well known and documented or aren't even entirely 
relevant to the thread in question?


For an end user, it generally doesn't matter whether a given layer 
reported the error or passed it on (or generated it), it matters whether 
it was corrected or not.  If the subset of the storage stack below 
whatever layer is being discussed (in this case the filesystem) causes 
errors at a rate deemed unacceptable for the given application that it 
does not correct, it's unreliable, regardless of whether or not they get 
corrected at this layer or a higher layer.  Even if you're running BTRFS 
on top of it, a SATA connected 

Re: 4.13: "error in btrfs_run_delayed_refs:3009: errno=-28 No space left" with 1.3TB unallocated / 737G free?

2017-10-19 Thread Martin Raiber
On 19.10.2017 10:16 Vladimir Panteleev wrote:
> On Tue, 17 Oct 2017 16:21:04 -0700, Duncan wrote:
>> * try the balance on 4.14-rc5+, where the known bug should be fixed
>
> Thanks! However, I'm getting the same error on
> 4.14.0-rc5-g9aa0d2dde6eb. The stack trace is different, though:
>
> Aside from rebuilding the filesystem, what are my options? Should I
> try to temporarily add a file from another volume as a device and
> retry the balance? If so, what would be a good size for the temporary
> device?
>
Hi,

for me a work-around for something like this has been to reduce the
amount of dirty memory via e.g.

sysctl vm.dirty_background_bytes=$((100*1024*1024))
sysctl vm.dirty_bytes=$((400*1024*1024))

this reduces performance, however. You could also mount with
"enospc_debug" to give the devs more infos about this issue.
I am having more ENOSPC issues with 4.9.x than with the latest 4.14.

Regards,
Martin

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/2] btrfs: Fix transaction abort during failure in btrfs_rm_dev_item

2017-10-19 Thread David Sterba
On Thu, Sep 28, 2017 at 11:45:27AM +0300, Nikolay Borisov wrote:
> btrfs_rm_dev_item calls several function under an activa transaction, however
> it fails to abort it if an error happens. Fix this by adding explicit
> btrfs_abort_transaction/btrfs_end_transaction calls
> 
> Signed-off-by: Nikolay Borisov 
> ---
> 
> v2: 
>  * Explicitly handle every failure case w.r.t transaction abort rather than 
>  rely on final btrfs_commit_transaction() to do the right thing. 
> 
>  * Also consider the -ENOENT case from btrfs_search_slot as a failure.
> 
>  fs/btrfs/volumes.c | 15 ---
>  1 file changed, 12 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 0e8f16c305df..4709c7919ef2 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1765,20 +1765,29 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info 
> *fs_info,
>   key.offset = device->devid;
>  
>   ret = btrfs_search_slot(trans, root, , path, -1, 1);
> - if (ret < 0)
> + if (ret < 0) {
> + btrfs_abort_transaction(trans, ret);
> + btrfs_end_transaction(trans);
>   goto out;
> + }
>  
>   if (ret > 0) {
>   ret = -ENOENT;
> + btrfs_abort_transaction(trans, ret);
> + btrfs_end_transaction(trans);
>   goto out;
>   }
>  
>   ret = btrfs_del_item(trans, root, path);
> - if (ret)
> + if (ret) {
> + btrfs_abort_transaction(trans, ret);
> + btrfs_end_transaction(trans);
>   goto out;
> + }
> +
> + ret = btrfs_commit_transaction(trans);
>  out:
>   btrfs_free_path(path);
> - btrfs_commit_transaction(trans);
>   return ret;

This is wrong and I don't know why. I've painfully bisected to this
commit that causes a lockup of test btrfs/101. I'm going to remove it from
misc-next.

[ 3845.295346] run fstests btrfs/101 at 2017-10-19 02:51:17
[ 3864.988027] BTRFS: device fsid 9c721da4-271d-4499-8c1d-77ccb5e611a3 devid 1 
transid 5 /dev/sda5
[ 3864.997284] BTRFS: device fsid 9c721da4-271d-4499-8c1d-77ccb5e611a3 devid 2 
transid 5 /dev/sdb7
[ 3865.007986] BTRFS: device fsid 9c721da4-271d-4499-8c1d-77ccb5e611a3 devid 3 
transid 5 /dev/mapper/error-test
[ 3865.040554] BTRFS info (device dm-0): disk space caching is enabled
[ 3865.040559] BTRFS info (device dm-0): has skinny extents
[ 3865.040563] BTRFS info (device dm-0): flagging fs with big metadata feature
[ 3865.053175] BTRFS info (device dm-0): creating UUID tree
[ 3910.214683] BTRFS info (device dm-0): relocating block group 29360128 flags 
metadata|raid1
[ 3910.252405] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
1, rd 0, flush 0, corrupt 0, gen 0
[ 3910.263339] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
2, rd 0, flush 0, corrupt 0, gen 0
[ 3910.273859] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
3, rd 0, flush 0, corrupt 0, gen 0
[ 3910.274053] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
4, rd 0, flush 0, corrupt 0, gen 0
[ 3910.274178] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
5, rd 0, flush 0, corrupt 0, gen 0
[ 3910.326212] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3910.335684] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
6, rd 0, flush 0, corrupt 0, gen 0
[ 3910.346135] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3910.355378] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
7, rd 0, flush 0, corrupt 0, gen 0
[ 3912.082763] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3912.092300] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
8, rd 0, flush 0, corrupt 0, gen 0
[ 3912.102690] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3912.112326] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
9, rd 0, flush 0, corrupt 0, gen 0
[ 3913.778330] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3913.787814] BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 
10, rd 0, flush 0, corrupt 0, gen 0
[ 3913.798292] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3913.990997] BTRFS info (device dm-0): found 1675 extents
[ 3914.048231] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3914.059503] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3914.191222] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3914.200609] BTRFS warning (device dm-0): lost page write due to IO error on 
/dev/mapper/error-test
[ 3914.303028] BTRFS info (device dm-0): relocating block group 20971520 flags 
system|raid1
[ 3914.599728] 

Re: Is it safe to use btrfs on top of different types of devices?

2017-10-19 Thread Peter Grandi
[ ... ]

>> Oh please, please a bit less silliness would be welcome here.
>> In a previous comment on this tedious thread I had written:

>> > If the block device abstraction layer and lower layers work
>> > correctly, Btrfs does not have problems of that sort when
>> > adding new devices; conversely if the block device layer and
>> > lower layers do not work correctly, no mainline Linux
>> > filesystem I know can cope with that.
>> 
>> > Note: "work correctly" does not mean "work error-free".
>> 
>> The last line is very important and I added it advisedly.

> Even looking at things that way though, Zoltan's assessment
> that reliability is essentially a measure of error rate is
> correct.

It is instead based on a grave confusion between two very
different kinds of "error rate", confusion also partially based
on the ridiculous misunderstanding, which I have already pointed
out, that UNIX filesystems run on top of SATA or USB devices:

> Internal SATA devices absolutely can randomly drop off the bus
> just like many USB storage devices do,

Filesystems run on top of *block devices* with a definite
interface and a definite state machine, and filesystems in
general assume that the block device works *correctly*.

> but it almost never happens (it's a statistical impossibility
> if there are no hardware or firmware issues), so they are more
> reliable in that respect.

What the OP was doing was using "unreliable" both for the case
where the device "lies" and the case where the device does not
"lie" but reports a failure. Both of these are malfunctions in a
wide sense:

  * The [block] device "lies" as to its status or what it has done.
  * The [block] device reports truthfully that an action has failed.

But they are of very different nature and need completely
different handling. Hint: one is an extensional property and the
other is a modal one, there is a huge difference between "this
data is wrong" and "I know that this data is wrong".

The really important "detail" is that filesystems are, as a rule
with very few exceptions, designed to work only if the block
device layer (and those below it) does not "lie" (see "Bizantyne
failures" below), that is "works correctly": reports the failure
of every operation that fails and the success of every operation
that succeeds and never gets into an unexpected state.

In particular filesystems designs are nearly always based on the
assumption that there are no undetected errors at the block
device level or below. Then the expected *frequency* of detected
errors influences how much redundancy and what kind of recovery
are desirable, but the frequency of "lies" is assumed to be
zero.

The one case where Btrfs does not assume that the storage layer
works *correctly* is checksumming: it is quite expensive and
makes sense only if the block device is expected to (sometimes)
"lie" about having written the data correctly or having read it
correctly. The role of the checksum is to spot when a block
device "lies" and turn an undetected read error into a detected
one (they could be used also to detect correct writes that are
misreported as having failed).

The crucial difference that exists between SATA and USB is not
that USB chips have higher rates of detected failures (even if
they often do), but that in my experience SATA interfaces from
reputable suppliers don't "lie" (more realistically have
negligible "lie" rates), and USB interfaces (both host bus
adapters and IO bus bridges) "lie" both systematically and
statistically with non negligible rates, and anyhow the USB mass
storage protocol is not very good at error reporting and
handling.

>> The "working incorrectly" general case is the so called
>> "bizantine generals problem" [ ... ]

This is compsci for beginners and someone dealing with storage
issues (and not just) should be intimately familiar with the
implications:

  https://en.wikipedia.org/wiki/Byzantine_fault_tolerance

  Byzantine failures are considered the most general and most
  difficult class of failures among the failure modes. The
  so-called fail-stop failure mode occupies the simplest end of
  the spectrum. Whereas fail-stop failure model simply means
  that the only way to fail is a node crash, detected by other
  nodes, Byzantine failures imply no restrictions, which means
  that the failed node can generate arbitrary data, pretending
  to be a correct one, which makes fault tolerance difficult.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: change mans to describe the third copy of superblock

2017-10-19 Thread Satoru Takeuchi
At Thu, 19 Oct 2017 17:05:18 +0800,
Qu Wenruo wrote:
> 
> 
> 
> On 2017年10月19日 16:34, Misono, Tomohiro wrote:
> > On 2017/10/19 16:45, Satoru Takeuchi wrote:
> >> Some tools can select which superblock these commands use by "-s 
> >> "
> >> option. Although this option says the valid values are 0-2, we can set 3
> >> if filesystem is very large.
> >>
> > 
> > Hello, 
> > Wiki says there are 4 superblocks. However in the implementation 
> > BTRFS_SUPER_MIROR_MAX
> > is 3 and 0 indicates the block at 64K (disk-io.h of btrfs-progs), therefore 
> > I think
> > there is no 4th superblock actually.
> 
> Kernel implementation also shows that it will only update up to 3
> superblocks:
> 
> ---
>   if (max_mirrors == 0)
>   max_mirrors = BTRFS_SUPER_MIRROR_MAX;
> 
>   for (i = 0; i < max_mirrors; i++) {
>   bytenr = btrfs_sb_offset(i);
>   if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>   device->commit_total_bytes)
>   break;
> ---
> 
> And BTRFS_SUPER_MIRROR_MAX is 3:
> ---
> #define BTRFS_SUPER_MIRROR_MAX 3
> ---
> 
> So even you can set any value and btrfs_sb_offset() can calculate the
> super block offset, you will just read out some garbage.

My fault, sorry. I should read source more carefully. And thank you both
to let me know my mistake.

Thanks,
Satoru

> 
> Thanks,
> Qu
> > 
> > Regards,
> > Tomohiro
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: change mans to describe the third copy of superblock

2017-10-19 Thread Qu Wenruo


On 2017年10月19日 16:34, Misono, Tomohiro wrote:
> On 2017/10/19 16:45, Satoru Takeuchi wrote:
>> Some tools can select which superblock these commands use by "-s 
>> "
>> option. Although this option says the valid values are 0-2, we can set 3
>> if filesystem is very large.
>>
> 
> Hello, 
> Wiki says there are 4 superblocks. However in the implementation 
> BTRFS_SUPER_MIROR_MAX
> is 3 and 0 indicates the block at 64K (disk-io.h of btrfs-progs), therefore I 
> think
> there is no 4th superblock actually.

Kernel implementation also shows that it will only update up to 3
superblocks:

---
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;

for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
---

And BTRFS_SUPER_MIRROR_MAX is 3:
---
#define BTRFS_SUPER_MIRROR_MAX   3
---

So even you can set any value and btrfs_sb_offset() can calculate the
super block offset, you will just read out some garbage.

Thanks,
Qu
> 
> Regards,
> Tomohiro
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: check: "warning line 4144"

2017-10-19 Thread Qu Wenruo


On 2017年10月19日 16:53, Tom Hale wrote:
> In running btrfs check, I got the following message:
> 
> warning line 4144
> 
> Could this be a little more descriptive?
> 
> * Does it mean I should rebuild my FS from scratch?
> * Is there anything I can do to remove this warning?
> 
> Complete output below:
> 
> ==
> $ sudo btrfs check --repair -p /dev/mapper/fix-backup

--repair is dangerous, use it unless you're sure the problem can be
fixed by it.

What's the output of "btrfs check" and "btrfs check --mode=lowmem" after
doing the repair?

Thanks,
Qu

> enabling repair mode
> Checking filesystem on /dev/mapper/fix-backup
> UUID: 0f5b7713-929d-41e7-b214-32500b5c77fc
> ref mismatch on [195215851520 16384] extent item 0, found 1
> Backref 195215851520 parent 1463 root 1463 not found in extent tree
> backpointer mismatch on [195215851520 16384]
> owner ref check failed [195215851520 16384]
> repair deleting extent record: key 195215851520 169 1
> adding new tree backref on start 195215851520 len 16384 parent 0 root 1463
> Repaired extent references for 195215851520
> 
> Fixed 0 roots.
> cache and super generation don't match, space cache will be invalidated
> warning line 4144 [o]
> 
> checking csums
> checking root refs
> found 294913892352 bytes used, no error found
> total csum bytes: 282955440
> total tree bytes: 5166727168
> total fs tree bytes: 4489232384
> total extent tree bytes: 353075200
> btree space waste bytes: 886405127
> file data blocks allocated: 7930446508032
>  referenced 1126675288064
> ==
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


check: "warning line 4144"

2017-10-19 Thread Tom Hale
In running btrfs check, I got the following message:

warning line 4144

Could this be a little more descriptive?

* Does it mean I should rebuild my FS from scratch?
* Is there anything I can do to remove this warning?

Complete output below:

==
$ sudo btrfs check --repair -p /dev/mapper/fix-backup
enabling repair mode
Checking filesystem on /dev/mapper/fix-backup
UUID: 0f5b7713-929d-41e7-b214-32500b5c77fc
ref mismatch on [195215851520 16384] extent item 0, found 1
Backref 195215851520 parent 1463 root 1463 not found in extent tree
backpointer mismatch on [195215851520 16384]
owner ref check failed [195215851520 16384]
repair deleting extent record: key 195215851520 169 1
adding new tree backref on start 195215851520 len 16384 parent 0 root 1463
Repaired extent references for 195215851520

Fixed 0 roots.
cache and super generation don't match, space cache will be invalidated
warning line 4144 [o]

checking csums
checking root refs
found 294913892352 bytes used, no error found
total csum bytes: 282955440
total tree bytes: 5166727168
total fs tree bytes: 4489232384
total extent tree bytes: 353075200
btree space waste bytes: 886405127
file data blocks allocated: 7930446508032
 referenced 1126675288064
==

-- 
Tom Hale
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: change mans to describe the third copy of superblock

2017-10-19 Thread Misono, Tomohiro
On 2017/10/19 16:45, Satoru Takeuchi wrote:
> Some tools can select which superblock these commands use by "-s "
> option. Although this option says the valid values are 0-2, we can set 3
> if filesystem is very large.
> 

Hello, 
Wiki says there are 4 superblocks. However in the implementation 
BTRFS_SUPER_MIROR_MAX
is 3 and 0 indicates the block at 64K (disk-io.h of btrfs-progs), therefore I 
think
there is no 4th superblock actually.

Regards,
Tomohiro

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 4.13: "error in btrfs_run_delayed_refs:3009: errno=-28 No space left" with 1.3TB unallocated / 737G free?

2017-10-19 Thread Vladimir Panteleev

On Tue, 17 Oct 2017 16:21:04 -0700, Duncan wrote:

* try the balance on 4.14-rc5+, where the known bug should be fixed


Thanks! However, I'm getting the same error on 4.14.0-rc5-g9aa0d2dde6eb. 
The stack trace is different, though:


[25886.024757] BTRFS: Transaction aborted (error -28)
[25886.024793] [ cut here ]
[25886.024807] WARNING: CPU: 3 PID: 1904 at fs/btrfs/extent-tree.c:7062 
__btrfs_free_extent.isra.24+0xc23/0xda0 [btrfs]
[25886.024808] Modules linked in: ctr fuse xt_nat vhost_net vhost tap 
xt_CHECKSUM iptable_mangle xt_conntrack ipt_REJECT nf_reject_ipv4 
xt_tcpudp ebtable_filter ebtables ip6table_filter ip6_tables 
iptable_filter devlink tun nls_utf8 cifs ccm dns_resolver fscache uinput 
it87 hwmon_vid ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat 
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack 
libcrc32c crc32c_generic sit tunnel4 ip_tunnel snd_hda_codec_hdmi 8021q 
mrp snd_hda_codec_realtek snd_hda_codec_generic iTCO_wdt 
iTCO_vendor_support nls_iso8859_1 nls_cp437 mxm_wmi vfat fat 
nvidia_drm(PO) intel_rapl nvidia_modeset(PO) x86_pkg_temp_thermal 
intel_powerclamp nvidia(PO) coretemp kvm_intel kvm irqbypass 
crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc arc4 aesni_intel 
aes_x86_64 ath9k crypto_simd
[25886.024830]  glue_helper cryptd ath9k_common ath9k_hw intel_cstate 
ath3k ath intel_rapl_perf btusb snd_hda_intel btrtl btbcm pl2303 btintel 
drm_kms_helper uvcvideo snd_usb_audio snd_hda_codec videobuf2_vmalloc 
snd_usbmidi_lib mac80211 bluetooth videobuf2_memops snd_rawmidi 
videobuf2_v4l2 ecdh_generic usbserial crc16 i2c_i801 lpc_ich cdc_acm 
snd_seq_device snd_hda_core videobuf2_core drm e1000e cfg80211 snd_hwdep 
snd_pcm syscopyarea r8169 sysfillrect snd_timer videodev sysimgblt mii 
rfkill mei_me ptp fb_sys_fops snd mousedev input_leds joydev evdev 
ioatdma led_class mac_hid media soundcore mei pps_core dca shpchp wmi 
bridge tpm_infineon tpm_tis tpm_tis_core stp llc tpm button sch_fq_codel 
sg ip_tables x_tables sr_mod cdrom btrfs xor zstd_decompress 
zstd_compress xxhash raid6_pq sd_mod hid_generic
[25886.024855]  hid_dr ff_memless usbhid hid crc32c_intel isci xhci_pci 
ahci libsas ehci_pci xhci_hcd scsi_transport_sas libahci ehci_hcd 
usbcore libata usb_common scsi_mod serio
[25886.024863] CPU: 3 PID: 1904 Comm: btrfs-transacti Tainted: P 
  O4.14.0-rc5-g9aa0d2dde6eb #2
[25886.024864] Hardware name: Gigabyte Technology Co., Ltd. To be filled 
by O.E.M./X79S-UP5, BIOS F5f 03/19/2014

[25886.024865] task: 880eb8f1d880 task.stack: c9000c81c000
[25886.024871] RIP: 0010:__btrfs_free_extent.isra.24+0xc23/0xda0 [btrfs]
[25886.024871] RSP: 0018:c9000c81fc28 EFLAGS: 00010282
[25886.024873] RAX: 0026 RBX: 0854ddb4 RCX: 

[25886.024873] RDX:  RSI: 880fff2cdc48 RDI: 
880fff2cdc48
[25886.024874] RBP: c9000c81fcd0 R08: 0613 R09: 
0007
[25886.024875] R10: 1000 R11: 0001 R12: 
880ec87c6000
[25886.024876] R13: ffe4 R14:  R15: 
880ff4f4a690
[25886.024877] FS:  () GS:880fff2c() 
knlGS:

[25886.024878] CS:  0010 DS:  ES:  CR0: 80050033
[25886.024879] CR2: 7f1c6cb9c0d0 CR3: 02c09003 CR4: 
001606e0

[25886.024880] Call Trace:
[25886.024887]  ? btrfs_previous_extent_item+0xe1/0x110 [btrfs]
[25886.024895]  ? btrfs_merge_delayed_refs+0x8c/0x550 [btrfs]
[25886.024901]  __btrfs_run_delayed_refs+0x6ee/0x12f0 [btrfs]
[25886.024909]  btrfs_run_delayed_refs+0x6b/0x250 [btrfs]
[25886.024916]  btrfs_commit_transaction+0x48/0x920 [btrfs]
[25886.024922]  ? start_transaction+0x99/0x420 [btrfs]
[25886.024929]  transaction_kthread+0x182/0x1b0 [btrfs]
[25886.024932]  kthread+0x125/0x140
[25886.024939]  ? btrfs_cleanup_transaction+0x520/0x520 [btrfs]
[25886.024940]  ? kthread_create_on_node+0x70/0x70
[25886.024942]  ret_from_fork+0x25/0x30
[25886.024944] Code: d7 e0 0f ff eb d0 44 89 ee 48 c7 c7 68 b7 40 a0 e8 
c4 8d d7 e0 0f ff e9 7c fb ff ff 44 89 ee 48 c7 c7 68 b7 40 a0 e8 ae 8d 
d7 e0 <0f> ff e9 00 f5 ff ff 8b 55 20 48 89 c1 49 89 d8 48 c7 c6 48 b8

[25886.024961] ---[ end trace 3570a54b286cb501 ]---
[25886.024966] BTRFS: error (device sda1) in __btrfs_free_extent:7062: 
errno=-28 No space left

[25886.024968] BTRFS info (device sda1): forced readonly
[25886.024969] BTRFS: error (device sda1) in 
btrfs_run_delayed_refs:3089: errno=-28 No space left


Aside from rebuilding the filesystem, what are my options? Should I try 
to temporarily add a file from another volume as a device and retry the 
balance? If so, what would be a good size for the temporary device?


--
Best regards,
 Vladimir
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Unmountable fs - missing generation?

2017-10-19 Thread Satoru Takeuchi
At Thu, 19 Oct 2017 12:03:08 +0900,
satoru takeuchi wrote:
> 
> Resend it since I forgot to CC linux-btrfs ML >Larkin
> 
> On Oct 17, 2017, at 0:16, Larkin Lowrey 
> wrote:
> 
> I am unable to mount one my my filesystems. The superblock thinks
> the latest generation is 2220927 but I can't seem to find a root
> with that number. I can find 2220926 and 2220928 but not 2220927.
> Is there anything that I can do to recover this FS?
> 
> 
> `btrfs-select-super` may help you. Please see the following steps.
> 
> 1. Backup the current unmountable fs image if possible.
> 2. Salvage your files as much as possilbe with reading the following
> document
> if possible.
> 
> https://btrfs.wiki.kernel.org/index.php/Restore
> 
> 3. Execute `btrfs-select-super -s 1 /dev/Cached/Backups`. Please note
> that
> this command changes the contents of /dev/Cached/Backups. So if this
> command fails. Things would get worse.

I forgot to tell one important point. You can only run above mentioned
command iff 1st copy of superblock is valid. It can be confirmed
by `btrfs inspect-internal dump-super (or btrfs-dump-super)` as follows.

* valid case

```
$ sudo btrfs inspect dump-super -a /dev/sdb4
...
superblock: bytenr=67108864, device=/dev/sdb4   # 1st copy of superblock
-
csum0x423bcd19 [match]
bytenr  67108864
flags   0x1
...
```

* invalid case

```
$ sudo btrfs inspect dump-super -a /dev/sdb4
...
superblock: bytenr=67108864, device=/dev/sdb4
-
ERROR: bad magic on superblock on /dev/sdb4 at 67108864
...
```

Thanks,
Satoru

> Thanks,
> Satoru
> 
> 
> # btrfs check /dev/Cached/Backups
> checksum verify failed on 159057884594176 found 15284E33 wanted
> C8C5B54E
> checksum verify failed on 159057884594176 found 15284E33 wanted
> C8C5B54E
> checksum verify failed on 159057884594176 found 472037C9 wanted
> 9ACDCCB4
> checksum verify failed on 159057884594176 found 472037C9 wanted
> 9ACDCCB4
> Csum didn't match
> Couldn't setup extent tree
> Couldn't open file system
> 
> # btrfs-find-root -g 2220927 /dev/Cached/Backups
> Couldn't setup extent tree
> Couldn't setup device tree
> Superblock thinks the generation is 2220927
> Superblock thinks the level is 2
> 
> Found tree root at 159057884577792 gen 2220927 level 2
> Well block 101489031790592(gen: 2220928 level: 2) seems good, but
> generation/level doesn't match, want gen: 2220927 level: 2
> 
> # btrfs check --tree-root 159057884577792 /dev/Cached/Backups
> checksum verify failed on 159057884594176 found 15284E33 wanted
> C8C5B54E
> checksum verify failed on 159057884594176 found 15284E33 wanted
> C8C5B54E
> checksum verify failed on 159057884594176 found 472037C9 wanted
> 9ACDCCB4
> checksum verify failed on 159057884594176 found 472037C9 wanted
> 9ACDCCB4
> Csum didn't match
> Couldn't setup extent tree
> Couldn't open file system
> 
> # btrfs check --tree-root 101489031790592 /dev/Cached/Backups
> parent transid verify failed on 101489031790592 wanted 2220927
> found 2220928
> parent transid verify failed on 101489031790592 wanted 2220927
> found 2220928
> parent transid verify failed on 101489031790592 wanted 2220927
> found 2220928
> parent transid verify failed on 101489031790592 wanted 2220927
> found 2220928
> Ignoring transid failure
> parent transid verify failed on 159057595138048 wanted 2220927
> found 2220920
> parent transid verify failed on 159057595138048 wanted 2220927
> found 2220920
> parent transid verify failed on 159057595138048 wanted 2220927
> found 2220920
> parent transid verify failed on 159057595138048 wanted 2220927
> found 2220920
> Ignoring transid failure
> parent transid verify failed on 158652658122752 wanted 2220927
> found 2220911
> parent transid verify failed on 158652658122752 wanted 2220927
> found 2220911
> parent transid verify failed on 158652658122752 wanted 2220927
> found 2220911
> parent transid verify failed on 158652658122752 wanted 2220927
> found 2220911
> Ignoring transid failure
> Checking filesystem on /dev/Cached/Backups
> UUID: 1b213dfd-6486-47d8-8459-bc5825882023
> checking extents
> parent transid verify failed on 116329711550464 wanted 2220928
> found 2220921
> parent transid verify failed on 116329711550464 wanted 2220928
> found 2220921
> parent transid verify failed on 116329711550464 wanted 2220928
> found 2220921
> parent transid verify failed on 116329711550464 wanted 2220928
> found 2220921
> Ignoring transid failure
> parent transid verify failed on 116325928206336 wanted 2220928
> found 2220921
> 

[PATCH] btrfs-progs: change mans to describe the third copy of superblock

2017-10-19 Thread Satoru Takeuchi
Some tools can select which superblock these commands use by "-s "
option. Although this option says the valid values are 0-2, we can set 3
if filesystem is very large.

Signed-off-by: Satoru Takeuchi 
---
 Documentation/btrfs-check.asciidoc| 2 +-
 Documentation/btrfs-restore.asciidoc  | 2 +-
 Documentation/btrfs-select-super.asciidoc | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Documentation/btrfs-check.asciidoc 
b/Documentation/btrfs-check.asciidoc
index fbf4884..a557cff 100644
--- a/Documentation/btrfs-check.asciidoc
+++ b/Documentation/btrfs-check.asciidoc
@@ -74,7 +74,7 @@ run in read-only mode, this option exists to calm potential 
panic when users
 are going to run the checker
 
 -s|--super ::
-use 'superblock'th superblock copy, valid values are 0, 1 or 2 if the
+use 'superblock'th superblock copy, valid values are 0, 1, 2 or 3 if the
 respective superblock offset is within the device size
 +
 This can be used to use a different starting point if some of the primary
diff --git a/Documentation/btrfs-restore.asciidoc 
b/Documentation/btrfs-restore.asciidoc
index 090dcc5..c19e0e2 100644
--- a/Documentation/btrfs-restore.asciidoc
+++ b/Documentation/btrfs-restore.asciidoc
@@ -63,7 +63,7 @@ use  to read the root tree
 only restore files that are under specified subvolume root pointed by 
 
 -u|--super ::
-use given superblock mirror identified by , it can be 0,1 or 2
+use given superblock mirror identified by , it can be 0, 1, 2 or 3
 
 -r|--root ::
 only restore files that are under a specified subvolume whose objectid is 

diff --git a/Documentation/btrfs-select-super.asciidoc 
b/Documentation/btrfs-select-super.asciidoc
index 6e94a03..7f96bd8 100644
--- a/Documentation/btrfs-select-super.asciidoc
+++ b/Documentation/btrfs-select-super.asciidoc
@@ -32,13 +32,14 @@ Superblock copies exist in the following offsets on the 
device:
 - primary: '64KiB' (65536)
 - 1st copy: '64MiB' (67108864)
 - 2nd copy: '256GiB' (274877906944)
+- 3rd copy: '1PiB' (1125899906842624)
 
 A superblock size is '4KiB' (4096).
 
 OPTIONS
 ---
 -s|--super ::
-use 'superblock'th superblock copy, valid values are 0 1 or 2 if the
+use 'superblock'th superblock copy, valid values are 0, 1, 2 or 3 if the
 respective superblock offset is within the device size
 
 SEE ALSO
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html