[PATCH] btrfs-progs: clarify manpage for btrfstune seeding option

2014-05-21 Thread Gui Hecheng
The btrfstune -S option accepts a positive value to enable seeding,
and a zero to disable seeding, negtive is not allowed.

Add positive, zero, negative sentences to btrfstune manpage.

Signed-off-by: Gui Hecheng guihc.f...@cn.fujitsu.com
---
 Documentation/btrfstune.txt | 5 +++--
 btrfstune.c | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/btrfstune.txt b/Documentation/btrfstune.txt
index bf4bfce..89d5996 100644
--- a/Documentation/btrfstune.txt
+++ b/Documentation/btrfstune.txt
@@ -17,8 +17,9 @@ enable/disable some extended features for btrfs.
 OPTIONS
 ---
 -S value::
-Updates the seeding value, it forces a fs readonly so that you can use it to
-build other filesystems.
+Updates the seeding value.
+A positive value will enable seeding, zero will disable seeding, negtive is 
not allowed.
+Enable seeding forces a fs readonly so that you can use it to build other 
filesystems.
 -r::
 Enable extended inode refs.
 -x::
diff --git a/btrfstune.c b/btrfstune.c
index 2c26fe9..3f2f0cd 100644
--- a/btrfstune.c
+++ b/btrfstune.c
@@ -100,9 +100,9 @@ static int enable_skinny_metadata(struct btrfs_root *root)
 static void print_usage(void)
 {
fprintf(stderr, usage: btrfstune [options] device\n);
-   fprintf(stderr, \t-S value\tenable/disable seeding\n);
+   fprintf(stderr, \t-S value\tpositive value will enable seeding, zero 
to disable, negative is not allowed\n);
fprintf(stderr, \t-r \t\tenable extended inode refs\n);
-   fprintf(stderr, \t-x enable skinny metadata extent refs\n);
+   fprintf(stderr, \t-x \t\tenable skinny metadata extent refs\n);
 }
 
 int main(int argc, char *argv[])
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 5/8] Btrfs: subpagesize-blocksize: Read tree blocks whose size is PAGE_CACHE_SIZE.

2014-05-21 Thread Chandan Rajendra
In the case of subpagesize-blocksize, this patch makes it possible to read
only a single metadata block from the disk instead of all the metadata blocks
that map into a page.

Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/disk-io.c   |  45 -
 fs/btrfs/disk-io.h   |   3 ++
 fs/btrfs/extent_io.c | 135 +++
 3 files changed, 137 insertions(+), 46 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bda2157..b2c4e9d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -413,7 +413,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root 
*root,
int mirror_num = 0;
int failed_mirror = 0;
 
-   clear_bit(EXTENT_BUFFER_CORRUPT, eb-bflags);
+   clear_bit(EXTENT_BUFFER_CORRUPT, eb-ebflags);
io_tree = BTRFS_I(root-fs_info-btree_inode)-io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, start,
@@ -432,7 +432,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root 
*root,
 * there is no reason to read the other copies, they won't be
 * any less wrong.
 */
-   if (test_bit(EXTENT_BUFFER_CORRUPT, eb-bflags))
+   if (test_bit(EXTENT_BUFFER_CORRUPT, eb-ebflags))
break;
 
num_copies = btrfs_num_copies(root-fs_info,
@@ -564,12 +564,13 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
 }
 
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int verify_extent_buffer_read(struct btrfs_io_bio *io_bio,
+   struct page *page,
+   u64 start, u64 end, int mirror)
 {
u64 found_start;
int found_level;
+   struct extent_buffer_head *ebh;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
int ret = 0;
@@ -579,18 +580,26 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
goto out;
 
eb = (struct extent_buffer *)page-private;
+   do {
+   if ((eb-start = start)  (eb-start + eb-len - 1  start))
+   break;
+   } while ((eb = eb-eb_next) != NULL);
+
+   BUG_ON(!eb);
+
+   ebh = eb_head(eb);
 
/* the pending IO might have been the only thing that kept this buffer
 * in memory.  Make sure we have a ref for all this other checks
 */
extent_buffer_get(eb);
 
-   reads_done = atomic_dec_and_test(eb-io_pages);
+   reads_done = atomic_dec_and_test(ebh-io_bvecs);
if (!reads_done)
goto err;
 
eb-read_mirror = mirror;
-   if (test_bit(EXTENT_BUFFER_IOERR, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_IOERR, eb-ebflags)) {
ret = -EIO;
goto err;
}
@@ -632,7 +641,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
 * return -EIO.
 */
if (found_level == 0  check_leaf(root, eb)) {
-   set_bit(EXTENT_BUFFER_CORRUPT, eb-bflags);
+   set_bit(EXTENT_BUFFER_CORRUPT, eb-ebflags);
ret = -EIO;
}
 
@@ -640,7 +649,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
set_extent_buffer_uptodate(eb);
 err:
if (reads_done 
-   test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags))
+   test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-ebflags))
btree_readahead_hook(root, eb, eb-start, ret);
 
if (ret) {
@@ -649,7 +658,7 @@ err:
 * again, we have to make sure it has something
 * to decrement
 */
-   atomic_inc(eb-io_pages);
+   atomic_inc(eb_head(eb)-io_bvecs);
clear_extent_buffer_uptodate(eb);
}
free_extent_buffer(eb);
@@ -657,20 +666,6 @@ out:
return ret;
 }
 
-static int btree_io_failed_hook(struct page *page, int failed_mirror)
-{
-   struct extent_buffer *eb;
-   struct btrfs_root *root = BTRFS_I(page-mapping-host)-root;
-
-   eb = (struct extent_buffer *)page-private;
-   set_bit(EXTENT_BUFFER_IOERR, eb-bflags);
-   eb-read_mirror = failed_mirror;
-   atomic_dec(eb-io_pages);
-   if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, eb-bflags))
-   btree_readahead_hook(root, eb, eb-start, -EIO);
-   return -EIO;/* we fixed nothing */
-}
-
 static void end_workqueue_bio(struct bio *bio, int err)
 {
struct end_io_wq *end_io_wq = bio-bi_private;
@@ -4109,8 +4104,6 @@ static int btrfs_cleanup_transaction(struct btrfs_root 
*root)
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
-   .readpage_end_io_hook = 

[RFC PATCH 8/8] Btrfs: subpagesize-blocksize: Compute and look up csums based on sectorsized blocks.

2014-05-21 Thread Chandan Rajendra
Checksums are applicable to sectorsize units. The current code uses
bio-bv_len units to compute and look up checksums. This works on machines
where sectorsize == PAGE_CACHE_SIZE. This patch makes the checksum
computation and look up code to work with sectorsize units.

Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/file-item.c | 85 
 fs/btrfs/inode.c | 24 ++-
 2 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9d84658..16deb87 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -172,6 +172,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
+   u64 page_bytes_left;
u32 diff;
int nblocks;
int bio_index = 0;
@@ -220,6 +221,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
disk_bytenr = (u64)bio-bi_sector  9;
if (dio)
offset = logical_offset;
+
+   page_bytes_left = bvec-bv_len;
while (bio_index  bio-bi_vcnt) {
if (!dio)
offset = page_offset(bvec-bv_page) + bvec-bv_offset;
@@ -243,7 +246,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
if (BTRFS_I(inode)-root-root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
-   offset + bvec-bv_len - 1,
+   offset + root-sectorsize - 1,
EXTENT_NODATASUM, GFP_NOFS);
} else {

btrfs_info(BTRFS_I(inode)-root-fs_info,
@@ -281,11 +284,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root 
*root,
 found:
csum += count * csum_size;
nblocks -= count;
+
while (count--) {
-   disk_bytenr += bvec-bv_len;
-   offset += bvec-bv_len;
-   bio_index++;
-   bvec++;
+   disk_bytenr += root-sectorsize;
+   offset += root-sectorsize;
+   page_bytes_left -= root-sectorsize;
+   if (!page_bytes_left) {
+   bio_index++;
+   bvec++;
+   page_bytes_left = bvec-bv_len;
+   }
+
}
}
btrfs_free_path(path);
@@ -442,6 +451,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
struct bio_vec *bvec = bio-bi_io_vec;
int bio_index = 0;
int index;
+   int nr_sectors;
+   int i;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
@@ -468,41 +479,49 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct 
inode *inode,
if (!contig)
offset = page_offset(bvec-bv_page) + bvec-bv_offset;
 
-   if (offset = ordered-file_offset + ordered-len ||
-   offset  ordered-file_offset) {
-   unsigned long bytes_left;
-   sums-len = this_sum_bytes;
-   this_sum_bytes = 0;
-   btrfs_add_ordered_sum(inode, ordered, sums);
-   btrfs_put_ordered_extent(ordered);
+   data = kmap_atomic(bvec-bv_page);
 
-   bytes_left = bio-bi_size - total_bytes;
+   nr_sectors = (bvec-bv_len + root-sectorsize - 1)
+root-fs_info-sb-s_blocksize_bits;
+
+   for (i = 0; i  nr_sectors; i++) {
+   if (offset = ordered-file_offset + ordered-len ||
+   offset  ordered-file_offset) {
+   unsigned long bytes_left;
+   sums-len = this_sum_bytes;
+   this_sum_bytes = 0;
+   btrfs_add_ordered_sum(inode, ordered, sums);
+   btrfs_put_ordered_extent(ordered);
+
+   bytes_left = bio-bi_size - total_bytes;
+
+   sums = kzalloc(btrfs_ordered_sum_size(root, 
bytes_left),
+   GFP_NOFS);
+   BUG_ON(!sums); /* -ENOMEM */
+   sums-len = bytes_left;
+   ordered = btrfs_lookup_ordered_extent(inode, 
offset);
+   BUG_ON(!ordered); /* Logic error */
+   sums-bytenr = ((u64)bio-bi_sector  9) +
+

[RFC PATCH 7/8] Btrfs: subpagesize-blocksize: Allow mounting filesystems where sectorsize != PAGE_SIZE

2014-05-21 Thread Chandan Rajendra
From: Chandra Seetharaman sekha...@us.ibm.com

This patch allows mounting filesystems with blocksize smaller than the
PAGE_SIZE.

Signed-off-by: Chandra Seetharaman sekha...@us.ibm.com
Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/disk-io.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 28a45f6..3bb7072 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2599,12 +2599,6 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
 
-   if (sectorsize != PAGE_SIZE) {
-   printk(KERN_WARNING BTRFS: Incompatible sector size(%lu) 
-  found on %s\n, (unsigned long)sectorsize, sb-s_id);
-   goto fail_sb_buffer;
-   }
-
mutex_lock(fs_info-chunk_mutex);
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(fs_info-chunk_mutex);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 0/8] Btrfs: Subpagesize-blocksize: Get rid of whole page I/O

2014-05-21 Thread Chandan Rajendra
This patchset continues with the work posted earlier at
http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg32143.html. The
following are the high level changes brought in by this patchset:
1. Rewrite 'extent buffer' handling code to incorporate comments posted to
   Chandra Seetharaman's patchset
   http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg29534.html.
2. Compute and look up checksums for data extents in sectorsize units.

Xfstests' generic tests were run on an x86_64 machine with the patches
applied.

On multiple runs of the tests with 4k blocksize, 'umount' process would
sometimes get blocked indefinitely causing 'hung task detector' to print the
function call trace. Also, there are occasional instances where warning
messages from btree_invalidatepage() is being printed to indicate that
PG_private flag of a page is still set.

For 2k blocksize only a few Xfstests' generic tests pass.

The following is a list of known TODO items which will be implemented in
future revisions of this patchset:
1. Create separate slab caches for 'extent buffer head' and 'extent buffer'.
2. Add 'leak list' tracking for 'extent buffer' instances.
3. Rename EXTENT_BUFFER_TREE_REF and EXTENT_BUFFER_IN_TREE to
   EXTENT_BUFFER_HEAD_TREE_REF and EXTENT_BUFFER_HEAD_IN_TREE respectively.
4. Remove usage of bvec-{bv_offset, bv_len} from end_bio_extent_readpage()
   end_bio_extent_writepage() and btrfs_csum_one_bio.
5. Get Xfstests' generic tests to successfully run on both 4k and 2k
   blocksizes.

Chandan Rajendra (6):
  Btrfs: subpagesize-blocksize: Get rid of whole page reads.
  Btrfs: subpagesize-blocksize: Get rid of whole page writes.
  Btrfs: subpagesize-blocksize: __btrfs_buffered_write: Reserve/release
extents aligned to block size.
  Btrfs: subpagesize-blocksize: Read tree blocks whose size is
PAGE_CACHE_SIZE.
  Btrfs: subpagesize-blocksize: Write only dirty extent buffers
belonging to a page
  Btrfs: subpagesize-blocksize: Compute and look up csums based on
sectorsized blocks.

Chandra Seetharaman (2):
  Btrfs: subpagesize-blocksize: Define extent_buffer_head.
  Btrfs: subpagesize-blocksize: Allow mounting filesystems where
sectorsize != PAGE_SIZE

 fs/btrfs/backref.c   |   2 +-
 fs/btrfs/ctree.c |   2 +-
 fs/btrfs/ctree.h |   6 +-
 fs/btrfs/disk-io.c   | 117 +++---
 fs/btrfs/disk-io.h   |   3 +
 fs/btrfs/extent-tree.c   |   6 +-
 fs/btrfs/extent_io.c | 980 +--
 fs/btrfs/extent_io.h |  46 +-
 fs/btrfs/file-item.c |  85 ++--
 fs/btrfs/file.c  |  32 +-
 fs/btrfs/inode.c |  24 +-
 fs/btrfs/volumes.c   |   2 +-
 fs/btrfs/volumes.h   |   3 +
 include/trace/events/btrfs.h |   2 +-
 14 files changed, 947 insertions(+), 363 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/8] Btrfs: subpagesize-blocksize: Get rid of whole page reads.

2014-05-21 Thread Chandan Rajendra
Based on original patch from Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

bio_vec-{bv_offset, bv_len} cannot be relied upon by the end bio functions
to track the file offset range operated on by the bio. Hence this patch adds
two new members to 'struct btrfs_io_bio' to track the file offset range.

This patch also brings back check_page_locked() to reliably unlock pages in
readpage's end bio function.

Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/extent_io.c | 120 +--
 fs/btrfs/volumes.h   |   3 ++
 2 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fbe501d..fd6f011 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1943,15 +1943,29 @@ int test_range_bit(struct extent_io_tree *tree, u64 
start, u64 end,
  * helper function to set a given page up to date if all the
  * extents in the tree for that page are up to date
  */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page,
+   struct extent_state *cached)
 {
u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
-   if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+   if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, cached))
SetPageUptodate(page);
 }
 
 /*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static void check_page_locked(struct extent_io_tree *tree, struct page *page)
+{
+   u64 start = page_offset(page);
+   u64 end = start + PAGE_CACHE_SIZE - 1;
+
+   if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
+   unlock_page(page);
+   }
+}
+
  * When IO fails, either with EIO or csum verification fails, we
  * try other mirrors that might have a good copy of the data.  This
  * io_failure_record is used to record state as we go through all the
@@ -2414,16 +2428,33 @@ static void end_bio_extent_writepage(struct bio *bio, 
int err)
bio_put(bio);
 }
 
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
- int uptodate)
+static void unlock_extent_and_page(struct address_space *mapping,
+  struct extent_io_tree *tree,
+  struct btrfs_io_bio *io_bio)
 {
-   struct extent_state *cached = NULL;
-   u64 end = start + len - 1;
+   pgoff_t index;
+   u64 offset, len;
+   /*
+* This btrfs_io_bio may span multiple pages.
+* We need to unlock the pages convered by them
+* if we got endio callback for all the blocks in the page.
+* btrfs_io_bio also contain contigous blocks of the file
+* look at submit_extent_page for more details.
+*/
 
-   if (uptodate  tree-track_uptodate)
-   set_extent_uptodate(tree, start, end, cached, GFP_ATOMIC);
-   unlock_extent_cached(tree, start, end, cached, GFP_ATOMIC);
+   offset = io_bio-start_offset;
+   len= io_bio-len;
+   unlock_extent(tree, offset, offset + len - 1);
+
+   index = offset  PAGE_CACHE_SHIFT;
+   while (offset  io_bio-start_offset + len) {
+   struct page *page;
+   page = find_get_page(mapping, index);
+   check_page_locked(tree, page);
+   page_cache_release(page);
+   index++;
+   offset += PAGE_CACHE_SIZE;
+   }
 }
 
 /*
@@ -2443,13 +2474,13 @@ static void end_bio_extent_readpage(struct bio *bio, 
int err)
struct bio_vec *bvec_end = bio-bi_io_vec + bio-bi_vcnt - 1;
struct bio_vec *bvec = bio-bi_io_vec;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+   struct address_space *mapping = bio-bi_io_vec-bv_page-mapping;
struct extent_io_tree *tree;
+   struct extent_state *cached = NULL;
u64 offset = 0;
u64 start;
u64 end;
u64 len;
-   u64 extent_start = 0;
-   u64 extent_len = 0;
int mirror;
int ret;
 
@@ -2482,8 +2513,8 @@ static void end_bio_extent_readpage(struct bio *bio, int 
err)
bvec-bv_offset, bvec-bv_len);
}
 
-   start = page_offset(page);
-   end = start + bvec-bv_offset + bvec-bv_len - 1;
+   start = page_offset(page) + bvec-bv_offset;
+   end = start + bvec-bv_len - 1;
len = bvec-bv_len;
 
if (++bvec = bvec_end)
@@ -2540,40 +2571,24 @@ readpage_ok:
offset = i_size  (PAGE_CACHE_SIZE-1);
if (page-index == end_index  offset)
zero_user_segment(page, offset, 
PAGE_CACHE_SIZE);
-   

[RFC PATCH 6/8] Btrfs: subpagesize-blocksize: Write only dirty extent buffers belonging to a page

2014-05-21 Thread Chandan Rajendra
For the subpagesize-blocksize scenario, This patch adds the ability to write a
single extent buffer to the disk.

Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/disk-io.c   |  20 ++--
 fs/btrfs/extent_io.c | 277 ++-
 2 files changed, 243 insertions(+), 54 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b2c4e9d..28a45f6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -466,17 +466,23 @@ static int btree_read_extent_buffer_pages(struct 
btrfs_root *root,
 
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
-   u64 start = page_offset(page);
-   u64 found_start;
struct extent_buffer *eb;
+   u64 found_start;
 
eb = (struct extent_buffer *)page-private;
-   if (page != eb-pages[0])
+   if (page != eb_head(eb)-pages[0])
return 0;
-   found_start = btrfs_header_bytenr(eb);
-   if (WARN_ON(found_start != start || !PageUptodate(page)))
-   return 0;
-   csum_tree_block(root, eb, 0);
+   do {
+   if (!test_bit(EXTENT_BUFFER_WRITEBACK, eb-ebflags))
+   continue;
+   if (WARN_ON(!test_bit(EXTENT_BUFFER_UPTODATE, eb-ebflags)))
+   continue;
+   found_start = btrfs_header_bytenr(eb);
+   if (WARN_ON(found_start != eb-start))
+   return 0;
+   csum_tree_block(root, eb, 0);
+   } while ((eb = eb-eb_next) != NULL);
+
return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5d23935..7f88dbd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3495,32 +3495,53 @@ void wait_on_extent_buffer_writeback(struct 
extent_buffer *eb)
TASK_UNINTERRUPTIBLE);
 }
 
-static int lock_extent_buffer_for_io(struct extent_buffer *eb,
-struct btrfs_fs_info *fs_info,
-struct extent_page_data *epd)
+static void lock_extent_buffer_pages(struct extent_buffer_head *ebh,
+   struct extent_page_data *epd)
 {
+   struct extent_buffer *eb = ebh-eb;
unsigned long i, num_pages;
-   int flush = 0;
+
+   num_pages = num_extent_pages(eb-start, eb-len);
+   for (i = 0; i  num_pages; i++) {
+   struct page *p = extent_buffer_page(eb, i);
+
+   if (!trylock_page(p)) {
+   flush_write_bio(epd);
+   lock_page(p);
+   }
+   }
+
+   return;
+}
+
+static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+   struct btrfs_fs_info *fs_info,
+   struct extent_page_data *epd)
+{
+   int dirty;
int ret = 0;
 
if (!btrfs_try_tree_write_lock(eb)) {
-   flush = 1;
flush_write_bio(epd);
btrfs_tree_lock(eb);
}
 
-   if (test_bit(EXTENT_BUFFER_WRITEBACK, eb-bflags)) {
+   if (test_bit(EXTENT_BUFFER_WRITEBACK, eb-ebflags)) {
+   dirty = test_bit(EXTENT_BUFFER_DIRTY, eb-ebflags);
btrfs_tree_unlock(eb);
-   if (!epd-sync_io)
-   return 0;
-   if (!flush) {
-   flush_write_bio(epd);
-   flush = 1;
+   if (!epd-sync_io) {
+   if (!dirty)
+   return 1;
+   else
+   return 2;
}
+
+   flush_write_bio(epd);
+
while (1) {
wait_on_extent_buffer_writeback(eb);
btrfs_tree_lock(eb);
-   if (!test_bit(EXTENT_BUFFER_WRITEBACK, eb-bflags))
+   if (!test_bit(EXTENT_BUFFER_WRITEBACK, eb-ebflags))
break;
btrfs_tree_unlock(eb);
}
@@ -3531,27 +3552,25 @@ static int lock_extent_buffer_for_io(struct 
extent_buffer *eb,
 * under IO since we can end up having no IO bits set for a short period
 * of time.
 */
-   spin_lock(eb-refs_lock);
-   if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, eb-bflags)) {
-   set_bit(EXTENT_BUFFER_WRITEBACK, eb-bflags);
-   spin_unlock(eb-refs_lock);
+   spin_lock(eb_head(eb)-refs_lock);
+   if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, eb-ebflags)) {
+   set_bit(EXTENT_BUFFER_WRITEBACK, eb-ebflags);
+   spin_unlock(eb_head(eb)-refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
__percpu_counter_add(fs_info-dirty_metadata_bytes,
 -eb-len,
 fs_info-dirty_metadata_batch);
-   ret = 1;
+   ret = 0;

[RFC PATCH 2/8] Btrfs: subpagesize-blocksize: Get rid of whole page writes.

2014-05-21 Thread Chandan Rajendra
This commit brings back functions that set/clear EXTENT_WRITEBACK bits. These
are required to reliably clear PG_writeback page flag.

Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/extent_io.c | 76 +---
 1 file changed, 73 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fd6f011..17ff01b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1293,6 +1293,20 @@ int clear_extent_uptodate(struct extent_io_tree *tree, 
u64 start, u64 end,
cached_state, mask);
 }
 
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 
end,
+   struct extent_state **cached_state, gfp_t mask)
+{
+   return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, NULL,
+   cached_state, mask);
+}
+
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 
end,
+   struct extent_state **cached_state, gfp_t mask)
+{
+   return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0,
+   cached_state, mask);
+}
+
 /*
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
@@ -1399,6 +1413,7 @@ static int set_range_writeback(struct extent_io_tree 
*tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
+   set_extent_writeback(tree, start, end, NULL, GFP_NOFS);
return 0;
 }
 
@@ -1966,6 +1981,16 @@ static void check_page_locked(struct extent_io_tree 
*tree, struct page *page)
}
 }
 
+static void check_page_writeback(struct extent_io_tree *tree, struct page 
*page)
+{
+   u64 start = page_offset(page);
+   u64 end = start + PAGE_CACHE_SIZE - 1;
+
+   if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0, NULL))
+   end_page_writeback(page);
+}
+
+/*
  * When IO fails, either with EIO or csum verification fails, we
  * try other mirrors that might have a good copy of the data.  This
  * io_failure_record is used to record state as we go through all the
@@ -2378,6 +2403,32 @@ int end_extent_writepage(struct page *page, int err, u64 
start, u64 end)
return 0;
 }
 
+static void clear_extent_and_page_writeback(struct address_space *mapping,
+   struct extent_io_tree *tree,
+   struct btrfs_io_bio *io_bio)
+{
+   struct page *page;
+   pgoff_t index;
+   u64 offset, len;
+
+   offset  = io_bio-start_offset;
+   len = io_bio-len;
+
+   clear_extent_writeback(tree, offset, offset + len - 1, NULL,
+   GFP_ATOMIC);
+
+   index = offset  PAGE_CACHE_SHIFT;
+   while (offset  io_bio-start_offset + len) {
+   page = find_get_page(mapping, index);
+   check_page_writeback(tree, page);
+   page_cache_release(page);
+   index++;
+   offset += page_offset(page) + PAGE_CACHE_SIZE - offset;
+   }
+
+   unlock_extent(tree, io_bio-start_offset, io_bio-start_offset + len - 
1);
+}
+
 /*
  * after a writepage IO is done, we need to:
  * clear the uptodate bits on error
@@ -2389,6 +2440,9 @@ int end_extent_writepage(struct page *page, int err, u64 
start, u64 end)
  */
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
+   struct address_space *mapping =  bio-bi_io_vec-bv_page-mapping;
+   struct extent_io_tree *tree = BTRFS_I(mapping-host)-io_tree;
+   struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct bio_vec *bvec = bio-bi_io_vec + bio-bi_vcnt - 1;
u64 start;
u64 end;
@@ -2413,8 +2467,8 @@ static void end_bio_extent_writepage(struct bio *bio, int 
err)
bvec-bv_offset, bvec-bv_len);
}
 
-   start = page_offset(page);
-   end = start + bvec-bv_offset + bvec-bv_len - 1;
+   start = page_offset(page) + bvec-bv_offset;
+   end = start + bvec-bv_len - 1;
 
if (--bvec = bio-bi_io_vec)
prefetchw(bvec-bv_page-flags);
@@ -2422,9 +2476,10 @@ static void end_bio_extent_writepage(struct bio *bio, 
int err)
if (end_extent_writepage(page, err, start, end))
continue;
 
-   end_page_writeback(page);
} while (bvec = bio-bi_io_vec);
 
+   clear_extent_and_page_writeback(mapping, tree, io_bio);
+
bio_put(bio);
 }
 
@@ -3151,6 +3206,7 @@ static int __extent_writepage(struct page *page, struct 
writeback_control *wbc,
u64 last_byte = i_size_read(inode);
u64 block_start;
u64 iosize;
+   u64 unlock_start = start;
sector_t sector;
struct extent_state *cached_state = NULL;
struct extent_map *em;
@@ -3233,6 

[RFC PATCH 3/8] Btrfs: subpagesize-blocksize: __btrfs_buffered_write: Reserve/release extents aligned to block size.

2014-05-21 Thread Chandan Rajendra
Currently, the code reserves/releases extents in multiples of PAGE_CACHE_SIZE
units. Fix this.

Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/file.c | 32 
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 006af2f..541e227 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1339,18 +1339,21 @@ fail:
 static noinline int
 lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
+   size_t write_bytes,
u64 *lockstart, u64 *lockend,
struct extent_state **cached_state)
 {
+   struct btrfs_root *root = BTRFS_I(inode)-root;
u64 start_pos;
u64 last_pos;
int i;
int ret = 0;
 
-   start_pos = pos  ~((u64)PAGE_CACHE_SIZE - 1);
-   last_pos = start_pos + ((u64)num_pages  PAGE_CACHE_SHIFT) - 1;
+   start_pos = pos  ~((u64)root-sectorsize - 1);
+   last_pos = start_pos
+   + ALIGN(pos + write_bytes - start_pos, root-sectorsize) - 1;
 
-   if (start_pos  inode-i_size) {
+   if (start_pos  inode-i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(BTRFS_I(inode)-io_tree,
 start_pos, last_pos, 0, cached_state);
@@ -1468,6 +1471,7 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
 
while (iov_iter_count(i)  0) {
size_t offset = pos  (PAGE_CACHE_SIZE - 1);
+   size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
 nrptrs * (size_t)PAGE_CACHE_SIZE -
 offset);
@@ -1488,7 +1492,9 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
break;
}
 
-   reserve_bytes = num_pages  PAGE_CACHE_SHIFT;
+   sector_offset = pos  (root-sectorsize - 1);
+   reserve_bytes = ALIGN(write_bytes + sector_offset, 
root-sectorsize);
+
ret = btrfs_check_data_free_space(inode, reserve_bytes);
if (ret == -ENOSPC 
(BTRFS_I(inode)-flags  (BTRFS_INODE_NODATACOW |
@@ -1503,7 +1509,9 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
num_pages = (write_bytes + offset +
 PAGE_CACHE_SIZE - 1) 
PAGE_CACHE_SHIFT;
-   reserve_bytes = num_pages  PAGE_CACHE_SHIFT;
+
+   reserve_bytes = ALIGN(write_bytes + 
sector_offset,
+   root-sectorsize);
ret = 0;
} else {
ret = -ENOSPC;
@@ -1536,8 +1544,8 @@ again:
break;
 
ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
- pos, lockstart, lockend,
- cached_state);
+   pos, write_bytes, lockstart, 
lockend,
+   cached_state);
if (ret  0) {
if (ret == -EAGAIN)
goto again;
@@ -1574,9 +1582,9 @@ again:
 * we still have an outstanding extent for the chunk we actually
 * managed to copy.
 */
-   if (num_pages  dirty_pages) {
-   release_bytes = (num_pages - dirty_pages) 
-   PAGE_CACHE_SHIFT;
+   if (write_bytes  copied) {
+   release_bytes = (write_bytes - copied)
+~((u64)root-sectorsize - 1);
if (copied  0) {
spin_lock(BTRFS_I(inode)-lock);
BTRFS_I(inode)-outstanding_extents++;
@@ -1590,7 +1598,7 @@ again:
 release_bytes);
}
 
-   release_bytes = dirty_pages  PAGE_CACHE_SHIFT;
+   release_bytes = ALIGN(copied + sector_offset, root-sectorsize);
 
if (copied  0)
ret = btrfs_dirty_pages(root, inode, pages,
@@ -1609,7 +1617,7 @@ again:
if (only_release_metadata  copied  0) {
u64 lockstart = round_down(pos, root-sectorsize);
u64 lockend = lockstart +
-   (dirty_pages  PAGE_CACHE_SHIFT) - 1;
+   ALIGN(copied, root-sectorsize) - 1;
 

[RFC PATCH 4/8] Btrfs: subpagesize-blocksize: Define extent_buffer_head.

2014-05-21 Thread Chandan Rajendra
From: Chandra Seetharaman sekha...@us.ibm.com

In order to handle multiple extent buffers per page, first we need to create a
way to handle all the extent buffers that are attached to a page.

This patch creates a new data structure 'struct extent_buffer_head', and moves
fields that are common to all extent buffers in a page from 'struct extent
buffer' to 'struct extent_buffer_head'

Also, this patch moves EXTENT_BUFFER_TREE_REF, EXTENT_BUFFER_DUMMY and
EXTENT_BUFFER_IN_TREE flags from extent_buffer-ebflags  to
extent_buffer_head-bflags.

Signed-off-by: Chandra Seetharaman sekha...@us.ibm.com
Signed-off-by: Chandan Rajendra chan...@linux.vnet.ibm.com
---
 fs/btrfs/backref.c   |   2 +-
 fs/btrfs/ctree.c |   2 +-
 fs/btrfs/ctree.h |   6 +-
 fs/btrfs/disk-io.c   |  46 --
 fs/btrfs/extent-tree.c   |   6 +-
 fs/btrfs/extent_io.c | 372 +--
 fs/btrfs/extent_io.h |  46 --
 fs/btrfs/volumes.c   |   2 +-
 include/trace/events/btrfs.h |   2 +-
 9 files changed, 326 insertions(+), 158 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a88da72..603ae44 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1272,7 +1272,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, 
struct btrfs_path *path,
eb = path-nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
-   atomic_inc(eb-refs);
+   atomic_inc(eb_head(eb)-refs);
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d..0d4ad91 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -169,7 +169,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root 
*root)
 * the inc_not_zero dance and if it doesn't work then
 * synchronize_rcu and try again.
 */
-   if (atomic_inc_not_zero(eb-refs)) {
+   if (atomic_inc_not_zero(eb_head(eb)-refs)) {
rcu_read_unlock();
break;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dac6653..901ada2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2138,14 +2138,16 @@ static inline void btrfs_set_token_##name(struct 
extent_buffer *eb, \
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)\
 static inline u##bits btrfs_##name(struct extent_buffer *eb)   \
 {  \
-   type *p = page_address(eb-pages[0]);   \
+   type *p = page_address(eb_head(eb)-pages[0]) + \
+   (eb-start  (PAGE_CACHE_SIZE -1)); \
u##bits res = le##bits##_to_cpu(p-member); \
return res; \
 }  \
 static inline void btrfs_set_##name(struct extent_buffer *eb,  \
u##bits val)\
 {  \
-   type *p = page_address(eb-pages[0]);   \
+   type *p = page_address(eb_head(eb)-pages[0]) + \
+   (eb-start  (PAGE_CACHE_SIZE -1)); \
p-member = cpu_to_le##bits(val);   \
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cc1b423..bda2157 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1018,13 +1018,21 @@ static int btree_set_page_dirty(struct page *page)
 {
 #ifdef DEBUG
struct extent_buffer *eb;
+   int i, dirty = 0;
 
BUG_ON(!PagePrivate(page));
eb = (struct extent_buffer *)page-private;
BUG_ON(!eb);
-   BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, eb-bflags));
-   BUG_ON(!atomic_read(eb-refs));
-   btrfs_assert_tree_locked(eb);
+
+   do {
+   dirty = test_bit(EXTENT_BUFFER_DIRTY, eb-ebflags);
+   if (dirty)
+   break;
+   } while ((eb = eb-eb_next) != NULL);
+
+   BUG_ON(!dirty);
+   BUG_ON(!atomic_read((eb_head(eb)-refs)));
+   btrfs_assert_tree_locked(ebh-eb);
 #endif
return __set_page_dirty_nobuffers(page);
 }
@@ -1068,7 +1076,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 
bytenr, u32 blocksize,
if (!buf)
return 0;
 
-   set_bit(EXTENT_BUFFER_READAHEAD, buf-bflags);
+   set_bit(EXTENT_BUFFER_READAHEAD, buf-ebflags);
 
ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
   btree_get_extent, mirror_num);
@@ -1077,7 

Re: problem with degraded boot and systemd

2014-05-21 Thread Duncan
On Tue, 20 May 2014 18:51:26 -0600
Chris Murphy li...@colorremedies.com wrote:

 
 On May 20, 2014, at 6:03 PM, Duncan 1i5t5.dun...@cox.net wrote:
  
  
  I'd actually argue that's functioning as it should, since I see
  forced manual intervention in ordered to mount degraded as a
  FEATURE, NOT A BUG.
 
 Manual intervention is OK for now, when it takes the form of dropping
 to a dracut shell, and only requires the user to pass mount -o
 degraded. To mount degraded automatically is worse because within a
 notification API for user space, it will lead users to make bad
 choices resulting in data loss.
 
 But the needed sequence is fairly burdensome: force shutdown, boot
 again, use rd.break=premount, then use mount -o degraded, and then
 exit a couple of times.

I haven't had the rootfs fail to mount due to that, but every time it
has failed for other reasons[2], I've been dropped to an emergency shell
prompt, from which I could run the mount manually, or do whatever else
I needed to do.  No force shutdown, boot again...  Just do the manual
mount or whatever, exit, and let the boot process continue from where
it errored out and dropped to the emergency shell.

But now that I think about it, I believe that automatic dropping to an
emergency shell when something goes wrong is a dracut option, that I
must have enabled by default.  I can't imagine why anyone would want it
off, thus forcing the reboot and manually adding the rd.break=whatever,
but apparently some folks do, so it's an option.  And I guess if you're
having to do the reboot and add the rd.break manually, you must not
have that option on when you do your dracut initr* builds.

  [1] dracut: I use it here on gentoo as well, because my rootfs is a
  multi-device btrfs and a kernel rootflags=device= line won't parse
  correctly, apparently due to splitting at the wrong =, so I must
  use an initr* despite my preference for a direct initr*-less boot,
  and I use dracut to generate it.
 
 rootflags doesn't take a device argument, it only applies to the
 volume to be mounted at /sysroot, so only one = is needed.

You misunderstand.  To mount a multi-device btrfs, one of two things
must happen to let the kernel know about all the devices.

A) btrfs device scan.

That's userspace, so for a multi-device btrfs
rootfs, it requires an initr* with the btrfs command and something to
trigger it (with dracut it's a udev rule that triggers the btrfs device
scan), before the mount is attempted.

B) btrfs has the device= mount option.

This can be given several times, once for each device in the
multi-device filesystem.

Under normal conditions, the rootflags= kernel commandline option could
thus be used to pass appropriate device= options to be used to mount
the rootfs, thus avoiding the need for an initr* with btrfs device scan
or the device= options passed to a userspace mount.

But trying rootflags=device=/dev/sda5,device=/dev/sdb5,... doesn't
work and the kernel will not mount the filesystem.

But rootflags=degraded works, but then activates the filesystem with
only the single device listed, say root=/dev/sda5, without the other
device.

So obviously rootflags= works since rootflags=degraded works.  But
rootflags=device= does NOT work.  The obvious difference and likely bug
is as I said, the multiple equals, with the kernel commandline parser
apparently trying to parse a parameter called rootflags=device, instead
of a parameter called rootflags, with device= as part of its value.
And of course rootflags=device isn't rootflags, so it doesn't do what
it's supposed to do.

Make more sense now? =:^)

Since I realized the double-equal parsing must be the problem, I've
been going to file a kernel bug on it and hopefully get the kernel
commandline parser fixed.  But apparently I have yet to find an
appropriately rounded tuit, since I've not done so yet. =:^(

FWIW, the btrfs kernel devs are aware that using device= with
rootflags= is broken, as it was one of them that originally mentioned
it to me when I was still asking about things before I had setup my
multi-device btrfs rootfs.  So it's a known issue.  But I'm not sure
they had looked into why, they just knew it didn't work.  And since it
only affects (1) btrfs users who (2) use a multi-device rootfs, *BUT*
(3) do NOT wish to use an initr*, I guess the priority simply hasn't
been high enough for them to investigate further.

So I really should file that bug[3] and get it it to the right people.

---
[2] Back a few dracut versions ago, building the initr* with host-only
would tie the initr* to the UUID of the default rootfs.  As long as
that UUID could be found, the usual root= could be used on the kernel
commandline to boot any other rootfs if desired, and naturally, that's
how I tested my backup, with the main rootfs still there and thus its
UUID available.  But then I renewed my backup, tested again that I
could boot to it using root=, and did a fresh mkfs on the main rootfs,
thus of course killing the UUID 

Re: btrfs check, btrfsck, fsck.btrfs

2014-05-21 Thread Duncan
On Tue, 20 May 2014 18:26:59 -0600
Chris Murphy li...@colorremedies.com wrote:

 That ought to be true, but at least on a systemd 212-4 system, it
 assumes the system root needs to be fsck'd before mounting it. Since
 the fs isn't mounted, fstab isn't available. And the fstab.empty file
 I found in the initramfs is in fact empty. So even with fs_passno set
 to 0, systemd is trying to run fsck.btrfs, which it fails to find,
 warns about, then moves on.
 
 I filed that bug here:
 https://bugzilla.redhat.com/show_bug.cgi?id=1098799

[@ Chris M, I sent this to you only first.  So this one's to the list
only, but it'll have a different message-id, so you'll probably get it
as two different messages, one to the list, one direct to you.]

Hmm... it isn't doing so here.  dracut-037, systemd-212-r4 (the -r4
indicating four gentoo package level revision bumps since the initial
in-tree release of the upstream 037 version).

But I have an install-customized dracut config (tho I no longer use
host-only as explained in my last post, to the degraded boot and
systemd thread), all kernel modules built-in, etc.  If you're running a
generic everything-including-the-kitchen-sink dracut, that might
explain it, since I guess on most filesystems (not reiserfs/xfs/btrfs,
however) it would need to be run.

-- 
Duncan - No HTML messages please, as they are filtered as spam.
Every nonfree program has a lord, a master --
and if you use the program, he is your master.  Richard Stallman
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check, btrfsck, fsck.btrfs

2014-05-21 Thread Chris Murphy

On May 21, 2014, at 4:47 AM, Duncan 1i5t5.dun...@cox.net wrote:

 On Tue, 20 May 2014 18:26:59 -0600
 Chris Murphy li...@colorremedies.com wrote:
 
 That ought to be true, but at least on a systemd 212-4 system, it
 assumes the system root needs to be fsck'd before mounting it. Since
 the fs isn't mounted, fstab isn't available. And the fstab.empty file
 I found in the initramfs is in fact empty. So even with fs_passno set
 to 0, systemd is trying to run fsck.btrfs, which it fails to find,
 warns about, then moves on.
 
 I filed that bug here:
 https://bugzilla.redhat.com/show_bug.cgi?id=1098799
 
 
 Hmm... it isn't doing so here.  dracut-037, systemd-212-r4 (the -r4
 indicating four gentoo package level revision bumps since the initial
 in-tree release of the upstream 037 version).

systemd 212 and dracut 037 here also.


 But I have an install-customized dracut config (tho I no longer use
 host-only as explained in my last post, to the degraded boot and
 systemd thread), all kernel modules built-in, etc.  If you're running a
 generic everything-including-the-kitchen-sink dracut, that might
 explain it, since I guess on most filesystems (not reiserfs/xfs/btrfs,
 however) it would need to be run.

I've tried both types of initramfs's. The fsck on root is always called, the 
difference being fsck.btrfs is not in the host-only initramfs, but is in the 
no-host-only one.

[1.779007] localhost systemd[1]: Failed to load configuration for 
systemd-fsck-root.service: No such file or directory
[…snip…]
[1.780811] localhost systemd[1]: Installed new job 
initrd-root-fs.target/start as 30
[1.780818] localhost systemd[1]: Installed new job sysroot.mount/start as 31
[1.780826] localhost systemd[1]: Installed new job 
dev-disk-by\x2duuid-d372e5d1\x2d386f\x2d460c\x2db036\x2d611469e0155e.device/start
 as 32
[1.780834] localhost systemd[1]: Installed new job 
systemd-fsck@dev-disk-by\x2duuid-d372e5d1\x2d386f\x2d460c\x2db036\x2d611469e0155e.service/start
 as 33

The first and last entries are mysteries. There is a 
/usr/lib/systemd/system/systemd-fsck-root.service so I don't know why it fails 
to load. The last entry looks like it occurs not by systemd-fsck-root.service 
but rather /usr/lib/systemd/system/systemd-fsck@.service, which is in the same 
directory. It reads:

[Unit]
Description=File System Check on %f
Documentation=man:systemd-fsck@.service(8)
DefaultDependencies=no
BindsTo=%i.device
After=systemd-readahead-collect.service systemd-readahead-replay.service 
%i.device systemd-fsck-root.service
Before=shutdown.target

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/usr/lib/systemd/systemd-fsck %f
StandardOutput=journal+console
TimeoutSec=0

So I'm not sure what's going on.


Chris Murphy

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: send, fix corrupted paths strings for long paths

2014-05-21 Thread Filipe David Borba Manana
If a path has more than 230 characters, we allocate a new buffer to
use for the path, but we were forgotting to copy the contents of the
previous buffer into the new one, which has random content from the
kmalloc call.

Test:

mkfs.btrfs -f /dev/sdd
mount /dev/sdd /mnt


TEST_PATH=/mnt/fdmanana/.config/google-chrome-mysetup/Default/Pepper_Data/Shockwave_Flash/WritableRoot/#SharedObjects/JSHJ4ZKN/s.wsj.net/[[IMPORT]]/players.edgesuite.net/flash/plugins/osmf/advanced-streaming-plugin/v2.7/osmf1.6/Ak#
mkdir -p $TEST_PATH
echo hello world  $TEST_PATH/amaiAdvancedStreamingPlugin.txt

btrfs subvolume snapshot -r /mnt /mnt/mysnap1
btrfs send /mnt/mysnap1 -f /tmp/1.snap

A test for xfstests follows.

Signed-off-by: Filipe David Borba Manana fdman...@gmail.com
Cc: Marc Merlin m...@merlins.org
---
 fs/btrfs/send.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f6bbc1e..70c5e8c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -368,10 +368,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
/*
 * First time the inline_buf does not suffice
 */
-   if (p-buf == p-inline_buf)
+   if (p-buf == p-inline_buf) {
tmp_buf = kmalloc(len, GFP_NOFS);
-   else
+   if (tmp_buf)
+   memcpy(tmp_buf, p-buf, old_buf_len);
+   } else {
tmp_buf = krealloc(p-buf, len, GFP_NOFS);
+   }
if (!tmp_buf)
return -ENOMEM;
p-buf = tmp_buf;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] xfstests: add test for btrfs send with long paths

2014-05-21 Thread Filipe David Borba Manana
Regression test for btrfs send where long paths (exceeding 230 characters)
made send produce paths with random characters from a memory buffer returned
by kmalloc, as send forgot to populate the new buffer with the path string.

This issue is fixed by the following linux kernel btrfs patch:

   Btrfs: send, fix corrupted path strings for long paths

Signed-off-by: Filipe David Borba Manana fdman...@gmail.com
---
 tests/btrfs/051 | 85 +
 tests/btrfs/051.out |  1 +
 tests/btrfs/group   |  1 +
 3 files changed, 87 insertions(+)
 create mode 100755 tests/btrfs/051
 create mode 100644 tests/btrfs/051.out

diff --git a/tests/btrfs/051 b/tests/btrfs/051
new file mode 100755
index 000..53df664
--- /dev/null
+++ b/tests/btrfs/051
@@ -0,0 +1,85 @@
+#! /bin/bash
+# FS QA Test No. btrfs/051
+#
+# Regression test for btrfs send where long paths (exceeding 230 characters)
+# made send produce paths with random characters from a memory buffer returned
+# by kmalloc, as send forgot to populate the new buffer with the path string.
+#
+# This issue is fixed by the following linux kernel btrfs patch:
+#
+#   Btrfs: send, fix corrupted path strings for long paths
+#
+#---
+# Copyright (c) 2014 Filipe Manana.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo QA output created by $seq
+
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap _cleanup; exit \$status 0 1 2 3 15
+
+_cleanup()
+{
+rm -fr $send_files_dir
+rm -fr $tmp
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_require_fssum
+_need_to_be_root
+
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
+rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
+
+_scratch_mkfs /dev/null 21
+_scratch_mount
+
+TEST_PATH=$SCRATCH_MNT/fdmanana/.config/google-chrome-mysetup/Default/Pepper_Data/Shockwave_Flash/WritableRoot/#SharedObjects/JSHJ4ZKN/s.wsj.net/[[IMPORT]]/players.edgesuite.net/flash/plugins/osmf/advanced-streaming-plugin/v2.7/osmf1.6/Ak#
+
+mkdir -p $TEST_PATH
+echo hello world  $TEST_PATH/amaiAdvancedStreamingPlugin.txt
+
+_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1
+run_check $FSSUM_PROG -A -f -w $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1
+_run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $send_files_dir/1.snap
+
+_scratch_unmount
+_check_scratch_fs
+
+_scratch_mkfs /dev/null 21
+_scratch_mount
+
+_run_btrfs_util_prog receive $SCRATCH_MNT -f $send_files_dir/1.snap
+run_check $FSSUM_PROG -r $send_files_dir/1.fssum $SCRATCH_MNT/mysnap1
+
+_check_scratch_fs
+
+status=0
+exit
diff --git a/tests/btrfs/051.out b/tests/btrfs/051.out
new file mode 100644
index 000..636dcef
--- /dev/null
+++ b/tests/btrfs/051.out
@@ -0,0 +1 @@
+QA output created by 051
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 69a80e0..0673449 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -53,3 +53,4 @@
 048 auto quick
 049 auto quick
 050 auto
+051 auto quick
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] Btrfs: send, fix corrupted path strings for long paths

2014-05-21 Thread Filipe David Borba Manana
If a path has more than 230 characters, we allocate a new buffer to
use for the path, but we were forgotting to copy the contents of the
previous buffer into the new one, which has random content from the
kmalloc call.

Test:

mkfs.btrfs -f /dev/sdd
mount /dev/sdd /mnt


TEST_PATH=/mnt/fdmanana/.config/google-chrome-mysetup/Default/Pepper_Data/Shockwave_Flash/WritableRoot/#SharedObjects/JSHJ4ZKN/s.wsj.net/[[IMPORT]]/players.edgesuite.net/flash/plugins/osmf/advanced-streaming-plugin/v2.7/osmf1.6/Ak#
mkdir -p $TEST_PATH
echo hello world  $TEST_PATH/amaiAdvancedStreamingPlugin.txt

btrfs subvolume snapshot -r /mnt /mnt/mysnap1
btrfs send /mnt/mysnap1 -f /tmp/1.snap

A test for xfstests follows.

Signed-off-by: Filipe David Borba Manana fdman...@gmail.com
Cc: Marc Merlin m...@merlins.org
---

V2: Fix change title, paths to path.

 fs/btrfs/send.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f6bbc1e..70c5e8c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -368,10 +368,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
/*
 * First time the inline_buf does not suffice
 */
-   if (p-buf == p-inline_buf)
+   if (p-buf == p-inline_buf) {
tmp_buf = kmalloc(len, GFP_NOFS);
-   else
+   if (tmp_buf)
+   memcpy(tmp_buf, p-buf, old_buf_len);
+   } else {
tmp_buf = krealloc(p-buf, len, GFP_NOFS);
+   }
if (!tmp_buf)
return -ENOMEM;
p-buf = tmp_buf;
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Add some simple end-to-end tests for btrfs-convert.

2014-05-21 Thread Adam Buchbinder
These use the system's mke2fs, and don't require loop devices
or root privileges.

They don't pick up anything with the default flags right now,
but they do pick up some sanitizer issues when the tools are
compiled with any of -fsanitize={address,memory,thread}.

Signed-off-by: Adam Buchbinder abuchbin...@google.com
---
 Makefile   |  2 +-
 tests/convert-tests.sh | 35 +++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 tests/convert-tests.sh

diff --git a/Makefile b/Makefile
index da05197..8f002f3 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ libbtrfs_objects = send-stream.o send-utils.o rbtree.o 
btrfs-list.o crc32c.o \
 libbtrfs_headers = send-stream.h send-utils.h send.h rbtree.h btrfs-list.h \
   crc32c.h list.h kerncompat.h radix-tree.h extent-cache.h \
   extent_io.h ioctl.h ctree.h btrfsck.h
-TESTS = fsck-tests.sh
+TESTS = fsck-tests.sh convert-tests.sh
 
 INSTALL = install
 prefix ?= /usr/local
diff --git a/tests/convert-tests.sh b/tests/convert-tests.sh
new file mode 100644
index 000..87369c5
--- /dev/null
+++ b/tests/convert-tests.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# convert ext2/3/4 images to btrfs images, and make sure the results are
+# clean.
+#
+
+here=`pwd`
+
+_fail()
+{
+   echo $* | tee -a convert-tests-results.txt
+   exit 1
+}
+
+rm -f convert-tests-results.txt
+rm -f test.img
+
+test(){
+   echo  [TEST]$1
+shift
+echo creating ext image with: $*  convert-tests-results.txt
+   # 256MB is the smallest acceptable btrfs image.
+   dd if=/dev/zero of=$here/test.img bs=1024 count=$((256*1024)) \
+convert-tests-results.txt 21 || _fail dd failed
+   $* -F $here/test.img  convert-tests-results.txt 21 \
+   || _fail filesystem create failed
+   $here/btrfs-convert $here/test.img  convert-tests-results.txt 21 \
+   || _fail btrfs-convert failed
+   $here/btrfsck $here/test.img  convert-tests-results.txt 21 \
+   || _fail btrfsck detected errors
+}
+
+test ext2, 4k blocksize mke2fs -b 4096
+test ext3, 4k blocksize mke2fs -j -b 4096
+test ext4, 4k blocksize mke2fs -t ext4 -b 4096
-- 
1.9.1.423.g4596e3a

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 1/2] btrfs: Add missing device check in dev_info/rm_dev ioctl

2014-05-21 Thread Brendan Hide

On 2014/05/21 06:15 AM, Qu Wenruo wrote:

[snip]

 Further on top of your check_missing patch I am writing
 code to to handle disk reappear. I should be sending them
 all soon.

Disk reappear problem is also reproduce here.

I am intersting about how will your patch to deal with.
Is your patch going to check super genertion to determing previously 
missing device and

wipe reappeared superblock?(Wang mentioned it in the mail in Jan.)



With md we have the bitmap feature that helps prevent resynchronising 
the entire disk when doing a re-add. Wiping the superblock is *better* 
than what we currently have (corruption) - but hopefully the end goal is 
to be able to have it re-add *without* introducing corruption.




IMO the reappear disk problem can also be resolved by not swap 
tgtdev-uuid and srcdev-uuid,

which means tgtdev will not use the same uuid of srcdev.

Thanks,
Qu


Thanks, Anand




--
__
Brendan Hide
http://swiftspirit.co.za/
http://www.webafrica.co.za/?AFF1E97

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: destroyed disk in btrfs raid

2014-05-21 Thread laie

On 2014-05-14 23:44, laie wrote:

On 2014-05-14 20:44, Hugo Mills wrote:

On Wed, May 14, 2014 at 08:43:41PM +0200, laie wrote:

On 2014-05-11 16:19, Hugo Mills wrote:
On Tue, May 13, 2014 at 10:16:59AM +0200, laie wrote:
On 2014-05-09 20:01, Hugo Mills wrote:
On Fri, May 09, 2014 at 06:58:27PM +0100, Hugo Mills wrote:
On Fri, May 09, 2014 at 08:02:45PM +0200, laie wrote:
 Now I'm looking for a way to tell btrfs to provide me with a list of the
 corrupted files and delete them afterwards. This would be great, because
 otherwise it would take very long to get the data back from slow backups.

   Simple solution: cat every file to /dev/null, and see which ones
fail with an I/O error. With RAID-0 data, losing a device is going to
damage most files, though, so don't necessarily expect much to survive.

I finished building the List, about 40% of the Data is gone. So far 
so good.


As next step I planned to delete these files. This is not possible 
because

I'm not able to mount the fs r/w.

btrfs: allowing degraded mounts
btrfs: bdev /dev/mapper/luks-0 errs: wr 37519, rd 32783, flush 0, 
corrupt 0,

gen 0
Btrfs: too many missing devices, writeable mount is not allowed
btrfs: open_ctree failed

Is it correct remove the missing device now:

btrfs device delete missing /mnt

Or do I have to add the replacement first?


   You'd have to mount r/w before you can add a new disk. :)

   You should be able to mount r/w using the -o degraded mount option.


The error above is occurs when I try to mount with

mount -o degraded source target

only

mount -o degraded,ro source target

works.


I'm still stuck here, does anybody have an idea how its possible to get 
this filesystem to r/w mode?


The only solution I can think of is to copy the undamaged Data to a new 
filesystem. I don't like to do that.

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ditto blocks on ZFS

2014-05-21 Thread Martin
Very good comment from Ashford.


Sorry, but I see no advantages from Russell's replies other than for a
feel-good factor or a dangerous false sense of security. At best,
there is a weak justification that for metadata, again going from 2% to
4% isn't going to be a great problem (storage is cheap and fast).

I thought an important idea behind btrfs was that we avoid by design in
the first place the very long and vulnerable RAID rebuild scenarios
suffered for block-level RAID...


On 21/05/14 03:51, Russell Coker wrote:
 Absolutely. Hopefully this discussion will inspire the developers to
 consider this an interesting technical challenge and a feature that
 is needed to beat ZFS.

Sorry, but I think that is completely the wrong reasoning. ...Unless
that is you are some proprietary sales droid hyping features and big
numbers! :-P


Personally I'm not convinced we gain anything beyond what btrfs will
eventually offer in any case for the n-way raid or the raid-n Cauchy stuff.

Also note that usually, data is wanted to be 100% reliable and
retrievable. Or if that fails, you go to your backups instead. Gambling
proportions and importance rather than *ensuring* fault/error
tolerance is a very human thing... ;-)


Sorry:

Interesting idea but not convinced there's any advantage for disk/SSD
storage.


Regards,
Martin




--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ditto blocks on ZFS

2014-05-21 Thread Konstantinos Skarlatos

On 20/5/2014 5:07 πμ, Russell Coker wrote:

On Mon, 19 May 2014 23:47:37 Brendan Hide wrote:

This is extremely difficult to measure objectively. Subjectively ... see
below.


[snip]

*What other failure modes* should we guard against?

I know I'd sleep a /little/ better at night knowing that a double disk
failure on a raid5/1/10 configuration might ruin a ton of data along
with an obscure set of metadata in some long tree paths - but not the
entire filesystem.

My experience is that most disk failures that don't involve extreme physical
damage (EG dropping a drive on concrete) don't involve totally losing the
disk.  Much of the discussion about RAID failures concerns entirely failed
disks, but I believe that is due to RAID implementations such as Linux
software RAID that will entirely remove a disk when it gives errors.

I have a disk which had ~14,000 errors of which ~2000 errors were corrected by
duplicate metadata.  If two disks with that problem were in a RAID-1 array
then duplicate metadata would be a significant benefit.


The other use-case/failure mode - where you are somehow unlucky enough
to have sets of bad sectors/bitrot on multiple disks that simultaneously
affect the only copies of the tree roots - is an extremely unlikely
scenario. As unlikely as it may be, the scenario is a very painful
consequence in spite of VERY little corruption. That is where the
peace-of-mind/bragging rights come in.

http://research.cs.wisc.edu/adsl/Publications/corruption-fast08.html

The NetApp research on latent errors on drives is worth reading.  On page 12
they report latent sector errors on 9.5% of SATA disks per year.  So if you
lose one disk entirely the risk of having errors on a second disk is higher
than you would want for RAID-5.  While losing the root of the tree is
unlikely, losing a directory in the middle that has lots of subdirectories is
a risk.
Seeing the results of that paper, I think erasure coding is a better 
solution. Instead of having many copies of metadata or data, we could do 
erasure coding using something like zfec[1] that is being used by 
Tahoe-LAFS, increasing their size by lets say 5-10%, and be quite safe 
even from multiple continuous bad sectors.


[1] https://pypi.python.org/pypi/zfec


I can understand why people wouldn't want ditto blocks to be mandatory.  But
why are people arguing against them as an option?


As an aside, I'd really like to be able to set RAID levels by subtree.  I'd
like to use RAID-1 with ditto blocks for my important data and RAID-0 for
unimportant data.



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: don't remove raid type sysfs entries until unmount

2014-05-21 Thread Chris Mason

The Btrfs sysfs code removes entries for raid types that are no
longer in use.  This means that if you have a raid0 FS and use balance
to turn it into a raid1 FS, the raid0 sysfs entries will go away.

The rough chain of events is:

__link_block_group() - see we're the first RAIDX, add sysfs entry

btrfs_remove_block_group() - notice we're removing the last RAIDX
remove sysfs entry

This all makes sense until we try to add RAIDX back into the FS again.
The problem is that our RAID kobjects are just in an array that gets
freed at unmount time, instead of an array of pointers to kobjects that
get freed when the great sysfs in the sky is done with them.

When we remove the sysfs entry for a given raid level, the syfs code
free's the name.  When we use the same kobject to add back the RAIDX
entry again, sysfs sees the old name pointer and tries to free it again.

All of which is a long way of saying we're using sysfs wrong.  For now,
just don't remove entries for raid levels that we're no longer using.

Signed-off-by: Chris Mason c...@fb.com
Reported-by: Dave Sterba dste...@suse.cz

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ddf16bf..acdc7ed 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8535,12 +8535,14 @@ static void __link_block_group(struct btrfs_space_info 
*space_info,
struct kobject *kobj = space_info-block_group_kobjs[index];
int ret;
 
-   kobject_get(space_info-kobj); /* put in release */
-   ret = kobject_add(kobj, space_info-kobj, %s,
- get_raid_name(index));
-   if (ret) {
-   pr_warn(BTRFS: failed to add kobject for block cache. 
ignoring.\n);
-   kobject_put(space_info-kobj);
+   if (!kobj-name) {
+   kobject_get(space_info-kobj); /* put in release */
+   ret = kobject_add(kobj, space_info-kobj, %s,
+ get_raid_name(index));
+   if (ret) {
+   pr_warn(BTRFS: failed to add kobject for block 
cache. ignoring.\n);
+   kobject_put(space_info-kobj);
+   }
}
}
 }
@@ -8976,8 +8978,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle 
*trans,
 */
list_del_init(block_group-list);
if (list_empty(block_group-space_info-block_groups[index])) {
-   kobject_del(block_group-space_info-block_group_kobjs[index]);
-   kobject_put(block_group-space_info-block_group_kobjs[index]);
clear_avail_alloc_bits(root-fs_info, block_group-flags);
}
up_write(block_group-space_info-groups_sem);
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: don't remove raid type sysfs entries until unmount

2014-05-21 Thread Jeff Mahoney
On 05/21/2014 08:12 PM, Chris Mason wrote:
 
 The Btrfs sysfs code removes entries for raid types that are no
 longer in use.  This means that if you have a raid0 FS and use balance
 to turn it into a raid1 FS, the raid0 sysfs entries will go away.
 
 The rough chain of events is:
 
 __link_block_group() - see we're the first RAIDX, add sysfs entry
 
 btrfs_remove_block_group() - notice we're removing the last RAIDX
 remove sysfs entry
 
 This all makes sense until we try to add RAIDX back into the FS again.
 The problem is that our RAID kobjects are just in an array that gets
 freed at unmount time, instead of an array of pointers to kobjects that
 get freed when the great sysfs in the sky is done with them.
 
 When we remove the sysfs entry for a given raid level, the syfs code
 free's the name.  When we use the same kobject to add back the RAIDX
 entry again, sysfs sees the old name pointer and tries to free it again.
 
 All of which is a long way of saying we're using sysfs wrong.  For now,
 just don't remove entries for raid levels that we're no longer using.

Hi Chris -

Thanks for posting the problem. I disagree that sysfs is being used
wrong here - or well, not on purpose. The problem is only that I didn't
anticipate raid levels going away and only initialize the kobjects i
update_space_info.

The name being double-freed is only part of the problem. The kobject
isn't reinitialized at all. Clearing -name and re-calling kobject_init
would fix it - but I think we can do one better.

kobject_cleanup's comment for freeing -name claims that -name being
set means that it was allocated by the kobject infrastructure. The raid
names come directly out of an array anyway, so we don't even need to
copy it. We can avoid the string allocation entirely and handle the
reuse properly.

The following (untested, tonight anyway) patch should fix it.

-Jeff

--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3401,10 +3401,8 @@ static int update_space_info(struct btrfs_fs_info *info, 
u64 flags,
return ret;
}
 
-   for (i = 0; i  BTRFS_NR_RAID_TYPES; i++) {
+   for (i = 0; i  BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(found-block_groups[i]);
-   kobject_init(found-block_group_kobjs[i], btrfs_raid_ktype);
-   }
init_rwsem(found-groups_sem);
spin_lock_init(found-lock);
found-flags = flags  BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -8356,8 +8354,9 @@ static void __link_block_group(struct btrfs_space_info 
*space_info,
int ret;
 
kobject_get(space_info-kobj); /* put in release */
-   ret = kobject_add(kobj, space_info-kobj, %s,
- get_raid_name(index));
+   kobject-name = get_raid_name(index);
+   ret = kobject_init_and_add(kobj, btrfs_raid_ktype,
+  space_info-kobj, NULL);
if (ret) {
pr_warn(BTRFS: failed to add kobject for block cache. 
ignoring.\n);
kobject_put(space_info-kobj);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c5eb214..d742d79 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -288,6 +288,7 @@ static struct attribute *raid_attributes[] = {
 
 static void release_raid_kobj(struct kobject *kobj)
 {
+   kobj-name = NULL;
kobject_put(kobj-parent);
 }
 


-Jeff


 Signed-off-by: Chris Mason c...@fb.com
 Reported-by: Dave Sterba dste...@suse.cz
 
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index ddf16bf..acdc7ed 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -8535,12 +8535,14 @@ static void __link_block_group(struct 
 btrfs_space_info *space_info,
   struct kobject *kobj = space_info-block_group_kobjs[index];
   int ret;
  
 - kobject_get(space_info-kobj); /* put in release */
 - ret = kobject_add(kobj, space_info-kobj, %s,
 -   get_raid_name(index));
 - if (ret) {
 - pr_warn(BTRFS: failed to add kobject for block cache. 
 ignoring.\n);
 - kobject_put(space_info-kobj);
 + if (!kobj-name) {
 + kobject_get(space_info-kobj); /* put in release */
 + ret = kobject_add(kobj, space_info-kobj, %s,
 +   get_raid_name(index));
 + if (ret) {
 + pr_warn(BTRFS: failed to add kobject for block 
 cache. ignoring.\n);
 + kobject_put(space_info-kobj);
 + }
   }
   }
  }
 @@ -8976,8 +8978,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle 
 *trans,
*/
   list_del_init(block_group-list);
   if (list_empty(block_group-space_info-block_groups[index])) {
 - kobject_del(block_group-space_info-block_group_kobjs[index]);
 - 

Should btrfs reuse the src_dev's dev UUID when doing dev replacing?

2014-05-21 Thread Qu Wenruo

Hi,

[Current dev replace]
As kernel codes show, 'btrfs dev replace' will swap tgt_dev's uuid with 
src_dev's uuid.
This method works fine most of the time, since it doesn't need to change 
the chunk tree.


[Problem with re-appear missing device]
(Anand Jain reported the problem in Jan 2014)
Take the following suitiuation as example:
/dev/sda, /dev/sdb, /dev/sdc as btrfs RAID1.
1, 2, 3 as their dev id.

1)/dev/sdb is missing,
Mount them in degraded mode.

2) 'btrfs dev replace start 2 /dev/sdd' will replace missing /dev/sdb.

3) /dev/sdb is online again.

4) umount /BTRFS/MOUNT/POINT; mount /dev/sda
After mount, btrfs will still use /dev/sdb but not /dev/sdd

[Cause of the bug]
When this comes to missing device, since the src_dev is missing, neither 
UUID swap nor superblock wipe will
work. So if the device reappears, next mount will scan the the fsid and 
dev uuid, and if btrfs scan the re-appeared

device first, it will use the re-appeared device.

[Method to fix]
IMO there are 2 possible method to fix the bug.
1) Don't reuse the src_dev's dev UUID.
I don't think any of the UUID in btrfs should be reused, so if every 
device in btrfs has its own UUID,
it is quite easy to distinguish different devices, and even don't need 
to wipe the superblock of src_dev.

(But superblock wipe is still needed for other reasons)

2) Do generation check in device_list_add.
When multiple devices with same dev UUID is found, only add the one 
whose generation is the same with

other deivces.
IMO this is just a workaround.

I think it is better to be decided before any related patch sent.

Any suggestions?

Thanks,
Qu
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] Btrfs: send, fix corrupted path strings for long paths

2014-05-21 Thread Marc MERLIN
On Wed, May 21, 2014 at 05:38:13PM +0100, Filipe David Borba Manana wrote:
 If a path has more than 230 characters, we allocate a new buffer to
 use for the path, but we were forgotting to copy the contents of the
 previous buffer into the new one, which has random content from the
 kmalloc call.

I've confirmed this fixes the problem I was seeing when applied to
3.15rc5.
Thanks for taking that down.

Tested-by: Marc MERLIN m...@merlins.org

(while we're at it, I post with my personal address, but I work at
Google, I'm supposed to state that :) )

Marc

 Test:
 
 mkfs.btrfs -f /dev/sdd
 mount /dev/sdd /mnt
 
 
 TEST_PATH=/mnt/fdmanana/.config/google-chrome-mysetup/Default/Pepper_Data/Shockwave_Flash/WritableRoot/#SharedObjects/JSHJ4ZKN/s.wsj.net/[[IMPORT]]/players.edgesuite.net/flash/plugins/osmf/advanced-streaming-plugin/v2.7/osmf1.6/Ak#
 mkdir -p $TEST_PATH
 echo hello world  $TEST_PATH/amaiAdvancedStreamingPlugin.txt
 
 btrfs subvolume snapshot -r /mnt /mnt/mysnap1
 btrfs send /mnt/mysnap1 -f /tmp/1.snap
 
 A test for xfstests follows.
 
 Signed-off-by: Filipe David Borba Manana fdman...@gmail.com
 Cc: Marc Merlin m...@merlins.org
 ---
 
 V2: Fix change title, paths to path.
 
  fs/btrfs/send.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)
 
 diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
 index f6bbc1e..70c5e8c 100644
 --- a/fs/btrfs/send.c
 +++ b/fs/btrfs/send.c
 @@ -368,10 +368,13 @@ static int fs_path_ensure_buf(struct fs_path *p, int 
 len)
   /*
* First time the inline_buf does not suffice
*/
 - if (p-buf == p-inline_buf)
 + if (p-buf == p-inline_buf) {
   tmp_buf = kmalloc(len, GFP_NOFS);
 - else
 + if (tmp_buf)
 + memcpy(tmp_buf, p-buf, old_buf_len);
 + } else {
   tmp_buf = krealloc(p-buf, len, GFP_NOFS);
 + }
   if (!tmp_buf)
   return -ENOMEM;
   p-buf = tmp_buf;
 -- 
 1.9.1
 
 

-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2 v2] btrfs: label should not contain return char

2014-05-21 Thread Anand Jain




(Random aside: why does btrfs support online fs relabeling, anyway?)

-Eric


  Online you mean when mounted ?

  But I had an opinion that should we support label store from the sysfs
  interface when the (sysfs) interface can't communicate the module's
  specific errors back to the user.?

Thanks
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2 v2] btrfs: label should not contain return char

2014-05-21 Thread Eric Sandeen
On 5/21/14, 9:05 PM, Anand Jain wrote:
 

 (Random aside: why does btrfs support online fs relabeling, anyway?)

 -Eric
 
   Online you mean when mounted ?

Yep - I'm just not sure who would ever want to do that.

Aren't labels primarly used for mounting, during the mount process?

So changing it while mounted seems like a feature looking for a usecase...

-Eric
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Should btrfs reuse the src_dev's dev UUID when doing dev replacing?

2014-05-21 Thread Anand Jain



 Thanks Qu for bringing up this topic. We definitely need some focus
 on the btrfs volume management related bugs/features/enhancements.

 more inline..

On 22/05/14 09:35, Qu Wenruo wrote:

Hi,

[Current dev replace]
As kernel codes show, 'btrfs dev replace' will swap tgt_dev's uuid with
src_dev's uuid.
This method works fine most of the time, since it doesn't need to change
the chunk tree.

[Problem with re-appear missing device]
(Anand Jain reported the problem in Jan 2014)
Take the following suitiuation as example:
/dev/sda, /dev/sdb, /dev/sdc as btrfs RAID1.
1, 2, 3 as their dev id.

1)/dev/sdb is missing,
Mount them in degraded mode.

2) 'btrfs dev replace start 2 /dev/sdd' will replace missing /dev/sdb.

3) /dev/sdb is online again.

4) umount /BTRFS/MOUNT/POINT; mount /dev/sda
After mount, btrfs will still use /dev/sdb but not /dev/sdd


 Yeah its weird that grouping depends on the mercy of chronological
 oder of device probing. The _last_ device probed stays in the list.
 But the most weird is if FS is mounted and is followed with the dev
 scan it would just overwrite the btrfs_device struct.
 I have sent out interim fix to both of these bugs a long time back.


[Cause of the bug]
When this comes to missing device, since the src_dev is missing, neither
UUID swap nor superblock wipe will
work. So if the device reappears, next mount will scan the the fsid and
dev uuid, and if btrfs scan the re-appeared
device first, it will use the re-appeared device.

[Method to fix]
IMO there are 2 possible method to fix the bug.
1) Don't reuse the src_dev's dev UUID.
I don't think any of the UUID in btrfs should be reused, so if every
device in btrfs has its own UUID,
it is quite easy to distinguish different devices, and even don't need
to wipe the superblock of src_dev.
(But superblock wipe is still needed for other reasons)


 Yep that the right way IMO too. UUID must be unique to disk, even in
 the case of replace.


2) Do generation check in device_list_add.
When multiple devices with same dev UUID is found, only add the one
whose generation is the same with
other deivces.
IMO this is just a workaround.


 yes an interim fix, patch was sent out a long time back.


I think it is better to be decided before any related patch sent.

Any suggestions?

Thanks,
Qu
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Linuxcon-JP Btrfs talk

2014-05-21 Thread Marc MERLIN
If you're new with Btrfs, this may be a useful walkthrough for you.

You can go through the slides which I wrote to be readable without the
video, but the video is available too if you'd like:
http://marc.merlins.org/perso/btrfs/post_2014-05-21_My-Btrfs-Talk-at-Linuxcon-JP-2014.html

If you've already been using btrfs for a while, this probably won't tell
you anything you didn't already know :)

Marc
-- 
A mouse is a device used to point at the xterm you want to type in - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2 v2] btrfs: label should not contain return char

2014-05-21 Thread Roman Mamedov
On Wed, 21 May 2014 21:14:07 -0500
Eric Sandeen sand...@redhat.com wrote:

  (Random aside: why does btrfs support online fs relabeling, anyway?)
 
  -Eric
  
Online you mean when mounted ?
 
 Yep - I'm just not sure who would ever want to do that.
 
 Aren't labels primarly used for mounting, during the mount process?

Well if you want to change the label of your root filesystem, how else would
you go about that? If Btrfs did not support this, then only while booted from a
rescue LiveCD? That's quite a bit more involved and not even always feasible
in case of remote machines with no IPMI or similar management.

Extfs supports online change of the volume label as well, albeit probably not
via a sysfs file, but with the tune2fs utility:

  tune2fs -L name /dev/blockdevice

-- 
With respect,
Roman


signature.asc
Description: PGP signature


Re: historical backups with hardlinks vs cp --reflink vs snapshots

2014-05-21 Thread Russell Coker
On Tue, 20 May 2014 20:59:28 Marc MERLIN wrote:
  just wrote a blog post about the 3 way of doing historical snapshots:
 http://marc.merlins.org/perso/btrfs/post_2014-05-20_Historical-Snapshots-Wit
 h-Btrfs.html 
 I love reflink, but that forces me to use btrfs send as the only way to
 copy a filesystem without losing the reflink relationship, and I have no
 good way from user space to see the blocks shared to see how many are
 shared or whether some just got duped in a copy.
 As a result, for now I still use hardlinks.

It would be nice if someone patched rsync to look for files with identical 
contents and use reflink or hardlinks (optionally at user request) instead of 
making multiple copies of the same data.  Also it would be nice if rsync would 
look for matching blocks in different files to save transfer.

-- 
My Main Blog http://etbe.coker.com.au/
My Documents Bloghttp://doc.coker.com.au/

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html