This commit gets file defragmentation code to work in subpage-blocksize scenario. It does this by keeping track of page offsets that mark block boundaries and passing them as arguments to the functions that implement the defragmentation logic.
Signed-off-by: Chandan Rajendra <chan...@linux.vnet.ibm.com> --- fs/btrfs/ioctl.c | 198 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 136 insertions(+), 62 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 001c111..fb92566 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -904,12 +904,13 @@ out_unlock: static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 end; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); + em = lookup_extent_mapping(em_tree, offset, root->sectorsize); read_unlock(&em_tree->lock); if (em) { @@ -999,7 +1000,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; - u64 len = PAGE_SIZE; + u64 len = BTRFS_I(inode)->root->sectorsize; /* * hopefully we have this extent in the tree already, try without @@ -1118,37 +1119,47 @@ out: * before calling this. */ static int cluster_pages_for_defrag(struct inode *inode, - struct page **pages, - unsigned long start_index, - unsigned long num_pages) + struct page **pages, + unsigned long start_index, + size_t pg_offset, + unsigned long num_blks) { - unsigned long file_end; u64 isize = i_size_read(inode); + u64 start_blk; + u64 end_blk; u64 page_start; u64 page_end; u64 page_cnt; + u64 blk_cnt; int ret; int i; int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; + struct btrfs_root *root; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - file_end = (isize - 1) >> PAGE_SHIFT; - if (!isize || start_index > file_end) + root = BTRFS_I(inode)->root; + start_blk = (start_index << PAGE_SHIFT) + pg_offset; + start_blk >>= inode->i_blkbits; + end_blk = (isize - 1) >> inode->i_blkbits; + if (!isize || start_blk > end_blk) return 0; - page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); + blk_cnt = min_t(u64, (u64)num_blks, (u64)end_blk - start_blk + 1); ret = btrfs_delalloc_reserve_space(inode, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT); + start_blk << inode->i_blkbits, + blk_cnt << inode->i_blkbits); if (ret) return ret; i_done = 0; tree = &BTRFS_I(inode)->io_tree; + page_cnt = DIV_ROUND_UP(pg_offset + (blk_cnt << inode->i_blkbits), + PAGE_SIZE); + /* step one, lock all the pages */ for (i = 0; i < page_cnt; i++) { struct page *page; @@ -1159,12 +1170,22 @@ again: break; page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + + if (i == 0) + page_start += pg_offset; + + if (i == page_cnt - 1) { + page_end = (start_index << PAGE_SHIFT) + pg_offset; + page_end += (blk_cnt << inode->i_blkbits) - 1; + } else { + page_end = page_offset(page) + PAGE_SIZE - 1; + } + while (1) { lock_extent_bits(tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, - page_start); + ordered = btrfs_lookup_ordered_range(inode, page_start, + page_end - page_start + 1); unlock_extent_cached(tree, page_start, page_end, &cached_state, GFP_NOFS); if (!ordered) @@ -1203,7 +1224,7 @@ again: } pages[i] = page; - i_done++; + i_done += (page_end - page_start + 1) >> inode->i_blkbits; } if (!i_done || ret) goto out; @@ -1215,55 +1236,77 @@ again: * so now we have a nice long stream of locked * and up to date pages, lets wait on them */ - for (i = 0; i < i_done; i++) + page_cnt = DIV_ROUND_UP(pg_offset + (i_done << inode->i_blkbits), + PAGE_SIZE); + for (i = 0; i < page_cnt; i++) wait_on_page_writeback(pages[i]); - page_start = page_offset(pages[0]); - page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; + page_start = page_offset(pages[0]) + pg_offset; + page_end = page_start + (i_done << inode->i_blkbits) - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); + page_start, page_end, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + page_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - if (i_done != page_cnt) { + if (i_done != blk_cnt) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - start_index << PAGE_SHIFT, - (page_cnt - i_done) << PAGE_SHIFT); + start_blk << inode->i_blkbits, + (blk_cnt - i_done) << inode->i_blkbits); } - set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state); + set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end, + &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state, + page_start, page_end, &cached_state, GFP_NOFS); - for (i = 0; i < i_done; i++) { + for (i = 0; i < page_cnt; i++) { clear_page_dirty_for_io(pages[i]); ClearPageChecked(pages[i]); set_page_extent_mapped(pages[i]); + + page_start = page_offset(pages[i]); + if (i == 0) + page_start += pg_offset; + + if (i == page_cnt - 1) { + page_end = page_offset(pages[0]) + pg_offset; + page_end += (i_done << inode->i_blkbits) - 1; + } else { + page_end = page_offset(pages[i]) + PAGE_SIZE - 1; + } + + if (root->sectorsize < PAGE_SIZE) + set_page_blks_state(pages[i], + 1 << BLK_STATE_UPTODATE | 1 << BLK_STATE_DIRTY, + page_start, page_end); set_page_dirty(pages[i]); unlock_page(pages[i]); put_page(pages[i]); } return i_done; out: - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - put_page(pages[i]); + if (i_done) { + page_cnt = DIV_ROUND_UP(pg_offset + (i_done << inode->i_blkbits), + PAGE_SIZE); + for (i = 0; i < page_cnt; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } } + btrfs_delalloc_release_space(inode, - start_index << PAGE_SHIFT, - page_cnt << PAGE_SHIFT); + start_blk << inode->i_blkbits, + blk_cnt << inode->i_blkbits); return ret; - } int btrfs_defrag_file(struct inode *inode, struct file *file, @@ -1272,19 +1315,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, { struct btrfs_root *root = BTRFS_I(inode)->root; struct file_ra_state *ra = NULL; + unsigned long first_off, last_off; + unsigned long first_block, last_block; unsigned long last_index; u64 isize = i_size_read(inode); u64 last_len = 0; u64 skip = 0; u64 defrag_end = 0; u64 newer_off = range->start; + u64 start; + u64 page_cnt; unsigned long i; unsigned long ra_index = 0; + size_t pg_offset; int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; + unsigned long max_cluster = SZ_256K >> inode->i_blkbits; unsigned long cluster = max_cluster; u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; @@ -1318,8 +1366,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc_array(max_cluster, sizeof(struct page *), - GFP_NOFS); + /* + * In subpage-blocksize scenario the first of "max_cluster" blocks + * may start on a non-zero page offset. In such scenarios we need one + * page more than what would be needed in the case where the first block + * maps to first block of a page. + */ + page_cnt = (max_cluster >> (PAGE_SHIFT - inode->i_blkbits)) + 1; + pages = kmalloc_array(page_cnt, sizeof(struct page *), GFP_NOFS); if (!pages) { ret = -ENOMEM; goto out_ra; @@ -1327,12 +1381,15 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { - last_index = min_t(u64, isize - 1, - range->start + range->len - 1) >> PAGE_SHIFT; + last_off = min_t(u64, isize - 1, range->start + range->len - 1); } else { - last_index = (isize - 1) >> PAGE_SHIFT; + last_off = isize - 1; } + last_off = round_up(last_off, root->sectorsize) - 1; + last_block = last_off >> inode->i_blkbits; + last_index = last_off >> PAGE_SHIFT; + if (newer_than) { ret = find_new_extents(root, inode, newer_than, &newer_off, SZ_64K); @@ -1342,14 +1399,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * we always align our defrag to help keep * the extents in the file evenly spaced */ - i = (newer_off & new_align) >> PAGE_SHIFT; + first_off = newer_off & new_align; } else goto out_ra; } else { - i = range->start >> PAGE_SHIFT; + first_off = range->start; } + + first_off = round_down(first_off, root->sectorsize); + first_block = first_off >> inode->i_blkbits; + i = first_off >> PAGE_SHIFT; + pg_offset = first_off & (PAGE_SIZE - 1); + if (!max_to_defrag) - max_to_defrag = last_index - i + 1; + max_to_defrag = last_block - first_block + 1; /* * make writeback starts from i, so the defrag range can be @@ -1373,39 +1436,50 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; } - if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, - extent_thresh, &last_len, &skip, - &defrag_end, range->flags & - BTRFS_DEFRAG_RANGE_COMPRESS)) { + start = pg_offset + ((u64)i << PAGE_SHIFT); + if (!should_defrag_range(inode, start, + extent_thresh, &last_len, &skip, + &defrag_end, range->flags & + BTRFS_DEFRAG_RANGE_COMPRESS)) { unsigned long next; /* * the should_defrag function tells us how much to skip * bump our counter by the suggested amount */ - next = DIV_ROUND_UP(skip, PAGE_SIZE); - i = max(i + 1, next); + next = max(skip, start + root->sectorsize); + next >>= inode->i_blkbits; + + first_off = next << inode->i_blkbits; + i = first_off >> PAGE_SHIFT; + pg_offset = first_off & (PAGE_SIZE - 1); continue; } if (!newer_than) { - cluster = (PAGE_ALIGN(defrag_end) >> - PAGE_SHIFT) - i; + cluster = (defrag_end >> inode->i_blkbits) + - (start >> inode->i_blkbits); + cluster = min(cluster, max_cluster); } else { cluster = max_cluster; } - if (i + cluster > ra_index) { + page_cnt = pg_offset + (cluster << inode->i_blkbits) - 1; + page_cnt = DIV_ROUND_UP(page_cnt, PAGE_SIZE); + if (i + page_cnt > ra_index) { ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, - cluster); - ra_index += cluster; + page_cnt); + ra_index += DIV_ROUND_UP(pg_offset + + (cluster << inode->i_blkbits), + PAGE_SIZE); } inode_lock(inode); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + ret = cluster_pages_for_defrag(inode, pages, i, pg_offset, + cluster); if (ret < 0) { inode_unlock(inode); goto out_ra; @@ -1419,29 +1493,29 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (newer_off == (u64)-1) break; - if (ret > 0) - i += ret; - newer_off = max(newer_off + 1, - (u64)i << PAGE_SHIFT); + start + (ret << inode->i_blkbits)); ret = find_new_extents(root, inode, newer_than, &newer_off, SZ_64K); if (!ret) { range->start = newer_off; - i = (newer_off & new_align) >> PAGE_SHIFT; + first_off = newer_off & new_align; } else { break; } } else { if (ret > 0) { - i += ret; - last_len += ret << PAGE_SHIFT; + first_off = start + (ret << inode->i_blkbits); + last_len += ret << inode->i_blkbits; } else { - i++; + first_off = start + root->sectorsize; last_len = 0; } } + first_off = round_down(first_off, root->sectorsize); + i = first_off >> PAGE_SHIFT; + pg_offset = first_off & (PAGE_SIZE - 1); } if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { -- 2.5.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html