Hi Daniel: On Fri, Jan 16, 2009 at 8:37 AM, Daniel Phillips <[email protected]> wrote:
> This is very cool. Issues to consider: > > * It should support variable block size, which all the rest of Tux3 > already does. Let's look at the what the tie to page size is, and > figure out what to do about it. This implementation is very simple, using the nobh_ routines to defer block allocation and a private page flag to reserve space. Support for variable blocksize is achievable by utilizing buffer-head's delay flag. Of course, more code is needed. > > * It has to fit with atomic commit. Details of atomic commit are > just now falling into place, so this is an iterative process. I agree. I am about to the atomic commit part -:) > > > * We may not use mpage_writepages to drive delalloc, because it uses > the narrow ->get_block interface to map pages, forcing Tux3 to do > a btree probe for every block. We really need to implement > ->writepages directly, using a more direct interface to tux3's > map_region that can map a bigger logical address range with a > single btree probe. Yep. We need a tux3_da_writepages() that finds extents of pages and maps/allocates chunks of contigous disk blocks. > * Planned merge of delalloc would be after we have atomic commit > working, so that we can enter the review cycle as early as > possible, with as simple a code base as possible. But if > delalloc actually makes atomic commit easier then we will do it > now. This question should be settled over the next few days, > I hope you will be involved in the discussion. > Welcome to the Tux3 hall of fame! We can really use another developer > with the level of VFS skill that you obviously have. > > Could you please have a look at the new block-oriented page cache > interfaces? > > http://mailman.tux3.org/pipermail/tux3/2009-January/000657.html > "Polymorphic blockread for kernel" > Yes. Very glad to. > > This may help in thinking about how to do the variable size block > support. I will post a full patch pretty soon. > > More comments after I try your patch. > Please try the below revised patch (only one line changed) which does block reservation correctly, if you like. Besides the blocksize limit, it's free-block statistics might has problems with truncate. Please see this patch as proof-of-concept or premature. I'll re-work on it after reading all the tux3 code. Many thanks for your comments and encouragement. Regards, xiaofeng -- diff -pNur tux-orig/balloc.c tux-hack/balloc.c --- tux-orig/balloc.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/balloc.c 2009-01-15 14:29:26.000000000 +0800 @@ -281,6 +281,7 @@ int bfree(struct sb *sb, block_t start, clear_bits(bufdata(buffer), start, blocks); brelse_dirty(buffer); sb->freeblocks += blocks; + tux3_release_blocks(sb, blocks); //set_sb_dirty(sb); mutex_unlock(&sb->bitmap->i_mutex); return 0; diff -pNur tux-orig/filemap.c tux-hack/filemap.c --- tux-orig/filemap.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/filemap.c 2009-01-16 11:37:20.000000000 +0800 @@ -505,4 +505,117 @@ const struct address_space_operations tu .sync_page = block_sync_page, .write_begin = tux3_vol_write_begin, }; + + +/* + * Tux3's delayed allocation + * Note: support blocksize == pagesize only + * Written by XiaoFeng LIU <[email protected]> + */ + +/* proof of concept */ +#define NR_RESERV_BLOCKS 32 + +static int tux3_da_reserve_blocks(struct super_block *sb, int count) +{ + long free_blocks; + struct sb *sbi = tux_sb(sb); + free_blocks = percpu_counter_read_positive(freeblocks_counter(sbi)); + xtrace("freeblocks_counter %ld", free_blocks); + + if (free_blocks < count + NR_RESERV_BLOCKS) + return -ENOSPC; + percpu_counter_sub(freeblocks_counter(sbi), count); + return 0; +} + +static void tux3_da_release_blocks(struct super_block *sb, int count) +{ + struct sb *sbi = tux_sb(sb); + if (count) { + percpu_counter_add(freeblocks_counter(sbi), count); + sb->s_dirt = 1; + } +} + +static int tux3_get_block_delay(struct inode *inode, sector_t iblock, + struct buffer_head *bh_rslt, int create) +{ + return tux3_get_block(inode, iblock, bh_rslt, 0); +} + +/* + * a get_block() called at the writeout time. + */ +static int tux3_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_rslt, int create) +{ + pgoff_t index = (pgoff_t) (iblock >> (PAGE_CACHE_SHIFT - inode->i_blkbits)); + struct page *page = find_get_page(inode->i_mapping, index); + + /* the page should be here, and dirty */ + if (unlikely(!page)) { + xtrace("find_get_page ret NULL."); + goto out; + } + if (create && PageChecked(page)) { + ClearPageChecked(page); + tux3_da_release_blocks(inode->i_sb, 1); + } + if (page) + page_cache_release(page); + +out: + return tux3_get_block(inode, iblock, bh_rslt, create); +} + +static int tux3_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + tux3_get_block_delay); +} + +static int tux3_da_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + /* + * Never do block reservation if the block has been allocated. + * In that case, tux3_da_write_begin sets the page mapped-to-disk. + */ + if (!PageMappedToDisk(page) && !PageChecked(page)) { + int ret = tux3_da_reserve_blocks(mapping->host->i_sb, 1); + if (ret) + return ret; + SetPageChecked(page); + } + + return nobh_write_end(file, mapping, pos, len, copied, page, fsdata); +} + +static int tux3_da_writepage(struct page *page, struct writeback_control *wbc) +{ + return nobh_writepage(page, tux3_get_block_write, wbc); +} +static int tux3_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, tux3_get_block_write); +} + +const struct address_space_operations tux_da_aops = { + .readpage = tux3_readpage, + .readpages = tux3_readpages, + .writepage = tux3_da_writepage, + .writepages = tux3_da_writepages, + .sync_page = block_sync_page, + .write_begin = tux3_da_write_begin, + .write_end = tux3_da_write_end, + .bmap = tux3_bmap, + .direct_IO = tux3_direct_IO, + .migratepage = buffer_migrate_page, +}; + #endif /* __KERNEL__ */ diff -pNur tux-orig/inode.c tux-hack/inode.c --- tux-orig/inode.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/inode.c 2009-01-15 15:29:43.000000000 +0800 @@ -438,7 +438,7 @@ static void tux_setup_inode(struct inode case S_IFREG: inode->i_op = &tux_file_iops; inode->i_fop = &tux_file_fops; - inode->i_mapping->a_ops = &tux_aops; + inode->i_mapping->a_ops = &tux_da_aops; break; case S_IFDIR: inode->i_op = &tux_dir_iops; diff -pNur tux-orig/modules.order tux-hack/modules.order --- tux-orig/modules.order 1970-01-01 08:00:00.000000000 +0800 +++ tux-hack/modules.order 2009-01-16 10:08:26.000000000 +0800 @@ -0,0 +1 @@ +kernel//home/xiaofeng/tux3bed/tux-hack/tux3.ko diff -pNur tux-orig/super.c tux-hack/super.c --- tux-orig/super.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/super.c 2009-01-15 14:27:53.000000000 +0800 @@ -106,6 +106,9 @@ static void tux3_put_super(struct super_ iput(sbi->volmap); iput(sbi->logmap); + /* destroy block allocation info */ + tux3_balloc_info_destroy(sbi); + sb->s_fs_info = NULL; kfree(sbi); } @@ -172,6 +175,10 @@ static int tux3_fill_super(struct super_ err = tux_load_sb(sb, silent); if (err) goto error; + + /* initialize block allocation info */ + tux3_balloc_info_init(sbi); + printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf %d\n", __func__, sbi->itable.sb, sbi->itable.ops, diff -pNur tux-orig/trace.h tux-hack/trace.h --- tux-orig/trace.h 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/trace.h 2009-01-15 15:04:49.000000000 +0800 @@ -22,4 +22,15 @@ die(100); \ } while (0) + +#ifdef __KERNEL__ +/* debug macro, xiaofeng */ +#define xtrace(f, a...) { \ + printk ("(%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (f, ## a); \ + printk ("\n"); \ + } + +#endif #endif diff -pNur tux-orig/tux3.h tux-hack/tux3.h --- tux-orig/tux3.h 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/tux3.h 2009-01-15 15:34:37.000000000 +0800 @@ -9,6 +9,8 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/mutex.h> +#include <linux/mm.h> +#include <linux/percpu_counter.h> #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27) #include <linux/cred.h> // fsuid @@ -213,6 +215,13 @@ struct cursor { } path[]; }; +/* Tux3 block allocation information */ +struct tux3_balloc_info { + struct percpu_counter freeblocks_counter; + /* nextalloc_counter, and others */ +}; +#define freeblocks_counter(sbi) (&sbi->balloc_info.freeblocks_counter) + /* Tux3-specific sb is a handle for the entire volume state */ struct sb { @@ -241,6 +250,7 @@ struct sb { struct mutex loglock; /* serialize log entries (spinlock me) */ #ifdef __KERNEL__ struct super_block *vfs_sb; /* Generic kernel superblock */ + struct tux3_balloc_info balloc_info; /* control info for block allocation */ #else struct dev *dev; /* userspace block device */ #endif @@ -620,6 +630,25 @@ static inline struct inode *buffer_inode return buffer->b_page->mapping->host; } +static inline void tux3_balloc_info_init(struct sb* sbi) +{ + percpu_counter_init(freeblocks_counter(sbi), sbi->freeblocks); +} +static inline void tux3_balloc_info_destroy(struct sb* sbi) +{ + percpu_counter_destroy(freeblocks_counter(sbi)); +} + +static inline void tux3_release_blocks(struct sb* sbi, int count) +{ + percpu_counter_add(freeblocks_counter(sbi), count); +} + +static inline void tux3_reserve_blocks(struct sb* sbi, int count) +{ + percpu_counter_sub(freeblocks_counter(sbi), count); +} + /* btree.c */ struct buffer_head *cursor_leafbuf(struct cursor *cursor); void release_cursor(struct cursor *cursor); @@ -678,6 +707,7 @@ int tux3_get_block(struct inode *inode, extern const struct address_space_operations tux_aops; extern const struct address_space_operations tux_blk_aops; extern const struct address_space_operations tux_vol_aops; +extern const struct address_space_operations tux_da_aops; /* iattr.c */ unsigned encode_asize(unsigned bits);
diff -pNur tux-orig/balloc.c tux-hack/balloc.c --- tux-orig/balloc.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/balloc.c 2009-01-15 14:29:26.000000000 +0800 @@ -281,6 +281,7 @@ int bfree(struct sb *sb, block_t start, clear_bits(bufdata(buffer), start, blocks); brelse_dirty(buffer); sb->freeblocks += blocks; + tux3_release_blocks(sb, blocks); //set_sb_dirty(sb); mutex_unlock(&sb->bitmap->i_mutex); return 0; diff -pNur tux-orig/filemap.c tux-hack/filemap.c --- tux-orig/filemap.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/filemap.c 2009-01-16 11:37:20.000000000 +0800 @@ -505,4 +505,117 @@ const struct address_space_operations tu .sync_page = block_sync_page, .write_begin = tux3_vol_write_begin, }; + + +/* + * Tux3's delayed allocation + * Note: support blocksize == pagesize only + * Written by XiaoFeng LIU <[email protected]> + */ + +/* proof of concept */ +#define NR_RESERV_BLOCKS 32 + +static int tux3_da_reserve_blocks(struct super_block *sb, int count) +{ + long free_blocks; + struct sb *sbi = tux_sb(sb); + free_blocks = percpu_counter_read_positive(freeblocks_counter(sbi)); + xtrace("freeblocks_counter %ld", free_blocks); + + if (free_blocks < count + NR_RESERV_BLOCKS) + return -ENOSPC; + percpu_counter_sub(freeblocks_counter(sbi), count); + return 0; +} + +static void tux3_da_release_blocks(struct super_block *sb, int count) +{ + struct sb *sbi = tux_sb(sb); + if (count) { + percpu_counter_add(freeblocks_counter(sbi), count); + sb->s_dirt = 1; + } +} + +static int tux3_get_block_delay(struct inode *inode, sector_t iblock, + struct buffer_head *bh_rslt, int create) +{ + return tux3_get_block(inode, iblock, bh_rslt, 0); +} + +/* + * a get_block() called at the writeout time. + */ +static int tux3_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_rslt, int create) +{ + pgoff_t index = (pgoff_t) (iblock >> (PAGE_CACHE_SHIFT - inode->i_blkbits)); + struct page *page = find_get_page(inode->i_mapping, index); + + /* the page should be here, and dirty */ + if (unlikely(!page)) { + xtrace("find_get_page ret NULL."); + goto out; + } + if (create && PageChecked(page)) { + ClearPageChecked(page); + tux3_da_release_blocks(inode->i_sb, 1); + } + if (page) + page_cache_release(page); + +out: + return tux3_get_block(inode, iblock, bh_rslt, create); +} + +static int tux3_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + tux3_get_block_delay); +} + +static int tux3_da_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + /* + * Never do block reservation if the block has been allocated. + * In that case, tux3_da_write_begin sets the page mapped-to-disk. + */ + if (!PageMappedToDisk(page) && !PageChecked(page)) { + int ret = tux3_da_reserve_blocks(mapping->host->i_sb, 1); + if (ret) + return ret; + SetPageChecked(page); + } + + return nobh_write_end(file, mapping, pos, len, copied, page, fsdata); +} + +static int tux3_da_writepage(struct page *page, struct writeback_control *wbc) +{ + return nobh_writepage(page, tux3_get_block_write, wbc); +} +static int tux3_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, tux3_get_block_write); +} + +const struct address_space_operations tux_da_aops = { + .readpage = tux3_readpage, + .readpages = tux3_readpages, + .writepage = tux3_da_writepage, + .writepages = tux3_da_writepages, + .sync_page = block_sync_page, + .write_begin = tux3_da_write_begin, + .write_end = tux3_da_write_end, + .bmap = tux3_bmap, + .direct_IO = tux3_direct_IO, + .migratepage = buffer_migrate_page, +}; + #endif /* __KERNEL__ */ diff -pNur tux-orig/inode.c tux-hack/inode.c --- tux-orig/inode.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/inode.c 2009-01-15 15:29:43.000000000 +0800 @@ -438,7 +438,7 @@ static void tux_setup_inode(struct inode case S_IFREG: inode->i_op = &tux_file_iops; inode->i_fop = &tux_file_fops; - inode->i_mapping->a_ops = &tux_aops; + inode->i_mapping->a_ops = &tux_da_aops; break; case S_IFDIR: inode->i_op = &tux_dir_iops; diff -pNur tux-orig/modules.order tux-hack/modules.order --- tux-orig/modules.order 1970-01-01 08:00:00.000000000 +0800 +++ tux-hack/modules.order 2009-01-16 10:08:26.000000000 +0800 @@ -0,0 +1 @@ +kernel//home/xiaofeng/tux3bed/tux-hack/tux3.ko diff -pNur tux-orig/super.c tux-hack/super.c --- tux-orig/super.c 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/super.c 2009-01-15 14:27:53.000000000 +0800 @@ -106,6 +106,9 @@ static void tux3_put_super(struct super_ iput(sbi->volmap); iput(sbi->logmap); + /* destroy block allocation info */ + tux3_balloc_info_destroy(sbi); + sb->s_fs_info = NULL; kfree(sbi); } @@ -172,6 +175,10 @@ static int tux3_fill_super(struct super_ err = tux_load_sb(sb, silent); if (err) goto error; + + /* initialize block allocation info */ + tux3_balloc_info_init(sbi); + printk("%s: sb %p, ops %p, depth %Lu, block %Lu, entries_per_leaf %d\n", __func__, sbi->itable.sb, sbi->itable.ops, diff -pNur tux-orig/trace.h tux-hack/trace.h --- tux-orig/trace.h 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/trace.h 2009-01-15 15:04:49.000000000 +0800 @@ -22,4 +22,15 @@ die(100); \ } while (0) + +#ifdef __KERNEL__ +/* debug macro, xiaofeng */ +#define xtrace(f, a...) { \ + printk ("(%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (f, ## a); \ + printk ("\n"); \ + } + +#endif #endif diff -pNur tux-orig/tux3.h tux-hack/tux3.h --- tux-orig/tux3.h 2009-01-14 20:00:22.000000000 +0800 +++ tux-hack/tux3.h 2009-01-15 15:34:37.000000000 +0800 @@ -9,6 +9,8 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/mutex.h> +#include <linux/mm.h> +#include <linux/percpu_counter.h> #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27) #include <linux/cred.h> // fsuid @@ -213,6 +215,13 @@ struct cursor { } path[]; }; +/* Tux3 block allocation information */ +struct tux3_balloc_info { + struct percpu_counter freeblocks_counter; + /* nextalloc_counter, and others */ +}; +#define freeblocks_counter(sbi) (&sbi->balloc_info.freeblocks_counter) + /* Tux3-specific sb is a handle for the entire volume state */ struct sb { @@ -241,6 +250,7 @@ struct sb { struct mutex loglock; /* serialize log entries (spinlock me) */ #ifdef __KERNEL__ struct super_block *vfs_sb; /* Generic kernel superblock */ + struct tux3_balloc_info balloc_info; /* control info for block allocation */ #else struct dev *dev; /* userspace block device */ #endif @@ -620,6 +630,25 @@ static inline struct inode *buffer_inode return buffer->b_page->mapping->host; } +static inline void tux3_balloc_info_init(struct sb* sbi) +{ + percpu_counter_init(freeblocks_counter(sbi), sbi->freeblocks); +} +static inline void tux3_balloc_info_destroy(struct sb* sbi) +{ + percpu_counter_destroy(freeblocks_counter(sbi)); +} + +static inline void tux3_release_blocks(struct sb* sbi, int count) +{ + percpu_counter_add(freeblocks_counter(sbi), count); +} + +static inline void tux3_reserve_blocks(struct sb* sbi, int count) +{ + percpu_counter_sub(freeblocks_counter(sbi), count); +} + /* btree.c */ struct buffer_head *cursor_leafbuf(struct cursor *cursor); void release_cursor(struct cursor *cursor); @@ -678,6 +707,7 @@ int tux3_get_block(struct inode *inode, extern const struct address_space_operations tux_aops; extern const struct address_space_operations tux_blk_aops; extern const struct address_space_operations tux_vol_aops; +extern const struct address_space_operations tux_da_aops; /* iattr.c */ unsigned encode_asize(unsigned bits);
_______________________________________________ Tux3 mailing list [email protected] http://mailman.tux3.org/cgi-bin/mailman/listinfo/tux3
