Re: [PATCH 41/49] ext4: Add multi block allocator for ext4
updated patch. Waiting for the test results. I am only attaching the diff. Mballoc patch is really large. -aneesh diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 4f329af..ec7d349 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -89,6 +89,8 @@ When mounting an ext4 filesystem, the following option are accepted: extentsext4 will use extents to address file data. The file system will no longer be mountable by ext3. +noextents ext4 will not use extents for new files created. + journal_checksum Enable checksumming of the journal transactions. This will allow the recovery code in e2fsck and the kernel to detect corruption in the kernel. It is a @@ -206,6 +208,10 @@ nobh (a) cache disk block mapping information nobh option tries to avoid associating buffer heads (supported only for writeback mode). +mballoc(*) Use the mutliblock allocator for block allocation +nomballoc disabled multiblock allocator for block allocation. +stripe=n filesystem blocks per stripe for a RAID configuration. + Data Mode - diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index dec9945..4413a2d 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -857,6 +857,45 @@ CPUs. The procs_blocked line gives the number of processes currently blocked, waiting for I/O to complete. +1.9 Ext4 file system parameters +-- +Ext4 file system have one directory per partition under /proc/fs/ext4/ +# ls /proc/fs/ext4/hdc/ +group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req +stats stream_req + +mb_groups: +This file gives the details of mutiblock allocator buddy cache of free blocks + +mb_history: +Multiblock allocation history. + +stats: +This file indicate whether the multiblock allocator should start collecting +statistics. The statistics are shown during unmount + +group_prealloc: +The multiblock allocator normalize the block allocation request to +group_prealloc filesystem blocks if we don't have strip value set. +The stripe value can be specified at mount time or during mke2fs. + +max_to_scan: +How long multiblock allocator can look for a best extent (in found extents) + +min_to_scan: +How long multiblock allocator must look for a best extent + +order2_req: +Multiblock allocator use 2^N search using buddies only for requests greater +than or equal to order2_req. The request size is specfied in file system +blocks. A value of 2 indicate only if the requests are greater than or equal +to 4 blocks. + +stream_req: +Files smaller than stream_req are served by the stream allocator, whose +purpose is to pack requests as close each to other as possible to +produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16 +filesystem block size will use group based preallocation. -- Summary diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0398aa0..310bad6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -489,7 +489,7 @@ struct ext4_free_extent { */ struct ext4_locality_group { /* for allocator */ - struct semaphorelg_sem; /* to serialize allocates */ + struct mutexlg_sem; /* to serialize allocates */ struct list_headlg_prealloc_list;/* list of preallocations */ spinlock_t lg_prealloc_lock; }; @@ -563,7 +563,10 @@ struct ext4_buddy { #define EXT4_MB_BUDDY(e4b) ((e4b)-bd_buddy) #ifndef EXT4_MB_HISTORY -#define ext4_mb_store_history(ac) +static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) +{ + return; +} #else static void ext4_mb_store_history(struct ext4_allocation_context *ac); #endif @@ -641,6 +644,10 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, static inline int mb_test_bit(int bit, void *addr) { + /* +* ext4_test_bit on architecture like powerpc +* needs unsigned long aligned address +*/ mb_correct_addr_and_bit(bit, addr); return ext4_test_bit(bit, addr); } @@ -669,7 +676,7 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) ext4_clear_bit_atomic(lock, bit, addr); } -static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) +static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; @@ -752,9 +759,20 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) } #else -#define mb_free_blocks_double(a, b, c, d) -#define mb_mark_used_double(a, b, c) -#define
Re: [PATCH 41/49] ext4: Add multi block allocator for ext4
On Thu, Jan 24, 2008 at 01:26:14PM +0530, Aneesh Kumar K.V wrote: +/* find most significant bit */ +static int fmsb(unsigned short word) +{ + int order; + + if (word 255) { + order = 7; + word = 8; + } else { + order = -1; + } + + do { + order++; + word = 1; + } while (word != 0); + + return order; +} Did we just reinvent fls()? replaced by fls. That should be fls() - 1; The full patch is at http://www.radian.org/~kvaneesh/ext4/jan-24-2008/mballoc-core.patch The patch is too big to inline. -aneesh - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 41/49] ext4: Add multi block allocator for ext4
On Mon, 21 Jan 2008 22:02:20 -0500 Theodore Ts'o [EMAIL PROTECTED] wrote: From: Alex Tomas [EMAIL PROTECTED] Signed-off-by: Alex Tomas [EMAIL PROTECTED] Signed-off-by: Andreas Dilger [EMAIL PROTECTED] Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED] Signed-off-by: Eric Sandeen [EMAIL PROTECTED] Signed-off-by: Theodore Ts'o [EMAIL PROTECTED] ... +#if BITS_PER_LONG == 64 +#define mb_correct_addr_and_bit(bit, addr) \ +{\ + bit += ((unsigned long) addr 7UL) 3; \ + addr = (void *) ((unsigned long) addr ~7UL); \ +} +#elif BITS_PER_LONG == 32 +#define mb_correct_addr_and_bit(bit, addr) \ +{\ + bit += ((unsigned long) addr 3UL) 3; \ + addr = (void *) ((unsigned long) addr ~3UL); \ +} +#else +#error how many bits you are?! +#endif Why do these exist? +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + return ext4_test_bit(bit, addr); +} ext2_test_bit() already handles bitnum wordsize. If mb_correct_addr_and_bit() is actually needed then some suitable comment would help. +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_set_bit_atomic(lock, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_clear_bit_atomic(lock, bit, addr); +} + +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) uninlining this will save about eighty squigabytes of text. Please review all of ext4/jbd2 with a view to removig unnecessary and wrong inlings. +{ + char *bb; + + /* FIXME!! is this needed */ + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(max == NULL); + + if (order e4b-bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + *max = 1 (e4b-bd_blkbits + 3); + if (order == 0) + return EXT4_MB_BITMAP(e4b); + + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b-bd_sb)-s_mb_offsets[order]; + *max = EXT4_SB(e4b-bd_sb)-s_mb_maxs[order]; + + return bb; +} + ... +#else +#define mb_free_blocks_double(a, b, c, d) +#define mb_mark_used_double(a, b, c) +#define mb_cmp_bitmaps(a, b) +#endif Please use the do{}while(0) thing. Or, better, proper C functions which have typechecking (unless this will cause undefined-var compile errors, which happens sometimes) +/* find most significant bit */ +static int fmsb(unsigned short word) +{ + int order; + + if (word 255) { + order = 7; + word = 8; + } else { + order = -1; + } + + do { + order++; + word = 1; + } while (word != 0); + + return order; +} Did we just reinvent fls()? +/* FIXME!! need more doc */ +static void ext4_mb_mark_free_simple(struct super_block *sb, + void *buddy, unsigned first, int len, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned short min; + unsigned short max; + unsigned short chunk; + unsigned short border; + + BUG_ON(len = EXT4_BLOCKS_PER_GROUP(sb)); + + border = 2 sb-s_blocksize_bits; Won't this explode with = 32k blocksize? + while (len 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fmsb(len); + + if (max min) + min = max; + chunk = 1 min; + + /* mark multiblock chunks only */ + grp-bb_counters[min]++; + if (min 0) + mb_clear_bit(first min, + buddy + sbi-s_mb_offsets[min]); + + len -= chunk; + first += chunk; + } +} + ... +static int ext4_mb_init_cache(struct page *page, char *incore) +{ + int blocksize; + int blocks_per_page; + int groups_per_page; + int err = 0; + int i; + ext4_group_t first_group; + int first_block; + struct super_block *sb; + struct buffer_head *bhs; + struct buffer_head **bh; + struct inode *inode; + char *data; + char *bitmap; + + mb_debug(init page %lu\n, page-index);
Re: [PATCH 41/49] ext4: Add multi block allocator for ext4
On Jan 23, 2008 14:07 -0800, Andrew Morton wrote: +#define mb_correct_addr_and_bit(bit, addr) \ +{ \ + bit += ((unsigned long) addr 3UL) 3; \ + addr = (void *) ((unsigned long) addr ~3UL); \ +} Why do these exist? They seem to be a holdover from when mballoc stored the buddy bitmaps on disk. That no longer happens (to avoid bitmap vs. buddy consistency problems), so I suspect they can be removed. I can't comment on many of the other issues because Alex wrote most of the code. Gosh what a lot of code. Is it faster? Yes, and also importantly it uses a lot less CPU to do a given amount of allocation, which is critical in our environments where there is very high disk bandwidth on a single node and CPU becomes the limiting factor of the IO speed. This of course also helps any write-intensive environment where the CPU is doing something useful. Some older test results include: https://ols2006.108.redhat.com/2007/Reprints/mathur-Reprint.pdf (Section 7) In the linux-ext4 thread compilebench numbers for ext4: http://www.mail-archive.com/linux-ext4@vger.kernel.org/msg03834.html http://oss.oracle.com/~mason/compilebench/ext4/ext-create-compare.png http://oss.oracle.com/~mason/compilebench/ext4/ext-compile-compare.png http://oss.oracle.com/~mason/compilebench/ext4/ext-read-compare.png http://oss.oracle.com/~mason/compilebench/ext4/ext-rm-compare.png note the ext-read-compare.png graph shows lower read performance, but a couple of bugs in mballoc were since fixed to have ext4 allocate more contiguous extents. In the old linux-ext4 thread [RFC] delayed allocation testing on node zefir http://www.mail-archive.com/linux-ext4@vger.kernel.org/msg00587.html : dd2048rw : REAL UTIME STIME READWRITTEN DETAILS EXT3: 58.46 23 1491 25722097292 17 extents EXT4: 44.56 19 1018 12 2097244 19 extents REISERFS: 56.80 26 1370 29522097336 457 extents JFS : 45.77 22 9840 2097216 1 extents XFS : 50.97 20 1394 0 2100825 7 extents : kernuntar : REAL UTIME STIME READWRITTEN DETAILS EXT3: 56.99 5037 65168 252016 EXT4: 55.03 5034 55336 249884 REISERFS: 52.55 4996 85464 238068 JFS : 70.15 5057 630496 288116 XFS : 72.84 5052 953132 316798 : kernstat : REAL UTIME STIME READWRITTEN DETAILS EXT3: 2.83 8 15 58920 EXT4: 0.51 9 10 58920 REISERFS: 0.81 7 49 26960 JFS : 6.19 11 49 12552 0 XFS : 2.09 9 61 65040 : kerncat : REAL UTIME STIME READWRITTEN DETAILS EXT3: 9.48 25 213241624 0 EXT4: 6.29 27 197238560 0 REISERFS: 14.69 33 230234744 0 JFS : 23.51 23 231244596 0 XFS : 18.24 36 254238548 0 : kernrm : REAL UTIME STIME READWRITTEN DETAILS EXT3: 4.82 4 10896284672 EXT4: 1.61 5 11065364632 REISERFS: 3.15 8 2762768236 JFS : 33.90 7 16814400 33048 XFS : 20.03 8 296663286160 Cheers, Andreas -- Andreas Dilger Sr. Staff Engineer, Lustre Group Sun Microsystems of Canada, Inc. - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 41/49] ext4: Add multi block allocator for ext4
On Wed, Jan 23, 2008 at 02:07:27PM -0800, Andrew Morton wrote: On Mon, 21 Jan 2008 22:02:20 -0500 Theodore Ts'o [EMAIL PROTECTED] wrote: From: Alex Tomas [EMAIL PROTECTED] Signed-off-by: Alex Tomas [EMAIL PROTECTED] Signed-off-by: Andreas Dilger [EMAIL PROTECTED] Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED] Signed-off-by: Eric Sandeen [EMAIL PROTECTED] Signed-off-by: Theodore Ts'o [EMAIL PROTECTED] ... +#if BITS_PER_LONG == 64 +#define mb_correct_addr_and_bit(bit, addr) \ +{ \ + bit += ((unsigned long) addr 7UL) 3; \ + addr = (void *) ((unsigned long) addr ~7UL); \ +} +#elif BITS_PER_LONG == 32 +#define mb_correct_addr_and_bit(bit, addr) \ +{ \ + bit += ((unsigned long) addr 3UL) 3; \ + addr = (void *) ((unsigned long) addr ~3UL); \ +} +#else +#error how many bits you are?! +#endif Why do these exist? Initial version on mballoc supported on x86 32 this was there to give compile warning on 64 bit platform. I guess we can remove that now. Or may be we can keep it as such because it is harmless. +static inline int mb_test_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + return ext4_test_bit(bit, addr); +} ext2_test_bit() already handles bitnum wordsize. If mb_correct_addr_and_bit() is actually needed then some suitable comment would help. ext4_test_bit on powerpc needs the addr to be 8 byte aligned. Othewise it fails +static inline void mb_set_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_set_bit_atomic(lock, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) +{ + mb_correct_addr_and_bit(bit, addr); + ext4_clear_bit_atomic(lock, bit, addr); +} + +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) uninlining this will save about eighty squigabytes of text. Fixed Please review all of ext4/jbd2 with a view to removig unnecessary and wrong inlings. +{ + char *bb; + + /* FIXME!! is this needed */ + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(max == NULL); + + if (order e4b-bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + *max = 1 (e4b-bd_blkbits + 3); + if (order == 0) + return EXT4_MB_BITMAP(e4b); + + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b-bd_sb)-s_mb_offsets[order]; + *max = EXT4_SB(e4b-bd_sb)-s_mb_maxs[order]; + + return bb; +} + ... +#else +#define mb_free_blocks_double(a, b, c, d) +#define mb_mark_used_double(a, b, c) +#define mb_cmp_bitmaps(a, b) +#endif Please use the do{}while(0) thing. Or, better, proper C functions which have typechecking (unless this will cause undefined-var compile errors, which happens sometimes) makde static inline void. +/* find most significant bit */ +static int fmsb(unsigned short word) +{ + int order; + + if (word 255) { + order = 7; + word = 8; + } else { + order = -1; + } + + do { + order++; + word = 1; + } while (word != 0); + + return order; +} Did we just reinvent fls()? replaced by fls. +/* FIXME!! need more doc */ +static void ext4_mb_mark_free_simple(struct super_block *sb, + void *buddy, unsigned first, int len, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned short min; + unsigned short max; + unsigned short chunk; + unsigned short border; + + BUG_ON(len = EXT4_BLOCKS_PER_GROUP(sb)); + + border = 2 sb-s_blocksize_bits; Won't this explode with = 32k blocksize? + while (len 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fmsb(len); + + if (max min) + min = max; + chunk = 1 min; + + /* mark multiblock chunks only */ + grp-bb_counters[min]++; + if (min 0) + mb_clear_bit(first min, +buddy + sbi-s_mb_offsets[min]); + + len -= chunk; + first += chunk; + } +} + ... +static