Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-24 Thread Aneesh Kumar K.V
On Thu, Jan 24, 2008 at 01:26:14PM +0530, Aneesh Kumar K.V wrote:
> 
> > 
> > > +/* find most significant bit */
> > > +static int fmsb(unsigned short word)
> > > +{
> > > + int order;
> > > +
> > > + if (word > 255) {
> > > + order = 7;
> > > + word >>= 8;
> > > + } else {
> > > + order = -1;
> > > + }
> > > +
> > > + do {
> > > + order++;
> > > + word >>= 1;
> > > + } while (word != 0);
> > > +
> > > + return order;
> > > +}
> > 
> > Did we just reinvent fls()?
> 
> replaced by fls.
> 
> > 

That should be fls() - 1;

The full patch is at

http://www.radian.org/~kvaneesh/ext4/jan-24-2008/mballoc-core.patch

The patch is too big to inline.

-aneesh
-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-24 Thread Aneesh Kumar K.V
updated patch. Waiting for the test results.

I am only attaching the diff. Mballoc patch is really large.

-aneesh
diff --git a/Documentation/filesystems/ext4.txt 
b/Documentation/filesystems/ext4.txt
index 4f329af..ec7d349 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -89,6 +89,8 @@ When mounting an ext4 filesystem, the following option are 
accepted:
 extentsext4 will use extents to address file data.  The
file system will no longer be mountable by ext3.
 
+noextents  ext4 will not use extents for new files created.
+
 journal_checksum   Enable checksumming of the journal transactions.
This will allow the recovery code in e2fsck and the
kernel to detect corruption in the kernel.  It is a
@@ -206,6 +208,10 @@ nobh   (a) cache disk block mapping 
information
"nobh" option tries to avoid associating buffer
heads (supported only for "writeback" mode).
 
+mballoc(*) Use the mutliblock allocator for block 
allocation
+nomballoc  disabled multiblock allocator for block allocation.
+stripe=n   filesystem blocks per stripe for a RAID configuration.
+
 
 Data Mode
 -
diff --git a/Documentation/filesystems/proc.txt 
b/Documentation/filesystems/proc.txt
index dec9945..4413a2d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -857,6 +857,45 @@ CPUs.
 The   "procs_blocked" line gives  the  number of  processes currently blocked,
 waiting for I/O to complete.
 
+1.9 Ext4 file system parameters
+--
+Ext4 file system have one directory per partition under /proc/fs/ext4/
+# ls /proc/fs/ext4/hdc/
+group_prealloc  max_to_scan  mb_groups  mb_history  min_to_scan  order2_req
+stats  stream_req
+
+mb_groups:
+This file gives the details of mutiblock allocator buddy cache of free blocks
+
+mb_history:
+Multiblock allocation history.
+
+stats:
+This file indicate whether the multiblock allocator should start collecting
+statistics. The statistics are shown during unmount
+
+group_prealloc:
+The multiblock allocator normalize the block allocation request to
+group_prealloc filesystem blocks if we don't have strip value set.
+The stripe value can be specified at mount time or during mke2fs.
+
+max_to_scan:
+How long multiblock allocator can look for a best extent (in found extents)
+
+min_to_scan:
+How long multiblock allocator  must look for a best extent
+
+order2_req:
+Multiblock allocator use  2^N search using buddies only for requests greater
+than or equal to order2_req. The request size is specfied in file system
+blocks. A value of 2 indicate only if the requests are greater than or equal
+to 4 blocks.
+
+stream_req:
+Files smaller than stream_req are served by the stream allocator, whose
+purpose is to pack requests as close each to other as possible to
+produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
+filesystem block size will use group based preallocation.
 
 --
 Summary
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0398aa0..310bad6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -489,7 +489,7 @@ struct ext4_free_extent {
  */
 struct ext4_locality_group {
/* for allocator */
-   struct semaphorelg_sem; /* to serialize allocates */
+   struct mutexlg_sem; /* to serialize allocates */
struct list_headlg_prealloc_list;/* list of preallocations */
spinlock_t  lg_prealloc_lock;
 };
@@ -563,7 +563,10 @@ struct ext4_buddy {
 #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
 
 #ifndef EXT4_MB_HISTORY
-#define ext4_mb_store_history(ac)
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+   return;
+}
 #else
 static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #endif
@@ -641,6 +644,10 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct 
super_block *sb,
 
 static inline int mb_test_bit(int bit, void *addr)
 {
+   /*
+* ext4_test_bit on architecture like powerpc
+* needs unsigned long aligned address
+*/
mb_correct_addr_and_bit(bit, addr);
return ext4_test_bit(bit, addr);
 }
@@ -669,7 +676,7 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, 
int bit, void *addr)
ext4_clear_bit_atomic(lock, bit, addr);
 }
 
-static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
char *bb;
 
@@ -752,9 +759,20 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void 
*bitmap)
 }
 
 #else
-#define mb_free_blocks_double(a, b, c, d)
-#define mb_mark_used_double(a, b, c)
-#defin

Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-23 Thread Aneesh Kumar K.V
On Wed, Jan 23, 2008 at 02:07:27PM -0800, Andrew Morton wrote:
> > On Mon, 21 Jan 2008 22:02:20 -0500 "Theodore Ts'o" <[EMAIL PROTECTED]> 
> > wrote:
> > From: Alex Tomas <[EMAIL PROTECTED]>
> > 
> > Signed-off-by: Alex Tomas <[EMAIL PROTECTED]>
> > Signed-off-by: Andreas Dilger <[EMAIL PROTECTED]>
> > Signed-off-by: Aneesh Kumar K.V <[EMAIL PROTECTED]>
> > Signed-off-by: Eric Sandeen <[EMAIL PROTECTED]>
> > Signed-off-by: "Theodore Ts'o" <[EMAIL PROTECTED]>
> >
> > ...
> >
> > +#if BITS_PER_LONG == 64
> > +#define mb_correct_addr_and_bit(bit, addr) \
> > +{  \
> > +   bit += ((unsigned long) addr & 7UL) << 3;   \
> > +   addr = (void *) ((unsigned long) addr & ~7UL);  \
> > +}
> > +#elif BITS_PER_LONG == 32
> > +#define mb_correct_addr_and_bit(bit, addr) \
> > +{  \
> > +   bit += ((unsigned long) addr & 3UL) << 3;   \
> > +   addr = (void *) ((unsigned long) addr & ~3UL);  \
> > +}
> > +#else
> > +#error "how many bits you are?!"
> > +#endif
> 
> Why do these exist?

Initial version on mballoc supported on x86 32 this was there to give
compile warning on 64 bit platform. I guess we can remove that now.
Or may be we can keep it as such because it is harmless.


> 
> > +static inline int mb_test_bit(int bit, void *addr)
> > +{
> > +   mb_correct_addr_and_bit(bit, addr);
> > +   return ext4_test_bit(bit, addr);
> > +}
> 
> ext2_test_bit() already handles bitnum > wordsize.
> 
> If mb_correct_addr_and_bit() is actually needed then some suitable comment
> would help.

ext4_test_bit on powerpc needs the addr to be 8 byte aligned. Othewise
it fails

> 
> > +static inline void mb_set_bit(int bit, void *addr)
> > +{
> > +   mb_correct_addr_and_bit(bit, addr);
> > +   ext4_set_bit(bit, addr);
> > +}
> > +
> > +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
> > +{
> > +   mb_correct_addr_and_bit(bit, addr);
> > +   ext4_set_bit_atomic(lock, bit, addr);
> > +}
> > +
> > +static inline void mb_clear_bit(int bit, void *addr)
> > +{
> > +   mb_correct_addr_and_bit(bit, addr);
> > +   ext4_clear_bit(bit, addr);
> > +}
> > +
> > +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void 
> > *addr)
> > +{
> > +   mb_correct_addr_and_bit(bit, addr);
> > +   ext4_clear_bit_atomic(lock, bit, addr);
> > +}
> > +
> > +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int 
> > *max)
> 
> uninlining this will save about eighty squigabytes of text.

Fixed


> 
> Please review all of ext4/jbd2 with a view to removig unnecessary and wrong
> inlings.
> 
> > +{
> > +   char *bb;
> > +
> > +   /* FIXME!! is this needed */
> > +   BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
> > +   BUG_ON(max == NULL);
> > +
> > +   if (order > e4b->bd_blkbits + 1) {
> > +   *max = 0;
> > +   return NULL;
> > +   }
> > +
> > +   /* at order 0 we see each particular block */
> > +   *max = 1 << (e4b->bd_blkbits + 3);
> > +   if (order == 0)
> > +   return EXT4_MB_BITMAP(e4b);
> > +
> > +   bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
> > +   *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
> > +
> > +   return bb;
> > +}
> > +
> >
> > ...
> >
> > +#else
> > +#define mb_free_blocks_double(a, b, c, d)
> > +#define mb_mark_used_double(a, b, c)
> > +#define mb_cmp_bitmaps(a, b)
> > +#endif
> 
> Please use the do{}while(0) thing.  Or, better, proper C functions which
> have typechecking (unless this will cause undefined-var compile errors,
> which happens sometimes)

makde static inline void.

> 
> > +/* find most significant bit */
> > +static int fmsb(unsigned short word)
> > +{
> > +   int order;
> > +
> > +   if (word > 255) {
> > +   order = 7;
> > +   word >>= 8;
> > +   } else {
> > +   order = -1;
> > +   }
> > +
> > +   do {
> > +   order++;
> > +   word >>= 1;
> > +   } while (word != 0);
> > +
> > +   return order;
> > +}
> 
> Did we just reinvent fls()?

replaced by fls.

> 
> > +/* FIXME!! need more doc */
> > +static void ext4_mb_mark_free_simple(struct super_block *sb,
> > +   void *buddy, unsigned first, int len,
> > +   struct ext4_group_info *grp)
> > +{
> > +   struct ext4_sb_info *sbi = EXT4_SB(sb);
> > +   unsigned short min;
> > +   unsigned short max;
> > +   unsigned short chunk;
> > +   unsigned short border;
> > +
> > +   BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
> > +
> > +   border = 2 << sb->s_blocksize_bits;
> 
> Won't this explode with >= 32k blocksize?
> 
> > +   while (len > 0) {
> > +   /* find how many blocks can be covered since this position */
> > +   max = ffs(first | border) - 1;
> > +
> > +   /* find how many blocks of power 2 we need to mark */
> > +   min = fmsb(len);
> > +
> > +   if (max < min)
> > +   min = max;
> > +   chunk =

Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-23 Thread Andreas Dilger
On Jan 23, 2008  14:07 -0800, Andrew Morton wrote:
> > +#define mb_correct_addr_and_bit(bit, addr) \
> > +{  \
> > +   bit += ((unsigned long) addr & 3UL) << 3;   \
> > +   addr = (void *) ((unsigned long) addr & ~3UL);  \
> > +}
> 
> Why do these exist?

They seem to be a holdover from when mballoc stored the buddy bitmaps
on disk.  That no longer happens (to avoid bitmap vs. buddy consistency
problems), so I suspect they can be removed.

I can't comment on many of the other issues because Alex wrote most
of the code.

> Gosh what a lot of code.  Is it faster?

Yes, and also importantly it uses a lot less CPU to do a given amount
of allocation, which is critical in our environments where there is
very high disk bandwidth on a single node and CPU becomes the limiting
factor of the IO speed.  This of course also helps any write-intensive
environment where the CPU is doing something "useful".

Some older test results include:
https://ols2006.108.redhat.com/2007/Reprints/mathur-Reprint.pdf (Section 7)

In the linux-ext4 thread "compilebench numbers for ext4":
http://www.mail-archive.com/linux-ext4@vger.kernel.org/msg03834.html

http://oss.oracle.com/~mason/compilebench/ext4/ext-create-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-compile-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-read-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-rm-compare.png

note the ext-read-compare.png graph shows lower read performance, but
a couple of bugs in mballoc were since fixed to have ext4 allocate more
contiguous extents.

In the old linux-ext4 thread "[RFC] delayed allocation testing on node zefir"
http://www.mail-archive.com/linux-ext4@vger.kernel.org/msg00587.html

: dd2048rw 
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 58.46  23 1491   25722097292 17 extents
EXT4: 44.56  19 1018   12  2097244 19 extents
REISERFS: 56.80  26 1370   29522097336 457 extents
JFS : 45.77  22 9840   2097216 1 extents
XFS : 50.97  20 1394   0   2100825 7 extents

: kernuntar
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 56.99  5037   65168  252016  
EXT4: 55.03  5034   55336  249884  
REISERFS: 52.55  4996   85464  238068  
JFS : 70.15  5057   630496 288116  
XFS : 72.84  5052   953132 316798  

: kernstat 
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 2.83   8  15 58920   
EXT4: 0.51   9  10 58920   
REISERFS: 0.81   7  49 26960   
JFS : 6.19   11 49 12552   0   
XFS : 2.09   9  61 65040   

: kerncat  
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 9.48   25 213241624  0   
EXT4: 6.29   27 197238560  0   
REISERFS: 14.69  33 230234744  0   
JFS : 23.51  23 231244596  0   
XFS : 18.24  36 254238548  0   

: kernrm   
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 4.82   4  10896284672
EXT4: 1.61   5  11065364632
REISERFS: 3.15   8  2762768236 
JFS : 33.90  7  16814400   33048   
XFS : 20.03  8  296663286160   


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-23 Thread Andrew Morton
> On Mon, 21 Jan 2008 22:02:20 -0500 "Theodore Ts'o" <[EMAIL PROTECTED]> wrote:
> From: Alex Tomas <[EMAIL PROTECTED]>
> 
> Signed-off-by: Alex Tomas <[EMAIL PROTECTED]>
> Signed-off-by: Andreas Dilger <[EMAIL PROTECTED]>
> Signed-off-by: Aneesh Kumar K.V <[EMAIL PROTECTED]>
> Signed-off-by: Eric Sandeen <[EMAIL PROTECTED]>
> Signed-off-by: "Theodore Ts'o" <[EMAIL PROTECTED]>
>
> ...
>
> +#if BITS_PER_LONG == 64
> +#define mb_correct_addr_and_bit(bit, addr)   \
> +{\
> + bit += ((unsigned long) addr & 7UL) << 3;   \
> + addr = (void *) ((unsigned long) addr & ~7UL);  \
> +}
> +#elif BITS_PER_LONG == 32
> +#define mb_correct_addr_and_bit(bit, addr)   \
> +{\
> + bit += ((unsigned long) addr & 3UL) << 3;   \
> + addr = (void *) ((unsigned long) addr & ~3UL);  \
> +}
> +#else
> +#error "how many bits you are?!"
> +#endif

Why do these exist?

> +static inline int mb_test_bit(int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + return ext4_test_bit(bit, addr);
> +}

ext2_test_bit() already handles bitnum > wordsize.

If mb_correct_addr_and_bit() is actually needed then some suitable comment
would help.

> +static inline void mb_set_bit(int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_set_bit(bit, addr);
> +}
> +
> +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_set_bit_atomic(lock, bit, addr);
> +}
> +
> +static inline void mb_clear_bit(int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_clear_bit(bit, addr);
> +}
> +
> +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
> +{
> + mb_correct_addr_and_bit(bit, addr);
> + ext4_clear_bit_atomic(lock, bit, addr);
> +}
> +
> +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int 
> *max)

uninlining this will save about eighty squigabytes of text.

Please review all of ext4/jbd2 with a view to removig unnecessary and wrong
inlings.

> +{
> + char *bb;
> +
> + /* FIXME!! is this needed */
> + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
> + BUG_ON(max == NULL);
> +
> + if (order > e4b->bd_blkbits + 1) {
> + *max = 0;
> + return NULL;
> + }
> +
> + /* at order 0 we see each particular block */
> + *max = 1 << (e4b->bd_blkbits + 3);
> + if (order == 0)
> + return EXT4_MB_BITMAP(e4b);
> +
> + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
> + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
> +
> + return bb;
> +}
> +
>
> ...
>
> +#else
> +#define mb_free_blocks_double(a, b, c, d)
> +#define mb_mark_used_double(a, b, c)
> +#define mb_cmp_bitmaps(a, b)
> +#endif

Please use the do{}while(0) thing.  Or, better, proper C functions which
have typechecking (unless this will cause undefined-var compile errors,
which happens sometimes)

> +/* find most significant bit */
> +static int fmsb(unsigned short word)
> +{
> + int order;
> +
> + if (word > 255) {
> + order = 7;
> + word >>= 8;
> + } else {
> + order = -1;
> + }
> +
> + do {
> + order++;
> + word >>= 1;
> + } while (word != 0);
> +
> + return order;
> +}

Did we just reinvent fls()?

> +/* FIXME!! need more doc */
> +static void ext4_mb_mark_free_simple(struct super_block *sb,
> + void *buddy, unsigned first, int len,
> + struct ext4_group_info *grp)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + unsigned short min;
> + unsigned short max;
> + unsigned short chunk;
> + unsigned short border;
> +
> + BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
> +
> + border = 2 << sb->s_blocksize_bits;

Won't this explode with >= 32k blocksize?

> + while (len > 0) {
> + /* find how many blocks can be covered since this position */
> + max = ffs(first | border) - 1;
> +
> + /* find how many blocks of power 2 we need to mark */
> + min = fmsb(len);
> +
> + if (max < min)
> + min = max;
> + chunk = 1 << min;
> +
> + /* mark multiblock chunks only */
> + grp->bb_counters[min]++;
> + if (min > 0)
> + mb_clear_bit(first >> min,
> +  buddy + sbi->s_mb_offsets[min]);
> +
> + len -= chunk;
> + first += chunk;
> + }
> +}
> +
> 
> ...
>
> +static int ext4_mb_init_cache(struct page *page, char *incore)
> +{
> + int blocksize;
> + int blocks_per_page;
> + int groups_per_page;
> + int err = 0;
> + int i;
> + ext4_group_t first_group;
> + int first_block;