Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-24 Thread Aneesh Kumar K.V
updated patch. Waiting for the test results.

I am only attaching the diff. Mballoc patch is really large.

-aneesh
diff --git a/Documentation/filesystems/ext4.txt 
b/Documentation/filesystems/ext4.txt
index 4f329af..ec7d349 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -89,6 +89,8 @@ When mounting an ext4 filesystem, the following option are 
accepted:
 extentsext4 will use extents to address file data.  The
file system will no longer be mountable by ext3.
 
+noextents  ext4 will not use extents for new files created.
+
 journal_checksum   Enable checksumming of the journal transactions.
This will allow the recovery code in e2fsck and the
kernel to detect corruption in the kernel.  It is a
@@ -206,6 +208,10 @@ nobh   (a) cache disk block mapping 
information
nobh option tries to avoid associating buffer
heads (supported only for writeback mode).
 
+mballoc(*) Use the mutliblock allocator for block 
allocation
+nomballoc  disabled multiblock allocator for block allocation.
+stripe=n   filesystem blocks per stripe for a RAID configuration.
+
 
 Data Mode
 -
diff --git a/Documentation/filesystems/proc.txt 
b/Documentation/filesystems/proc.txt
index dec9945..4413a2d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -857,6 +857,45 @@ CPUs.
 The   procs_blocked line gives  the  number of  processes currently blocked,
 waiting for I/O to complete.
 
+1.9 Ext4 file system parameters
+--
+Ext4 file system have one directory per partition under /proc/fs/ext4/
+# ls /proc/fs/ext4/hdc/
+group_prealloc  max_to_scan  mb_groups  mb_history  min_to_scan  order2_req
+stats  stream_req
+
+mb_groups:
+This file gives the details of mutiblock allocator buddy cache of free blocks
+
+mb_history:
+Multiblock allocation history.
+
+stats:
+This file indicate whether the multiblock allocator should start collecting
+statistics. The statistics are shown during unmount
+
+group_prealloc:
+The multiblock allocator normalize the block allocation request to
+group_prealloc filesystem blocks if we don't have strip value set.
+The stripe value can be specified at mount time or during mke2fs.
+
+max_to_scan:
+How long multiblock allocator can look for a best extent (in found extents)
+
+min_to_scan:
+How long multiblock allocator  must look for a best extent
+
+order2_req:
+Multiblock allocator use  2^N search using buddies only for requests greater
+than or equal to order2_req. The request size is specfied in file system
+blocks. A value of 2 indicate only if the requests are greater than or equal
+to 4 blocks.
+
+stream_req:
+Files smaller than stream_req are served by the stream allocator, whose
+purpose is to pack requests as close each to other as possible to
+produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
+filesystem block size will use group based preallocation.
 
 --
 Summary
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0398aa0..310bad6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -489,7 +489,7 @@ struct ext4_free_extent {
  */
 struct ext4_locality_group {
/* for allocator */
-   struct semaphorelg_sem; /* to serialize allocates */
+   struct mutexlg_sem; /* to serialize allocates */
struct list_headlg_prealloc_list;/* list of preallocations */
spinlock_t  lg_prealloc_lock;
 };
@@ -563,7 +563,10 @@ struct ext4_buddy {
 #define EXT4_MB_BUDDY(e4b) ((e4b)-bd_buddy)
 
 #ifndef EXT4_MB_HISTORY
-#define ext4_mb_store_history(ac)
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+   return;
+}
 #else
 static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #endif
@@ -641,6 +644,10 @@ static ext4_fsblk_t ext4_grp_offs_to_block(struct 
super_block *sb,
 
 static inline int mb_test_bit(int bit, void *addr)
 {
+   /*
+* ext4_test_bit on architecture like powerpc
+* needs unsigned long aligned address
+*/
mb_correct_addr_and_bit(bit, addr);
return ext4_test_bit(bit, addr);
 }
@@ -669,7 +676,7 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, 
int bit, void *addr)
ext4_clear_bit_atomic(lock, bit, addr);
 }
 
-static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
char *bb;
 
@@ -752,9 +759,20 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void 
*bitmap)
 }
 
 #else
-#define mb_free_blocks_double(a, b, c, d)
-#define mb_mark_used_double(a, b, c)
-#define 

Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-24 Thread Aneesh Kumar K.V
On Thu, Jan 24, 2008 at 01:26:14PM +0530, Aneesh Kumar K.V wrote:
 
  
   +/* find most significant bit */
   +static int fmsb(unsigned short word)
   +{
   + int order;
   +
   + if (word  255) {
   + order = 7;
   + word = 8;
   + } else {
   + order = -1;
   + }
   +
   + do {
   + order++;
   + word = 1;
   + } while (word != 0);
   +
   + return order;
   +}
  
  Did we just reinvent fls()?
 
 replaced by fls.
 
  

That should be fls() - 1;

The full patch is at

http://www.radian.org/~kvaneesh/ext4/jan-24-2008/mballoc-core.patch

The patch is too big to inline.

-aneesh
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-23 Thread Andrew Morton
 On Mon, 21 Jan 2008 22:02:20 -0500 Theodore Ts'o [EMAIL PROTECTED] wrote:
 From: Alex Tomas [EMAIL PROTECTED]
 
 Signed-off-by: Alex Tomas [EMAIL PROTECTED]
 Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
 Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED]
 Signed-off-by: Eric Sandeen [EMAIL PROTECTED]
 Signed-off-by: Theodore Ts'o [EMAIL PROTECTED]

 ...

 +#if BITS_PER_LONG == 64
 +#define mb_correct_addr_and_bit(bit, addr)   \
 +{\
 + bit += ((unsigned long) addr  7UL)  3;   \
 + addr = (void *) ((unsigned long) addr  ~7UL);  \
 +}
 +#elif BITS_PER_LONG == 32
 +#define mb_correct_addr_and_bit(bit, addr)   \
 +{\
 + bit += ((unsigned long) addr  3UL)  3;   \
 + addr = (void *) ((unsigned long) addr  ~3UL);  \
 +}
 +#else
 +#error how many bits you are?!
 +#endif

Why do these exist?

 +static inline int mb_test_bit(int bit, void *addr)
 +{
 + mb_correct_addr_and_bit(bit, addr);
 + return ext4_test_bit(bit, addr);
 +}

ext2_test_bit() already handles bitnum  wordsize.

If mb_correct_addr_and_bit() is actually needed then some suitable comment
would help.

 +static inline void mb_set_bit(int bit, void *addr)
 +{
 + mb_correct_addr_and_bit(bit, addr);
 + ext4_set_bit(bit, addr);
 +}
 +
 +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
 +{
 + mb_correct_addr_and_bit(bit, addr);
 + ext4_set_bit_atomic(lock, bit, addr);
 +}
 +
 +static inline void mb_clear_bit(int bit, void *addr)
 +{
 + mb_correct_addr_and_bit(bit, addr);
 + ext4_clear_bit(bit, addr);
 +}
 +
 +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 +{
 + mb_correct_addr_and_bit(bit, addr);
 + ext4_clear_bit_atomic(lock, bit, addr);
 +}
 +
 +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int 
 *max)

uninlining this will save about eighty squigabytes of text.

Please review all of ext4/jbd2 with a view to removig unnecessary and wrong
inlings.

 +{
 + char *bb;
 +
 + /* FIXME!! is this needed */
 + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
 + BUG_ON(max == NULL);
 +
 + if (order  e4b-bd_blkbits + 1) {
 + *max = 0;
 + return NULL;
 + }
 +
 + /* at order 0 we see each particular block */
 + *max = 1  (e4b-bd_blkbits + 3);
 + if (order == 0)
 + return EXT4_MB_BITMAP(e4b);
 +
 + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b-bd_sb)-s_mb_offsets[order];
 + *max = EXT4_SB(e4b-bd_sb)-s_mb_maxs[order];
 +
 + return bb;
 +}
 +

 ...

 +#else
 +#define mb_free_blocks_double(a, b, c, d)
 +#define mb_mark_used_double(a, b, c)
 +#define mb_cmp_bitmaps(a, b)
 +#endif

Please use the do{}while(0) thing.  Or, better, proper C functions which
have typechecking (unless this will cause undefined-var compile errors,
which happens sometimes)

 +/* find most significant bit */
 +static int fmsb(unsigned short word)
 +{
 + int order;
 +
 + if (word  255) {
 + order = 7;
 + word = 8;
 + } else {
 + order = -1;
 + }
 +
 + do {
 + order++;
 + word = 1;
 + } while (word != 0);
 +
 + return order;
 +}

Did we just reinvent fls()?

 +/* FIXME!! need more doc */
 +static void ext4_mb_mark_free_simple(struct super_block *sb,
 + void *buddy, unsigned first, int len,
 + struct ext4_group_info *grp)
 +{
 + struct ext4_sb_info *sbi = EXT4_SB(sb);
 + unsigned short min;
 + unsigned short max;
 + unsigned short chunk;
 + unsigned short border;
 +
 + BUG_ON(len = EXT4_BLOCKS_PER_GROUP(sb));
 +
 + border = 2  sb-s_blocksize_bits;

Won't this explode with = 32k blocksize?

 + while (len  0) {
 + /* find how many blocks can be covered since this position */
 + max = ffs(first | border) - 1;
 +
 + /* find how many blocks of power 2 we need to mark */
 + min = fmsb(len);
 +
 + if (max  min)
 + min = max;
 + chunk = 1  min;
 +
 + /* mark multiblock chunks only */
 + grp-bb_counters[min]++;
 + if (min  0)
 + mb_clear_bit(first  min,
 +  buddy + sbi-s_mb_offsets[min]);
 +
 + len -= chunk;
 + first += chunk;
 + }
 +}
 +
 
 ...

 +static int ext4_mb_init_cache(struct page *page, char *incore)
 +{
 + int blocksize;
 + int blocks_per_page;
 + int groups_per_page;
 + int err = 0;
 + int i;
 + ext4_group_t first_group;
 + int first_block;
 + struct super_block *sb;
 + struct buffer_head *bhs;
 + struct buffer_head **bh;
 + struct inode *inode;
 + char *data;
 + char *bitmap;
 +
 + mb_debug(init page %lu\n, page-index);

Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-23 Thread Andreas Dilger
On Jan 23, 2008  14:07 -0800, Andrew Morton wrote:
  +#define mb_correct_addr_and_bit(bit, addr) \
  +{  \
  +   bit += ((unsigned long) addr  3UL)  3;   \
  +   addr = (void *) ((unsigned long) addr  ~3UL);  \
  +}
 
 Why do these exist?

They seem to be a holdover from when mballoc stored the buddy bitmaps
on disk.  That no longer happens (to avoid bitmap vs. buddy consistency
problems), so I suspect they can be removed.

I can't comment on many of the other issues because Alex wrote most
of the code.

 Gosh what a lot of code.  Is it faster?

Yes, and also importantly it uses a lot less CPU to do a given amount
of allocation, which is critical in our environments where there is
very high disk bandwidth on a single node and CPU becomes the limiting
factor of the IO speed.  This of course also helps any write-intensive
environment where the CPU is doing something useful.

Some older test results include:
https://ols2006.108.redhat.com/2007/Reprints/mathur-Reprint.pdf (Section 7)

In the linux-ext4 thread compilebench numbers for ext4:
http://www.mail-archive.com/linux-ext4@vger.kernel.org/msg03834.html

http://oss.oracle.com/~mason/compilebench/ext4/ext-create-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-compile-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-read-compare.png
http://oss.oracle.com/~mason/compilebench/ext4/ext-rm-compare.png

note the ext-read-compare.png graph shows lower read performance, but
a couple of bugs in mballoc were since fixed to have ext4 allocate more
contiguous extents.

In the old linux-ext4 thread [RFC] delayed allocation testing on node zefir
http://www.mail-archive.com/linux-ext4@vger.kernel.org/msg00587.html

: dd2048rw 
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 58.46  23 1491   25722097292 17 extents
EXT4: 44.56  19 1018   12  2097244 19 extents
REISERFS: 56.80  26 1370   29522097336 457 extents
JFS : 45.77  22 9840   2097216 1 extents
XFS : 50.97  20 1394   0   2100825 7 extents

: kernuntar
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 56.99  5037   65168  252016  
EXT4: 55.03  5034   55336  249884  
REISERFS: 52.55  4996   85464  238068  
JFS : 70.15  5057   630496 288116  
XFS : 72.84  5052   953132 316798  

: kernstat 
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 2.83   8  15 58920   
EXT4: 0.51   9  10 58920   
REISERFS: 0.81   7  49 26960   
JFS : 6.19   11 49 12552   0   
XFS : 2.09   9  61 65040   

: kerncat  
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 9.48   25 213241624  0   
EXT4: 6.29   27 197238560  0   
REISERFS: 14.69  33 230234744  0   
JFS : 23.51  23 231244596  0   
XFS : 18.24  36 254238548  0   

: kernrm   
: REAL   UTIME  STIME  READWRITTEN DETAILS
EXT3: 4.82   4  10896284672
EXT4: 1.61   5  11065364632
REISERFS: 3.15   8  2762768236 
JFS : 33.90  7  16814400   33048   
XFS : 20.03  8  296663286160   


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 41/49] ext4: Add multi block allocator for ext4

2008-01-23 Thread Aneesh Kumar K.V
On Wed, Jan 23, 2008 at 02:07:27PM -0800, Andrew Morton wrote:
  On Mon, 21 Jan 2008 22:02:20 -0500 Theodore Ts'o [EMAIL PROTECTED] 
  wrote:
  From: Alex Tomas [EMAIL PROTECTED]
  
  Signed-off-by: Alex Tomas [EMAIL PROTECTED]
  Signed-off-by: Andreas Dilger [EMAIL PROTECTED]
  Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED]
  Signed-off-by: Eric Sandeen [EMAIL PROTECTED]
  Signed-off-by: Theodore Ts'o [EMAIL PROTECTED]
 
  ...
 
  +#if BITS_PER_LONG == 64
  +#define mb_correct_addr_and_bit(bit, addr) \
  +{  \
  +   bit += ((unsigned long) addr  7UL)  3;   \
  +   addr = (void *) ((unsigned long) addr  ~7UL);  \
  +}
  +#elif BITS_PER_LONG == 32
  +#define mb_correct_addr_and_bit(bit, addr) \
  +{  \
  +   bit += ((unsigned long) addr  3UL)  3;   \
  +   addr = (void *) ((unsigned long) addr  ~3UL);  \
  +}
  +#else
  +#error how many bits you are?!
  +#endif
 
 Why do these exist?

Initial version on mballoc supported on x86 32 this was there to give
compile warning on 64 bit platform. I guess we can remove that now.
Or may be we can keep it as such because it is harmless.


 
  +static inline int mb_test_bit(int bit, void *addr)
  +{
  +   mb_correct_addr_and_bit(bit, addr);
  +   return ext4_test_bit(bit, addr);
  +}
 
 ext2_test_bit() already handles bitnum  wordsize.
 
 If mb_correct_addr_and_bit() is actually needed then some suitable comment
 would help.

ext4_test_bit on powerpc needs the addr to be 8 byte aligned. Othewise
it fails

 
  +static inline void mb_set_bit(int bit, void *addr)
  +{
  +   mb_correct_addr_and_bit(bit, addr);
  +   ext4_set_bit(bit, addr);
  +}
  +
  +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
  +{
  +   mb_correct_addr_and_bit(bit, addr);
  +   ext4_set_bit_atomic(lock, bit, addr);
  +}
  +
  +static inline void mb_clear_bit(int bit, void *addr)
  +{
  +   mb_correct_addr_and_bit(bit, addr);
  +   ext4_clear_bit(bit, addr);
  +}
  +
  +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void 
  *addr)
  +{
  +   mb_correct_addr_and_bit(bit, addr);
  +   ext4_clear_bit_atomic(lock, bit, addr);
  +}
  +
  +static inline void *mb_find_buddy(struct ext4_buddy *e4b, int order, int 
  *max)
 
 uninlining this will save about eighty squigabytes of text.

Fixed


 
 Please review all of ext4/jbd2 with a view to removig unnecessary and wrong
 inlings.
 
  +{
  +   char *bb;
  +
  +   /* FIXME!! is this needed */
  +   BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
  +   BUG_ON(max == NULL);
  +
  +   if (order  e4b-bd_blkbits + 1) {
  +   *max = 0;
  +   return NULL;
  +   }
  +
  +   /* at order 0 we see each particular block */
  +   *max = 1  (e4b-bd_blkbits + 3);
  +   if (order == 0)
  +   return EXT4_MB_BITMAP(e4b);
  +
  +   bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b-bd_sb)-s_mb_offsets[order];
  +   *max = EXT4_SB(e4b-bd_sb)-s_mb_maxs[order];
  +
  +   return bb;
  +}
  +
 
  ...
 
  +#else
  +#define mb_free_blocks_double(a, b, c, d)
  +#define mb_mark_used_double(a, b, c)
  +#define mb_cmp_bitmaps(a, b)
  +#endif
 
 Please use the do{}while(0) thing.  Or, better, proper C functions which
 have typechecking (unless this will cause undefined-var compile errors,
 which happens sometimes)

makde static inline void.

 
  +/* find most significant bit */
  +static int fmsb(unsigned short word)
  +{
  +   int order;
  +
  +   if (word  255) {
  +   order = 7;
  +   word = 8;
  +   } else {
  +   order = -1;
  +   }
  +
  +   do {
  +   order++;
  +   word = 1;
  +   } while (word != 0);
  +
  +   return order;
  +}
 
 Did we just reinvent fls()?

replaced by fls.

 
  +/* FIXME!! need more doc */
  +static void ext4_mb_mark_free_simple(struct super_block *sb,
  +   void *buddy, unsigned first, int len,
  +   struct ext4_group_info *grp)
  +{
  +   struct ext4_sb_info *sbi = EXT4_SB(sb);
  +   unsigned short min;
  +   unsigned short max;
  +   unsigned short chunk;
  +   unsigned short border;
  +
  +   BUG_ON(len = EXT4_BLOCKS_PER_GROUP(sb));
  +
  +   border = 2  sb-s_blocksize_bits;
 
 Won't this explode with = 32k blocksize?
 
  +   while (len  0) {
  +   /* find how many blocks can be covered since this position */
  +   max = ffs(first | border) - 1;
  +
  +   /* find how many blocks of power 2 we need to mark */
  +   min = fmsb(len);
  +
  +   if (max  min)
  +   min = max;
  +   chunk = 1  min;
  +
  +   /* mark multiblock chunks only */
  +   grp-bb_counters[min]++;
  +   if (min  0)
  +   mb_clear_bit(first  min,
  +buddy + sbi-s_mb_offsets[min]);
  +
  +   len -= chunk;
  +   first += chunk;
  +   }
  +}
  +
  
  ...
 
  +static