Hi Mingming,

New patches for patch queue can be found at
http://www.radian.org/~kvaneesh/ext4/jan-10-2008-ver2/

The changes are
------------
a) mballoc patch got an explanation about regular allocator.
b) mballoc regular allocator we changed the usage of ffs to fls. I guess
it makes sense to use fls because we want to compare it against the
tunable s_mb_order2_reqs. Only request above this order are using
criteria 0 allocation.
c) stripe.patch to use the stripe size set in the super block for block
allocation.

The diff is attached for reference.

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0d31817..0085fde 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -468,7 +468,6 @@ static void ext4_mb_free_committed_blocks(struct 
super_block *);
 static void ext4_mb_return_to_preallocation(struct inode *inode,
                                        struct ext4_buddy *e4b, sector_t block,
                                        int count);
-static void ext4_mb_show_ac(struct ext4_allocation_context *ac);
 static void ext4_mb_put_pa(struct ext4_allocation_context *, struct 
super_block *,
                                                struct ext4_prealloc_space *pa);
 static int ext4_mb_init_per_dev_proc(struct super_block *sb);
@@ -1838,14 +1837,23 @@ static int ext4_mb_regular_allocator(struct 
ext4_allocation_context *ac)
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                goto out;
 
-       i = ffs(ac->ac_g_ex.fe_len);
+       /*
+        * ac->ac2_order is set only if the fe_len is a power of 2
+        * if ac2_order is set we also set criteria to 0 so whtat we
+        * try exact allocation using buddy.
+        */
+       i = fls(ac->ac_g_ex.fe_len);
        ac->ac_2order = 0;
-       /* FIXME!!
-        * What happens if i is still greater than s_mb_order2_reqs
+       /*
+        * We search using buddy data only if the order of the request
+        * is greater than equal to the sbi_s_mb_order2_reqs
+        * You can tune it via /proc/fs/ext4/<partition>/order2_req
         */
        if (i >= sbi->s_mb_order2_reqs) {
-               i--;
-               if ((ac->ac_g_ex.fe_len & (~(1 << i))) == 0)
+               /*
+                * This should tell if fe_len is exactly power of 2
+                */
+               if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
                        ac->ac_2order = i;
        }
 
@@ -1865,17 +1873,17 @@ static int ext4_mb_regular_allocator(struct 
ext4_allocation_context *ac)
                spin_unlock(&sbi->s_md_lock);
        }
 
+       /* searching for the right group start from the goal value specified */
        group = ac->ac_g_ex.fe_group;
 
        /* Let's just scan groups to find more-less suitable blocks */
        cr = ac->ac_2order ? 0 : 1;
+       /*
+        * cr == 0 try to get exact allocation,
+        * cr == 3  try to get anything
+        */
 repeat:
        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
-               /* FIXME!!
-                * We need to explain what criteria is and also
-                * need to define the number 0 to 4 for criteria
-                * What they actually means.
-                */
                ac->ac_criteria = cr;
                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
                        struct ext4_group_info *grp;
@@ -1889,23 +1897,28 @@ repeat:
                        if (grp->bb_free == 0)
                                continue;
 
+                       /*
+                        * if the group is already init we check whether it is
+                        * a good group and if not we don't load the buddy
+                        */
                        if (EXT4_MB_GRP_NEED_INIT(EXT4_GROUP_INFO(sb, group))) {
-                               /* we need full data about the group
-                                * to make a good selection */
+                               /*
+                                * we need full data about the group
+                                * to make a good selection
+                                */
                                err = ext4_mb_load_buddy(sb, group, &e4b);
                                if (err)
                                        goto out;
                                ext4_mb_release_desc(&e4b);
                        }
 
-                       /* check is group good for our criteries */
+                       /*
+                        * If the particular group doesn't satisfy our
+                        * criteria we continue with the next group
+                        */
                        if (!ext4_mb_good_group(ac, group, cr))
                                continue;
 
-                       /* FIXME!!
-                        * here also we are loading the buddy. so what 
difference
-                        * does EXT4_MB_GRP_NEED_INIT actually make
-                        */
                        err = ext4_mb_load_buddy(sb, group, &e4b);
                        if (err)
                                goto out;
@@ -3726,10 +3739,9 @@ repeat:
                busy = 0;
                ext4_unlock_group(sb, group);
                /*
-                * We see this quiet rare. But if a particular workload is
-                * effected by this we may need to add a waitqueue
+                * Yield the CPU here so that we don't get soft lockup
                 */
-               schedule_timeout(HZ);
+               schedule();
                goto repeat;
        }
 
@@ -3808,7 +3820,7 @@ repeat:
                        printk(KERN_ERR "uh-oh! used pa while discarding\n");
                        dump_stack();
                        current->state = TASK_UNINTERRUPTIBLE;
-                       schedule();
+                       schedule_timeout(HZ);
                        goto repeat;
 
                }
@@ -3832,8 +3844,12 @@ repeat:
                 * pa from inode's list may access already
                 * freed memory, bad-bad-bad */
 
+               /* XXX: if this happens too often, we can
+                * add a flag to force wait only in case
+                * of ->clear_inode(), but not in case of
+                * regular truncate */
                current->state = TASK_UNINTERRUPTIBLE;
-               schedule();
+               schedule_timeout(HZ);
                goto repeat;
        }
        spin_unlock(&ei->i_prealloc_lock);
@@ -3878,7 +3894,7 @@ static void ext4_mb_return_to_preallocation(struct inode 
*inode,
 {
        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
 }
-
+#ifdef MB_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
@@ -3928,6 +3944,9 @@ static void ext4_mb_show_ac(struct 
ext4_allocation_context *ac)
        }
        printk(KERN_ERR "\n");
 }
+#else
+#define ext4_mb_show_ac(x)
+#endif
 
 /*
  * We use locality group preallocation for small size file. The size of the
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c69f4e5..9d91c60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1775,6 +1775,21 @@ static ext4_fsblk_t descriptor_loc(struct super_block 
*sb,
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+       unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+       unsigned long stripe_width = 
le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+
+       if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) {
+               return sbi->s_stripe;
+       } else if (stripe_width <= sbi->s_blocks_per_group) {
+               return stripe_width;
+       } else if (stride <= sbi->s_blocks_per_group) {
+               return stride;
+       }
+
+       return 0;
+}
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                                __releases(kernel_sem)
@@ -2131,6 +2146,13 @@ static int ext4_fill_super (struct super_block *sb, void 
*data, int silent)
        sbi->s_rsv_window_head.rsv_alloc_hit = 0;
        sbi->s_rsv_window_head.rsv_goal_size = 0;
        ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
+       /*
+        * set the stripe size. If we have specified it via mount option, then
+        * use the mount option value. If the value specified at mount time is
+        * greater than the blocks per group use the super block value.
+        * Allocator needs it be less than blocks per group.
+        */
+       sbi->s_stripe = ext4_get_stripe_size(sbi);
 
        /*
         * set up enough so that it can read an inode
-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to