This is the first ask-for-review patch for dir inode reservation. Basic 
function testing is done,
the benchmark result is still on the way (really time consuming).

The previous patch (v0.1) introduced 2 special indoes which were named magic 
inodes. The magic inode
scheme modified ext4 on-disk format, which was concerned by several people.

This time the patch (V1) removes magic inodes, there is no on-disk format 
modification in this
patch. Also dir inode reservation feature is only mount option, if you do not 
want to test it, just
ignore the mount option dir_ireserve=low/normal/high.

I will post detail text later. Any comments for this patch is great welcome :-)

Signed-off-by: Coly Li <[EMAIL PROTECTED]>
Cc: Andreas Dilger <[EMAIL PROTECTED]>
Cc: Mingming Cao <[EMAIL PROTECTED]>
---
 fs/ext4/ialloc.c           |  203 ++++++++++++++++++++++++++++++++++++++++++--
 fs/ext4/super.c            |   18 ++++-
 include/linux/ext4_fs.h    |    8 ++
 include/linux/ext4_fs_sb.h |    2 +
 4 files changed, 221 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d775170..cbb9db9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -130,6 +130,41 @@ error_out:
 }

 /*
+ * When calling this function, spin_lock of gdp is hold already.
+ */
+static void ext4_update_itable_unused(handle_t * handle, struct inode * inode,
+                        struct ext4_group_desc * gdp, struct buffer_head * 
bitmap_bh)
+{
+       struct super_block * sb;
+       int bit, offset;
+       int free, group, ires;
+
+       sb = inode->i_sb;
+       ires =  EXT4_SB(sb)->s_dir_ireserve_nr;
+       bit = (inode->i_ino - 1) % EXT4_INODES_PER_GROUP(sb);
+       if (bit & (ires - 1))
+               return;
+       free = EXT4_INODES_PER_GROUP(sb) - le16_to_cpu(gdp->bg_itable_unused);
+       if (free < ires)
+               return;
+       group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+       do {
+               offset = ext4_find_next_bit(
+                       bitmap_bh->b_data, free, free - ires);
+               if (offset >= free)
+                       free -= ires;
+               else
+                       break;
+       } while(free > 0);
+       if (free < 0)
+               free = 0;
+       if (group == 0 && (free < EXT4_DIR_IRESERVE_NORMAL))
+               free = EXT4_DIR_IRESERVE_NORMAL;
+       gdp->bg_itable_unused = cpu_to_le16(
+               EXT4_INODES_PER_GROUP(sb) - free);
+}
+
+/*
  * NOTE! When we get the inode, we're the only people
  * that have access to it, and as such there are no
  * race conditions we have to worry about. The inode
@@ -225,9 +260,13 @@ void ext4_free_inode (handle_t *handle, struct inode * 
inode)
                        spin_lock(sb_bgl_lock(sbi, block_group));
                        gdp->bg_free_inodes_count = cpu_to_le16(
                                le16_to_cpu(gdp->bg_free_inodes_count) + 1);
-                       if (is_directory)
+                       if (is_directory) {
                                gdp->bg_used_dirs_count = cpu_to_le16(
                                  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+                               if (tes_opt(sb, DIR_IRESERVE))
+                                       ext4_update_itable_unused(
+                                               handle, inode, gdp, bitmap_bh);
+                       }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
                        spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -264,9 +303,10 @@ static int find_group_dir(struct super_block *sb, struct 
inode *parent,
                          ext4_grpnum_t *best_group)
 {
        ext4_grpnum_t ngroups = EXT4_SB(sb)->s_groups_count;
+       int ires = EXT4_SB(sb)->s_dir_ireserve_nr;
        unsigned int freei, avefreei;
-       struct ext4_group_desc *desc, *best_desc = NULL;
-       ext4_grpnum_t group;
+       struct ext4_group_desc *desc, *best_desc = NULL, *best_ires_desc = NULL;
+       ext4_grpnum_t group, best_ires_group = -1;
        int ret = -1;

        freei = 
percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
@@ -285,7 +325,21 @@ static int find_group_dir(struct super_block *sb, struct 
inode *parent,
                        best_desc = desc;
                        ret = 0;
                }
+               if(test_opt(sb, DIR_IRESERVE)) {
+                       if((best_ires_desc &&
+                          (le16_to_cpu(desc->bg_itable_unused) >
+                          le16_to_cpu(best_ires_desc->bg_itable_unused))) ||
+                          ((!best_ires_desc) &&
+                          (le16_to_cpu(desc->bg_itable_unused) >= ires))) {
+                               best_ires_group = group;
+                               best_ires_desc = desc;
+                               ret = 0;
+                       }
+               }
        }
+       if (test_opt(sb, DIR_IRESERVE) && best_ires_desc)
+               *best_group = best_ires_group;
+       
        return ret;
 }

@@ -354,6 +408,10 @@ static int find_group_orlov(struct super_block *sb, struct 
inode *parent,
                        desc = ext4_get_group_desc(sb, grp, NULL);
                        if (!desc || !desc->bg_free_inodes_count)
                                continue;
+                       if (test_opt(sb, DIR_IRESERVE) &&
+                           (le16_to_cpu(desc->bg_itable_unused)
+                                               < 
EXT4_SB(sb)->s_dir_ireserve_nr))
+                               continue;
                        if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
                                continue;
                        if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -390,6 +448,10 @@ static int find_group_orlov(struct super_block *sb, struct 
inode *parent,
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (!desc || !desc->bg_free_inodes_count)
                        continue;
+               if (test_opt(sb, DIR_IRESERVE) &&
+                   (le16_to_cpu(desc->bg_itable_unused)
+                                       < EXT4_SB(sb)->s_dir_ireserve_nr))
+                       continue;
                if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
                        continue;
                if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
@@ -479,6 +541,108 @@ static int find_group_other(struct super_block *sb, 
struct inode *parent,
 }

 /*
+ *
+ */
+static int ext4_ino_from_ireserve(handle_t *handle, struct inode * dir,
+                                 int mode, int * group, unsigned long * ino)
+{
+       struct ext4_group_desc * gdp = NULL;
+       struct super_block * sb;
+       struct ext4_sb_info * sbi;
+       struct buffer_head *gdp_bh =NULL, *bitmap_bh = NULL;
+       int free;
+       int i;
+       int retries;
+       unsigned long ires_ino;
+       int ires_group = *group;
+
+       sb = dir->i_sb;
+       sbi = EXT4_SB(sb);
+
+       /* if the inode number is not for directory,
+        * only try to allocate after directory's inode
+        */
+       if (!S_ISDIR(mode)) {
+               ires_ino = dir->i_ino % EXT4_INODES_PER_GROUP(sb);
+               goto find;
+       }
+
+       /* reserve inodes for new directory */
+       for(i = 0; i < sbi->s_groups_count; i++) {
+               gdp = ext4_get_group_desc(sb, ires_group, &gdp_bh);
+               if (!gdp)
+                       goto fail;
+               retries = 2;
+still_reserve_in_this_group:
+               if (le16_to_cpu(gdp->bg_itable_unused) >=
+                   sbi->s_dir_ireserve_nr) {
+
+                       brelse(bitmap_bh);
+                       bitmap_bh = read_inode_bitmap(sb, ires_group);
+                       if (!bitmap_bh) {
+                               goto fail;
+                       }
+
+                       BUFFER_TRACE(bitmap_bh, "get_write_access");
+                       if (ext4_journal_get_write_access(handle, bitmap_bh) != 
0)
+                               goto fail;
+                       free = EXT4_INODES_PER_GROUP(sb) -
+                               le16_to_cpu(gdp->bg_itable_unused);
+                       if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, ires_group),
+                                       free, bitmap_bh->b_data)) {
+                               /* we won it */
+                               BUFFER_TRACE(bitmap_bh,
+                                       "call ext4_journal_dirty_metadata");
+                               if (ext4_journal_dirty_metadata(handle,
+                                                       bitmap_bh) != 0)
+                                       goto fail;
+                               ires_ino = free;
+                               goto find;
+                       }
+                       /* we lost it */
+                       jbd2_journal_release_buffer(handle, bitmap_bh);
+                       if (-- retries > 0)
+                               goto still_reserve_in_this_group;
+               }
+               if (++ires_group == sbi->s_groups_count)
+                       ires_group = 0;
+       }
+       goto fail;
+find:
+       if(S_ISDIR(mode)) {
+               free = ires_ino + sbi->s_dir_ireserve_nr;
+               if (free > EXT4_INODES_PER_GROUP(sb))
+                       free = EXT4_INODES_PER_GROUP(sb);
+       
+               spin_lock(sb_bgl_lock(sbi, ires_group));
+               if ((EXT4_INODES_PER_GROUP(sb) - free) <
+                    le16_to_cpu(gdp->bg_itable_unused)) {
+                       BUFFER_TRACE (gdp_bh,
+                                     "call ext4_journal_get_write_access");
+                       if (ext4_journal_get_write_access(handle, gdp_bh)) {
+                               spin_unlock(sb_bgl_lock(sbi, ires_group));
+                               goto fail;
+                       }
+                       gdp->bg_itable_unused =
+                               EXT4_INODES_PER_GROUP(sb) - free;
+                       spin_unlock(sb_bgl_lock(sbi, ires_group));
+                       BUFFER_TRACE (bh, "call ext4_journal_dirty_metadata");
+                       if (ext4_journal_dirty_metadata(handle, gdp_bh) != 0)
+                               goto fail;
+               } else {
+                       spin_unlock(sb_bgl_lock(sbi, ires_group));
+               }
+               brelse(bitmap_bh);
+               *group = ires_group;
+       }
+       *ino = ires_ino;
+       return 0;
+fail:
+       brelse(bitmap_bh);
+       return -ENOSPC;
+}
+
+/*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
  * free space and a low directory-to-inode ratio; if that fails, then of
@@ -541,7 +705,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct 
inode * dir, int mode)
                        goto fail;

                ino = 0;
-
+               if (test_opt(sb, DIR_IRESERVE)) {
+                       err = ext4_ino_from_ireserve(handle, dir,
+                                                    mode, &group, &ino);
+                       if ((!err) && S_ISDIR(mode))
+                               goto got;
+               }
 repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
                                bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), 
ino);
@@ -633,6 +802,20 @@ got:
        }

        spin_lock(sb_bgl_lock(sbi, group));
+
+       if (test_opt(sb, DIR_IRESERVE)) {
+               free = EXT4_INODES_PER_GROUP(sb) -
+                       le16_to_cpu(gdp->bg_itable_unused);
+               if (ino > free) {
+                       free += sbi->s_dir_ireserve_nr;
+                       free = (free + sbi->s_dir_ireserve_nr - 1) &
+                               ~(sbi->s_dir_ireserve_nr - 1);
+                       if (free > EXT4_INODES_PER_GROUP(sb))
+                               free = EXT4_INODES_PER_GROUP(sb);
+                       gdp->bg_itable_unused = cpu_to_le16(
+                               EXT4_INODES_PER_GROUP(sb) - free);
+               }
+       }
        /* If we didn't allocate from within the initialized part of the inode
         * table then we need to initialize up to this inode. */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
@@ -655,12 +838,14 @@ got:
                /*
                 * Check the relative inode number against the last used
                 * relative inode number in this group. if it is greater
-                * we need to  update the bg_itable_unused count
-                *
+                * we need to  update the bg_itable_unused count. If
+                * directory inode reservation is enabled, try to make it
+                * align on a s_dir_ireserve_nr boundary.
                 */
-               if (ino > free)
-                       gdp->bg_itable_unused =
-                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+               if (ino > free) {
+                       gdp->bg_itable_unused = cpu_to_le16(
+                               EXT4_INODES_PER_GROUP(sb) - ino);
+               }
        }

        gdp->bg_free_inodes_count =
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 37afc41..159021b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -874,11 +874,12 @@ enum {
        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+       Opt_dir_ireserve_low, Opt_dir_ireserve_normal, Opt_dir_ireserve_high,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_delalloc,
-       Opt_mballoc, Opt_nomballoc, Opt_stripe,
+       Opt_mballoc, Opt_nomballoc, Opt_stripe,
 };

 static match_table_t tokens = {
@@ -919,6 +920,9 @@ static match_table_t tokens = {
        {Opt_data_journal, "data=journal"},
        {Opt_data_ordered, "data=ordered"},
        {Opt_data_writeback, "data=writeback"},
+       {Opt_dir_ireserve_low, "dir_ireserve=low"},
+       {Opt_dir_ireserve_normal, "dir_ireserve=normal"},
+       {Opt_dir_ireserve_high, "dir_ireserve=high"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -1297,6 +1301,18 @@ clear_qf_name:
                                return 0;
                        sbi->s_stripe = option;
                        break;
+               case Opt_dir_ireserve_low:
+                       set_opt(sbi->s_mount_opt, DIR_IRESERVE);
+                       sbi->s_dir_ireserve_nr = EXT4_DIR_IRESERVE_LOW;
+                       break;
+               case Opt_dir_ireserve_normal:
+                       set_opt(sbi->s_mount_opt, DIR_IRESERVE);
+                       sbi->s_dir_ireserve_nr = EXT4_DIR_IRESERVE_NORMAL;
+                       break;
+               case Opt_dir_ireserve_high:
+                       set_opt(sbi->s_mount_opt, DIR_IRESERVE);
+                       sbi->s_dir_ireserve_nr = EXT4_DIR_IRESERVE_HIGH;
+                       break;
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 8d56b86..a8332bd 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -92,6 +92,13 @@ struct ext4_allocation_request {
 #define EXT4_GOOD_OLD_FIRST_INO        11

 /*
+ * Macro-instructions used to reserve inodes for directories
+ */
+#define EXT4_DIR_IRESERVE_LOW          16
+#define EXT4_DIR_IRESERVE_NORMAL       64
+#define EXT4_DIR_IRESERVE_HIGH         128
+
+/*
  * Maximal count of links to a file
  */
 #define EXT4_LINK_MAX          65000
@@ -502,6 +509,7 @@ do {                                                        
                       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async 
Commit */
 #define EXT4_MOUNT_DELALLOC            0x2000000 /* Delalloc support */
 #define EXT4_MOUNT_MBALLOC             0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DIR_IRESERVE                0x10000000/* directory inodes 
reservation support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)              o &= ~EXT4_MOUNT_##opt
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 4098d4f..fa5e866 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -147,6 +147,8 @@ struct ext4_sb_info {

        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
+       /* directory inodes reservation number */
+       int s_dir_ireserve_nr;
 };
 #define EXT4_GROUP_INFO(sb, group)                                        \
        EXT4_SB(sb)->s_group_info[(group) >> EXT4_DESC_PER_BLOCK_BITS(sb)] \



-- 
Coly Li
SuSE PRC Labs
-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to