Hello all ext3 users,
included is a patch to ext3-0.0.2d which fixes the problem with ext3
filesystems "losing inodes" upon a system crash.  What happens is that if
a file is open, and then the last directory entry is unlinked, the file
remains on the disk until it is closed (this is normal Unix behaviour).
However, if the system crashes before the file is closed, the ext3 code
does not do anything with the unlinked file - it sits on disk using up
blocks.  Since e2fsck is never run on an ext3 filesystem, these inodes
and their blocks are never recovered.

This patch (against vanilla 2.2.14 with Stephen Tweedie's ext3 0.0.2d
patch, ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/) keeps a singly-linked
list of such "orphan" inodes on disk using the dtime (deletion time)
field in the on-disk inode, and a new "s_last_orphan" field in the
on-disk superblock.

At filesystem mount time (or remount time, if the filesystem is initially
mounted read-only), after journal recovery is complete, the kernel walks
the list of orphan inodes, disposing of them.

There is a slight risk if you crash your ext3 fileystem, and then don't
let the orphan code have a chance to cleanup (e.g. by mounting the
filesystem read-only), run e2fsck, start using it as an ext2 filesystem,
and then go back to ext3.  However, I'm submitting a patch to Ted for
e2fsck to fix this behaviour (it has to do with how deleted inodes
are handled).

The patch also includes fixes for socket filetype handling, new code
for i_version/i_generation (from 2.2.14 ext2), as well as some newer
patches that Stephen and I worked on which fix a timer problem and a
ext3_write_super bug (you would get an oops if you unmounted a filesystem
right after writing into it, if you didn't run sync first).

The original idea for the implementation came from Stephen, but he has
not had a chance to look at the code yet, so I don't know whether he
likes or hates the way I've implemented this, but I'm throwing it out
here to get some feedback from other ext3 users.  All comments regarding
this patch should be directed to fsdevel or to me, not to Stephen.

I've tested the code with interruptions with open inodes, followed
by immediate recovery, delayed recovery after mounting ro then rw, and
delayed recovery immediately after e2fsck.  It appears to be working fine,
but like all things in life, YMMV.  Please don't use this on production
systems yet.

Cheers, Andreas
--- cut here ---
diff -ru linux-2.2.14-ext3/fs/ext3/ialloc.c linux-2.2.14-ext3e/fs/ext3/ialloc.c
--- linux-2.2.14-ext3/fs/ext3/ialloc.c  Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/fs/ext3/ialloc.c Wed May 24 15:32:53 2000
@@ -270,21 +270,6 @@
 }
 
 /*
- * This function increments the inode version number
- *
- * This may be used one day by the NFS server
- */
-static void inc_inode_version (struct inode * inode,
-                              struct ext3_group_desc *gdp,
-                              int mode)
-{
-       inode->u.ext3_i.i_version++;
-       mark_inode_dirty(inode);
-
-       return;
-}
-
-/*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
  * free space and a low directory-to-inode ratio; if that fails, then of
@@ -497,13 +482,15 @@
        inode->u.ext3_i.i_file_acl = 0;
        inode->u.ext3_i.i_dir_acl = 0;
        inode->u.ext3_i.i_dtime = 0;
+       INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
        inode->u.ext3_i.i_block_group = i;
        inode->i_op = NULL;
        if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
                inode->i_flags |= MS_SYNCHRONOUS;
        insert_inode_hash(inode);
+       inode->i_generation = inode_generation_count++;
+       inode->u.ext3_i.i_version = inode->i_generation;
        ext3_mark_inode_dirty(handle, inode);
-       inc_inode_version (inode, gdp, mode);
 
        unlock_super (sb);
        if(DQUOT_ALLOC_INODE(sb, inode)) {
diff -ru linux-2.2.14-ext3/fs/ext3/inode.c linux-2.2.14-ext3e/fs/ext3/inode.c
--- linux-2.2.14-ext3/fs/ext3/inode.c   Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/fs/ext3/inode.c  Fri May 26 10:27:16 2000
@@ -38,9 +38,43 @@
  */
 void ext3_put_inode (struct inode * inode)
 {
+       jfs_debug(5, "putting inode %ld\n", inode->i_ino);
        ext3_discard_prealloc (inode);
 }
 
+/* ext3_orphan_del() removes an unlinked-but-still-referenced inode from the
+ * list of such inodes stored on disk, because it is finally being deleted.
+ */
+static void ext3_orphan_del(handle_t *handle, struct inode *inode)
+{
+       struct list_head *prev = inode->u.ext3_i.i_orphan.prev;
+       struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+       ino_t ino_next = NEXT_ORPHAN(inode);
+
+       if (list_empty(&inode->u.ext3_i.i_orphan))
+               return;
+
+       jfs_debug(4, "removing orphan inode %ld\n", inode->i_ino);
+       lock_super(inode->i_sb);
+       list_del(&inode->u.ext3_i.i_orphan);
+
+       if (prev == &sbi->s_orphan) {
+               jfs_debug(4, "superblock will point to %ld\n",ino_next);
+               journal_get_write_access(handle, sbi->s_sbh);
+               sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+               journal_dirty_metadata(handle, sbi->s_sbh);
+       } else {
+               struct inode *i_prev =
+                       list_entry(prev, struct inode, u.ext3_i.i_orphan);
+
+               jfs_debug(4, "orphan inode %ld will point to %ld\n",
+                         i_prev->i_ino, ino_next);
+               NEXT_ORPHAN(i_prev) = ino_next;
+               ext3_mark_inode_dirty(handle, i_prev);
+       }
+       unlock_super(inode->i_sb);
+}
+
 /*
  * Called at the last iput() if i_nlink is zero.
  */
@@ -51,11 +85,17 @@
        if (inode->i_ino == EXT3_ACL_IDX_INO ||
            inode->i_ino == EXT3_ACL_DATA_INO)
                return;
-       inode->u.ext3_i.i_dtime = CURRENT_TIME;
+
+       /* When we delete an inode, we increment its i_version. If it
+          is ever read in from disk again, it will have a different
+          i_version. */
+       inode->u.ext3_i.i_version++;
 
        handle = journal_start(EXT3_JOURNAL(inode), 
                               EXT3_DELETE_TRANS_BLOCKS);
-       
+
+       ext3_orphan_del(handle, inode);
+       inode->u.ext3_i.i_dtime = CURRENT_TIME;
        if (IS_SYNC(inode))
                handle->h_sync = 1;
        ext3_mark_inode_dirty(handle, inode);
@@ -555,6 +595,10 @@
        inode->i_ctime = le32_to_cpu(iloc.raw_inode->i_ctime);
        inode->i_mtime = le32_to_cpu(iloc.raw_inode->i_mtime);
        inode->u.ext3_i.i_dtime = le32_to_cpu(iloc.raw_inode->i_dtime);
+       if (inode->i_nlink == 0) {
+               jfs_debug(4, "orphan inode %ld points to next inode %d\n",
+                         inode->i_ino, NEXT_ORPHAN(inode));
+       }
        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat), not 
the fs block size */
        inode->i_blocks = le32_to_cpu(iloc.raw_inode->i_blocks);
        inode->i_version = ++global_event;
@@ -580,9 +624,11 @@
 #endif
        }
        inode->u.ext3_i.i_version = le32_to_cpu(iloc.raw_inode->i_version);
+       inode->i_generation = inode->u.ext3_i.i_version;
        inode->u.ext3_i.i_block_group = iloc.block_group;
        inode->u.ext3_i.i_next_alloc_block = 0;
        inode->u.ext3_i.i_next_alloc_goal = 0;
+       INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
        if (inode->u.ext3_i.i_prealloc_count)
                ext3_error (inode->i_sb, "ext3_read_inode",
                            "New inode has non-zero prealloc count!");
diff -ru linux-2.2.14-ext3/fs/ext3/ioctl.c linux-2.2.14-ext3e/fs/ext3/ioctl.c
--- linux-2.2.14-ext3/fs/ext3/ioctl.c   Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/fs/ext3/ioctl.c  Tue May 23 15:41:20 2000
@@ -76,6 +76,7 @@
                        return -EROFS;
                if (get_user(inode->u.ext3_i.i_version, (int *) arg))
                        return -EFAULT; 
+               inode->i_generation = inode->u.ext2_i.i_version;
                inode->i_ctime = CURRENT_TIME;
                mark_inode_dirty(inode);
                return 0;
diff -ru linux-2.2.14-ext3/fs/ext3/namei.c linux-2.2.14-ext3e/fs/ext3/namei.c
--- linux-2.2.14-ext3/fs/ext3/namei.c   Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/fs/ext3/namei.c  Fri May 26 15:41:47 2000
@@ -438,6 +438,10 @@
                if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
                                              EXT3_FEATURE_INCOMPAT_FILETYPE))
                        de->file_type = EXT3_FT_REG_FILE;
+       } else if (S_ISSOCK(inode->i_mode)) {
+               if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
+                                             EXT3_FEATURE_INCOMPAT_FILETYPE))
+                       de->file_type = EXT3_FT_SOCK;
        } else if (S_ISCHR(inode->i_mode)) {
                inode->i_op = &chrdev_inode_operations;
                if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb,
@@ -674,6 +678,43 @@
        return retval;
 }
 
+/* ext3_orphan_add() links a unlinked-but-still-referenced inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the file is
+ * closed/deleted.
+ *
+ * At filesystem recovery time, we walk this list deleting these orphan inodes
+ * in ext3_orphan_cleanup().
+ */
+static void ext3_orphan_add(handle_t *handle, struct dentry *dentry)
+{
+       struct inode *inode = dentry->d_inode;
+       struct super_block *sb = inode->i_sb;
+
+       if (inode->i_nlink > 0 || !list_empty(&inode->u.ext3_i.i_orphan) ||
+           (dentry->d_count <= 1 && inode->i_count <= 1))
+               return;
+
+       lock_super(sb);
+       /* There is a slight chance that inode was added while we didn't
+        * hold super lock, but we don't want to get the super lock for
+        * each unlink, only for an orphan.
+        */
+       if (!list_empty(&inode->u.ext3_i.i_orphan)) {
+               unlock_super(sb);
+               return;
+       }
+       journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+       NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+       EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+       journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+       list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+       unlock_super(sb);
+
+       jfs_debug(4, "superblock will point to %ld\n", inode->i_ino);
+       jfs_debug(4, "orphan inode %ld will point to %d\n",
+                 inode->i_ino, NEXT_ORPHAN(inode));
+}
+
 int ext3_unlink(struct inode * dir, struct dentry *dentry)
 {
        int retval;
@@ -713,9 +754,11 @@
        dir->u.ext3_i.i_flags &= ~EXT3_BTREE_FL;
        ext3_mark_inode_dirty(handle, dir);
        inode->i_nlink--;
+       ext3_orphan_add(handle, dentry);
        ext3_mark_inode_dirty(handle, inode);
        inode->i_ctime = dir->i_ctime;
        retval = 0;
+
        d_delete(dentry);       /* This also frees the inode */
 
 end_unlink:
@@ -829,6 +872,8 @@
                        de->file_type = EXT3_FT_DIR;
                else if (S_ISLNK(inode->i_mode))
                        de->file_type = EXT3_FT_SYMLINK;
+               else if (S_ISSOCK(inode->i_mode))
+                       de->file_type = EXT3_FT_SOCK;
                else if (S_ISCHR(inode->i_mode))
                        de->file_type = EXT3_FT_CHRDEV;
                else if (S_ISBLK(inode->i_mode))
diff -ru linux-2.2.14-ext3/fs/ext3/super.c linux-2.2.14-ext3e/fs/ext3/super.c
--- linux-2.2.14-ext3/fs/ext3/super.c   Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/fs/ext3/super.c  Fri May 26 15:53:16 2000
@@ -136,6 +136,9 @@
                if (EXT3_SB(sb)->s_block_bitmap[i])
                        brelse (EXT3_SB(sb)->s_block_bitmap[i]);
        brelse (EXT3_SB(sb)->s_sbh);
+       /* @@ debugging of orphan list - should never happen */
+       if (!list_empty(&EXT3_SB(sb)->s_orphan))
+               ext3_warning(sb, __FUNCTION__, "orphan inodes in list???\n");
 
        MOD_DEC_USE_COUNT;
        return;
@@ -414,6 +417,63 @@
        return 1;
 }
 
+/* ext3_cleanup_orphans() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash.  We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode.  The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext3_free_inode().  The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext3_orphan_cleanup( struct super_block * sb,
+                                struct ext3_super_block * es)
+{
+       ino_t max_ino;
+
+       if (!es->s_last_orphan)
+               return;
+
+       if (sb->s_flags & MS_RDONLY) {
+               printk("EXT3-fs: %s: deferred orphan cleanup on read-only fs\n",
+                      kdevname(sb->s_dev));
+               return;
+       }
+
+       max_ino = le32_to_cpu(es->s_inodes_count);
+       while (es->s_last_orphan) {
+               ino_t ino = le32_to_cpu(es->s_last_orphan);
+               struct inode *inode;
+
+               /* Error case - e2fsck has already cleaned up for us */
+               if (!(inode = iget(sb, ino)) || is_bad_inode(inode) ||
+                   NEXT_ORPHAN(inode) > max_ino || inode->i_nlink > 0) {
+                       ext3_warning(sb, __FUNCTION__,
+                                    "bad orphan ino %ld!  e2fsck was run?\n",
+                                    ino);
+                       iput(inode);
+                       break;
+               }
+               jfs_debug(2, "deleting inode %ld\n", inode->i_ino);
+               list_add(&inode->u.ext3_i.i_orphan, &sb->u.ext3_sb.s_orphan);
+               iput(inode);
+       }
+
+       printk("EXT3-fs: %s: orphan inodes deleted\n", kdevname(sb->s_dev));
+       journal_flush(EXT3_SB(sb)->s_journal);
+
+       es->s_last_orphan = 0;
+       ext3_commit_super(sb, es, 1);
+}
+
 #define log2(n) ffz(~(n))
 
 struct super_block * ext3_read_super (struct super_block * sb, void * data,
@@ -670,6 +730,7 @@
         */
        sb->s_dev = dev;
        sb->s_op = &ext3_sops;
+       INIT_LIST_HEAD(&sb->u.ext3_sb.s_orphan); /* unlinked but open files */
        unlock_super (sb);
 
        err = 0;
@@ -700,6 +761,7 @@
        if (!sb->s_root) 
                goto error_out;
        ext3_setup_super (sb, es);
+       ext3_orphan_cleanup(sb, es);
        return sb;
 
  error_out:
@@ -755,7 +817,7 @@
         * can get read-write access to the device.
         */
 
-       if (es->s_feature_incompat & EXT3_FEATURE_INCOMPAT_RECOVER) {
+       if (es->s_feature_incompat & cpu_to_le32(EXT3_FEATURE_INCOMPAT_RECOVER)) {
                if (sb->s_flags & MS_RDONLY) {
                        printk(KERN_ERR "EXT3-fs: WARNING: recovery required on 
readonly filesystem.\n");
                        if (is_read_only(sb->s_dev)) {
@@ -868,17 +933,16 @@
 
 void ext3_write_super (struct super_block * sb)
 {
-       struct ext3_super_block * es;
-
        if (!(sb->s_flags & MS_RDONLY)) {
                journal_t *journal;
-               
+
                journal = EXT3_SB(sb)->s_journal;
-               es = sb->u.ext3_sb.s_es;
 
-               if (journal->j_running_transaction)
+               if (journal->j_running_transaction) {
+                       tid_t wait_tid = journal->j_running_transaction->t_tid;
                        log_start_commit(journal, journal->j_running_transaction);
-               if (journal->j_committing_transaction)
+                       log_wait_commit(journal, wait_tid);
+               } else if (journal->j_committing_transaction)
                        log_wait_commit(journal, 
journal->j_committing_transaction->t_tid);
        }
        sb->s_dirt = 0;
@@ -938,6 +1002,7 @@
                sb->u.ext3_sb.s_mount_state = le16_to_cpu(es->s_state);
                sb->s_flags &= ~MS_RDONLY;
                ext3_setup_super (sb, es);
+               ext3_orphan_cleanup(sb, es);
        }
        return 0;
 }
diff -ru linux-2.2.14-ext3/fs/jfs/journal.c linux-2.2.14-ext3e/fs/jfs/journal.c
--- linux-2.2.14-ext3/fs/jfs/journal.c  Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/fs/jfs/journal.c Tue May 23 15:41:20 2000
@@ -130,6 +130,11 @@
                    time_after_eq(jiffies, transaction->t_expires))
                        journal->j_commit_request = transaction->t_tid;
        }
+
+       if (journal->j_commit_timer_active) {
+               journal->j_commit_timer_active = 0;
+               del_timer(journal->j_commit_timer);
+       }
        
        journal->j_task = NULL;
        wake_up(&journal->j_wait_done_commit);
diff -ru linux-2.2.14-ext3/fs/jfs/transaction.c linux-2.2.14-ext3e/fs/jfs/transaction.c
--- linux-2.2.14-ext3/fs/jfs/transaction.c      Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/fs/jfs/transaction.c     Tue May 23 15:48:25 2000
@@ -232,6 +232,7 @@
                return NULL;
        
        if (current->j_handle) {
+               jfs_debug(4, "Using existing handle %p\n", current->j_handle);
                current->j_handle->h_ref++;
                return current->j_handle;
        }
diff -ru linux-2.2.14-ext3/include/linux/ext3_fs.h 
linux-2.2.14-ext3e/include/linux/ext3_fs.h
--- linux-2.2.14-ext3/include/linux/ext3_fs.h   Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/include/linux/ext3_fs.h  Fri May 26 15:44:24 2000
@@ -406,8 +406,10 @@
         */
        __u8    s_journal_uuid[16];     /* uuid of journal superblock */
        __u32   s_journal_inum;         /* inode number of journal file */
-       
-       __u32   s_reserved[199];        /* Padding to the end of the block */
+       __u32   s_journal_dev;          /* device number of journal file */
+       __u32   s_last_orphan;          /* start of list of inodes to delete */
+
+       __u32   s_reserved[197];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
diff -ru linux-2.2.14-ext3/include/linux/ext3_fs_i.h 
linux-2.2.14-ext3e/include/linux/ext3_fs_i.h
--- linux-2.2.14-ext3/include/linux/ext3_fs_i.h Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/include/linux/ext3_fs_i.h        Wed May 24 16:20:19 2000
@@ -36,7 +36,9 @@
        __u32   i_prealloc_block;
        __u32   i_prealloc_count;
        __u32   i_high_size;
+       struct list_head i_orphan;      /* unlinked but open inodes */
        int     i_new_inode:1;  /* Is a freshly allocated inode */
 };
 
+#define NEXT_ORPHAN(inode) inode->u.ext3_i.i_dtime
 #endif /* _LINUX_EXT3_FS_I */
diff -ru linux-2.2.14-ext3/include/linux/ext3_fs_sb.h 
linux-2.2.14-ext3e/include/linux/ext3_fs_sb.h
--- linux-2.2.14-ext3/include/linux/ext3_fs_sb.h        Tue May 23 17:46:31 2000
+++ linux-2.2.14-ext3e/include/linux/ext3_fs_sb.h       Wed May 24 17:33:00 2000
@@ -65,6 +65,7 @@
        /* Journaling */
        struct inode * s_journal_inode;
        struct journal_s * s_journal;
+       struct list_head s_orphan;
 };
 
 #endif /* _LINUX_EXT3_FS_SB */
--- cut here ---
-- 
Andreas Dilger  \ "If a man ate a pound of pasta and a pound of antipasto,
                 \  would they cancel out, leaving him still hungry?"
http://www-mddsp.enel.ucalgary.ca/People/adilger/               -- Dogbert

Reply via email to