Hi, 

The following patch should fix the OOM deadlock condition caused by
prune_icache(), and improve its performance significantly.

The OOM deadlock can happen because prune_icache() tries to sync _all_
dirty inodes (under PF_MEMALLOC) on the system before trying to free a
portion of the clean unused inodes.

The patch also changes prune_icache() to free clean inodes first, and then
sync _unused_ ones if needed. In case (nr_free_pages < freepages.low) the
code writes one inode synchronously and returns (to avoid the OOM
deadlock).

Patch against 2.4.4-pre1. 

Comments are welcome. 


--- fs/inode.c~ Thu Apr 12 21:16:35 2001
+++ fs/inode.c  Thu Apr 12 21:49:56 2001
@@ -13,6 +13,8 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/cache.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
 
 /*
  * New inode.c implementation.
@@ -197,6 +199,34 @@
        inodes_stat.nr_unused--;
 }
 
+static inline void __sync_one(struct inode *inode, int sync)
+{
+       unsigned dirty;
+
+       list_del(&inode->i_list);
+       list_add(&inode->i_list, atomic_read(&inode->i_count)
+                                               ? &inode_in_use
+                                               : &inode_unused);
+
+       /* Set I_LOCK, reset I_DIRTY */
+       dirty = inode->i_state & I_DIRTY;
+       inode->i_state |= I_LOCK;
+       inode->i_state &= ~I_DIRTY;
+       spin_unlock(&inode_lock);
+
+       filemap_fdatasync(inode->i_mapping);
+
+       /* Don't write the inode if only I_DIRTY_PAGES was set */
+       if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
+               write_inode(inode, sync);
+
+       filemap_fdatawait(inode->i_mapping);
+
+       spin_lock(&inode_lock);
+       inode->i_state &= ~I_LOCK;
+       wake_up(&inode->i_wait);
+}
+
 static inline void sync_one(struct inode *inode, int sync)
 {
        if (inode->i_state & I_LOCK) {
@@ -206,29 +236,7 @@
                iput(inode);
                spin_lock(&inode_lock);
        } else {
-               unsigned dirty;
-
-               list_del(&inode->i_list);
-               list_add(&inode->i_list, atomic_read(&inode->i_count)
-                                                       ? &inode_in_use
-                                                       : &inode_unused);
-               /* Set I_LOCK, reset I_DIRTY */
-               dirty = inode->i_state & I_DIRTY;
-               inode->i_state |= I_LOCK;
-               inode->i_state &= ~I_DIRTY;
-               spin_unlock(&inode_lock);
-
-               filemap_fdatasync(inode->i_mapping);
-
-               /* Don't write the inode if only I_DIRTY_PAGES was set */
-               if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
-                       write_inode(inode, sync);
-
-               filemap_fdatawait(inode->i_mapping);
-
-               spin_lock(&inode_lock);
-               inode->i_state &= ~I_LOCK;
-               wake_up(&inode->i_wait);
+               __sync_one(inode, sync);
        }
 }
 
@@ -236,10 +244,42 @@
 {
        struct list_head * tmp;
 
-       while ((tmp = head->prev) != head)
+       while ((tmp = head->prev) != head) 
                sync_one(list_entry(tmp, struct inode, i_list), 0);
 }
 
+static inline int try_to_sync_unused_list(struct list_head *head)
+{
+       struct list_head *tmp = head;
+       struct inode *inode;
+
+       while ((tmp = tmp->prev) != head) {
+               inode = list_entry(tmp, struct inode, i_list);
+
+               if (!(inode->i_state & I_LOCK) 
+                               && !atomic_read(&inode->i_count)) {
+                       /* 
+                        * We're under PF_MEMALLOC here, and syncing the 
+                        * inode may have to allocate memory. To avoid
+                        * running into a OOM deadlock, we write one 
+                        * inode synchronously and stop syncing in case 
+                        * we're under freepages.low
+                        */
+
+                       int sync = nr_free_pages() < freepages.low;
+                       __sync_one(inode, sync);
+                       if (sync) 
+                               return 0;
+                       /* 
+                        * __sync_one moved the inode to another list,
+                        * so we have to start looking from the list head.
+                        */
+                       tmp = head;
+               }
+       }
+       return 1;
+}
+
 /**
  *     sync_inodes
  *     @dev: device to sync the inodes from.
@@ -273,13 +313,14 @@
 /*
  * Called with the spinlock already held..
  */
-static void sync_all_inodes(void)
+static void try_to_sync_unused_inodes(void)
 {
        struct super_block * sb = sb_entry(super_blocks.next);
        for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) {
                if (!sb->s_dev)
                        continue;
-               sync_list(&sb->s_dirty);
+               if (!try_to_sync_unused_list(&sb->s_dirty))
+                       break;
        }
 }
 
@@ -506,13 +547,12 @@
 {
        LIST_HEAD(list);
        struct list_head *entry, *freeable = &list;
-       int count = 0;
+       int count = 0, synced = 0;
        struct inode * inode;
 
        spin_lock(&inode_lock);
-       /* go simple and safe syncing everything before starting */
-       sync_all_inodes();
 
+free_unused:
        entry = inode_unused.prev;
        while (entry != &inode_unused)
        {
@@ -539,6 +579,20 @@
        spin_unlock(&inode_lock);
 
        dispose_list(freeable);
+
+       /* 
+        * If we freed enough clean inodes, avoid writing 
+        * dirty ones. Also giveup if we already tried to
+        * sync dirty inodes.
+        */
+       if (!goal || synced)
+               return;
+       
+       synced = 1;
+
+       spin_lock(&inode_lock);
+       try_to_sync_unused_inodes();
+       goto free_unused;
 }
 
 void shrink_icache_memory(int priority, int gfp_mask)


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to