Hi,

The attached patch seems to fix all the reported deadlock
problems with the new VM. Basically they could be grouped
into 2 categories:

1) __GFP_IO related locking issues
2) something sleeps on a free/clean/inactive page goal
   that isn't worked towards

The patch has survived some heavy stresstesting on both
SMP and UP machines. I hope nobody will be able to find
a way to still crash this one ;)

A second change is a more dynamic free memory target
(now freepages.high + inactive_target / 3), this seems
to help a little bit in some loads.

If your mailer messes up the patch, you can grab it from
http://www.surriel.com/patches/2.4.0-t9p7-vmpatch

Linus, if this patch turns out to work fine for the people
testing it, could you please apply it to your tree?

regards,

Rik
--
"What you're running that piece of shit Gnome?!?!"
       -- Miguel de Icaza, UKUUG 2000

http://www.conectiva.com/               http://www.surriel.com/



--- linux-2.4.0-test9-pre7/fs/buffer.c.orig     Sat Sep 30 18:09:18 2000
+++ linux-2.4.0-test9-pre7/fs/buffer.c  Mon Oct  2 00:19:41 2000
@@ -706,7 +706,9 @@
 static void refill_freelist(int size)
 {
        if (!grow_buffers(size)) {
-               try_to_free_pages(GFP_BUFFER);
+               wakeup_bdflush(1);
+               current->policy |= SCHED_YIELD;
+               schedule();
        }
 }
 
@@ -859,6 +861,7 @@
 int balance_dirty_state(kdev_t dev)
 {
        unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+       int shortage;
 
        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
        tot = nr_free_buffer_pages();
@@ -869,21 +872,20 @@
 
        /* First, check for the "real" dirty limit. */
        if (dirty > soft_dirty_limit) {
-               if (dirty > hard_dirty_limit || inactive_shortage())
+               if (dirty > hard_dirty_limit)
                        return 1;
                return 0;
        }
 
        /*
-        * Then, make sure the number of inactive pages won't overwhelm
-        * page replacement ... this should avoid stalls.
+        * If we are about to get low on free pages and
+        * cleaning the inactive_dirty pages would help
+        * fix this, wake up bdflush.
         */
-       if (nr_inactive_dirty_pages >
-                               nr_free_pages() + nr_inactive_clean_pages()) {
-               if (free_shortage() > freepages.min)
-                       return 1;
+       if (free_shortage() && nr_inactive_dirty_pages > free_shortage() &&
+                       nr_inactive_dirty_pages > freepages.high)
                return 0;
-       }
+
        return -1;
 }
 
@@ -2663,9 +2665,8 @@
                CHECK_EMERGENCY_SYNC
 
                flushed = flush_dirty_buffers(0);
-               if (nr_inactive_dirty_pages > nr_free_pages() +
-                                               nr_inactive_clean_pages())
-                       flushed += page_launder(GFP_KSWAPD, 0);
+               if (free_shortage())
+                       flushed += page_launder(GFP_BUFFER, 0);
 
                /* If wakeup_bdflush will wakeup us
                   after our bdflush_done wakeup, then
--- linux-2.4.0-test9-pre7/mm/filemap.c.orig    Sat Sep 30 18:16:28 2000
+++ linux-2.4.0-test9-pre7/mm/filemap.c Mon Oct  2 00:11:45 2000
@@ -44,7 +44,6 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
-struct list_head lru_cache;
 
 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
--- linux-2.4.0-test9-pre7/mm/page_alloc.c.orig Sat Sep 30 18:16:28 2000
+++ linux-2.4.0-test9-pre7/mm/page_alloc.c      Mon Oct  2 00:31:15 2000
@@ -258,13 +258,13 @@
                 */
                switch (limit) {
                        default:
-                       case 0:
+                       case PAGES_MIN:
                                water_mark = z->pages_min;
                                break;
-                       case 1:
+                       case PAGES_LOW:
                                water_mark = z->pages_low;
                                break;
-                       case 2:
+                       case PAGES_HIGH:
                                water_mark = z->pages_high;
                }
 
@@ -318,10 +318,19 @@
                direct_reclaim = 1;
 
        /*
-        * Are we low on inactive pages?
+        * If we are about to get low on free pages and we also have
+        * an inactive page shortage, wake up kswapd.
         */
        if (inactive_shortage() > inactive_target / 2 && free_shortage())
                wakeup_kswapd(0);
+       /*
+        * If we are about to get low on free pages and cleaning
+        * the inactive_dirty pages would fix the situation,
+        * wake up bdflush.
+        */
+       else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
+                       && nr_inactive_dirty_pages > freepages.high)
+               wakeup_bdflush(0);
 
 try_again:
        /*
@@ -378,8 +387,23 @@
         *
         * We wake up kswapd, in the hope that kswapd will
         * resolve this situation before memory gets tight.
+        *
+        * We also yield the CPU, because that:
+        * - gives kswapd a chance to do something
+        * - slows down allocations, in particular the
+        *   allocations from the fast allocator that's
+        *   causing the problems ...
+        * - ... which minimises the impact the "bad guys"
+        *   have on the rest of the system
+        * - if we don't have __GFP_IO set, kswapd may be
+        *   able to free some memory we can't free ourselves
         */
        wakeup_kswapd(0);
+       if (gfp_mask & __GFP_WAIT) {
+               __set_current_state(TASK_RUNNING);
+               current->policy |= SCHED_YIELD;
+               schedule();
+       }
 
        /*
         * After waking up kswapd, we try to allocate a page
@@ -440,28 +464,43 @@
                 * up again. After that we loop back to the start.
                 *
                 * We have to do this because something else might eat
-                * the memory kswapd frees for us (interrupts, other
-                * processes, etc).
+                * the memory kswapd frees for us and we need to be
+                * reliable. Note that we don't loop back for higher
+                * order allocations since it is possible that kswapd
+                * simply cannot free a large enough contiguous area
+                * of memory *ever*.
                 */
-               if (gfp_mask & __GFP_WAIT) {
-                       /*
-                        * Give other processes a chance to run:
-                        */
-                       if (current->need_resched) {
-                               __set_current_state(TASK_RUNNING);
-                               schedule();
-                       }
+               if (gfp_mask & (__GFP_WAIT|__GFP_IO) == (__GFP_WAIT|__GFP_IO)) {
+                       wakeup_kswapd(1);
+                       memory_pressure++;
+                       if (!order)
+                               goto try_again;
+               /*
+                * If __GFP_IO isn't set, we can't wait on kswapd because
+                * kswapd just might need some IO locks /we/ are holding ...
+                *
+                * SUBTLE: The scheduling point above makes sure that
+                * kswapd does get the chance to free memory we can't
+                * free ourselves...
+                */
+               } else if (gfp_mask & __GFP_WAIT) {
                        try_to_free_pages(gfp_mask);
                        memory_pressure++;
-                       goto try_again;
+                       if (!order)
+                               goto try_again;
                }
+
        }
 
        /*
         * Final phase: allocate anything we can!
         *
-        * This is basically reserved for PF_MEMALLOC and
-        * GFP_ATOMIC allocations...
+        * Higher order allocations, GFP_ATOMIC allocations and
+        * recursive allocations (PF_MEMALLOC) end up here.
+        *
+        * Only recursive allocations can use the very last pages
+        * in the system, otherwise it would be just too easy to
+        * deadlock the system...
         */
        zone = zonelist->zones;
        for (;;) {
@@ -472,8 +511,21 @@
                if (!z->size)
                        BUG();
 
+               /*
+                * SUBTLE: direct_reclaim is only possible if the task
+                * becomes PF_MEMALLOC while looping above. This will
+                * happen when the OOM killer selects this task for
+                * instant execution...
+                */
                if (direct_reclaim)
                        page = reclaim_page(z);
+               if (page)
+                       return page;
+
+               /* XXX: is pages_min/4 a good amount to reserve for this? */
+               if (z->free_pages < z->pages_min / 4 &&
+                               !(current->flags & PF_MEMALLOC))
+                       continue;
                if (!page)
                        page = rmqueue(z, order);
                if (page)
@@ -481,8 +533,7 @@
        }
 
        /* No luck.. */
-       if (!order)
-               show_free_areas();
+       printk(KERN_ERR "__alloc_pages: %d-order allocation failed.\n", order);
        return NULL;
 }
 
@@ -572,6 +623,13 @@
        sum = nr_free_pages();
        sum += nr_inactive_clean_pages();
        sum += nr_inactive_dirty_pages;
+
+       /*
+        * Keep our write behind queue filled, even if
+        * kswapd lags a bit right now.
+        */
+       if (sum < freepages.high + inactive_target)
+               sum = freepages.high + inactive_target;
        /*
         * We don't want dirty page writebehind to put too
         * much pressure on the working set, but we want it
--- linux-2.4.0-test9-pre7/mm/swap.c.orig       Sat Sep 30 18:16:28 2000
+++ linux-2.4.0-test9-pre7/mm/swap.c    Sat Sep 30 20:49:27 2000
@@ -100,6 +100,15 @@
                page->age = PAGE_AGE_MAX;
 }
 
+/*
+ * We use this (minimal) function in the case where we
+ * know we can't deactivate the page (yet).
+ */
+void age_page_down_ageonly(struct page * page)
+{
+       page->age /= 2;
+}
+
 void age_page_down_nolock(struct page * page)
 {
        /* The actual page aging bit */
@@ -155,30 +164,39 @@
  */
 void deactivate_page_nolock(struct page * page)
 {
+       /*
+        * One for the cache, one for the extra reference the
+        * caller has and (maybe) one for the buffers.
+        *
+        * This isn't perfect, but works for just about everything.
+        * Besides, as long as we don't move unfreeable pages to the
+        * inactive_clean list it doesn't need to be perfect...
+        */
+       int maxcount = (page->buffers ? 3 : 2);
        page->age = 0;
 
        /*
         * Don't touch it if it's not on the active list.
         * (some pages aren't on any list at all)
         */
-       if (PageActive(page) && (page_count(page) <= 2 || page->buffers) &&
+       if (PageActive(page) && page_count(page) <= maxcount &&
                        !page_ramdisk(page)) {
 
                /*
                 * We can move the page to the inactive_dirty list
-                * if we know there is backing store available.
+                * if we have the strong suspicion that they might
+                * become freeable in the near future.
                 *
-                * We also move pages here that we cannot free yet,
-                * but may be able to free later - because most likely
-                * we're holding an extra reference on the page which
-                * will be dropped right after deactivate_page().
+                * That is, the page has buffer heads attached (that
+                * need to be cleared away) and/or the function calling
+                * us has an extra reference count on the page.
                 */
                if (page->buffers || page_count(page) == 2) {
                        del_page_from_active_list(page);
                        add_page_to_inactive_dirty_list(page);
                /*
-                * If the page is clean and immediately reusable,
-                * we can move it to the inactive_clean list.
+                * Only if we are SURE the page is clean and immediately
+                * reusable, we move it to the inactive_clean list.
                 */
                } else if (page->mapping && !PageDirty(page) &&
                                                        !PageLocked(page)) {
@@ -215,6 +233,10 @@
                 * not to do anything.
                 */
        }
+
+       /* Make sure the page gets a fair chance at staying active. */
+       if (page->age < PAGE_AGE_START)
+               page->age = PAGE_AGE_START;
 }
 
 void activate_page(struct page * page)
--- linux-2.4.0-test9-pre7/mm/vmscan.c.orig     Sat Sep 30 18:16:28 2000
+++ linux-2.4.0-test9-pre7/mm/vmscan.c  Sun Oct  1 19:19:56 2000
@@ -74,7 +74,8 @@
                goto out_failed;
        }
        if (!onlist)
-               age_page_down(page);
+               /* The page is still mapped, so it can't be freeable... */
+               age_page_down_ageonly(page);
 
        /*
         * If the page is in active use by us, or if the page
@@ -419,7 +420,7 @@
                                continue;
                        /* Skip tasks which haven't slept long enough yet when 
idle-swapping. */
                        if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) 
||
-                                       time_before(p->sleep_time + idle_time * HZ, 
jiffies)))
+                                       time_after(p->sleep_time + idle_time * HZ, 
+jiffies)))
                                continue;
                        found_task++;
                        /* Refresh swap_cnt? */
@@ -536,6 +537,7 @@
 found_page:
        del_page_from_inactive_clean_list(page);
        UnlockPage(page);
+       page->age = PAGE_AGE_START;
        if (page_count(page) != 1)
                printk("VM: reclaim_page, found page with count %d!\n",
                                page_count(page));
@@ -565,22 +567,24 @@
  * This code is heavily inspired by the FreeBSD source code. Thanks
  * go out to Matthew Dillon.
  */
-#define MAX_SYNC_LAUNDER       (1 << page_cluster)
-#define MAX_LAUNDER            (MAX_SYNC_LAUNDER * 4)
+#define MAX_LAUNDER            (4 * (1 << page_cluster))
 int page_launder(int gfp_mask, int sync)
 {
-       int synclaunder, launder_loop, maxscan, cleaned_pages, maxlaunder;
+       int launder_loop, maxscan, cleaned_pages, maxlaunder;
+       int can_get_io_locks;
        struct list_head * page_lru;
        struct page * page;
 
+       /*
+        * We can only grab the IO locks (eg. for flushing dirty
+        * buffers to disk) if __GFP_IO is set.
+        */
+       can_get_io_locks = gfp_mask & __GFP_IO;
+
        launder_loop = 0;
-       synclaunder = 0;
        maxlaunder = 0;
        cleaned_pages = 0;
 
-       if (!(gfp_mask & __GFP_IO))
-               return 0;
-
 dirty_page_rescan:
        spin_lock(&pagemap_lru_lock);
        maxscan = nr_inactive_dirty_pages;
@@ -638,7 +642,7 @@
                        spin_unlock(&pagemap_lru_lock);
 
                        /* Will we do (asynchronous) IO? */
-                       if (launder_loop && synclaunder-- > 0)
+                       if (launder_loop && maxlaunder == 0 && sync)
                                wait = 2;       /* Synchrounous IO */
                        else if (launder_loop && maxlaunder-- > 0)
                                wait = 1;       /* Async IO */
@@ -725,10 +729,11 @@
         * loads, flush out the dirty pages before we have to wait on
         * IO.
         */
-       if (!launder_loop && free_shortage()) {
+       if (can_get_io_locks && !launder_loop && free_shortage()) {
                launder_loop = 1;
-               if (sync && !cleaned_pages)
-                       synclaunder = MAX_SYNC_LAUNDER;
+               /* If we cleaned pages, never do synchronous IO. */
+               if (cleaned_pages)
+                       sync = 0;
                /* We only do a few "out of order" flushes. */
                maxlaunder = MAX_LAUNDER;
                /* Kflushd takes care of the rest. */
@@ -774,8 +779,23 @@
                        age_page_up_nolock(page);
                        page_active = 1;
                } else {
-                       age_page_down_nolock(page);
-                       page_active = 0;
+                       age_page_down_ageonly(page);
+                       /*
+                        * Since we don't hold a reference on the page
+                        * ourselves, we have to do our test a bit more
+                        * strict then deactivate_page(). This is needed
+                        * since otherwise the system could hang shuffling
+                        * unfreeable pages from the active list to the
+                        * inactive_dirty list and back again...
+                        *
+                        * SUBTLE: we can have buffer pages with count 1.
+                        */
+                       if (page_count(page) <= (page->buffers ? 2 : 1)) {
+                               deactivate_page_nolock(page);
+                               page_active = 0;
+                       } else {
+                               page_active = 1;
+                       }
                }
                /*
                 * If the page is still on the active list, move it
@@ -805,14 +825,11 @@
        pg_data_t *pgdat = pgdat_list;
        int sum = 0;
        int freeable = nr_free_pages() + nr_inactive_clean_pages();
+       int freetarget = freepages.high + inactive_target / 3;
 
-       /* Are we low on truly free pages? */
-       if (nr_free_pages() < freepages.min)
-               return freepages.high - nr_free_pages();
-
-       /* Are we low on free pages over-all? */
-       if (freeable < freepages.high)
-               return freepages.high - freeable;
+       /* Are we low on free pages globally? */
+       if (freeable < freetarget)
+               return freetarget - freeable;
 
        /* If not, are we very low on any particular zone? */
        do {
@@ -1052,14 +1069,7 @@
                        /* Do we need to do some synchronous flushing? */
                        if (waitqueue_active(&kswapd_done))
                                wait = 1;
-                       if (!do_try_to_free_pages(GFP_KSWAPD, wait)) {
-                               /*
-                                * if (out_of_memory()) {
-                                *      try again a few times;
-                                *      oom_kill();
-                                * }
-                                */
-                       }
+                       do_try_to_free_pages(GFP_KSWAPD, wait);
                }
 
                /*
@@ -1096,6 +1106,10 @@
                 */
                if (!free_shortage() || !inactive_shortage())
                        interruptible_sleep_on_timeout(&kswapd_wait, HZ);
+               /*
+                * TODO: insert out of memory check & oom killer
+                * invocation in an else branch here.
+                */
        }
 }
 
--- linux-2.4.0-test9-pre7/include/linux/swap.h.orig    Sat Sep 30 20:42:01 2000
+++ linux-2.4.0-test9-pre7/include/linux/swap.h Sun Oct  1 15:35:03 2000
@@ -91,6 +91,7 @@
 extern void age_page_up_nolock(struct page *);
 extern void age_page_down(struct page *);
 extern void age_page_down_nolock(struct page *);
+extern void age_page_down_ageonly(struct page *);
 extern void deactivate_page(struct page *);
 extern void deactivate_page_nolock(struct page *);
 extern void activate_page(struct page *);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
Please read the FAQ at http://www.tux.org/lkml/

Reply via email to