Track non-resident pages through a simple hashing scheme.  This way
the space overhead is limited to 1 u32 per page, or 0.1% space overhead
and lookups are one cache miss.

Aside from seeing whether or not a page was recently evicted, we can
also take a reasonable guess at how many other pages were evicted since
this page was evicted.

Signed-off-by: Rik van Riel <[EMAIL PROTECTED]>

Index: linux-2.6.12-vm/include/linux/swap.h
===================================================================
--- linux-2.6.12-vm.orig/include/linux/swap.h
+++ linux-2.6.12-vm/include/linux/swap.h
@@ -153,6 +153,11 @@ extern void out_of_memory(unsigned int _
 /* linux/mm/memory.c */
 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct 
*);
 
+/* linux/mm/nonresident.c */
+extern int remember_page(struct address_space *, unsigned long);
+extern int recently_evicted(struct address_space *, unsigned long);
+extern void init_nonresident(void);
+
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalhigh_pages;
@@ -288,6 +293,11 @@ static inline swp_entry_t get_swap_page(
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
 
+/* linux/mm/nonresident.c */
+#define init_nonresident()     do { } while (0)
+#define remember_page(x,y)     0
+#define recently_evicted(x,y)  0
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
Index: linux-2.6.12-vm/init/main.c
===================================================================
--- linux-2.6.12-vm.orig/init/main.c
+++ linux-2.6.12-vm/init/main.c
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <linux/swap.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -488,6 +489,7 @@ asmlinkage void __init start_kernel(void
        }
 #endif
        vfs_caches_init_early();
+       init_nonresident();
        mem_init();
        kmem_cache_init();
        numa_policy_init();
Index: linux-2.6.12-vm/mm/Makefile
===================================================================
--- linux-2.6.12-vm.orig/mm/Makefile
+++ linux-2.6.12-vm/mm/Makefile
@@ -12,7 +12,8 @@ obj-y                 := bootmem.o filemap.o mempool.o
                           readahead.o slab.o swap.o truncate.o vmscan.o \
                           prio_tree.o $(mmu-y)
 
-obj-$(CONFIG_SWAP)     += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP)     += page_io.o swap_state.o swapfile.o thrash.o \
+                          nonresident.o
 obj-$(CONFIG_HUGETLBFS)        += hugetlb.o
 obj-$(CONFIG_NUMA)     += mempolicy.o
 obj-$(CONFIG_SHMEM) += shmem.o
Index: linux-2.6.12-vm/mm/nonresident.c
===================================================================
--- /dev/null
+++ linux-2.6.12-vm/mm/nonresident.c
@@ -0,0 +1,162 @@
+/*
+ * mm/nonresident.c
+ * (C) 2004,2005 Red Hat, Inc
+ * Written by Rik van Riel <[EMAIL PROTECTED]>
+ * Released under the GPL, see the file COPYING for details.
+ *
+ * Keeps track of whether a non-resident page was recently evicted
+ * and should be immediately promoted to the active list. This also
+ * helps automatically tune the inactive target.
+ *
+ * The pageout code stores a recently evicted page in this cache
+ * by calling remember_page(mapping/mm, index/vaddr, generation)
+ * and can look it up in the cache by calling recently_evicted()
+ * with the same arguments.
+ *
+ * Note that there is no way to invalidate pages after eg. truncate
+ * or exit, we let the pages fall out of the non-resident set through
+ * normal replacement.
+ */
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/kernel.h>
+
+/* Number of non-resident pages per hash bucket. Never smaller than 15. */
+#if (L1_CACHE_BYTES < 64)
+#define NR_BUCKET_BYTES 64
+#else
+#define NR_BUCKET_BYTES L1_CACHE_BYTES
+#endif
+#define NUM_NR ((NR_BUCKET_BYTES - sizeof(atomic_t))/sizeof(u32))
+
+struct nr_bucket
+{
+       atomic_t hand;
+       u32 page[NUM_NR];
+} ____cacheline_aligned;
+
+/* The non-resident page hash table. */
+static struct nr_bucket * nonres_table;
+static unsigned int nonres_shift;
+static unsigned int nonres_mask;
+
+struct nr_bucket * nr_hash(void * mapping, unsigned long index)
+{
+       unsigned long bucket;
+       unsigned long hash;
+
+       hash = hash_ptr(mapping, BITS_PER_LONG);
+       hash = 37 * hash + hash_long(index, BITS_PER_LONG);
+       bucket = hash & nonres_mask;
+
+       return nonres_table + bucket;
+}
+
+static u32 nr_cookie(struct address_space * mapping, unsigned long index)
+{
+       unsigned long cookie = hash_ptr(mapping, BITS_PER_LONG);
+       cookie = 37 * cookie + hash_long(index, BITS_PER_LONG);
+
+       if (mapping->host) {
+               cookie = 37 * cookie + hash_long(mapping->host->i_ino, 
BITS_PER_LONG);
+       }
+
+       return (u32)(cookie >> (BITS_PER_LONG - 32));
+}
+
+int recently_evicted(struct address_space * mapping, unsigned long index)
+{
+       struct nr_bucket * nr_bucket;
+       int distance;
+       u32 wanted;
+       int i;
+
+       prefetch(mapping->host);
+       nr_bucket = nr_hash(mapping, index);
+
+       prefetch(nr_bucket);
+       wanted = nr_cookie(mapping, index);
+
+       for (i = 0; i < NUM_NR; i++) {
+               if (nr_bucket->page[i] == wanted) {
+                       nr_bucket->page[i] = 0;
+                       /* Return the distance between entry and clock hand. */
+                       distance = atomic_read(&nr_bucket->hand) + NUM_NR - i;
+                       distance = (distance % NUM_NR) + 1;
+                       return distance * (1 << nonres_shift);
+               }
+       }
+
+       return -1;
+}
+
+int remember_page(struct address_space * mapping, unsigned long index)
+{
+       struct nr_bucket * nr_bucket;
+       u32 nrpage;
+       int i;
+
+       prefetch(mapping->host);
+       nr_bucket = nr_hash(mapping, index);
+
+       prefetchw(nr_bucket);
+       nrpage = nr_cookie(mapping, index);
+
+       /* Atomically find the next array index. */
+       preempt_disable();
+  retry:
+       i = atomic_inc_return(&nr_bucket->hand);
+       if (unlikely(i >= NUM_NR)) {
+               if (i == NUM_NR)
+                       atomic_set(&nr_bucket->hand, -1);
+               goto retry;
+       }
+       preempt_enable();
+
+       /* Statistics may want to know whether the entry was in use. */
+       return xchg(&nr_bucket->page[i], nrpage);
+}
+
+/*
+ * For interactive workloads, we remember about as many non-resident pages
+ * as we have actual memory pages.  For server workloads with large inter-
+ * reference distances we could benefit from remembering more.
+ */
+static __initdata unsigned long nonresident_factor = 1;
+void __init init_nonresident(void)
+{
+       int target;
+       int i;
+
+       /*
+        * Calculate the non-resident hash bucket target. Use a power of
+        * two for the division because alloc_large_system_hash rounds up.
+        */
+       target = nr_all_pages * nonresident_factor;
+       target /= (sizeof(struct nr_bucket) / sizeof(u32));
+
+       nonres_table = alloc_large_system_hash("Non-resident page tracking",
+                                       sizeof(struct nr_bucket),
+                                       target,
+                                       0,
+                                       HASH_EARLY | HASH_HIGHMEM,
+                                       &nonres_shift,
+                                       &nonres_mask,
+                                       0);
+
+       for (i = 0; i < (1 << nonres_shift); i++)
+               atomic_set(&nonres_table[i].hand, 0);
+}
+
+static int __init set_nonresident_factor(char * str)
+{
+       if (!str)
+               return 0;
+       nonresident_factor = simple_strtoul(str, &str, 0);
+       return 1;
+}
+__setup("nonresident_factor=", set_nonresident_factor);
Index: linux-2.6.12-vm/mm/vmscan.c
===================================================================
--- linux-2.6.12-vm.orig/mm/vmscan.c
+++ linux-2.6.12-vm/mm/vmscan.c
@@ -509,6 +509,7 @@ static int shrink_list(struct list_head 
 #ifdef CONFIG_SWAP
                if (PageSwapCache(page)) {
                        swp_entry_t swap = { .val = page->private };
+                       remember_page(&swapper_space, page->private);
                        __delete_from_swap_cache(page);
                        write_unlock_irq(&mapping->tree_lock);
                        swap_free(swap);
@@ -517,6 +518,7 @@ static int shrink_list(struct list_head 
                }
 #endif /* CONFIG_SWAP */
 
+               remember_page(page->mapping, page->index);
                __remove_from_page_cache(page);
                write_unlock_irq(&mapping->tree_lock);
                __put_page(page);
Index: linux-2.6.12-vm/mm/filemap.c
===================================================================
--- linux-2.6.12-vm.orig/mm/filemap.c
+++ linux-2.6.12-vm/mm/filemap.c
@@ -400,6 +400,7 @@ int add_to_page_cache_lru(struct page *p
                                pgoff_t offset, int gfp_mask)
 {
        int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+       recently_evicted(mapping, offset);
        if (ret == 0)
                lru_cache_add(page);
        return ret;
Index: linux-2.6.12-vm/mm/swap_state.c
===================================================================
--- linux-2.6.12-vm.orig/mm/swap_state.c
+++ linux-2.6.12-vm/mm/swap_state.c
@@ -344,6 +344,8 @@ struct page *read_swap_cache_async(swp_e
                                break;          /* Out of memory */
                }
 
+               recently_evicted(&swapper_space, entry.val);
+
                /*
                 * Associate the page with swap entry in the swap cache.
                 * May fail (-ENOENT) if swap entry has been freed since

--
-- 
All Rights Reversed
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to