Hi,
I'm working on a bit of code that vmaps/vunmaps a bit more often than
I'd like. I've implemented a frontend to cache commonly used mappings,
which solves most of the problem, but in looking various other ways to
get the last bit of performance, I thought might be generally helpful
to batch up vunmap driven TLB flushes. So I'll just throw the idea
out there (is anyone else doing a lot of vmapping? I'd like to hear
from you!).
Anyway, the idea is just that we don't free up the virtual address space
immediately but wait until we've collected a batch of them, and free
them all at once and only flush the TLBs once per batch.
We are able to free the pages at vfree-time, because although we may
still have TLBs pointing to them, it would be a kernel bug to access
those TLBs at this stage (AFAIKS, we still do need to flush the cache
at vunmap-time, however).
And we are able to flush at vmap-time if we run out of virtual area.
So the cost is pretty small -- with 128 deferred regions sitting there,
it's maybe like 8K worth of struct vm_structs.
Here is a rough hack. Comments?
--
Index: linux-2.6/mm/vmalloc.c
===================================================================
--- linux-2.6.orig/mm/vmalloc.c
+++ linux-2.6/mm/vmalloc.c
@@ -24,8 +24,13 @@
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
+#define LAZY_MAX 128
+static unsigned long lazy_start = -1UL, lazy_end = 0;
+static unsigned int lazy_nr;
+
static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
int node);
+static void __purge_vm_area_lazy(void);
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
{
@@ -68,23 +73,33 @@ static inline void vunmap_pud_range(pgd_
} while (pud++, addr = next, addr != end);
}
-void unmap_kernel_range(unsigned long addr, unsigned long size)
+/*
+ * This function does not flush pagetables itself.
+ */
+static void __unmap_kernel_range(unsigned long addr, unsigned long end)
{
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr;
- unsigned long end = addr + size;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
- flush_cache_vunmap(addr, end);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
vunmap_pud_range(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
- flush_tlb_kernel_range(start, end);
+}
+
+void unmap_kernel_range(unsigned long addr, unsigned long size)
+{
+ unsigned long end = addr + size;
+
+ BUG_ON(addr >= end);
+
+ flush_cache_vunmap(addr, end);
+ __unmap_kernel_range(addr, end);
+ flush_tlb_kernel_range(addr, end);
}
static void unmap_vm_area(struct vm_struct *area)
@@ -200,6 +215,7 @@ static struct vm_struct *__get_vm_area_n
size += PAGE_SIZE;
write_lock(&vmlist_lock);
+retry:
for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
if ((unsigned long)tmp->addr < addr) {
if((unsigned long)tmp->addr + tmp->size >= addr)
@@ -215,7 +231,7 @@ static struct vm_struct *__get_vm_area_n
if (addr > end - size)
goto out;
}
-
+ /* XXX: should have addr > end - size check here */
found:
area->next = *p;
*p = area;
@@ -231,6 +247,11 @@ found:
return area;
out:
+ if (lazy_nr) {
+ __purge_vm_area_lazy();
+ addr = ALIGN(start, align);
+ goto retry;
+ }
write_unlock(&vmlist_lock);
kfree(area);
if (printk_ratelimit())
@@ -291,13 +312,64 @@ static struct vm_struct *__remove_vm_are
return NULL;
found:
+ BUG_ON(tmp->flags & VM_LAZYFREE);
unmap_vm_area(tmp);
*p = tmp->next;
- /*
- * Remove the guard page.
- */
- tmp->size -= PAGE_SIZE;
+ return tmp;
+}
+
+static void __purge_vm_area_lazy(void)
+{
+ struct vm_struct **p, *tmp;
+
+ p = &vmlist;
+ while ((tmp = *p) != NULL) {
+ if (tmp->flags & VM_LAZYFREE) {
+ unsigned long start = (unsigned long)tmp->addr;
+ unsigned long end = start + tmp->size;
+
+ BUG_ON(start < lazy_start);
+ BUG_ON(end > lazy_end);
+
+ *p = tmp->next;
+ __unmap_kernel_range(start, end);
+ kfree(tmp);
+ lazy_nr--;
+ } else
+ p = &tmp->next;
+ }
+ flush_tlb_kernel_range(lazy_start, lazy_end);
+ BUG_ON(lazy_nr != 0);
+
+ lazy_end = 0;
+ lazy_start = -1UL;
+}
+
+static struct vm_struct *__remove_vm_area_lazy(void *addr)
+{
+ struct vm_struct *tmp;
+
+ tmp = __find_vm_area(addr);
+ if (tmp) {
+ unsigned long start, end;
+
+ if (tmp->flags & VM_LAZYFREE)
+ return NULL; /* shouldn't happen */
+
+ start = (unsigned long)tmp->addr;
+ end = start + tmp->size;
+
+ flush_cache_vunmap(start, end);
+
+ tmp->flags |= VM_LAZYFREE;
+ if (start < lazy_start)
+ lazy_start = start;
+ if (end > lazy_end)
+ lazy_end = end;
+ lazy_nr++;
+ }
+
return tmp;
}
@@ -321,6 +393,8 @@ struct vm_struct *remove_vm_area(void *a
static void __vunmap(void *addr, int deallocate_pages)
{
struct vm_struct *area;
+ struct page **pages;
+ int nrpages, vpages;
if (!addr)
return;
@@ -331,32 +405,40 @@ static void __vunmap(void *addr, int dea
return;
}
- area = remove_vm_area(addr);
+ write_lock(&vmlist_lock);
+ area = __remove_vm_area_lazy(addr);
if (unlikely(!area)) {
+ write_unlock(&vmlist_lock);
printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
WARN_ON(1);
return;
}
- debug_check_no_locks_freed(addr, area->size);
+ debug_check_no_locks_freed(addr, area->size - PAGE_SIZE);
+
+ pages = area->pages;
+ nrpages = area->nr_pages;
+ vpages = area->flags & VM_VPAGES;
+
+ if (lazy_nr > LAZY_MAX)
+ __purge_vm_area_lazy();
+
+ write_unlock(&vmlist_lock);
if (deallocate_pages) {
int i;
- for (i = 0; i < area->nr_pages; i++) {
- BUG_ON(!area->pages[i]);
- __free_page(area->pages[i]);
+ for (i = 0; i < nrpages; i++) {
+ BUG_ON(!pages[i]);
+ __free_page(pages[i]);
}
- if (area->flags & VM_VPAGES)
- vfree(area->pages);
+ if (vpages)
+ vfree(pages);
else
- kfree(area->pages);
+ kfree(pages);
}
-
- kfree(area);
- return;
}
/**
Index: linux-2.6/include/linux/vmalloc.h
===================================================================
--- linux-2.6.orig/include/linux/vmalloc.h
+++ linux-2.6/include/linux/vmalloc.h
@@ -12,6 +12,7 @@ struct vm_area_struct;
#define VM_MAP 0x00000004 /* vmap()ed pages */
#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */
#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */
+#define VM_LAZYFREE 0x00000020 /* area is unmapped lazily */
/* bits [20..32] reserved for arch specific ioremap internals */
/*
-
To unsubscribe from this list: send the line "unsubscribe linux-arch" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html