Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Hugh Dickins
On Tue, 22 Jan 2008, Andrea Arcangeli wrote:
> 
> Then I will have to update KVM so that it will free the kvm structure
> after waiting a quiescent point to avoid kernel crashing memory
> corruption after applying your changes to the mmu notifier.

It may not be suitable (I've not looked into your needs), but consider
SLAB_DESTROY_BY_RCU: it might give you the easiest way to do that.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Christoph Lameter
On Tue, 22 Jan 2008, Peter Zijlstra wrote:

> I think we can get rid of this rwlock as I think this will seriously
> hurt larger machines.

Correct.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Andrea Arcangeli
On Tue, Jan 22, 2008 at 08:28:47PM +0100, Peter Zijlstra wrote:
> I think we can get rid of this rwlock as I think this will seriously
> hurt larger machines.

Yep, I initially considered it, nevertheless given you solved part of
the complication I can add it now ;). The only technical reason for
not using RCU is if certain users of the notifiers are registering and
unregistering at high frequency through objects that may need to be
freed quickly.

I can tell the KVM usage of the mmu notifiers is sure fine to use RCU.
Then I will have to update KVM so that it will free the kvm structure
after waiting a quiescent point to avoid kernel crashing memory
corruption after applying your changes to the mmu notifier.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Peter Zijlstra

On Mon, 2008-01-21 at 13:52 +0100, Andrea Arcangeli wrote:

> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> new file mode 100644
> --- /dev/null
> +++ b/include/linux/mmu_notifier.h
> @@ -0,0 +1,79 @@
> +#ifndef _LINUX_MMU_NOTIFIER_H
> +#define _LINUX_MMU_NOTIFIER_H
> +
> +#include 
> +#include 
> +
> +#ifdef CONFIG_MMU_NOTIFIER
> +
> +struct mmu_notifier;
> +
> +struct mmu_notifier_ops {
> + void (*release)(struct mmu_notifier *mn,
> + struct mm_struct *mm);
> + void (*age_page)(struct mmu_notifier *mn,
> +  struct mm_struct *mm,
> +  unsigned long address);
> + void (*invalidate_page)(struct mmu_notifier *mn,
> + struct mm_struct *mm,
> + unsigned long address);
> + void (*invalidate_range)(struct mmu_notifier *mn,
> +  struct mm_struct *mm,
> +  unsigned long start, unsigned long end);
> +};
> +
> +struct mmu_notifier_head {
> + struct hlist_head head;
> + rwlock_t lock;

spinlock_t lock;

I think we can get rid of this rwlock as I think this will seriously
hurt larger machines.

> +};
> +
> +struct mmu_notifier {
> + struct hlist_node hlist;
> + const struct mmu_notifier_ops *ops;
> +};
> +
> +#include 
> +
> +extern void mmu_notifier_register(struct mmu_notifier *mn,
> +   struct mm_struct *mm);
> +extern void mmu_notifier_unregister(struct mmu_notifier *mn,
> + struct mm_struct *mm);
> +extern void mmu_notifier_release(struct mm_struct *mm);
> +
> +static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
> +{
> + INIT_HLIST_HEAD(>head);
> + rwlock_init(>lock);
> +}
> +
> +#define mmu_notifier(function, mm, args...)  \
> + do {\
> + struct mmu_notifier *__mn;  \
> + struct hlist_node *__n; \
> + \
> + if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
> + read_lock(&(mm)->mmu_notifier.lock);\
rcu_read_lock();
> + hlist_for_each_entry(__mn, __n, \
hlist_for_each_entry_rcu
> +  &(mm)->mmu_notifier.head,  \
> +  hlist) \
> + if (__mn->ops->function)\
> + __mn->ops->function(__mn,   \
> + mm, \
> + args);  \
> + read_unlock(&(mm)->mmu_notifier.lock);  \
rcu_read_unlock();
> + }   \
> + } while (0)
> +
> +#else /* CONFIG_MMU_NOTIFIER */
> +
> +#define mmu_notifier_register(mn, mm) do {} while(0)
> +#define mmu_notifier_unregister(mn, mm) do {} while (0)
> +#define mmu_notifier_release(mm) do {} while (0)
> +#define mmu_notifier_head_init(mmh) do {} while (0)
> +
> +#define mmu_notifier(function, mm, args...)  \
> + do { } while (0)
> +
> +#endif /* CONFIG_MMU_NOTIFIER */
> +
> +#endif /* _LINUX_MMU_NOTIFIER_H */


> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> new file mode 100644
> --- /dev/null
> +++ b/mm/mmu_notifier.c
> @@ -0,0 +1,44 @@
> +/*
> + *  linux/mm/mmu_notifier.c
> + *
> + *  Copyright (C) 2008  Qumranet, Inc.
> + *
> + *  This work is licensed under the terms of the GNU GPL, version 2. See
> + *  the COPYING file in the top-level directory.
> + */
> +
> +#include 
> +#include 
> +
> +void mmu_notifier_release(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + struct hlist_node *n, *tmp;
> +
> + if (unlikely(!hlist_empty(>mmu_notifier.head))) {
> + read_lock(>mmu_notifier.lock);
rcu_read_lock();
> + hlist_for_each_entry_safe(mn, n, tmp,
hlist_for_each_entry_safe_rcu
> +   >mmu_notifier.head, hlist) {
> + if (mn->ops->release)
> + mn->ops->release(mn, mm);
> + hlist_del(>hlist);
hlist_del_rcu
> + }
> + read_unlock(>mmu_notifier.lock);
rcu_read_unlock();
> + }
> +}
> +
> +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> + write_lock(>mmu_notifier.lock);
spin_lock
> + hlist_add_head(>hlist, >mmu_notifier.head);
hlist_add_head_rcu
> + write_unlock(>mmu_notifier.lock);
  

Re: [kvm-devel] [PATCH] mmu notifiers #v3

2008-01-22 Thread Andrea Arcangeli
On Tue, Jan 22, 2008 at 04:12:34PM +0200, Avi Kivity wrote:
> Andrea Arcangeli wrote:
>> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
>> --- a/include/asm-generic/pgtable.h
>> +++ b/include/asm-generic/pgtable.h
>> @@ -44,8 +44,10 @@
>>  ({  \
>>  int __young;\
>>  __young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
>> -if (__young)\
>> +if (__young) {  \
>>  flush_tlb_page(__vma, __address);   \
>> +mmu_notifier(age_page, (__vma)->vm_mm, __address);  \
>> +}   \
>>  __young;\
>>  })
>>   
>
> I think that unconditionally doing
>
>  __young |= mmu_notifier(test_and_clear_young, ...);
>
> allows hardware with accessed bits more control over what is going on.

Agreed, likely it'll have to be mmu_notifier_age_page().
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [kvm-devel] [PATCH] mmu notifiers #v3

2008-01-22 Thread Avi Kivity

Andrea Arcangeli wrote:

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -44,8 +44,10 @@
 ({ \
int __young;\
__young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
-   if (__young)\
+   if (__young) {  \
flush_tlb_page(__vma, __address);   \
+   mmu_notifier(age_page, (__vma)->vm_mm, __address);   \
+   }   \
__young;\
 })
  


I think that unconditionally doing

 __young |= mmu_notifier(test_and_clear_young, ...);

allows hardware with accessed bits more control over what is going on.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [kvm-devel] [PATCH] mmu notifiers #v3

2008-01-22 Thread Avi Kivity

Andrea Arcangeli wrote:

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -44,8 +44,10 @@
 ({ \
int __young;\
__young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
-   if (__young)\
+   if (__young) {  \
flush_tlb_page(__vma, __address);   \
+   mmu_notifier(age_page, (__vma)-vm_mm, __address);   \
+   }   \
__young;\
 })
  


I think that unconditionally doing

 __young |= mmu_notifier(test_and_clear_young, ...);

allows hardware with accessed bits more control over what is going on.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [kvm-devel] [PATCH] mmu notifiers #v3

2008-01-22 Thread Andrea Arcangeli
On Tue, Jan 22, 2008 at 04:12:34PM +0200, Avi Kivity wrote:
 Andrea Arcangeli wrote:
 diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
 --- a/include/asm-generic/pgtable.h
 +++ b/include/asm-generic/pgtable.h
 @@ -44,8 +44,10 @@
  ({  \
  int __young;\
  __young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
 -if (__young)\
 +if (__young) {  \
  flush_tlb_page(__vma, __address);   \
 +mmu_notifier(age_page, (__vma)-vm_mm, __address);  \
 +}   \
  __young;\
  })
   

 I think that unconditionally doing

  __young |= mmu_notifier(test_and_clear_young, ...);

 allows hardware with accessed bits more control over what is going on.

Agreed, likely it'll have to be mmu_notifier_age_page().
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Peter Zijlstra

On Mon, 2008-01-21 at 13:52 +0100, Andrea Arcangeli wrote:

 diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
 new file mode 100644
 --- /dev/null
 +++ b/include/linux/mmu_notifier.h
 @@ -0,0 +1,79 @@
 +#ifndef _LINUX_MMU_NOTIFIER_H
 +#define _LINUX_MMU_NOTIFIER_H
 +
 +#include linux/list.h
 +#include linux/spinlock.h
 +
 +#ifdef CONFIG_MMU_NOTIFIER
 +
 +struct mmu_notifier;
 +
 +struct mmu_notifier_ops {
 + void (*release)(struct mmu_notifier *mn,
 + struct mm_struct *mm);
 + void (*age_page)(struct mmu_notifier *mn,
 +  struct mm_struct *mm,
 +  unsigned long address);
 + void (*invalidate_page)(struct mmu_notifier *mn,
 + struct mm_struct *mm,
 + unsigned long address);
 + void (*invalidate_range)(struct mmu_notifier *mn,
 +  struct mm_struct *mm,
 +  unsigned long start, unsigned long end);
 +};
 +
 +struct mmu_notifier_head {
 + struct hlist_head head;
 + rwlock_t lock;

spinlock_t lock;

I think we can get rid of this rwlock as I think this will seriously
hurt larger machines.

 +};
 +
 +struct mmu_notifier {
 + struct hlist_node hlist;
 + const struct mmu_notifier_ops *ops;
 +};
 +
 +#include linux/mm_types.h
 +
 +extern void mmu_notifier_register(struct mmu_notifier *mn,
 +   struct mm_struct *mm);
 +extern void mmu_notifier_unregister(struct mmu_notifier *mn,
 + struct mm_struct *mm);
 +extern void mmu_notifier_release(struct mm_struct *mm);
 +
 +static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
 +{
 + INIT_HLIST_HEAD(mnh-head);
 + rwlock_init(mnh-lock);
 +}
 +
 +#define mmu_notifier(function, mm, args...)  \
 + do {\
 + struct mmu_notifier *__mn;  \
 + struct hlist_node *__n; \
 + \
 + if (unlikely(!hlist_empty((mm)-mmu_notifier.head))) { \
 + read_lock((mm)-mmu_notifier.lock);\
rcu_read_lock();
 + hlist_for_each_entry(__mn, __n, \
hlist_for_each_entry_rcu
 +  (mm)-mmu_notifier.head,  \
 +  hlist) \
 + if (__mn-ops-function)\
 + __mn-ops-function(__mn,   \
 + mm, \
 + args);  \
 + read_unlock((mm)-mmu_notifier.lock);  \
rcu_read_unlock();
 + }   \
 + } while (0)
 +
 +#else /* CONFIG_MMU_NOTIFIER */
 +
 +#define mmu_notifier_register(mn, mm) do {} while(0)
 +#define mmu_notifier_unregister(mn, mm) do {} while (0)
 +#define mmu_notifier_release(mm) do {} while (0)
 +#define mmu_notifier_head_init(mmh) do {} while (0)
 +
 +#define mmu_notifier(function, mm, args...)  \
 + do { } while (0)
 +
 +#endif /* CONFIG_MMU_NOTIFIER */
 +
 +#endif /* _LINUX_MMU_NOTIFIER_H */


 diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
 new file mode 100644
 --- /dev/null
 +++ b/mm/mmu_notifier.c
 @@ -0,0 +1,44 @@
 +/*
 + *  linux/mm/mmu_notifier.c
 + *
 + *  Copyright (C) 2008  Qumranet, Inc.
 + *
 + *  This work is licensed under the terms of the GNU GPL, version 2. See
 + *  the COPYING file in the top-level directory.
 + */
 +
 +#include linux/mmu_notifier.h
 +#include linux/module.h
 +
 +void mmu_notifier_release(struct mm_struct *mm)
 +{
 + struct mmu_notifier *mn;
 + struct hlist_node *n, *tmp;
 +
 + if (unlikely(!hlist_empty(mm-mmu_notifier.head))) {
 + read_lock(mm-mmu_notifier.lock);
rcu_read_lock();
 + hlist_for_each_entry_safe(mn, n, tmp,
hlist_for_each_entry_safe_rcu
 +   mm-mmu_notifier.head, hlist) {
 + if (mn-ops-release)
 + mn-ops-release(mn, mm);
 + hlist_del(mn-hlist);
hlist_del_rcu
 + }
 + read_unlock(mm-mmu_notifier.lock);
rcu_read_unlock();
 + }
 +}
 +
 +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
 +{
 + write_lock(mm-mmu_notifier.lock);
spin_lock
 + hlist_add_head(mn-hlist, mm-mmu_notifier.head);
hlist_add_head_rcu
 + write_unlock(mm-mmu_notifier.lock);
spin_unlock
 +}
 

Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Christoph Lameter
On Tue, 22 Jan 2008, Peter Zijlstra wrote:

 I think we can get rid of this rwlock as I think this will seriously
 hurt larger machines.

Correct.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Andrea Arcangeli
On Tue, Jan 22, 2008 at 08:28:47PM +0100, Peter Zijlstra wrote:
 I think we can get rid of this rwlock as I think this will seriously
 hurt larger machines.

Yep, I initially considered it, nevertheless given you solved part of
the complication I can add it now ;). The only technical reason for
not using RCU is if certain users of the notifiers are registering and
unregistering at high frequency through objects that may need to be
freed quickly.

I can tell the KVM usage of the mmu notifiers is sure fine to use RCU.
Then I will have to update KVM so that it will free the kvm structure
after waiting a quiescent point to avoid kernel crashing memory
corruption after applying your changes to the mmu notifier.

Thanks!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mmu notifiers #v3

2008-01-22 Thread Hugh Dickins
On Tue, 22 Jan 2008, Andrea Arcangeli wrote:
 
 Then I will have to update KVM so that it will free the kvm structure
 after waiting a quiescent point to avoid kernel crashing memory
 corruption after applying your changes to the mmu notifier.

It may not be suitable (I've not looked into your needs), but consider
SLAB_DESTROY_BY_RCU: it might give you the easiest way to do that.

Hugh
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mmu notifiers #v3

2008-01-21 Thread Rik van Riel
On Mon, 21 Jan 2008 13:52:04 +0100
Andrea Arcangeli <[EMAIL PROTECTED]> wrote:

> Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]>

Reviewed-by: Rik van Riel <[EMAIL PROTECTED]>

-- 
All rights reversed.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mmu notifiers #v3

2008-01-21 Thread Andrea Arcangeli
On Thu, Jan 17, 2008 at 08:32:52PM +0100, Andrea Arcangeli wrote:
> To make this work we still need notification from the VM about memory
> pressure [..]

Ok I thought some more at the aging issue of the hot kvm pages (to
prevent the guest-OS very-hot working set to be swapped out). So I now
hooked a age_page mmu notifier in the page_referenced mkold path. This
way when the linux pte is marked old, we can also drop the spte. This
way we give the guest-OS a whole round scan of the inactive list in
order to generate a vmexit minor fault by touching the hot page. The
very-lightweight vmexit will call into follow_page again that I
accordingly changed to mark the pte young (which is nicer because it
truly simulates what a regular access through the virtual address
would do). For direct-io it makes no difference and this way the next
time page_referenced runs it'll find the pte young again and it'll
mark the pte old again and in turn it'll call ->age_page again that
will drop the spte again, etc... So the working set will be sticky in
ram and it won't generate spurious swapouts (this is the theory at
least). It works well in practice so far but I don't have hard numbers
myself (I just implemented what I think is a quite effective aging
strategy to do a not random page replacement on the very hot guest-OS
working set).

In absence of memory pressure (or with little pressure) there will be
no age_page calls at all and the spte cache can grow freely without
any vmexit. This provides peak performance in absence of memory
pressure.

This keeps the VM aging decision in the VM instead of having an lru of
sptes to collect. The lru of sptes to collect would still be
interesting for the shrinker method though (similar to dcache/inode
lru etc..).

This update also adds some locking so multiple subsystems can
register/unregister for the notifiers at any time (something that had
to be handled by design with external serialization before and
effectively it was a bit fragile).

BTW, when MMU_NOTIFIER=n the kernel compile spawns a warning in
memory.c about two unused variables, not sure if it worth hiding it
given I suppose most people will have MMU_NOTIFIER=y. One easy way to
avoid the warning is to move the mmu_notifier call out of line and to
have one function per notifier (which was suggested by Christoph
already as an icache optimization). But this implementation keeps the
patch smaller and quicker to improve for now...

I'd like to know if this could be possibly merged soon and what I need
to change to make this happen. Thanks!

The kvm side of this can be found here:

http://marc.info/?l=kvm-devel=120091930324366=2
http://marc.info/?l=kvm-devel=120091906724000=2
http://marc.info/?l=kvm-devel=120091939024572=2

Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]>

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -44,8 +44,10 @@
 ({ \
int __young;\
__young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
-   if (__young)\
+   if (__young) {  \
flush_tlb_page(__vma, __address);   \
+   mmu_notifier(age_page, (__vma)->vm_mm, __address);  \
+   }   \
__young;\
 })
 #endif
@@ -86,6 +88,7 @@ do {  
\
pte_t __pte;\
__pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep);  \
flush_tlb_page(__vma, __address);   \
+   mmu_notifier(invalidate_page, (__vma)->vm_mm, __address);   \
__pte;  \
 })
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -219,6 +220,10 @@ struct mm_struct {
/* aio bits */
rwlock_tioctx_list_lock;
struct kioctx   *ioctx_list;
+
+#ifdef CONFIG_MMU_NOTIFIER
+   struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,79 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include 
+#include 
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+   void 

[PATCH] mmu notifiers #v3

2008-01-21 Thread Andrea Arcangeli
On Thu, Jan 17, 2008 at 08:32:52PM +0100, Andrea Arcangeli wrote:
 To make this work we still need notification from the VM about memory
 pressure [..]

Ok I thought some more at the aging issue of the hot kvm pages (to
prevent the guest-OS very-hot working set to be swapped out). So I now
hooked a age_page mmu notifier in the page_referenced mkold path. This
way when the linux pte is marked old, we can also drop the spte. This
way we give the guest-OS a whole round scan of the inactive list in
order to generate a vmexit minor fault by touching the hot page. The
very-lightweight vmexit will call into follow_page again that I
accordingly changed to mark the pte young (which is nicer because it
truly simulates what a regular access through the virtual address
would do). For direct-io it makes no difference and this way the next
time page_referenced runs it'll find the pte young again and it'll
mark the pte old again and in turn it'll call -age_page again that
will drop the spte again, etc... So the working set will be sticky in
ram and it won't generate spurious swapouts (this is the theory at
least). It works well in practice so far but I don't have hard numbers
myself (I just implemented what I think is a quite effective aging
strategy to do a not random page replacement on the very hot guest-OS
working set).

In absence of memory pressure (or with little pressure) there will be
no age_page calls at all and the spte cache can grow freely without
any vmexit. This provides peak performance in absence of memory
pressure.

This keeps the VM aging decision in the VM instead of having an lru of
sptes to collect. The lru of sptes to collect would still be
interesting for the shrinker method though (similar to dcache/inode
lru etc..).

This update also adds some locking so multiple subsystems can
register/unregister for the notifiers at any time (something that had
to be handled by design with external serialization before and
effectively it was a bit fragile).

BTW, when MMU_NOTIFIER=n the kernel compile spawns a warning in
memory.c about two unused variables, not sure if it worth hiding it
given I suppose most people will have MMU_NOTIFIER=y. One easy way to
avoid the warning is to move the mmu_notifier call out of line and to
have one function per notifier (which was suggested by Christoph
already as an icache optimization). But this implementation keeps the
patch smaller and quicker to improve for now...

I'd like to know if this could be possibly merged soon and what I need
to change to make this happen. Thanks!

The kvm side of this can be found here:

http://marc.info/?l=kvm-develm=120091930324366w=2
http://marc.info/?l=kvm-develm=120091906724000w=2
http://marc.info/?l=kvm-develm=120091939024572w=2

Signed-off-by: Andrea Arcangeli [EMAIL PROTECTED]

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -44,8 +44,10 @@
 ({ \
int __young;\
__young = ptep_test_and_clear_young(__vma, __address, __ptep);  \
-   if (__young)\
+   if (__young) {  \
flush_tlb_page(__vma, __address);   \
+   mmu_notifier(age_page, (__vma)-vm_mm, __address);  \
+   }   \
__young;\
 })
 #endif
@@ -86,6 +88,7 @@ do {  
\
pte_t __pte;\
__pte = ptep_get_and_clear((__vma)-vm_mm, __address, __ptep);  \
flush_tlb_page(__vma, __address);   \
+   mmu_notifier(invalidate_page, (__vma)-vm_mm, __address);   \
__pte;  \
 })
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -10,6 +10,7 @@
 #include linux/rbtree.h
 #include linux/rwsem.h
 #include linux/completion.h
+#include linux/mmu_notifier.h
 #include asm/page.h
 #include asm/mmu.h
 
@@ -219,6 +220,10 @@ struct mm_struct {
/* aio bits */
rwlock_tioctx_list_lock;
struct kioctx   *ioctx_list;
+
+#ifdef CONFIG_MMU_NOTIFIER
+   struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,79 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include linux/list.h
+#include 

Re: [PATCH] mmu notifiers #v3

2008-01-21 Thread Rik van Riel
On Mon, 21 Jan 2008 13:52:04 +0100
Andrea Arcangeli [EMAIL PROTECTED] wrote:

 Signed-off-by: Andrea Arcangeli [EMAIL PROTECTED]

Reviewed-by: Rik van Riel [EMAIL PROTECTED]

-- 
All rights reversed.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/