Re: [kvm-devel] s390 kvm_virtio.c build error

2008-05-05 Thread Martin Schwidefsky
On Mon, 2008-05-05 at 16:00 +0300, Avi Kivity wrote:
 Christian Borntraeger wrote:
  Hmm... this should help:
 
  ---
   drivers/s390/kvm/kvm_virtio.c |   40 
  
  +++-

   1 file changed, 23 insertions(+), 17 deletions(-)
  
 
  Thanks Heiko.
  I did a short test and it seems to work.
 
  Acked-by: Christian Borntraeger [EMAIL PROTECTED]
 
  This looks almost identical to Rusty's patch. Who is going to send this (or 
  Rustys) patch to Linus?

 
 I can, but tell me which one.  Also, the patch (Heiko's) needs a 
 changelog entry and a signoff.

I've added Heiko's patch to my patchqueue. But since this is
drivers/s390/kvm this should go in over the kvm.git. See patch below.

-- 
blue skies,
  Martin.

Reality continues to ruin my life. - Calvin.

---
Subject: [PATCH] kvm/s390 compile error

From: Heiko Carstens [EMAIL PROTECTED]

Fix kvm compile error:

Commit c45a6816c19dee67b8f725e6646d428901a6dc24
(virtio: explicit advertisement of driver features)
and commit e976a2b997fc4ad70ccc53acfe62811c4aaec851
(s390: KVM guest: virtio device support, and kvm hypercalls)
don't like each other:

  CC  drivers/s390/kvm/kvm_virtio.o
drivers/s390/kvm/kvm_virtio.c:224: error: unknown field 'feature' specified in 
initializer
drivers/s390/kvm/kvm_virtio.c:224: warning: initialization from incompatible 
pointer type
make[3]: *** [drivers/s390/kvm/kvm_virtio.o] Error 1

Cc: Adrian Bunk [EMAIL PROTECTED]
Signed-off-by: Heiko Carstens [EMAIL PROTECTED]
Signed-off-by: Martin Schwidefsky [EMAIL PROTECTED]
---

 drivers/s390/kvm/kvm_virtio.c |   40 +++-
 1 file changed, 23 insertions(+), 17 deletions(-)

diff -urpN linux-2.6/drivers/s390/kvm/kvm_virtio.c 
linux-2.6-patched/drivers/s390/kvm/kvm_virtio.c
--- linux-2.6/drivers/s390/kvm/kvm_virtio.c 2008-05-05 13:20:45.0 
+0200
+++ linux-2.6-patched/drivers/s390/kvm/kvm_virtio.c 2008-05-05 
13:20:48.0 +0200
@@ -78,27 +78,32 @@ static unsigned desc_size(const struct k
+ desc-config_len;
 }
 
-/*
- * This tests (and acknowleges) a feature bit.
- */
-static bool kvm_feature(struct virtio_device *vdev, unsigned fbit)
+/* This gets the device's feature bits. */
+static u32 kvm_get_features(struct virtio_device *vdev)
 {
+   unsigned int i;
+   u32 features = 0;
struct kvm_device_desc *desc = to_kvmdev(vdev)-desc;
-   u8 *features;
+   u8 *in_features = kvm_vq_features(desc);
 
-   if (fbit / 8  desc-feature_len)
-   return false;
+   for (i = 0; i  min(desc-feature_len * 8, 32); i++)
+   if (in_features[i / 8]  (1  (i % 8)))
+   features |= (1  i);
+   return features;
+}
 
-   features = kvm_vq_features(desc);
-   if (!(features[fbit / 8]  (1  (fbit % 8
-   return false;
+static void kvm_set_features(struct virtio_device *vdev, u32 features)
+{
+   unsigned int i;
+   struct kvm_device_desc *desc = to_kvmdev(vdev)-desc;
+   /* Second half of bitmap is features we accept. */
+   u8 *out_features = kvm_vq_features(desc) + desc-feature_len;
 
-   /*
-* We set the matching bit in the other half of the bitmap to tell the
-* Host we want to use this feature.
-*/
-   features[desc-feature_len + fbit / 8] |= (1  (fbit % 8));
-   return true;
+   memset(out_features, 0, desc-feature_len);
+   for (i = 0; i  min(desc-feature_len * 8, 32); i++) {
+   if (features  (1  i))
+   out_features[i / 8] |= (1  (i % 8));
+   }
 }
 
 /*
@@ -221,7 +226,8 @@ static void kvm_del_vq(struct virtqueue 
  * The config ops structure as defined by virtio config
  */
 static struct virtio_config_ops kvm_vq_configspace_ops = {
-   .feature = kvm_feature,
+   .get_features = kvm_get_features,
+   .set_features = kvm_set_features,
.get = kvm_get,
.set = kvm_set,
.get_status = kvm_get_status,



-
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [PATCH 03/04] kvm-s390: Improve pgste accesses

2008-04-04 Thread Martin Schwidefsky
On Fri, 2008-04-04 at 15:12 +0200, Carsten Otte wrote:
 Index: kvm/include/asm-s390/pgtable.h
 ===
 --- kvm.orig/include/asm-s390/pgtable.h
 +++ kvm/include/asm-s390/pgtable.h
 @@ -553,12 +553,12 @@ static inline void ptep_rcp_copy(pte_t *
 
 skey = page_get_storage_key(page_to_phys(page));
 if (skey  _PAGE_CHANGED)
 - set_bit(RCP_GC_BIT, pgste);
 + set_bit_simple(RCP_GC_BIT, pgste);
 if (skey  _PAGE_REFERENCED)
 - set_bit(RCP_GR_BIT, pgste);
 - if (test_and_clear_bit(RCP_HC_BIT, pgste))
 + set_bit_simple(RCP_GR_BIT, pgste);
 + if (test_and_clear_bit_simple(RCP_HC_BIT, pgste))
 SetPageDirty(page);
 - if (test_and_clear_bit(RCP_HR_BIT, pgste))
 + if (test_and_clear_bit_simple(RCP_HR_BIT, pgste))
 SetPageReferenced(page);
 #endif
 }
 @@ -732,8 +732,8 @@ static inline int ptep_test_and_clear_yo
 young = ((page_get_storage_key(physpage)  _PAGE_REFERENCED) != 0);
 rcp_lock(ptep);
 if (young)
 - set_bit(RCP_GR_BIT, pgste);
 - young |= test_and_clear_bit(RCP_HR_BIT, pgste);
 + set_bit_simple(RCP_GR_BIT, pgste);
 + young |= test_and_clear_bit_simple(RCP_HR_BIT, pgste);
 rcp_unlock(ptep);
 return young;
 #endif

Major formatting accident ?

-- 
blue skies,
  Martin.

Reality continues to ruin my life. - Calvin.



-
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable

2008-03-23 Thread Martin Schwidefsky
On Sun, 2008-03-23 at 12:15 +0200, Avi Kivity wrote:
  Can you convert the page tables at a later time without doing a
  wholesale replacement of the mm?  It should be a bit easier to keep
  people off the pagetables than keep their grubby mitts off the mm
  itself.
  
 
  Yes, as far as I can see you're right. And whatever we do in arch code,
  after all it's just a work around to avoid a new clone flag.
  If something like clone() with CLONE_KVM would be useful for more
  architectures than just s390 then maybe we should try to get a flag.
 
  Oh... there are just two unused clone flag bits left. Looks like the
  namespace changes ate up a lot of them lately.
 
  Well, we could still play dirty tricks like setting a bit in current
  via whatever mechanism which indicates child-wants-extended-page-tables
  and then just fork and be happy.

 
 How about taking mmap_sem for write and converting all page tables 
 in-place?  I'd rather avoid the need to fork() when creating a VM.

That was my initial approach as well. If all the page table allocations
can be fullfilled the code is not too complicated. To handle allocation
failures gets tricky. At this point I realized that dup_mmap already
does what we want to do. It walks all the page tables, allocates new
page tables and copies the ptes. In principle I would reinvent the wheel
if we can not use dup_mmap.

-- 
blue skies,
  Martin.

Reality continues to ruin my life. - Calvin.





-
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse012070mrt/direct/01/
___
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel


[kvm-devel] [patch 3/6] Guest page hinting: mlocked pages.

2007-06-28 Thread Martin Schwidefsky
From: Martin Schwidefsky [EMAIL PROTECTED]
From: Hubertus Franke [EMAIL PROTECTED]
From: Himanshu Raj [EMAIL PROTECTED]

Add code to get mlock() working with guest page hinting. The problem
with mlock is that locked pages may not be removed from page cache.
That means they need to be stable. page_make_volatile needs a way to
check if a page has been locked. To avoid traversing vma lists - which
would hurt performance a lot - a field is added in the struct
address_space. This field is set in mlock_fixup if a vma gets mlocked.
The bit never gets removed - once a file had an mlocked vma all future
pages added to it will stay stable.

The pages of an mlocked area are made present in the linux page table by
a call to make_pages_present which calls get_user_pages and follow_page.
The follow_page function is called for each page in the mlocked vma,
if the VM_LOCKED bit in the vma flags is set the page is made stable.

Signed-off-by: Martin Schwidefsky [EMAIL PROTECTED]
---

 include/linux/fs.h |   10 ++
 mm/memory.c|5 +++--
 mm/mlock.c |2 ++
 mm/page-states.c   |5 -
 mm/rmap.c  |   13 +++--
 5 files changed, 30 insertions(+), 5 deletions(-)

diff -urpN linux-2.6/include/linux/fs.h linux-2.6-patched/include/linux/fs.h
--- linux-2.6/include/linux/fs.h2007-06-25 09:18:27.0 +0200
+++ linux-2.6-patched/include/linux/fs.h2007-06-28 18:19:45.0 
+0200
@@ -450,6 +450,9 @@ struct address_space {
spinlock_t  private_lock;   /* for use by the address_space 
*/
struct list_headprivate_list;   /* ditto */
struct address_space*assoc_mapping; /* ditto */
+#ifdef CONFIG_PAGE_STATES
+   unsigned intmlocked;/* set if VM_LOCKED vmas 
present */
+#endif
 } __attribute__((aligned(sizeof(long;
/*
 * On most architectures that alignment is already the case; but
@@ -457,6 +460,13 @@ struct address_space {
 * of struct page's mapping pointer be used for PAGE_MAPPING_ANON.
 */
 
+static inline void mapping_set_mlocked(struct address_space *mapping)
+{
+#ifdef CONFIG_PAGE_STATES
+   mapping-mlocked = 1;
+#endif
+}
+
 struct block_device {
dev_t   bd_dev;  /* not a kdev_t - it's a search key */
struct inode *  bd_inode;   /* will die */
diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c
--- linux-2.6/mm/memory.c   2007-06-28 18:19:45.0 +0200
+++ linux-2.6-patched/mm/memory.c   2007-06-28 18:19:45.0 +0200
@@ -981,9 +981,10 @@ struct page *follow_page(struct vm_area_
if (flags  FOLL_GET)
get_page(page);
 
-   if (flags  FOLL_GET) {
+   if ((flags  FOLL_GET) || (vma-vm_flags  VM_LOCKED)) {
/*
-* The page is made stable if a reference is acquired.
+* The page is made stable if a reference is acquired or
+* the vm area is locked.
 * If the caller does not get a reference it implies that
 * the caller can deal with page faults in case the page
 * is swapped out. In this case the caller can deal with
diff -urpN linux-2.6/mm/mlock.c linux-2.6-patched/mm/mlock.c
--- linux-2.6/mm/mlock.c2007-05-22 09:49:49.0 +0200
+++ linux-2.6-patched/mm/mlock.c2007-06-28 18:19:45.0 +0200
@@ -71,6 +71,8 @@ success:
 */
pages = (end - start)  PAGE_SHIFT;
if (newflags  VM_LOCKED) {
+   if (vma-vm_file  vma-vm_file-f_mapping)
+   mapping_set_mlocked(vma-vm_file-f_mapping);
pages = -pages;
if (!(newflags  VM_IO))
ret = make_pages_present(start, end);
diff -urpN linux-2.6/mm/page-states.c linux-2.6-patched/mm/page-states.c
--- linux-2.6/mm/page-states.c  2007-06-28 18:19:45.0 +0200
+++ linux-2.6-patched/mm/page-states.c  2007-06-28 18:19:45.0 +0200
@@ -29,6 +29,8 @@
  */
 static inline int check_bits(struct page *page)
 {
+   struct address_space *mapping;
+
/*
 * There are several conditions that prevent a page from becoming
 * volatile. The first check is for the page bits.
@@ -44,7 +46,8 @@ static inline int check_bits(struct page
 * it volatile. It will be freed soon. And if the mapping ever
 * had locked pages all pages of the mapping will stay stable.
 */
-   return page_mapping(page) != NULL;
+   mapping = page_mapping(page);
+   return mapping  !mapping-mlocked;
 }
 
 /*
diff -urpN linux-2.6/mm/rmap.c linux-2.6-patched/mm/rmap.c
--- linux-2.6/mm/rmap.c 2007-06-28 18:19:45.0 +0200
+++ linux-2.6-patched/mm/rmap.c 2007-06-28 18:19:45.0 +0200
@@ -706,8 +706,17 @@ static int try_to_unmap_one(struct page 
 */
if (!migration  ((vma-vm_flags  VM_LOCKED) ||
(ptep_clear_flush_young

[kvm-devel] [patch 2/6] Guest page hinting: volatile swap cache.

2007-06-28 Thread Martin Schwidefsky
From: Martin Schwidefsky [EMAIL PROTECTED]
From: Hubertus Franke [EMAIL PROTECTED]
From: Himanshu Raj [EMAIL PROTECTED]

The volatile page state can be used for anonymous pages as well, if
they have been added to the swap cache and the swap write is finished.
The tricky bit is in free_swap_and_cache. The call to find_get_page
dead-locks with the discard handler. If the page has been discarded
find_get_page will try to remove it. To do that it needs the page table
lock of all mappers but one is held by the caller of free_swap_and_cache.
A special variant of find_get_page is needed that does not check the
page state and returns a page reference even if the page is discarded.
The second pitfall is that the page needs to be made stable before the
swap slot gets freed. If the page cannot be made stable because it has
been discarded the swap slot may not be freed because it is still
needed to reload the discarded page from the swap device.

Signed-off-by: Martin Schwidefsky [EMAIL PROTECTED]
---

 include/linux/pagemap.h |3 ++
 include/linux/swap.h|5 
 mm/filemap.c|   19 +
 mm/memory.c |   13 +++-
 mm/page-states.c|   26 
 mm/rmap.c   |   51 
 mm/swap_state.c |   25 ++-
 mm/swapfile.c   |   30 ++--
 mm/vmscan.c |3 ++
 9 files changed, 154 insertions(+), 21 deletions(-)

diff -urpN linux-2.6/include/linux/pagemap.h 
linux-2.6-patched/include/linux/pagemap.h
--- linux-2.6/include/linux/pagemap.h   2007-06-28 18:19:44.0 +0200
+++ linux-2.6-patched/include/linux/pagemap.h   2007-06-28 18:19:44.0 
+0200
@@ -61,8 +61,11 @@ static inline void mapping_set_gfp_mask(
 
 #define page_cache_get(page)   get_page(page)
 #ifdef CONFIG_PAGE_STATES
+extern struct page * find_get_page_nodiscard(struct address_space *mapping,
+unsigned long index);
 #define page_cache_release(page)   put_page_check(page)
 #else
+#define find_get_page_nodiscard(mapping, index) find_get_page(mapping, index)
 #define page_cache_release(page)   put_page(page)
 #endif
 void release_pages(struct page **pages, int nr, int cold);
diff -urpN linux-2.6/include/linux/swap.h linux-2.6-patched/include/linux/swap.h
--- linux-2.6/include/linux/swap.h  2007-02-12 12:09:06.0 +0100
+++ linux-2.6-patched/include/linux/swap.h  2007-06-28 18:19:44.0 
+0200
@@ -228,6 +228,7 @@ extern struct address_space swapper_spac
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *, gfp_t);
 extern void __delete_from_swap_cache(struct page *);
+extern void __delete_from_swap_cache_nocheck(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern int move_to_swap_cache(struct page *, swp_entry_t);
 extern int move_from_swap_cache(struct page *, unsigned long,
@@ -343,6 +344,10 @@ static inline void __delete_from_swap_ca
 {
 }
 
+static inline void __delete_from_swap_cache_nocheck(struct page *page)
+{
+}
+
 static inline void delete_from_swap_cache(struct page *page)
 {
 }
diff -urpN linux-2.6/mm/filemap.c linux-2.6-patched/mm/filemap.c
--- linux-2.6/mm/filemap.c  2007-06-28 18:19:44.0 +0200
+++ linux-2.6-patched/mm/filemap.c  2007-06-28 18:19:44.0 +0200
@@ -507,6 +507,25 @@ static int __sleep_on_page_lock(void *wo
return 0;
 }
 
+#ifdef CONFIG_PAGE_STATES
+
+struct page * find_get_page_nodiscard(struct address_space *mapping,
+ unsigned long offset)
+{
+   struct page *page;
+
+   read_lock_irq(mapping-tree_lock);
+   page = radix_tree_lookup(mapping-page_tree, offset);
+   if (page)
+   page_cache_get(page);
+   read_unlock_irq(mapping-tree_lock);
+   return page;
+}
+
+EXPORT_SYMBOL(find_get_page_nodiscard);
+
+#endif
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
diff -urpN linux-2.6/mm/memory.c linux-2.6-patched/mm/memory.c
--- linux-2.6/mm/memory.c   2007-06-28 18:19:44.0 +0200
+++ linux-2.6-patched/mm/memory.c   2007-06-28 18:19:44.0 +0200
@@ -500,7 +500,18 @@ out_discard_pte:
 * in page cache anymore. Do what try_to_unmap_one would do
 * if the copy_one_pte had taken place before page_discard.
 */
-   if (page-index != linear_page_index(vma, addr))
+   if (PageAnon(page)) {
+   swp_entry_t entry = { .val = page_private(page) };
+   swap_duplicate(entry);
+   if (list_empty(dst_mm-mmlist)) {
+   spin_lock(mmlist_lock);
+   if (list_empty(dst_mm-mmlist))
+   list_add(dst_mm-mmlist, init_mm.mmlist);
+   spin_unlock(mmlist_lock

[kvm-devel] [patch 4/6] Guest page hinting: writable page table entries.

2007-06-28 Thread Martin Schwidefsky
From: Martin Schwidefsky [EMAIL PROTECTED]
From: Hubertus Franke [EMAIL PROTECTED]
From: Himanshu Raj [EMAIL PROTECTED]

The volatile state for page cache and swap cache pages requires that
the host system needs to be able to determine if a volatile page is
dirty before removing it. This excludes almost all platforms from using
the scheme. What is needed is a way to distinguish between pages that
are purely read-only and pages that might get written to. This allows
platforms with per-pte dirty bits to use the scheme and platforms with
per-page dirty bits a small optimization.

Whenever a writable pte is created a check is added that allows to
move the page into the correct state. This needs to be done before
the writable pte is established. To avoid unnecessary state transitions
and the need for a counter, a new page flag PG_writable is added. Only
the creation of the first writable pte will do a page state change.
Even if all the writable ptes pointing to a page are removed again,
the page stays in the safe state until all read-only users of the page
have unmapped it as well. Only then is the PG_writable bit reset.

The state a page needs to have if a writable pte is present depends
on the platform. A platform with per-pte dirty bits wants to move the
page into stable state, a platform with per-page dirty bits like s390
can decide to move the page into a special state that requires the host
system to check the dirty bit before discarding a page.

Signed-off-by: Martin Schwidefsky [EMAIL PROTECTED]
---

 fs/exec.c   |7 +++--
 include/linux/page-flags.h  |6 
 include/linux/page-states.h |   27 +++-
 mm/fremap.c |1 
 mm/memory.c |5 +++
 mm/mprotect.c   |2 +
 mm/page-states.c|   58 ++--
 mm/page_alloc.c |3 +-
 mm/rmap.c   |1 
 9 files changed, 104 insertions(+), 6 deletions(-)

diff -urpN linux-2.6/fs/exec.c linux-2.6-patched/fs/exec.c
--- linux-2.6/fs/exec.c 2007-05-25 09:33:26.0 +0200
+++ linux-2.6-patched/fs/exec.c 2007-06-28 18:19:47.0 +0200
@@ -51,6 +51,7 @@
 #include linux/cn_proc.h
 #include linux/audit.h
 #include linux/signalfd.h
+#include linux/page-states.h
 
 #include asm/uaccess.h
 #include asm/mmu_context.h
@@ -313,6 +314,7 @@ void install_arg_page(struct vm_area_str
 {
struct mm_struct *mm = vma-vm_mm;
pte_t * pte;
+   pte_t pte_val;
spinlock_t *ptl;
 
if (unlikely(anon_vma_prepare(vma)))
@@ -328,8 +330,9 @@ void install_arg_page(struct vm_area_str
}
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
-   set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
-   page, vma-vm_page_prot;
+   pte_val = pte_mkdirty(pte_mkwrite(mk_pte(page, vma-vm_page_prot)));
+   page_check_writable(page, pte_val, 2);
+   set_pte_at(mm, address, pte, pte_val);
page_add_new_anon_rmap(page, vma, address);
pte_unmap_unlock(pte, ptl);
 
diff -urpN linux-2.6/include/linux/page-flags.h 
linux-2.6-patched/include/linux/page-flags.h
--- linux-2.6/include/linux/page-flags.h2007-06-28 18:19:44.0 
+0200
+++ linux-2.6-patched/include/linux/page-flags.h2007-06-28 
18:19:47.0 +0200
@@ -105,6 +105,7 @@
 #endif
 
 #define PG_discarded   20  /* Page discarded by the hypervisor. */
+#define PG_writable21  /* Page is mapped writable. */
 
 /*
  * Manipulation of page state flags
@@ -283,6 +284,11 @@ static inline void __ClearPageTail(struc
 #define TestSetPageDiscarded(page) 0
 #endif
 
+#define PageWritable(page) test_bit(PG_writable, (page)-flags)
+#define TestSetPageWritable(page) \
+   test_and_set_bit(PG_writable, (page)-flags)
+#define ClearPageWritable(page) clear_bit(PG_writable, (page)-flags)
+
 struct page;   /* forward declaration */
 
 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
diff -urpN linux-2.6/include/linux/page-states.h 
linux-2.6-patched/include/linux/page-states.h
--- linux-2.6/include/linux/page-states.h   2007-06-28 18:19:44.0 
+0200
+++ linux-2.6-patched/include/linux/page-states.h   2007-06-28 
18:19:47.0 +0200
@@ -55,6 +55,9 @@ extern void page_discard(struct page *pa
 extern int  __page_make_stable(struct page *page);
 extern void __page_make_volatile(struct page *page, int offset);
 extern void __pagevec_make_volatile(struct pagevec *pvec);
+extern void __page_check_writable(struct page *page, pte_t pte,
+ unsigned int offset);
+extern void __page_reset_writable(struct page *page);
 
 /*
  * Extended guest page hinting functions defined by using the
@@ -76,6 +79,12 @@ extern void __pagevec_make_volatile(stru
  * from the LRU list and the radix tree of its mapping.
  * page_discard uses

[kvm-devel] [patch 6/6] Guest page hinting: s390 support.

2007-06-28 Thread Martin Schwidefsky
From: Martin Schwidefsky [EMAIL PROTECTED]
From: Hubertus Franke [EMAIL PROTECTED]
From: Himanshu Raj [EMAIL PROTECTED]

s390 uses the milli-coded ESSA instruction to set the page state. The
page state is formed by four guest page states called block usage states
and three host page states called block content states.

The guest states are:
 - stable (S): there is essential content in the page
 - unused (U): there is no useful content and any access to the page will
   cause an addressing exception
 - volatile (V): there is useful content in the page. The host system is
   allowed to discard the content anytime, but has to deliver a discard
   fault with the absolute address of the page if the guest tries to
   access it.
 - potential volatile (P): the page has useful content. The host system
   is allowed to discard the content after it has checked the dirty bit
   of the page. It has to deliver a discard fault with the absolute
   address of the page if the guest tries to access it.

The host states are:
 - resident: the page is present in real memory.
 - preserved: the page is not present in real memory but the content is
   preserved elsewhere by the machine, e.g. on the paging device.
 - zero: the page is not present in real memory. The content of the page
   is logically-zero.

There are 12 combinations of guest and host state, currently only 8 are
valid page states:
 Sr: a stable, resident page.
 Sp: a stable, preserved page.
 Sz: a stable, logically zero page. A page filled with zeroes will be
 allocated on first access.
 Ur: an unused but resident page. The host could make it Uz anytime but
 it doesn't have to.
 Uz: an unused, logically zero page.
 Vr: a volatile, resident page. The guest can access it normally.
 Vz: a volatile, logically zero page. This is a discarded page. The host
 will deliver a discard fault for any access to the page.
 Pr: a potential volatile, resident page. The guest can access it normally.

The remaining 4 combinations can't occur:
 Up: an unused, preserved page. If the host tries to get rid of a Ur page
 it will remove it without writing the page content to disk and set
 the page to Uz.
 Vp: a volatile, preserved page. If the host picks a Vr page for eviction
 it will discard it and set the page state to Vz.
 Pp: a potential volatile, preserved page. There are two cases for page out:
 1) if the page is dirty then the host will preserved the page and set
 it to Sp or 2) if the page is clean then the host will discard it and
 set the page state to Vz.
 Pz: a potential volatile, logically zero page. The host system will always
 use Vz instead of Pz.

The state transitions (a diagram would be nicer but that is too hard
to do in ascii art...):
{Ur,Sr,Vr,Pr}: a resident page will change its block usage state if the
 guest requests it with page_set_{unused,stable,volatile}.
{Uz,Sz,Vz}: a logically zero page will change its block usage state if the
 guest requests it with page_set_{unused,stable,volatile}. The
 guest can't create the Pz state, the state will be Vz instead.
Ur - Uz: the host system can remove an unused, resident page from memory
Sz - Sr: on first access a stable, logically zero page will become resident
Sr - Sp: the host system can swap a stable page to disk
Sp - Sr: a guest access to a Sp page forces the host to retrieve it
Vr - Vz: the host can discard a volatile page
Sp - Uz: a page preserved by the host will be removed if the guest sets 
 the block usage state to unused.
Sp - Vz: a page preserved by the host will be discarded if the guest sets
 the block usage state to volatile.
Pr - Sp: the host can move a page from Pr to Sp if it discovers that the
 page is dirty while trying to discard the page. The page content is
 written to the paging device.
Pr - Vz: the host can discard a Pr page. The Pz state is replaced by the
 Vz state.

The are some hazards the code has to deal with:
1) For potential volatile pages the transfer of the hardware dirty bit to
the software dirty bit needs to make sure that the page gets into the
stable state before the hardware dirty bit is cleared. Between the
page_test_dirty and the page_clear_dirty call a page_make_stable is
required.

2) Since the access of unused pages causes addressing exceptions we need
to take care with /dev/mem. The copy_{from_to}_user functions need to
be able to cope with addressing exceptions for the kernel address space.

3) The discard fault on a s390 machine delivers the absolute address of
the page that caused the fault instead of the virtual address. With the
virtual address we could have used the page table entry of the current
process to safely get a reference to the discarded page. We can get to
the struct page from the absolute page address but it is rather hard to
get to a proper page reference. The page that caused the fault could
already have been freed and reused for a different purpose. None of the
fields in the struct

[kvm-devel] [patch 5/6] Guest page hinting: minor fault optimization.

2007-06-28 Thread Martin Schwidefsky
From: Martin Schwidefsky [EMAIL PROTECTED]
From: Hubertus Franke [EMAIL PROTECTED]
From: Himanshu Raj [EMAIL PROTECTED]

On of the challenges of the guest page hinting scheme is the cost for
the state transitions. If the cost gets too high the whole concept of
page state information is in question. Therefore it is important to
avoid the state transitions when possible. One place where the state
transitions can be avoided are minor faults. Why change the page state
to stable in find_get_page and back in page_add_anon_rmap/
page_add_file_rmap if the discarded pages can be handled by the discard
fault handler? If the page is in page/swap cache just map it even if it
is already discarded. The first access to the page will cause a discard
fault which needs to be able to deal with this kind of situation anyway
because of other races in the memory management.

The special find_get_page_nodiscard variant introduced for volatile
swap cache is used which does not change the page state. The calls to
find_get_page in filemap_nopage and lookup_swap_cache are replaced with
find_get_page_nodiscard. By the use of this function a new race is
created. If a minor fault races with the discard of a page the page may
not get mapped to the page table because the discard handler removed
the page from the cache which removes the page-mapping that is needed
to find the page table entry. A check for the discarded bit is added to
do_swap_page and do_no_page. The page table lock for the pte takes care
of the synchronization.

That removes the state transitions on the minor fault path. A page that
has been mapped will eventually be unmapped again. On the unmap path
each page that has been removed from the page table is freed with a
call to page_cache_release. In general that causes an unnecessary page
state transition from volatile to volatile. To get rid of these state
transitions as well a special variants of page_cache_release is added
that does not attempt to make the page volatile.
page_cache_release_nocheck is then used in free_page_and_swap_cache
and release_pages. This makes the unmap of ptes state transitions free.

Signed-off-by: Martin Schwidefsky [EMAIL PROTECTED]
---

 include/linux/pagemap.h |1 +
 include/linux/swap.h|2 +-
 mm/filemap.c|4 ++--
 mm/fremap.c |2 ++
 mm/memory.c |4 ++--
 mm/rmap.c   |4 +---
 mm/shmem.c  |7 +++
 mm/swap_state.c |4 ++--
 8 files changed, 18 insertions(+), 10 deletions(-)

diff -urpN linux-2.6/include/linux/pagemap.h 
linux-2.6-patched/include/linux/pagemap.h
--- linux-2.6/include/linux/pagemap.h   2007-06-28 18:19:45.0 +0200
+++ linux-2.6-patched/include/linux/pagemap.h   2007-06-28 18:19:48.0 
+0200
@@ -68,6 +68,7 @@ extern struct page * find_get_page_nodis
 #define find_get_page_nodiscard(mapping, index) find_get_page(mapping, index)
 #define page_cache_release(page)   put_page(page)
 #endif
+#define page_cache_release_nocheck(page)   put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
 #ifdef CONFIG_NUMA
diff -urpN linux-2.6/include/linux/swap.h linux-2.6-patched/include/linux/swap.h
--- linux-2.6/include/linux/swap.h  2007-06-28 18:19:45.0 +0200
+++ linux-2.6-patched/include/linux/swap.h  2007-06-28 18:19:48.0 
+0200
@@ -290,7 +290,7 @@ static inline void disable_swap_token(vo
 /* only sparc can not include linux/pagemap.h in this file
  * so leave page_cache_release and release_pages undeclared... */
 #define free_page_and_swap_cache(page) \
-   page_cache_release(page)
+   page_cache_release_nocheck(page)
 #define free_pages_and_swap_cache(pages, nr) \
release_pages((pages), (nr), 0);
 
diff -urpN linux-2.6/mm/filemap.c linux-2.6-patched/mm/filemap.c
--- linux-2.6/mm/filemap.c  2007-06-28 18:19:45.0 +0200
+++ linux-2.6-patched/mm/filemap.c  2007-06-28 18:19:48.0 +0200
@@ -1467,7 +1467,7 @@ retry_all:
 * Do we have something in the page cache already?
 */
 retry_find:
-   page = find_get_page(mapping, pgoff);
+   page = find_get_page_nodiscard(mapping, pgoff);
if (!page) {
unsigned long ra_pages;
 
@@ -1501,7 +1501,7 @@ retry_find:
start = pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
-   page = find_get_page(mapping, pgoff);
+   page = find_get_page_nodiscard(mapping, pgoff);
if (!page)
goto no_cached_page;
}
diff -urpN linux-2.6/mm/fremap.c linux-2.6-patched/mm/fremap.c
--- linux-2.6/mm/fremap.c   2007-06-28 18:19:48.0 +0200
+++ linux-2.6-patched/mm/fremap.c   2007-06-28 18:19:48.0 +0200
@@ -15,6 +15,7 @@
 #include linux/rmap.h
 #include linux/module.h
 #include linux/syscalls.h
+#include linux/page-states.h
 
 #include asm