Functions to restore mm, VMAs and mm context are added.

Signed-off-by: Andrey Mirkin <[EMAIL PROTECTED]>
---
 checkpoint/Makefile      |    2 +-
 checkpoint/checkpoint.h  |    1 +
 checkpoint/cpt_image.h   |    5 +
 checkpoint/rst_mm.c      |  320 ++++++++++++++++++++++++++++++++++++++++++++++
 checkpoint/rst_process.c |    3 +-
 mm/mmap.c                |    1 +
 mm/mprotect.c            |    2 +
 7 files changed, 332 insertions(+), 2 deletions(-)
 create mode 100644 checkpoint/rst_mm.c

diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 689a0eb..19ca732 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -3,4 +3,4 @@ obj-y += sys_core.o
 obj-$(CONFIG_CHECKPOINT) += cptrst.o
 
 cptrst-objs := sys.o checkpoint.o cpt_process.o cpt_mm.o restart.o \
-              rst_process.o
+              rst_process.o rst_mm.o
diff --git a/checkpoint/checkpoint.h b/checkpoint/checkpoint.h
index 1d0ca49..195fdc6 100644
--- a/checkpoint/checkpoint.h
+++ b/checkpoint/checkpoint.h
@@ -65,3 +65,4 @@ int cpt_dump_mm(struct task_struct *tsk, struct cpt_context 
*ctx);
 int restart_container(struct cpt_context *ctx);
 int rst_get_object(int type, void *tmp, int size, struct cpt_context *ctx);
 int rst_restart_process(struct cpt_context *ctx);
+int rst_restore_mm(struct cpt_context *ctx);
diff --git a/checkpoint/cpt_image.h b/checkpoint/cpt_image.h
index 160cf85..e1fb483 100644
--- a/checkpoint/cpt_image.h
+++ b/checkpoint/cpt_image.h
@@ -233,6 +233,11 @@ struct cpt_x86_regs
        __u32   cpt_ss;
 } __attribute__ ((aligned (8)));
 
+static inline void __user * cpt_ptr_import(__u64 ptr)
+{
+       return (void*)(unsigned long)ptr;
+}
+
 static inline __u64 cpt_timespec_export(struct timespec *tv)
 {
        return (((u64)tv->tv_sec) << 32) + tv->tv_nsec;
diff --git a/checkpoint/rst_mm.c b/checkpoint/rst_mm.c
new file mode 100644
index 0000000..fe53c45
--- /dev/null
+++ b/checkpoint/rst_mm.c
@@ -0,0 +1,320 @@
+/*
+ *  Copyright (C) 2008 Parallels, Inc.
+ *
+ *  Author: Andrey Mirkin <[EMAIL PROTECTED]>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/syscalls.h>
+
+#include "checkpoint.h"
+#include "cpt_image.h"
+
+static unsigned long make_prot(struct cpt_vma_image *vmai)
+{
+       unsigned long prot = 0;
+
+       if (vmai->cpt_flags & VM_READ)
+               prot |= PROT_READ;
+       if (vmai->cpt_flags & VM_WRITE)
+               prot |= PROT_WRITE;
+       if (vmai->cpt_flags & VM_EXEC)
+               prot |= PROT_EXEC;
+       if (vmai->cpt_flags & VM_GROWSDOWN)
+               prot |= PROT_GROWSDOWN;
+       if (vmai->cpt_flags & VM_GROWSUP)
+               prot |= PROT_GROWSUP;
+       return prot;
+}
+
+static unsigned long make_flags(struct cpt_vma_image *vmai)
+{
+       unsigned long flags = MAP_FIXED;
+
+       if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE))
+               flags |= MAP_SHARED;
+       else
+               flags |= MAP_PRIVATE;
+
+       if (vmai->cpt_file == CPT_NULL)
+               flags |= MAP_ANONYMOUS;
+       if (vmai->cpt_flags & VM_GROWSDOWN)
+               flags |= MAP_GROWSDOWN;
+#ifdef MAP_GROWSUP
+       if (vmai->cpt_flags & VM_GROWSUP)
+               flags |= MAP_GROWSUP;
+#endif
+       if (vmai->cpt_flags & VM_DENYWRITE)
+               flags |= MAP_DENYWRITE;
+       if (vmai->cpt_flags & VM_EXECUTABLE)
+               flags |= MAP_EXECUTABLE;
+       if (!(vmai->cpt_flags & VM_ACCOUNT))
+               flags |= MAP_NORESERVE;
+       return flags;
+}
+
+static int rst_restore_one_vma(struct cpt_context *ctx)
+{
+       int err;
+       int i;
+       unsigned long addr;
+       struct mm_struct *mm = current->mm;
+       struct cpt_vma_image vmai;
+       struct vm_area_struct *vma;
+       struct file *file = NULL;
+       unsigned long prot;
+
+       err = rst_get_object(CPT_OBJ_VMA, &vmai, sizeof(vmai), ctx);
+       if (err)
+               return err;
+
+       prot = make_prot(&vmai);
+
+       if (vmai.cpt_vma_type == CPT_VMA_FILE) {
+               struct cpt_object_hdr h;
+               int len;
+               char *path;
+
+               err = rst_get_object(CPT_OBJ_NAME, &h, sizeof(h), ctx);
+               if (err)
+                       goto out;
+               len = h.cpt_len - sizeof(h);
+               if (len < 0) {
+                       err = -EINVAL;
+                       goto out;
+               }
+               path = kmalloc(len, GFP_KERNEL);
+               if (!path) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               err = ctx->read(path, len, ctx);
+               if (err) {
+                       kfree(path);
+                       goto out;
+               }
+
+               /* Just open file
+                  TODO: open with correct flags */
+               file = filp_open(path, O_RDONLY, 0);
+               kfree(path);
+               if (IS_ERR(file)) {
+                       err = PTR_ERR(file);
+                       goto out;
+               }
+       }
+
+       down_write(&mm->mmap_sem);
+       addr = do_mmap_pgoff(file, vmai.cpt_start,
+                            vmai.cpt_end - vmai.cpt_start,
+                            prot, make_flags(&vmai),
+                            vmai.cpt_pgoff);
+
+       if (addr != vmai.cpt_start) {
+               up_write(&mm->mmap_sem);
+
+               err = -EINVAL;
+               if (IS_ERR((void*)addr))
+                       err = addr;
+               goto out;
+       }
+
+       vma = find_vma(mm, vmai.cpt_start);
+       if (vma == NULL) {
+               up_write(&mm->mmap_sem);
+               eprintk("cannot find mmapped vma\n");
+               err = -ESRCH;
+               goto out;
+       }
+
+       /* do_mmap_pgoff() can merge new area to previous one (not to the next,
+        * we mmap in order, the rest of mm is still unmapped). This can happen
+        * f.e. if flags are to be adjusted later, or if we had different
+        * anon_vma on two adjacent regions. Split it by brute force. */
+       if (vma->vm_start != vmai.cpt_start) {
+               err = split_vma(mm, vma, (unsigned long)vmai.cpt_start, 0);
+               if (err) {
+                       up_write(&mm->mmap_sem);
+                       eprintk("cannot split vma\n");
+                       goto out;
+               }
+       }
+       up_write(&mm->mmap_sem);
+
+       for (i = 0; i < vmai.cpt_page_num; i++) {
+               struct cpt_page_block pb;
+
+               err = rst_get_object(CPT_OBJ_PAGES, &pb, sizeof(pb), ctx);
+               if (err)
+                       goto out;
+               if (!(vmai.cpt_flags & VM_ACCOUNT) && !(prot & PROT_WRITE)) {
+                       /* I guess this is get_user_pages() messed things,
+                        * this happens f.e. when gdb inserts breakpoints.
+                        */
+                       int j;
+                       for (j = 0; j < (pb.cpt_end-pb.cpt_start)/PAGE_SIZE; 
j++) {
+                               struct page *page;
+                               void *maddr;
+                               err = get_user_pages(current, current->mm,
+                                               (unsigned long)pb.cpt_start +
+                                               j * PAGE_SIZE,
+                                               1, 1, 1, &page, NULL);
+                               if (err == 0)
+                                       err = -EFAULT;
+                               if (err < 0) {
+                                       eprintk("get_user_pages: %d\n", err);
+                                       goto out;
+                               }
+                               err = 0;
+                               maddr = kmap(page);
+                               if (pb.cpt_content == CPT_CONTENT_VOID) {
+                                       memset(maddr, 0, PAGE_SIZE);
+                               } else if (pb.cpt_content == CPT_CONTENT_DATA) {
+                                       err = ctx->read(maddr, PAGE_SIZE, ctx);
+                                       if (err) {
+                                               kunmap(page);
+                                               goto out;
+                                       }
+                               } else {
+                                       err = -EINVAL;
+                                       kunmap(page);
+                                       goto out;
+                               }
+                               set_page_dirty_lock(page);
+                               kunmap(page);
+                               page_cache_release(page);
+                       }
+               } else {
+                       if (!(prot & PROT_WRITE))
+                               sys_mprotect(vmai.cpt_start,
+                                               vmai.cpt_end - vmai.cpt_start,
+                                               prot | PROT_WRITE);
+                       if (pb.cpt_content == CPT_CONTENT_VOID) {
+                               int j;
+                               for (j=0; 
j<(pb.cpt_end-pb.cpt_start)/sizeof(unsigned long); j++) {
+                                       err = __put_user(0UL, ((unsigned long 
__user*)(unsigned long)pb.cpt_start) + j);
+                                       if (err) {
+                                               eprintk("__put_user 2 %d\n", 
err);
+                                               goto out;
+                                       }
+                               }
+                       } else if (pb.cpt_content == CPT_CONTENT_DATA) {
+                               err = ctx->read(cpt_ptr_import(pb.cpt_start),
+                                               pb.cpt_end - pb.cpt_start,
+                                               ctx);
+                               if (err)
+                                       goto out;
+                       } else {
+                               err = -EINVAL;
+                               goto out;
+                       }
+                       if (!(prot & PROT_WRITE))
+                               sys_mprotect(vmai.cpt_start,
+                                               vmai.cpt_end - vmai.cpt_start,
+                                               prot);
+               }
+       }
+
+out:
+       if (file)
+               fput(file);
+       return err;
+}
+
+static int rst_restore_mm_context(struct cpt_context *ctx)
+{
+       struct cpt_obj_bits b;
+       struct mm_struct *mm = current->mm;
+       int oldsize = mm->context.size;
+       int err;
+       void *oldldt;
+       void *newldt;
+
+       err = rst_get_object(CPT_OBJ_BITS, &b, sizeof(b), ctx);
+       if (err)
+               return err;
+
+       if (b.cpt_size > PAGE_SIZE)
+               newldt = vmalloc(b.cpt_size);
+       else
+               newldt = kmalloc(b.cpt_size, GFP_KERNEL);
+
+       if (!newldt)
+               return -ENOMEM;
+
+       err = ctx->read(newldt, b.cpt_size, ctx);
+       if (err)
+               return err;
+
+       oldldt = mm->context.ldt;
+       mm->context.ldt = newldt;
+       mm->context.size = b.cpt_size / LDT_ENTRY_SIZE;
+
+       load_LDT(&mm->context);
+
+       if (oldsize) {
+               if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(oldldt);
+               else
+                       kfree(oldldt);
+       }
+
+       return 0;
+}
+
+int rst_restore_mm(struct cpt_context *ctx)
+{
+       int err;
+       int i;
+       struct mm_struct *mm = current->mm;
+       struct cpt_mm_image m;
+
+       err = rst_get_object(CPT_OBJ_MM, &m, sizeof(m), ctx);
+       if (err)
+               return err;
+
+       down_write(&mm->mmap_sem);
+       do_munmap(mm, 0, TASK_SIZE);
+
+       mm->start_code = m.cpt_start_code;
+       mm->end_code = m.cpt_end_code;
+       mm->start_data = m.cpt_start_data;
+       mm->end_data = m.cpt_end_data;
+       mm->start_brk = m.cpt_start_brk;
+       mm->brk = m.cpt_brk;
+       mm->start_stack = m.cpt_start_stack;
+       mm->arg_start = m.cpt_start_arg;
+       mm->arg_end = m.cpt_end_arg;
+       mm->env_start = m.cpt_start_env;
+       mm->env_end = m.cpt_end_env;
+       mm->def_flags = m.cpt_def_flags;
+       mm->flags = m.cpt_flags;
+
+       up_write(&mm->mmap_sem);
+
+       for (i = 0; i < m.cpt_map_count; i++) {
+               err = rst_restore_one_vma(ctx);
+               if (err < 0)
+                       goto out;
+       }
+
+       err = rst_restore_mm_context(ctx);
+out:
+       return err;
+}
+
diff --git a/checkpoint/rst_process.c b/checkpoint/rst_process.c
index b9f745e..9e448b2 100644
--- a/checkpoint/rst_process.c
+++ b/checkpoint/rst_process.c
@@ -210,7 +210,8 @@ static int restart_thread(void *arg)
        err = rst_get_object(CPT_OBJ_TASK, ti, sizeof(*ti), ctx);
        if (!err)
                err = rst_restore_task_struct(current, ti, ctx);
-       /* Restore mm here */
+       if (!err)
+               err = rst_restore_mm(ctx);
        if (!err)
                err = rst_restore_fpustate(current, ti, ctx);
        if (!err)
diff --git a/mm/mmap.c b/mm/mmap.c
index 971d0ed..98d1ba9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1858,6 +1858,7 @@ int split_vma(struct mm_struct * mm, struct 
vm_area_struct * vma,
 
        return 0;
 }
+EXPORT_SYMBOL(split_vma);
 
 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
diff --git a/mm/mprotect.c b/mm/mprotect.c
index fded06f..47c7d75 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -22,6 +22,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
+#include <linux/module.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -317,3 +318,4 @@ out:
        up_write(&current->mm->mmap_sem);
        return error;
 }
+EXPORT_SYMBOL(sys_mprotect);
-- 
1.5.6

_______________________________________________
Containers mailing list
[EMAIL PROTECTED]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to