[Devel] Re: [RFC v2][PATCH 4/9] Memory management - dump state

Oren Laadan Sat, 23 Aug 2008 23:18:09 -0700


Dave Hansen wrote:
> On Wed, 2008-08-20 at 23:05 -0400, Oren Laadan wrote:
>> index 7ecafb3..0addb63 100644
>> --- a/checkpoint/ckpt.h
>> +++ b/checkpoint/ckpt.h
>> @@ -29,6 +29,9 @@ struct cr_ctx {
>>      void *hbuf;             /* header: to avoid many alloc/dealloc */
>>      int hpos;
>>
>> +    struct cr_pgarr *pgarr;
>> +    struct cr_pgarr *pgcur;
>> +
>>      struct path *vfsroot;   /* container root */
>>   };
> 
> These need much better variable names.  From this, I have no idea what
> they do.  Checkpoint Restart Pirates Go ARR!
> 
>> @@ -62,6 +65,9 @@ int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void 
>> *buf, int n);
>>   int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type);
>>   int cr_read_str(struct cr_ctx *ctx, void *str, int n);
>>
>> +int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
>> +int cr_read_mm(struct cr_ctx *ctx);
>> +
>>   int do_checkpoint(struct cr_ctx *ctx);
>>   int do_restart(struct cr_ctx *ctx);
>>
>> diff --git a/checkpoint/ckpt_arch.h b/checkpoint/ckpt_arch.h
>> index b7cc8c9..3b87a6f 100644
>> --- a/checkpoint/ckpt_arch.h
>> +++ b/checkpoint/ckpt_arch.h
>> @@ -2,6 +2,7 @@
>>
>>   int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t);
>>   int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t);
>> +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int ptag);
>>
>>   int cr_read_thread(struct cr_ctx *ctx);
>>   int cr_read_cpu(struct cr_ctx *ctx);
>> diff --git a/checkpoint/ckpt_hdr.h b/checkpoint/ckpt_hdr.h
>> index a478b7c..a3919cf 100644
>> --- a/checkpoint/ckpt_hdr.h
>> +++ b/checkpoint/ckpt_hdr.h
>> @@ -30,6 +30,7 @@ struct cr_hdr {
>>      __u32 ptag;
>>   };
>>
>> +/* header types */
>>   enum {
>>      CR_HDR_HEAD = 1,
>>      CR_HDR_STR,
>> @@ -45,6 +46,12 @@ enum {
>>      CR_HDR_TAIL = 5001
>>   };
>>
>> +/* vma subtypes */
>> +enum {
>> +    CR_VMA_ANON = 1,
>> +    CR_VMA_FILE
>> +};
> 
> Is this really necessary, or can we infer it from the contents of the
> VMA?


This classification eventually simplifies both dump and restore. For
instance, it decides whether a file name follows or not.

There will be more, later:  CR_VMA_FILE_UNLINKED (mapped to an unlinked
file), CR_VMA_ANON_SHARED (shared anonymous), CR_VMA_ANON_SHARED_SKIP
(shared anonymous, had been sent before) and so on.

> 
>>   struct cr_hdr_head {
>>      __u64 magic;
>>
>> @@ -83,4 +90,28 @@ struct cr_hdr_task {
>>      char comm[TASK_COMM_LEN];
>>   } __attribute__ ((aligned (8)));
>>
>> +struct cr_hdr_mm {
>> +    __u32 tag;      /* sharing identifier */
> 
> If this really is a sharing identifier, we need a:
> 
> struct cr_shared_object_ref {
>       __u32 tag;
> }; 
> 
> And then one of *those* in the vma struct.  Make it much more idiot (aka
> Dave) proof.

I figured the use of 'tag' for the identifiers of shared objects is clear.
By using a __u32 the size of that field is immediately visible, while using
a structure will hide the actual size; in turn this is what we want visible
here (ABI), no ?

> 
>> +    __s16 map_count;
>> +    __s16 _padding;
>> +
>> +    __u64 start_code, end_code, start_data, end_data;
>> +    __u64 start_brk, brk, start_stack;
>> +    __u64 arg_start, arg_end, env_start, env_end;
>> +
>> +} __attribute__ ((aligned (8)));
>> +
>> +struct cr_hdr_vma {
>> +    __u32 how;
> 
> It is too bad that we can't actually use the enum type here.  It would
> make it *much* more obvious what this was.  I actually had to go look at
> the code below to figure it out.

Using __u32 instead of enum guarantees the size. I'll change the
name and move the enum nearby.

> 
>> +    __s16 npages;
> 
> Wow.  Linux only supports 256MB in a single VMA?  I didn't know that.
> Maybe we should make this type bigger.  :)
> 
> This also needs to get called something much more descriptive, like
> nr_present_pages.
> 
> 
> 
>>   #endif /* _CHECKPOINT_CKPT_HDR_H_ */
>> diff --git a/checkpoint/ckpt_mem.c b/checkpoint/ckpt_mem.c
>> new file mode 100644
>> index 0000000..a23aa29
>> --- /dev/null
>> +++ b/checkpoint/ckpt_mem.c
>> @@ -0,0 +1,382 @@
>> +/*
>> + *  Checkpoint memory contents
>> + *
>> + *  Copyright (C) 2008 Oren Laadan
>> + *
>> + *  This file is subject to the terms and conditions of the GNU General 
>> Public
>> + *  License.  See the file COPYING in the main directory of the Linux
>> + *  distribution for more details.
>> + */
>> +
>> +#include <linux/kernel.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/file.h>
>> +#include <linux/pagemap.h>
>> +#include <linux/mm_types.h>
>> +
>> +#include "ckpt.h"
>> +#include "ckpt_hdr.h"
>> +#include "ckpt_arch.h"
>> +#include "ckpt_mem.h"
>> +
>> +/*
>> + * utilities to alloc, free, and handle 'struct cr_pgarr'
>> + * (common to ckpt_mem.c and rstr_mem.c)
>> + */
>> +
>> +#define CR_PGARR_ORDER  0
>> +#define CR_PGARR_TOTAL  ((PAGE_SIZE << CR_PGARR_ORDER) / sizeof(void *))
> 
> Another allocator?  Really?
> 
>> +/* release pages referenced by a page-array */
>> +void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr)
>> +{
>> +    int n;
>> +
>> +    /* only checkpoint keeps references to pages */
>> +    if (ctx->flags & CR_CTX_CKPT) {
>> +            cr_debug("nused %d\n", pgarr->nused);
>> +            for (n = pgarr->nused; n--; )
>> +                    page_cache_release(pgarr->pages[n]);
>> +    }
>> +    pgarr->nused = 0;
>> +    pgarr->nleft = CR_PGARR_TOTAL;
>> +}
>> +
>> +/* release pages referenced by chain of page-arrays */
>> +void cr_pgarr_release(struct cr_ctx *ctx)
>> +{
>> +    struct cr_pgarr *pgarr;
>> +
>> +    for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next)
>> +            _cr_pgarr_release(ctx, pgarr);
>> +}
>> +
>> +/* free a chain of page-arrays */
>> +void cr_pgarr_free(struct cr_ctx *ctx)
>> +{
>> +    struct cr_pgarr *pgarr, *pgnxt;
>> +
>> +    for (pgarr = ctx->pgarr; pgarr; pgarr = pgnxt) {
>> +            _cr_pgarr_release(ctx, pgarr);
>> +            free_pages((unsigned long) ctx->pgarr->addrs, CR_PGARR_ORDER);
>> +            free_pages((unsigned long) ctx->pgarr->pages, CR_PGARR_ORDER);
>> +            pgnxt = pgarr->next;
>> +            kfree(pgarr);
>> +    }
>> +}
>> +
>> +/* allocate and add a new page-array to chain */
>> +struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew)
>> +{
>> +    struct cr_pgarr *pgarr = ctx->pgcur;
>> +
>> +    if (pgarr && pgarr->next) {
>> +            ctx->pgcur = pgarr->next;
>> +            return pgarr->next;
>> +    }
>> +
>> +    if ((pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL))) {
> 
> This entire nested if(){} should be brought back to 1 tab.  Remember, no
> assignments in if() conditions.
> 
>> +            pgarr->nused = 0;
>> +            pgarr->nleft = CR_PGARR_TOTAL;
>> +            pgarr->addrs = (unsigned long *)
>> +                    __get_free_pages(GFP_KERNEL, CR_PGARR_ORDER);
>> +            pgarr->pages = (struct page **)
>> +                    __get_free_pages(GFP_KERNEL, CR_PGARR_ORDER);
>> +            if (likely(pgarr->addrs && pgarr->pages)) {
>> +                    *pgnew = pgarr;
>> +                    ctx->pgcur = pgarr;
>> +                    return pgarr;
>> +            } else if (pgarr->addrs)
>> +                    free_pages((unsigned long) pgarr->addrs,
>> +                               CR_PGARR_ORDER);
>> +            kfree(pgarr);
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +/* return current page-array (and allocate if needed) */
>> +struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx)
>> +{
>> +    struct cr_pgarr *pgarr = ctx->pgcur;
>> +
>> +    if (unlikely(!pgarr->nleft))
>> +            pgarr = cr_pgarr_alloc(ctx, &pgarr->next);
>> +    return pgarr;
>> +}
>> +
>> +/*
>> + * Checkpoint is outside the context of the checkpointee, so one cannot
>> + * simply read pages from user-space. Instead, we scan the address space
>> + * of the target to cherry-pick pages of interest. Selected pages are
>> + * enlisted in a page-array chain (attached to the checkpoint context).
>> + * To save their contents, each page is mapped to kernel memory and then
>> + * dumped to the file descriptor.
>> + */
>> +
>> +/**
>> + * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
>> + * @ctx - checkpoint context
>> + * @pgarr - page-array to fill
>> + * @vma - vma to scan
>> + * @start - start address (updated)
>> + */
>> +static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
>> +                         struct vm_area_struct *vma, unsigned long *start)
>> +{
>> +    unsigned long end = vma->vm_end;
>> +    unsigned long addr = *start;
>> +    struct page **pagep;
>> +    unsigned long *addrp;
>> +    int cow, nr, ret = 0;
>> +
>> +    nr = pgarr->nleft;
>> +    pagep = &pgarr->pages[pgarr->nused];
>> +    addrp = &pgarr->addrs[pgarr->nused];
>> +    cow = !!vma->vm_file;
>> +
>> +    while (addr < end) {
>> +            struct page *page;
>> +
>> +            /* simplified version of get_user_pages(): already have vma,
>> +            * only need FOLL_TOUCH, and (for now) ignore fault stats */
>> +
>> +            cond_resched();
>> +            while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
>> +                    ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
>> +                    if (ret & VM_FAULT_ERROR) {
>> +                            if (ret & VM_FAULT_OOM)
>> +                                    ret = -ENOMEM;
>> +                            else if (ret & VM_FAULT_SIGBUS)
>> +                                    ret = -EFAULT;
>> +                            else
>> +                                    BUG();
>> +                            break;
>> +                    }
>> +                    cond_resched();
>> +            }
> 
> At best this needs to get folded back into its own function.  This is

This is almost identical to the original - see the preceding comment.

> pretty hard to read as-is.  Also, are there truly no in-kernel functions
> that can be used for this?

Can you suggest one ?

> 
>> +            if (IS_ERR(page)) {
>> +                    ret = PTR_ERR(page);
>> +                    break;
>> +            }
>> +
>> +            if (page == ZERO_PAGE(0))
>> +                    page = NULL;    /* zero page: ignore */
>> +            else if (cow && page_mapping(page) != NULL)
>> +                    page = NULL;    /* clean cow: ignore */
>> +            else {
> 
> Put the curly brackets in here.  It doesn't take up space.
> 
>> +                    get_page(page);
>> +                    *(addrp++) = addr;
>> +                    *(pagep++) = page;
>> +                    if (--nr == 0) {
>> +                            addr += PAGE_SIZE;
>> +                            break;
>> +                    }
>> +            }
>> +
>> +            addr += PAGE_SIZE;
>> +    }
>> +
>> +    if (unlikely(ret < 0)) {
>> +            nr = pgarr->nleft - nr;
>> +            while (nr--)
>> +                    page_cache_release(*(--pagep));
>> +            return ret;
>> +    }
>> +
>> +    *start = addr;
>> +    return (pgarr->nleft - nr);
>> +}
>> +
>> +/**
>> + * cr_vma_scan_pages - scan vma for pages that will need to be dumped
>> + * @ctx - checkpoint context
>> + * @vma - vma to scan
>> + *
>> + * a list of addr/page tuples is kept in ctx->pgarr page-array chain
>> + */
>> +static int cr_vma_scan_pages(struct cr_ctx *ctx, struct vm_area_struct *vma)
>> +{
>> +    unsigned long addr = vma->vm_start;
>> +    unsigned long end = vma->vm_end;
>> +    struct cr_pgarr *pgarr;
>> +    int nr, total = 0;
>> +
>> +    while (addr < end) {
>> +            if (!(pgarr = cr_pgarr_prep(ctx)))
>> +                    return -ENOMEM;
>> +            if ((nr = cr_vma_fill_pgarr(ctx, pgarr, vma, &addr)) < 0)
>> +                    return nr;
>> +            pgarr->nleft -= nr;
>> +            pgarr->nused += nr;
>> +            total += nr;
>> +    }
>> +
>> +    cr_debug("total %d\n", total);
>> +    return total;
>> +}
>> +
>> +/**
>> + * cr_vma_dump_pages - dump pages listed in the ctx page-array chain
>> + * @ctx - checkpoint context
>> + * @total - total number of pages
>> + */
>> +static int cr_vma_dump_pages(struct cr_ctx *ctx, int total)
>> +{
>> +    struct cr_pgarr *pgarr;
>> +    int ret;
>> +
>> +    if (!total)
>> +            return 0;
>> +
>> +    for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
>> +            ret = cr_kwrite(ctx, pgarr->addrs,
>> +                           pgarr->nused * sizeof(*pgarr->addrs));
>> +            if (ret < 0)
>> +                    return ret;
>> +    }
>> +
>> +    for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
>> +            struct page **pages = pgarr->pages;
>> +            int nr = pgarr->nused;
>> +            void *ptr;
>> +
>> +            while (nr--) {
>> +                    ptr = kmap(*pages);
>> +                    ret = cr_kwrite(ctx, ptr, PAGE_SIZE);
>> +                    kunmap(*pages);
> 
> Why not use kmap_atomic() here?

It is my understanding that the code must not sleep between kmap_atomic()
and kunmap_atomic().

> 
>> +                    if (ret < 0)
>> +                            return ret;
>> +                    pages++;
>> +            }
>> +    }
>> +
>> +    return total;
>> +}
>> +
>> +static int cr_write_vma(struct cr_ctx *ctx, struct vm_area_struct *vma)
>> +{
>> +    struct cr_hdr h;
>> +    struct cr_hdr_vma *hh = ctx->hbuf;
>> +    char *fname = NULL;
>> +    int flen = 0, how, nr, ret;
>> +
>> +    h.type = CR_HDR_VMA;
>> +    h.len = sizeof(*hh);
>> +    h.ptag = 0;
>> +
>> +    hh->vm_start = vma->vm_start;
>> +    hh->vm_end = vma->vm_end;
>> +    hh->vm_page_prot = vma->vm_page_prot.pgprot;
>> +    hh->vm_flags = vma->vm_flags;
>> +    hh->vm_pgoff = vma->vm_pgoff;
>> +
>> +    if (vma->vm_flags & (VM_SHARED | VM_IO | VM_HUGETLB | VM_NONLINEAR)) {
>> +            pr_warning("CR: unknown VMA %#lx\n", vma->vm_flags);
>> +            return -ETXTBSY;
>> +    }
> 
> Hmmm.  Interesting error code for VM_HUGETLB.  :)

:)  well, the usual EINVAL didn't seem suitable. Any better suggestions ?

> 
>> +    /* by default assume anon memory */
>> +    how = CR_VMA_ANON;
>> +
>> +    /* if there is a backing file, assume private-mapped */
>> +    /* (NEED: check if the file is unlinked) */
>> +    if (vma->vm_file) {
>> +            flen = PAGE_SIZE;
>> +            fname = cr_fill_fname(&vma->vm_file->f_path,
>> +                                  ctx->vfsroot, ctx->tbuf, &flen);
>> +            if (IS_ERR(fname))
>> +                    return PTR_ERR(fname);
>> +            how = CR_VMA_FILE;
>> +    }
>> +
>> +    hh->how = how;
>> +    hh->fname = !!fname;
>> +
>> +    /*
>> +     * it seems redundant now, but we do it in 3 steps for because:
>> +     * first, the logic is simpler when we how many pages before
>> +     * dumping them; second, a future optimization will defer the
>> +     * writeout (dump, and free) to a later step; in which case all
>> +     * the pages to be dumped will be aggregated on the checkpoint ctx
>> +     */
>> +
>> +    /* (1) scan: scan through the PTEs of the vma to count the pages
>> +     * to dump (and later make those pages COW), and keep the list of
>> +     * pages (and a reference to each page) on the checkpoint ctx */
>> +    nr = cr_vma_scan_pages(ctx, vma);
>> +    if (nr < 0)
>> +            return nr;
>> +
>> +    hh->npages = nr;
>> +    ret = cr_write_obj(ctx, &h, hh);
>> +
>> +    if (!ret && flen)
>> +            ret = cr_write_str(ctx, fname, flen);
>> +
>> +    if (ret < 0)
>> +            return ret;
>> +
>> +    /* (2) dump: write out the addresses of all pages in the list (on
>> +     * the checkpoint ctx) followed by the contents of all pages */
>> +    ret = cr_vma_dump_pages(ctx, nr);
>> +
>> +    /* (3) free: free the extra references to the pages in the list */
>> +    cr_pgarr_release(ctx);
>> +
>> +    return ret;
>> +}
> 
> This gets simpler-looking if you just defer the filename processing
> until you actually go to write it out.  

Yes. I'll encapsulate that in it's own function.

Oren.

_______________________________________________
Containers mailing list
[EMAIL PROTECTED]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel

[Devel] Re: [RFC v2][PATCH 4/9] Memory management - dump state

Reply via email to