On 15.04.2008 [11:17:40 -0700], Eric B Munson wrote:
> POC of kernel supported huge page backed stacks.  This simply enables
> it in kernel, to setup a huge page backed stack you need a user-space
> utility that sets the HUGE_PAGE_STACK personality flag for a process.
> This patch does not enable full huge page backed stack functionality,
> it will fail if a process stack exceeds one huge page as allocation
> and alignment across multiple huge pages have not been addressed.
> Stack randomization is not currently supported on with huge page
> stacks so when running with HUGE_PAGE_STACK set
> /proc/sys/kernel/randomize_va_space to 0

I think Andrew Hastings had some ideas on how to make hugepage stacks
work better with randomization on? It will be tough to make this work on
the distros, which I assume have that flag on by default, if we have
this requirement (globaly disabling of a security flag).

> Sample user-space utility to follow.
> 
> Based on 2.6.25-rc9
> 
> Signed-off-by: Eric Munson <[EMAIL PROTECTED]>
> 
> ---
> 
>  fs/binfmt_elf.c               |   17 +++++++++--
>  fs/exec.c                     |   63 
> ++++++++++++++++++++++++++++++++++++++----
>  fs/hugetlbfs/inode.c          |   22 ++++++++------
>  include/asm-ia64/page.h       |    1
>  include/asm-powerpc/page_64.h |    1
>  include/asm-sparc64/page.h    |    3 ++
>  include/asm-x86/page.h        |    3 ++
>  include/linux/hugetlb.h       |    4 +-
>  include/linux/personality.h   |    3 ++
>  ipc/shm.c                     |    2 -
>  10 files changed, 97 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 5e1a4fb..4a0b3ef 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -518,12 +518,23 @@ static unsigned long randomize_stack_top(unsigned long 
> stack_top)
>       if ((current->flags & PF_RANDOMIZE) &&
>               !(current->personality & ADDR_NO_RANDOMIZE)) {
>               random_variable = get_random_int() & STACK_RND_MASK;
> -             random_variable <<= PAGE_SHIFT;
> +             
> +             if (get_personality & HUGE_PAGE_STACK) {
> +                     random_variable >>= (HPAGE_SHIFT - PAGE_SHIFT);
> +                     random_variable <<= HPAGE_SHIFT;
> +             } else
> +                     random_variable <<= PAGE_SHIFT;

Match the curly braces here.

>       }
>  #ifdef CONFIG_STACK_GROWSUP
> -     return PAGE_ALIGN(stack_top) + random_variable;
> +     if (get_personality & HUGE_PAGE_STACK)
> +             return HPAGE_ALIGN(stack_top) + random_variable;
> +     else
> +             return PAGE_ALIGN(stack_top) + random_variable;
>  #else
> -     return PAGE_ALIGN(stack_top) - random_variable;
> +     if (get_personality & HUGE_PAGE_STACK)
> +             return HPAGE_ALIGN(stack_top) - random_variable;
> +     else
> +             return PAGE_ALIGN(stack_top) - random_variable;
>  #endif
>  }

Could you add a hpagestack variable here, too? Also, would it make sense
to tie it into a structure somewhere that's passed to all of these
functions, rahter than checking to see if the stack should be
hugepage-backed everywhere? Not sure if such a function exists.

> diff --git a/fs/exec.c b/fs/exec.c
> index 54a0a55..aab2a27 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -51,6 +51,7 @@
>  #include <linux/tsacct_kern.h>
>  #include <linux/cn_proc.h>
>  #include <linux/audit.h>
> +#include <linux/hugetlb.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/mmu_context.h>
> @@ -224,11 +225,31 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
>       int err = -ENOMEM;
>       struct vm_area_struct *vma = NULL;
>       struct mm_struct *mm = bprm->mm;
> +     struct file *hugefile;
> +     char fname[64] = {0};
> +     char hpagestack = !((get_personality & HUGE_PAGE_STACK) == 0 );

No trailing space inside the parentheses.

>       bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
>       if (!vma)
>               goto err;
>  
> +     if (hpagestack) {
> +             sprintf(fname, "htlbfs%lu", current->pid);

Do we need to check that this succeeds at all? Also, how does this work
with containerization, could we have multiple processes (since pids are
no longer unique, necessarily) with the same stack filename? Does that
matter?

> +             hugefile = hugetlb_file_setup(fname, HPAGE_SIZE, 0);
> +             if (unlikely(IS_ERR_VALUE(hugefile))) {
> +                     printk(KERN_DEBUG "Huge page backed stack unavailable 
> for process %lu.\n", current->pid);
> +                     /*
> +                      * If huge pages are not available for this stack fall
> +                      * fall back to normal pages for execution instead of
> +                      * failing.
> +                      */
> +                     set_personality(get_personality & (~HUGE_PAGE_STACK));
> +                     hpagestack = 0;
> +             } else
> +                     vma->vm_file = hugefile;

Match the curly braces.

> +     }
> +
> +

Extraneous spaces.

>       down_write(&mm->mmap_sem);
>       vma->vm_mm = mm;
>  
> @@ -239,9 +260,17 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
>        * configured yet.
>        */
>       vma->vm_end = STACK_TOP_MAX;
> -     vma->vm_start = vma->vm_end - PAGE_SIZE;
> +     if (hpagestack)
> +             vma->vm_start = vma->vm_end - HPAGE_SIZE;
> +     else
> +             vma->vm_start = vma->vm_end - PAGE_SIZE;
>  
>       vma->vm_flags = VM_STACK_FLAGS;
> +     if (hpagestack) {
> +             vma->vm_flags |= VM_HUGETLB;
> +             /* Stack randomization is currently not supported on huge pages 
> */
> +             vma->vm_flags |= ADDR_NO_RANDOMIZE;

Nit: you've said this feature only works with global randomization off?
So what does it matter if this flag is set/not? I guess future-proofing
:)

> +     }
>       vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
>       err = insert_vm_struct(mm, vma);
>       if (err) {
> @@ -575,6 +604,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
>       struct vm_area_struct *prev = NULL;
>       unsigned long vm_flags;
>       unsigned long stack_base;
> +     unsigned long stack_pad;
> +     char hpagestack = !((get_personality & HUGE_PAGE_STACK) == 0);
>  
>  #ifdef CONFIG_STACK_GROWSUP
>       /* Limit stack size to 1GB */
> @@ -586,16 +617,25 @@ int setup_arg_pages(struct linux_binprm *bprm,
>       if (vma->vm_end - vma->vm_start > stack_base)
>               return -ENOMEM;
>  
> -     stack_base = PAGE_ALIGN(stack_top - stack_base);
> +     if (hpagestack)
> +             stack_base = HPAGE_ALIGN(stack_top - stack_base);
> +     else
> +             stack_base = PAGE_ALIGN(stack_top - stack_base);
>  
>       stack_shift = vma->vm_start - stack_base;
> +     if (hpagestack)
> +             stack_shift = ALIGN(stack_shift, HPAGE_SIZE);
>       mm->arg_start = bprm->p - stack_shift;
>       bprm->p = vma->vm_end - stack_shift;
>  #else
>       stack_top = arch_align_stack(stack_top);
> -     stack_top = PAGE_ALIGN(stack_top);
> +     if (hpagestack)
> +             stack_top = HPAGE_ALIGN(stack_top);
> +     else
> +             stack_top = PAGE_ALIGN(stack_top);
>       stack_shift = vma->vm_end - stack_top;
> -

I think this blank line should still exist, but perhaps after the
following conditional?

> +     if (hpagestack)
> +             stack_shift = ALIGN(stack_shift, HPAGE_SIZE);
>       bprm->p -= stack_shift;
>       mm->arg_start = bprm->p;
>  #endif
> @@ -633,11 +673,22 @@ int setup_arg_pages(struct linux_binprm *bprm,
>               }
>       }
>  
> +     stack_pad = EXTRA_STACK_VM_PAGES * PAGE_SIZE;
> +     if (hpagestack)
> +             stack_pad = ALIGN(stack_pad, HPAGE_SIZE);
> +
>  #ifdef CONFIG_STACK_GROWSUP
> -     stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
> +     if (vma->vm_end - vma->vm_start < stack_pad)
> +             stack_base = vma->vm_end + stack_pad;
> +     else
> +             stack_base = vma->vm_end;
>  #else
> -     stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
> +     if (vma->vm_end - vma->vm_start < stack_pad)
> +             stack_base = vma->vm_start - stack_pad;
> +     else
> +             stack_base = vma->vm_start;
>  #endif
> +

Extraneous newline.

>       ret = expand_stack(vma, stack_base);
>       if (ret)
>               ret = -EFAULT;
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 6846785..0ca7be2 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -488,7 +488,7 @@ out:
>  }
>  
>  static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 
> -                                     gid_t gid, int mode, dev_t dev)
> +                                     gid_t gid, int mode, dev_t dev, char 
> shared)
>  {
>       struct inode *inode;
>  

This change seems like it might be best in a separate patch, to isolated
the stack changes.

Thanks,
Nish

-- 
Nishanth Aravamudan <[EMAIL PROTECTED]>
IBM Linux Technology Center

-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
_______________________________________________
Libhugetlbfs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libhugetlbfs-devel

Reply via email to