[uml-devel] SKAS3 for 2.6.23

2007-12-08 Thread Jeff Dike
A skas3 patch which works on 2.6.23 is below.  There were a couple of
problems that I fixed which are described in the changelog.  With
this on the host and a UML close to what's currently in -mm, I get ~85%
of native performance on a kernel build.  With skas0, the best I've
seen is ~75%.

Thanks to [EMAIL PROTECTED] for sending me a patch that patched
cleanly into 2.6.23.

2.6.24 is going to be even more interesting, given the x86 merge.

Jeff

-- 
Work email - jdike at linux dot intel dot com

commit 7d984ebee7c1263b24904f049e898a37bf85f522
Author: Jeff Dike <[EMAIL PROTECTED]>
Date:   Sat Dec 8 09:07:59 2007 -0500

Fixed skas3 patch for 2.6.23.

Brokenness in 2.6.23 included use of current->mm in the mmap path
causing new maps to be done in the UML kernel address space rather
than the process address space.

The -EINVAL that everyone started seeing with 2.6.23 was caused by
a change in procfs.  file->f_ops was no longer proc_mm_ops, but
proc_reg_ops, with proc_mm_ops hidden elsewhere.  This broke the
sanity checking in proc_mm_get_mm which made sure that it was getting
a /proc/mm descriptor by checking that file->f_ops was &proc_mm_ops.

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 97b64d7..129ae08 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -612,6 +612,26 @@ config X86_PAE
  has the cost of more pagetable lookup overhead, and also
  consumes more pagetable space per process.
 
+config PROC_MM
+   bool "/proc/mm support"
+   default y
+
+config PROC_MM_DUMPABLE
+   bool "Make UML childs /proc/ completely browsable"
+   default n
+   help
+ If in doubt, say N.
+
+ This fiddles with some settings to make sure /proc/ is completely
+ browsable by who started UML, at the expense of some additional
+ locking (maybe this could slow down the runned UMLs of a few percents,
+ I've not tested this).
+
+ Also, if there is a bug in this feature, there is some little
+ possibility to do privilege escalation if you have UML installed
+ setuid (which you shouldn't have done) or if UML changes uid on
+ startup (which will be a good thing, when enabled) ...
+
 # Common NUMA Features
 config NUMA
bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index e0b2d17..dc80de4 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -27,11 +27,12 @@ static void flush_ldt(void *null)
 }
 #endif
 
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+static int alloc_ldt(struct mm_struct *mm, int mincount, int reload)
 {
void *oldldt;
void *newldt;
int oldsize;
+   mm_context_t * pc = &mm->context;
 
if (mincount <= pc->size)
return 0;
@@ -58,13 +59,15 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int 
reload)
 #ifdef CONFIG_SMP
cpumask_t mask;
preempt_disable();
-   load_LDT(pc);
+   if (¤t->active_mm->context == pc)
+   load_LDT(pc);
mask = cpumask_of_cpu(smp_processor_id());
-   if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+   if (!cpus_equal(mm->cpu_vm_mask, mask))
smp_call_function(flush_ldt, NULL, 1, 1);
preempt_enable();
 #else
-   load_LDT(pc);
+   if (¤t->active_mm->context == pc)
+   load_LDT(pc);
 #endif
}
if (oldsize) {
@@ -76,12 +79,12 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int 
reload)
return 0;
 }
 
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+static inline int copy_ldt(struct mm_struct *new, struct mm_struct *old)
 {
-   int err = alloc_ldt(new, old->size, 0);
+   int err = alloc_ldt(new, old->context.size, 0);
if (err < 0)
return err;
-   memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+   memcpy(new->context.ldt, old->context.ldt, 
old->context.size*LDT_ENTRY_SIZE);
return 0;
 }
 
@@ -89,22 +92,24 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t 
*old)
  * we do not have to muck with descriptors here, that is
  * done in switch_mm() as needed.
  */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int copy_context(struct mm_struct *mm, struct mm_struct *old_mm)
 {
-   struct mm_struct * old_mm;
int retval = 0;
 
-   init_MUTEX(&mm->context.sem);
-   mm->context.size = 0;
-   old_mm = current->mm;
if (old_mm && old_mm->context.size > 0) {
down(&old_mm->context.sem);
-   retval = copy_ldt(&mm->context, &old_mm->context);
+   retval = copy_ldt(mm, old_mm);
up(&old_mm->context.sem);
}
return ret

[uml-devel] Diff against old 2.6.23 SKAS3

2007-12-08 Thread Jeff Dike
And for anyone who's interested, below are the changes which I made to
the original skas patch.

Jeff

-- 
Work email - jdike at linux dot intel dot com

Index: linux-2.6-host/include/linux/mm.h
===
--- linux-2.6-host.orig/include/linux/mm.h  2007-12-07 00:37:20.0 
-0500
+++ linux-2.6-host/include/linux/mm.h   2007-12-07 22:53:15.0 -0500
@@ -1077,8 +1077,8 @@ static inline unsigned long do_mmap_pgof
return __do_mmap_pgoff(current->mm, file, addr, len, prot, flag, pgoff);
 }
 
-extern unsigned long mmap_region(struct file *file, unsigned long addr,
-   unsigned long len, unsigned long flags,
+extern unsigned long mmap_region(struct mm_struct *mm, struct file *file,
+   unsigned long addr, unsigned long len, unsigned long flags,
unsigned int vm_flags, unsigned long pgoff,
int accountable);
 
Index: linux-2.6-host/mm/fremap.c
===
--- linux-2.6-host.orig/mm/fremap.c 2007-12-07 00:36:08.0 -0500
+++ linux-2.6-host/mm/fremap.c  2007-12-07 22:54:54.0 -0500
@@ -190,8 +190,9 @@ asmlinkage long sys_remap_file_pages(uns
unsigned long addr;
 
flags &= MAP_NONBLOCK;
-   addr = mmap_region(vma->vm_file, start, size,
-   flags, vma->vm_flags, pgoff, 1);
+   addr = mmap_region(current->mm, vma->vm_file, start,
+  size, flags, vma->vm_flags, pgoff,
+  1);
if (IS_ERR_VALUE(addr)) {
err = addr;
} else {
Index: linux-2.6-host/mm/mmap.c
===
--- linux-2.6-host.orig/mm/mmap.c   2007-12-07 00:37:20.0 -0500
+++ linux-2.6-host/mm/mmap.c2007-12-07 22:55:26.0 -0500
@@ -1023,7 +1023,7 @@ unsigned long __do_mmap_pgoff(struct mm_
if (error)
return error;
 
-   return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+   return mmap_region(mm, file, addr, len, flags, vm_flags, pgoff,
   accountable);
 }
 EXPORT_SYMBOL(__do_mmap_pgoff);
@@ -1062,12 +1062,12 @@ int vma_wants_writenotify(struct vm_area
 }
 
 
-unsigned long mmap_region(struct file *file, unsigned long addr,
+unsigned long mmap_region(struct mm_struct *mm,
+ struct file *file, unsigned long addr,
  unsigned long len, unsigned long flags,
  unsigned int vm_flags, unsigned long pgoff,
  int accountable)
 {
-   struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
int correct_wcount = 0;
int error;
Index: linux-2.6-host/mm/proc_mm.c
===
--- linux-2.6-host.orig/mm/proc_mm.c2007-12-07 16:21:06.0 -0500
+++ linux-2.6-host/mm/proc_mm.c 2007-12-07 16:22:07.0 -0500
@@ -209,7 +209,7 @@ struct mm_struct *proc_mm_get_mm(int fd)
goto out;
 
ret = ERR_PTR(-EINVAL);
-   if(file->f_op != &proc_mm_fops)
+   if(PDE(file->f_path.dentry->d_inode)->proc_fops != &proc_mm_fops)
goto out_fput;
 
ret = file->private_data;

-
SF.Net email is sponsored by: 
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
___
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


Re: [uml-devel] SKAS3 for 2.6.23

2007-12-08 Thread Karol Swietlicki
On 08/12/2007, Jeff Dike <[EMAIL PROTECTED]> wrote:
> A skas3 patch which works on 2.6.23 is below.  There were a couple of
> problems that I fixed which are described in the changelog.  With
> this on the host and a UML close to what's currently in -mm, I get ~85%
> of native performance on a kernel build.  With skas0, the best I've
> seen is ~75%.
>
> Thanks to [EMAIL PROTECTED] for sending me a patch that patched
> cleanly into 2.6.23.
>
> 2.6.24 is going to be even more interesting, given the x86 merge.
>
> Jeff

This has just made my day. I just wish I had access to my main work machine now.
I can't wait to test this.

Karol Swietlicki

-
SF.Net email is sponsored by: 
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
___
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


[uml-devel] should there be os_clone() analogous to os_getpid() ?

2007-12-08 Thread John Reiser
In source file arch/um/os-Linux/process.c there is a warning:
-
/* Don't use the glibc version, which caches the result in TLS. It misses some
 * syscalls, and also breaks with clone(), which does not unshare the TLS.
 */

int os_getpid(void)
-

I see no os_clone(), yet the glibc clone() does the same caching of pid in
ThreadLocalStorage [TLS], and the TLS still may be shared.  If nobody reads
glibc's shared TLS slot for PID then an actual bug will be avoided.  However,
it is unsafe to leave such a tempting pitfall.  Also, if you are ptrace()ing
through a glibc clone(), then in many cases you will see syscall(__NR_getpid)
*from glibc* immediately following!  There is an "extra" getpid()
that the tracking logic might not expect.  So it seems to me that
there should be an os_clone() that refrains from fiddling with getpid.
[Unfortunately os_clone() is not so simple as os_getpid().]


The clone() we're talking about here is _not_ the bare syscall:
-
_syscall5(int, clone, int, flags, void *, child_stack,
int *, parent_tidptr, struct user_desc *, newtls,
int *, child_tidptr)
-
but rather the C-language interface:
-
   int clone(int (*fn)(void *), void *child_stack,
 int flags, void *arg, ...
 /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ );
-
which is declared in arch/um/include/kern.h and referenced in
   arch/um/drivers/ubd_user.c
   arch/um/kernel/tt/tracer.c
   arch/um/os/tt.c
   arch/um/os/start_up.c
   arch/um/os/skas/process.c
This clone() is implemented by glibc, and at runtime lives in the shared
library /lib/libc.so.6.

Not only that, but some versions of glibc for x86 use "int $0x80" directly
only for the __NR_clone call.  They use "ENTER_KERNEL" for the getpid(),
which in some cases (such as Fedora 7 and 8, but not Ubuntu 7.04) expands to
"call *%gs:n" which points at "sysenter; ret".

-- 
John Reiser, [EMAIL PROTECTED]

-
SF.Net email is sponsored by: 
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
___
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel