Re: [RFC][PATCH] Replacing the /proc//exe symlink code

2007-06-06 Thread Serge E. Hallyn
Quoting Matt Helsley ([EMAIL PROTECTED]):
> This patch avoids holding the mmap semaphore while walking VMAs in response to
> programs which read or follow the /proc//exe symlink. This also 
> allows
> us to merge mmu and nommu proc_exe_link() functions. The costs are holding the
> task lock, a separate reference to the executable file stored in the task
> struct, and increased code in fork, exec, and exit paths.
> 
> Changes:
> Clear exe_file field in exit path
> Use task_lock() to protect exe_file between write and read paths

This one does seem safe.

Though do you need to handle the #ifdef __alpha__ in
fs/exec.c:search_binary_handler?

thanks,
-serge

> Signed-off-by: Matt Helsley <[EMAIL PROTECTED]>
> ---
> 
>  fs/exec.c |7 +--
>  fs/proc/base.c|   21 +
>  fs/proc/internal.h|1 -
>  fs/proc/task_mmu.c|   34 --
>  fs/proc/task_nommu.c  |   34 --
>  include/linux/sched.h |3 ++-
>  kernel/exit.c |6 ++
>  kernel/fork.c |9 -
>  8 files changed, 42 insertions(+), 73 deletions(-)
> 
> Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
> ===
> --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
> +++ linux-2.6.22-rc2-mm1/include/linux/sched.h
> @@ -988,10 +988,11 @@ struct task_struct {
>   int oomkilladj; /* OOM kill score adjustment (bit shift). */
>   char comm[TASK_COMM_LEN]; /* executable name excluding path
>- access with [gs]et_task_comm (which lock
>  it with task_lock())
>- initialized normally by flush_old_exec */
> + struct file *exe_file;
>  /* file system info */
>   int link_count, total_link_count;
>  #ifdef CONFIG_SYSVIPC
>  /* ipc stuff */
>   struct sysv_sem sysvsem;
> @@ -1549,11 +1550,11 @@ static inline int thread_group_empty(str
> 
>  #define delay_group_leader(p) \
>   (thread_group_leader(p) && !thread_group_empty(p))
> 
>  /*
> - * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
> + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, ->exe_file, keyring
>   * subscriptions and synchronises with wait4().  Also used in procfs.  Also
>   * pins the final release of task.io_context.  Also protects ->cpuset.
>   *
>   * Nests both inside and outside of read_lock(_lock).
>   * It must not be nested with write_lock_irq(_lock),
> Index: linux-2.6.22-rc2-mm1/fs/exec.c
> ===
> --- linux-2.6.22-rc2-mm1.orig/fs/exec.c
> +++ linux-2.6.22-rc2-mm1/fs/exec.c
> @@ -1106,12 +1106,15 @@ int search_binary_handler(struct linux_b
>   read_unlock(_lock);
>   retval = fn(bprm, regs);
>   if (retval >= 0) {
>   put_binfmt(fmt);
>   allow_write_access(bprm->file);
> - if (bprm->file)
> - fput(bprm->file);
> + task_lock(current);
> + if (current->exe_file)
> + fput(current->exe_file);
> + current->exe_file = bprm->file;
> + task_unlock(current);
>   bprm->file = NULL;
>   current->did_exec = 1;
>   proc_exec_connector(current);
>   return retval;
>   }
> Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
> ===
> --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
> +++ linux-2.6.22-rc2-mm1/fs/proc/base.c
> @@ -951,10 +951,31 @@ const struct file_operations proc_pid_sc
>   .write  = sched_write,
>   .llseek = seq_lseek,
>   .release= seq_release,
>  };
> 
> +static int proc_exe_link(struct inode *inode, struct dentry **dentry,
> +  struct vfsmount **mnt)
> +{
> + int error = -ENOENT;
> + struct task_struct *task;
> +
> + task = get_proc_task(inode);
> + if (!task)
> + return error;
> + task_lock(task);
> + if (!task->exe_file)
> + goto out;
> + *mnt = mntget(task->exe_file->f_path.mnt);
> + *dentry = dget(task->exe_file->f_path.dentry);
> + error = 0;
> +out:
> + task_unlock(task);
> + put_task_struct(task);
> + return error;
> +}
> +
>  static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata 
> *nd)
>  {
>   struct inode *inode = dentry->d_inode;
>   int error = -EACCES;
> 
> Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> ===
> --- 

[RFC][PATCH] Replacing the /proc//exe symlink code

2007-06-01 Thread Matt Helsley
This patch avoids holding the mmap semaphore while walking VMAs in response to
programs which read or follow the /proc//exe symlink. This also allows
us to merge mmu and nommu proc_exe_link() functions. The costs are holding the
task lock, a separate reference to the executable file stored in the task
struct, and increased code in fork, exec, and exit paths.

Changes:
Clear exe_file field in exit path
Use task_lock() to protect exe_file between write and read paths

Signed-off-by: Matt Helsley <[EMAIL PROTECTED]>
---

 fs/exec.c |7 +--
 fs/proc/base.c|   21 +
 fs/proc/internal.h|1 -
 fs/proc/task_mmu.c|   34 --
 fs/proc/task_nommu.c  |   34 --
 include/linux/sched.h |3 ++-
 kernel/exit.c |6 ++
 kernel/fork.c |9 -
 8 files changed, 42 insertions(+), 73 deletions(-)

Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
===
--- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
+++ linux-2.6.22-rc2-mm1/include/linux/sched.h
@@ -988,10 +988,11 @@ struct task_struct {
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN]; /* executable name excluding path
 - access with [gs]et_task_comm (which lock
   it with task_lock())
 - initialized normally by flush_old_exec */
+   struct file *exe_file;
 /* file system info */
int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
struct sysv_sem sysvsem;
@@ -1549,11 +1550,11 @@ static inline int thread_group_empty(str
 
 #define delay_group_leader(p) \
(thread_group_leader(p) && !thread_group_empty(p))
 
 /*
- * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, ->exe_file, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset.
  *
  * Nests both inside and outside of read_lock(_lock).
  * It must not be nested with write_lock_irq(_lock),
Index: linux-2.6.22-rc2-mm1/fs/exec.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/exec.c
+++ linux-2.6.22-rc2-mm1/fs/exec.c
@@ -1106,12 +1106,15 @@ int search_binary_handler(struct linux_b
read_unlock(_lock);
retval = fn(bprm, regs);
if (retval >= 0) {
put_binfmt(fmt);
allow_write_access(bprm->file);
-   if (bprm->file)
-   fput(bprm->file);
+   task_lock(current);
+   if (current->exe_file)
+   fput(current->exe_file);
+   current->exe_file = bprm->file;
+   task_unlock(current);
bprm->file = NULL;
current->did_exec = 1;
proc_exec_connector(current);
return retval;
}
Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
+++ linux-2.6.22-rc2-mm1/fs/proc/base.c
@@ -951,10 +951,31 @@ const struct file_operations proc_pid_sc
.write  = sched_write,
.llseek = seq_lseek,
.release= seq_release,
 };
 
+static int proc_exe_link(struct inode *inode, struct dentry **dentry,
+struct vfsmount **mnt)
+{
+   int error = -ENOENT;
+   struct task_struct *task;
+
+   task = get_proc_task(inode);
+   if (!task)
+   return error;
+   task_lock(task);
+   if (!task->exe_file)
+   goto out;
+   *mnt = mntget(task->exe_file->f_path.mnt);
+   *dentry = dget(task->exe_file->f_path.dentry);
+   error = 0;
+out:
+   task_unlock(task);
+   put_task_struct(task);
+   return error;
+}
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
struct inode *inode = dentry->d_inode;
int error = -EACCES;
 
Index: linux-2.6.22-rc2-mm1/kernel/exit.c
===
--- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
+++ linux-2.6.22-rc2-mm1/kernel/exit.c
@@ -924,10 +924,16 @@ fastcall void do_exit(long code)
if (unlikely(tsk->audit_context))
audit_free(tsk);
 
taskstats_exit(tsk, group_dead);
 
+   task_lock(tsk);
+   if (tsk->exe_file) {
+   

Re: [RFC][PATCH] Replacing the /proc//exe symlink code

2007-06-01 Thread Matt Helsley
On Fri, 2007-06-01 at 17:31 -0500, Serge E. Hallyn wrote:
> Quoting Matt Helsley ([EMAIL PROTECTED]):
> > On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
> > > Quoting Matt Helsley ([EMAIL PROTECTED]):
> > > > This patch avoids holding the mmap semaphore while walking VMAs in 
> > > > response to
> > > > programs which read or follow the /proc//exe symlink. This 
> > > > also allows us
> > > > to merge mmu and nommu proc_exe_link() functions. The costs are holding 
> > > > a separate
> > > > reference to the executable file stored in the task struct and 
> > > > increased code in
> > > > fork, exec, and exit paths.
> > > > 
> > > > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]>
> > > > ---
> > > > 
> > > > Compiled and passed simple tests for regressions when patched against a 
> > > > 2.6.20
> > > > and 2.6.22-rc2-mm1 kernel.
> > > > 
> > > >  fs/exec.c |5 +++--
> > > >  fs/proc/base.c|   20 
> > > >  fs/proc/internal.h|1 -
> > > >  fs/proc/task_mmu.c|   34 --
> > > >  fs/proc/task_nommu.c  |   34 --
> > > >  include/linux/sched.h |1 +
> > > >  kernel/exit.c |2 ++
> > > >  kernel/fork.c |   10 +-
> > > >  8 files changed, 35 insertions(+), 72 deletions(-)
> > 
> > 
> > 
> > > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> > > > ===
> > > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > > > if (unlikely(tsk->audit_context))
> > > > audit_free(tsk);
> > > >  
> > > > taskstats_exit(tsk, group_dead);
> > > >  
> > > > +   if (tsk->exe_file)
> > > > +   fput(tsk->exe_file);
> > > 
> > > Hi,
> > > 
> > > just taking a cursory look so I may be missing something, but doesn't
> > > this leave the possibility that right here, with tsk->exe_file being
> > > put, another task would try to look at tsk's /proc/tsk->pid/exe?
> > > 
> > > thanks,
> > > -serge
> > >
> > >   exit_mm(tsk);
> > >
> >   
> > 
> > 
> > Good question. To be precise, I think the problem doesn't exist here but
> > after the exit_mm() because there's a VMA that holds a reference to the
> > same file.
> > 
> > The existing code appears to solve the race between
> > reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by
> > returning -ENOENT for the case where there is no executable VMA with a
> > reference to the file backing it.
> > 
> > So I need to put NULL in the exe_file field and adjust the return value
> > to be -ENOENT instead of -ENOSYS.
> > 
> > Thanks for the review!
> 
> Ok, I had to think about this a bit, but so you're saying you set it to
> NULL in do_exit(), and anyone who has just dereferenced tsk->exe_file
> before the fput in do_exit() should be ok because the vma hasn't yet
> been put?

Yes

> Should the 
>   if (!task->exe_file)
>   goto out;
>   *mnt = mntget(task->exe_file->f_path.mnt);
>   *dentry = dget(task->exe_file->f_path.dentry);
> 
> also go inside an preempt_disable to prevent sleeping and maybe become

It needs some form of protection from concurrent access between write
and read. write happens during exec, fork, and exit. In the fork case
however it's not necessary because the new task isn't visible in /proc
yet and the value of current doesn't change anyway.

>   exef = task->exe_file;  /* to prevent task->exe_file being set
>   to NULL before we've grabbed the path */
>   if (!exef)
>   goto out;
>   get_file(exef);  /* to prevent the mm somehow being put before
>   we've grabbed the path? */
>   *mnt = mntget(task->exe_file->f_path.mnt);
>   *dentry = dget(task->exe_file->f_path.dentry);
>   put_file(exef);  /* ? */
> 
> ?
> 
> Or am I being overly paranoid?

No, you're right. In fact, because readlink() can be initiated by
another task I think disabling preemption really only fixes the problem
on uniprocessor systems. So I was thinking of using task_lock() to
protect exe_file.

Cheers,
-Matt Helsley

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc//exe symlink code

2007-06-01 Thread Serge E. Hallyn
Quoting Matt Helsley ([EMAIL PROTECTED]):
> On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
> > Quoting Matt Helsley ([EMAIL PROTECTED]):
> > > This patch avoids holding the mmap semaphore while walking VMAs in 
> > > response to
> > > programs which read or follow the /proc//exe symlink. This also 
> > > allows us
> > > to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
> > > separate
> > > reference to the executable file stored in the task struct and increased 
> > > code in
> > > fork, exec, and exit paths.
> > > 
> > > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]>
> > > ---
> > > 
> > > Compiled and passed simple tests for regressions when patched against a 
> > > 2.6.20
> > > and 2.6.22-rc2-mm1 kernel.
> > > 
> > >  fs/exec.c |5 +++--
> > >  fs/proc/base.c|   20 
> > >  fs/proc/internal.h|1 -
> > >  fs/proc/task_mmu.c|   34 --
> > >  fs/proc/task_nommu.c  |   34 --
> > >  include/linux/sched.h |1 +
> > >  kernel/exit.c |2 ++
> > >  kernel/fork.c |   10 +-
> > >  8 files changed, 35 insertions(+), 72 deletions(-)
> 
> 
> 
> > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> > > ===
> > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > >   if (unlikely(tsk->audit_context))
> > >   audit_free(tsk);
> > >  
> > >   taskstats_exit(tsk, group_dead);
> > >  
> > > + if (tsk->exe_file)
> > > + fput(tsk->exe_file);
> > 
> > Hi,
> > 
> > just taking a cursory look so I may be missing something, but doesn't
> > this leave the possibility that right here, with tsk->exe_file being
> > put, another task would try to look at tsk's /proc/tsk->pid/exe?
> > 
> > thanks,
> > -serge
> >
> >   exit_mm(tsk);
> >
>   
> 
> 
> Good question. To be precise, I think the problem doesn't exist here but
> after the exit_mm() because there's a VMA that holds a reference to the
> same file.
> 
> The existing code appears to solve the race between
> reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by
> returning -ENOENT for the case where there is no executable VMA with a
> reference to the file backing it.
> 
> So I need to put NULL in the exe_file field and adjust the return value
> to be -ENOENT instead of -ENOSYS.
> 
> Thanks for the review!

Ok, I had to think about this a bit, but so you're saying you set it to
NULL in do_exit(), and anyone who has just dereferenced tsk->exe_file
before the fput in do_exit() should be ok because the vma hasn't yet
been put?

Should the 
if (!task->exe_file)
goto out;
*mnt = mntget(task->exe_file->f_path.mnt);
*dentry = dget(task->exe_file->f_path.dentry);

also go inside an preempt_disable to prevent sleeping and maybe become

exef = task->exe_file;  /* to prevent task->exe_file being set
to NULL before we've grabbed the path */
if (!exef)
goto out;
get_file(exef);  /* to prevent the mm somehow being put before
we've grabbed the path? */
*mnt = mntget(task->exe_file->f_path.mnt);
*dentry = dget(task->exe_file->f_path.dentry);
put_file(exef);  /* ? */

?

Or am I being overly paranoid?

thanks,
-serge
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc//exe symlink code

2007-05-31 Thread Matt Helsley
On Wed, 2007-05-30 at 23:01 -0700, Chris Wright wrote:
> * Serge E. Hallyn ([EMAIL PROTECTED]) wrote:
> > > ===
> > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > >   if (unlikely(tsk->audit_context))
> > >   audit_free(tsk);
> > >  
> > >   taskstats_exit(tsk, group_dead);
> > >  
> > > + if (tsk->exe_file)
> > > + fput(tsk->exe_file);
> > 
> > just taking a cursory look so I may be missing something, but doesn't
> > this leave the possibility that right here, with tsk->exe_file being
> > put, another task would try to look at tsk's /proc/tsk->pid/exe?
> 
> And I hit this one, so there's at least one issue.
> 
> [  110.296952] Unable to handle kernel NULL pointer dereference at 
> 0088 RIP: 
> [  110.299053]  [] d_path+0x1a/0x117
> [  110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0 
> [  110.303509] Oops:  [1] SMP 
> [  110.304719] CPU 1 
> [  110.305493] Modules linked in: oprofile
> [  110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty 
> #183
> [  110.309733] RIP: 0010:[]  [] 
> d_path+0x1a/0x117
> [  110.312635] RSP: 0018:810142335e38  EFLAGS: 00010292
> [  110.314667] RAX: 81006d58a000 RBX:  RCX: 
> 1000
> [  110.317397] RDX: 81006d58a000 RSI:  RDI: 
> 
> [  110.320127] RBP: 81006d58a000 R08: fff3 R09: 
> 0006be8b
> [  110.322857] R10:  R11: 0001 R12: 
> 
> [  110.325588] R13: 1000 R14: 81006d58a000 R15: 
> 0
> [  110.328319] FS:  2b033d578260() GS:81000106e480() 
> knlGS:
> [  110.331415] CS:  0010 DS:  ES:  CR0: 8005003b
> [  110.333613] CR2: 0088 CR3: 6cf54000 CR4: 
> 06e0
> [  110.336344] Process pidof (pid: 3983, threadinfo 810142334000, task 
> 8101421186c0)
> [  110.339472] Stack:  8101422a7268  81006d58a000 
> fff4
> [  110.342556]   1000 00678820 
> 802b7a54
> [  110.345404]     
> 
> [  110.348180] Call Trace:
> [  110.349188]  [] proc_pid_readlink+0x89/0xff
> [  110.351387]  [] sys_readlinkat+0x87/0xa9
> [  110.353487]  [] remove_vma+0x5d/0x64
> [  110.355455]  [] error_exit+0x0/0x84
> [  110.357389]  [] system_call+0x7e/0x83
> [  110.359388] 
> [  110.359958] 
> [  110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 
> 74 
> [  110.363381] RIP  [] d_path+0x1a/0x117
> [  110.365386]  RSP 
> [  110.366720] CR2: 0088

Hmm, at first blush this does appear to be related to the bug Serge
pointed out. I'm not certain though, so I'll see if I can spot/produce
any other problems that could explain this.

Thanks for testing my patch!

Cheers,
-Matt Helsley

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc//exe symlink code

2007-05-31 Thread Matt Helsley
On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
> Quoting Matt Helsley ([EMAIL PROTECTED]):
> > This patch avoids holding the mmap semaphore while walking VMAs in response 
> > to
> > programs which read or follow the /proc//exe symlink. This also 
> > allows us
> > to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
> > separate
> > reference to the executable file stored in the task struct and increased 
> > code in
> > fork, exec, and exit paths.
> > 
> > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]>
> > ---
> > 
> > Compiled and passed simple tests for regressions when patched against a 
> > 2.6.20
> > and 2.6.22-rc2-mm1 kernel.
> > 
> >  fs/exec.c |5 +++--
> >  fs/proc/base.c|   20 
> >  fs/proc/internal.h|1 -
> >  fs/proc/task_mmu.c|   34 --
> >  fs/proc/task_nommu.c  |   34 --
> >  include/linux/sched.h |1 +
> >  kernel/exit.c |2 ++
> >  kernel/fork.c |   10 +-
> >  8 files changed, 35 insertions(+), 72 deletions(-)



> > Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> > ===
> > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > if (unlikely(tsk->audit_context))
> > audit_free(tsk);
> >  
> > taskstats_exit(tsk, group_dead);
> >  
> > +   if (tsk->exe_file)
> > +   fput(tsk->exe_file);
> 
> Hi,
> 
> just taking a cursory look so I may be missing something, but doesn't
> this leave the possibility that right here, with tsk->exe_file being
> put, another task would try to look at tsk's /proc/tsk->pid/exe?
> 
> thanks,
> -serge
>
>   exit_mm(tsk);
>
  


Good question. To be precise, I think the problem doesn't exist here but
after the exit_mm() because there's a VMA that holds a reference to the
same file.

The existing code appears to solve the race between
reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by
returning -ENOENT for the case where there is no executable VMA with a
reference to the file backing it.

So I need to put NULL in the exe_file field and adjust the return value
to be -ENOENT instead of -ENOSYS.

Thanks for the review!

Cheers,
-Matt Helsley

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc//exe symlink code

2007-05-31 Thread Chris Wright
* Serge E. Hallyn ([EMAIL PROTECTED]) wrote:
> > ===
> > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> > +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> > @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
> > if (unlikely(tsk->audit_context))
> > audit_free(tsk);
> >  
> > taskstats_exit(tsk, group_dead);
> >  
> > +   if (tsk->exe_file)
> > +   fput(tsk->exe_file);
> 
> just taking a cursory look so I may be missing something, but doesn't
> this leave the possibility that right here, with tsk->exe_file being
> put, another task would try to look at tsk's /proc/tsk->pid/exe?

And I hit this one, so there's at least one issue.

[  110.296952] Unable to handle kernel NULL pointer dereference at 
0088 RIP: 
[  110.299053]  [] d_path+0x1a/0x117
[  110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0 
[  110.303509] Oops:  [1] SMP 
[  110.304719] CPU 1 
[  110.305493] Modules linked in: oprofile
[  110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty 
#183
[  110.309733] RIP: 0010:[]  [] 
d_path+0x1a/0x117
[  110.312635] RSP: 0018:810142335e38  EFLAGS: 00010292
[  110.314667] RAX: 81006d58a000 RBX:  RCX: 1000
[  110.317397] RDX: 81006d58a000 RSI:  RDI: 
[  110.320127] RBP: 81006d58a000 R08: fff3 R09: 0006be8b
[  110.322857] R10:  R11: 0001 R12: 
[  110.325588] R13: 1000 R14: 81006d58a000 R15: 
0
[  110.328319] FS:  2b033d578260() GS:81000106e480() 
knlGS:
[  110.331415] CS:  0010 DS:  ES:  CR0: 8005003b
[  110.333613] CR2: 0088 CR3: 6cf54000 CR4: 06e0
[  110.336344] Process pidof (pid: 3983, threadinfo 810142334000, task 
8101421186c0)
[  110.339472] Stack:  8101422a7268  81006d58a000 
fff4
[  110.342556]   1000 00678820 
802b7a54
[  110.345404]     

[  110.348180] Call Trace:
[  110.349188]  [] proc_pid_readlink+0x89/0xff
[  110.351387]  [] sys_readlinkat+0x87/0xa9
[  110.353487]  [] remove_vma+0x5d/0x64
[  110.355455]  [] error_exit+0x0/0x84
[  110.357389]  [] system_call+0x7e/0x83
[  110.359388] 
[  110.359958] 
[  110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 
74 
[  110.363381] RIP  [] d_path+0x1a/0x117
[  110.365386]  RSP 
[  110.366720] CR2: 0088
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc//exe symlink code

2007-05-30 Thread Serge E. Hallyn
Quoting Matt Helsley ([EMAIL PROTECTED]):
> This patch avoids holding the mmap semaphore while walking VMAs in response to
> programs which read or follow the /proc//exe symlink. This also 
> allows us
> to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
> separate
> reference to the executable file stored in the task struct and increased code 
> in
> fork, exec, and exit paths.
> 
> Signed-off-by: Matt Helsley <[EMAIL PROTECTED]>
> ---
> 
> Compiled and passed simple tests for regressions when patched against a 2.6.20
> and 2.6.22-rc2-mm1 kernel.
> 
>  fs/exec.c |5 +++--
>  fs/proc/base.c|   20 
>  fs/proc/internal.h|1 -
>  fs/proc/task_mmu.c|   34 --
>  fs/proc/task_nommu.c  |   34 --
>  include/linux/sched.h |1 +
>  kernel/exit.c |2 ++
>  kernel/fork.c |   10 +-
>  8 files changed, 35 insertions(+), 72 deletions(-)
> 
> Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
> ===
> --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
> +++ linux-2.6.22-rc2-mm1/include/linux/sched.h
> @@ -988,10 +988,11 @@ struct task_struct {
>   int oomkilladj; /* OOM kill score adjustment (bit shift). */
>   char comm[TASK_COMM_LEN]; /* executable name excluding path
>- access with [gs]et_task_comm (which lock
>  it with task_lock())
>- initialized normally by flush_old_exec */
> + struct file *exe_file;
>  /* file system info */
>   int link_count, total_link_count;
>  #ifdef CONFIG_SYSVIPC
>  /* ipc stuff */
>   struct sysv_sem sysvsem;
> Index: linux-2.6.22-rc2-mm1/fs/exec.c
> ===
> --- linux-2.6.22-rc2-mm1.orig/fs/exec.c
> +++ linux-2.6.22-rc2-mm1/fs/exec.c
> @@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b
>   read_unlock(_lock);
>   retval = fn(bprm, regs);
>   if (retval >= 0) {
>   put_binfmt(fmt);
>   allow_write_access(bprm->file);
> - if (bprm->file)
> - fput(bprm->file);
> + if (current->exe_file)
> + fput(current->exe_file);
> + current->exe_file = bprm->file;
>   bprm->file = NULL;
>   current->did_exec = 1;
>   proc_exec_connector(current);
>   return retval;
>   }
> Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
> ===
> --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
> +++ linux-2.6.22-rc2-mm1/fs/proc/base.c
> @@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc
>   .write  = sched_write,
>   .llseek = seq_lseek,
>   .release= seq_release,
>  };
>  
> +static int proc_exe_link(struct inode *inode, struct dentry **dentry,
> +  struct vfsmount **mnt)
> +{
> + int error;
> + struct task_struct *task;
> +
> + task = get_proc_task(inode);
> + if (!task)
> + return -ENOENT;
> + error = -ENOSYS;
> + if (!task->exe_file)
> + goto out;
> + *mnt = mntget(task->exe_file->f_path.mnt);
> + *dentry = dget(task->exe_file->f_path.dentry);
> + error = 0;
> +out:
> + put_task_struct(task);
> + return error;
> +}
> +
>  static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata 
> *nd)
>  {
>   struct inode *inode = dentry->d_inode;
>   int error = -EACCES;
>  
> Index: linux-2.6.22-rc2-mm1/kernel/exit.c
> ===
> --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
> +++ linux-2.6.22-rc2-mm1/kernel/exit.c
> @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
>   if (unlikely(tsk->audit_context))
>   audit_free(tsk);
>  
>   taskstats_exit(tsk, group_dead);
>  
> + if (tsk->exe_file)
> + fput(tsk->exe_file);

Hi,

just taking a cursory look so I may be missing something, but doesn't
this leave the possibility that right here, with tsk->exe_file being
put, another task would try to look at tsk's /proc/tsk->pid/exe?

thanks,
-serge


>   exit_mm(tsk);
>  
>   if (group_dead)
>   acct_process();
>   exit_sem(tsk);
> Index: linux-2.6.22-rc2-mm1/kernel/fork.c
> ===
> --- linux-2.6.22-rc2-mm1.orig/kernel/fork.c
> +++ linux-2.6.22-rc2-mm1/kernel/fork.c
> @@ -1163,10 +1163,13 @@ static struct task_struct 

[RFC][PATCH] Replacing the /proc//exe symlink code

2007-05-29 Thread Matt Helsley
This patch avoids holding the mmap semaphore while walking VMAs in response to
programs which read or follow the /proc//exe symlink. This also 
allows us
to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
separate
reference to the executable file stored in the task struct and increased code in
fork, exec, and exit paths.

Signed-off-by: Matt Helsley <[EMAIL PROTECTED]>
---

Compiled and passed simple tests for regressions when patched against a 2.6.20
and 2.6.22-rc2-mm1 kernel.

 fs/exec.c |5 +++--
 fs/proc/base.c|   20 
 fs/proc/internal.h|1 -
 fs/proc/task_mmu.c|   34 --
 fs/proc/task_nommu.c  |   34 --
 include/linux/sched.h |1 +
 kernel/exit.c |2 ++
 kernel/fork.c |   10 +-
 8 files changed, 35 insertions(+), 72 deletions(-)

Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
===
--- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
+++ linux-2.6.22-rc2-mm1/include/linux/sched.h
@@ -988,10 +988,11 @@ struct task_struct {
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN]; /* executable name excluding path
 - access with [gs]et_task_comm (which lock
   it with task_lock())
 - initialized normally by flush_old_exec */
+   struct file *exe_file;
 /* file system info */
int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
struct sysv_sem sysvsem;
Index: linux-2.6.22-rc2-mm1/fs/exec.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/exec.c
+++ linux-2.6.22-rc2-mm1/fs/exec.c
@@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b
read_unlock(_lock);
retval = fn(bprm, regs);
if (retval >= 0) {
put_binfmt(fmt);
allow_write_access(bprm->file);
-   if (bprm->file)
-   fput(bprm->file);
+   if (current->exe_file)
+   fput(current->exe_file);
+   current->exe_file = bprm->file;
bprm->file = NULL;
current->did_exec = 1;
proc_exec_connector(current);
return retval;
}
Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
+++ linux-2.6.22-rc2-mm1/fs/proc/base.c
@@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc
.write  = sched_write,
.llseek = seq_lseek,
.release= seq_release,
 };
 
+static int proc_exe_link(struct inode *inode, struct dentry **dentry,
+struct vfsmount **mnt)
+{
+   int error;
+   struct task_struct *task;
+
+   task = get_proc_task(inode);
+   if (!task)
+   return -ENOENT;
+   error = -ENOSYS;
+   if (!task->exe_file)
+   goto out;
+   *mnt = mntget(task->exe_file->f_path.mnt);
+   *dentry = dget(task->exe_file->f_path.dentry);
+   error = 0;
+out:
+   put_task_struct(task);
+   return error;
+}
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
struct inode *inode = dentry->d_inode;
int error = -EACCES;
 
Index: linux-2.6.22-rc2-mm1/kernel/exit.c
===
--- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
+++ linux-2.6.22-rc2-mm1/kernel/exit.c
@@ -924,10 +924,12 @@ fastcall void do_exit(long code)
if (unlikely(tsk->audit_context))
audit_free(tsk);
 
taskstats_exit(tsk, group_dead);
 
+   if (tsk->exe_file)
+   fput(tsk->exe_file);
exit_mm(tsk);
 
if (group_dead)
acct_process();
exit_sem(tsk);
Index: linux-2.6.22-rc2-mm1/kernel/fork.c
===
--- linux-2.6.22-rc2-mm1.orig/kernel/fork.c
+++ linux-2.6.22-rc2-mm1/kernel/fork.c
@@ -1163,10 +1163,13 @@ static struct task_struct *copy_process(
 
/* ok, now we should be set up.. */
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & 
CSIGNAL);
p->pdeath_signal = 0;
p->exit_state = 0;
+   p->exe_file = current->exe_file;
+   if (p->exe_file)
+   get_file(p->exe_file);
 
/*
 * Ok, make it visible to the rest of the system.
 * We dont wake it up yet.