Re: [RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-06-06 Thread Serge E. Hallyn
Quoting Matt Helsley ([EMAIL PROTECTED]):
 This patch avoids holding the mmap semaphore while walking VMAs in response to
 programs which read or follow the /proc/pid|self/exe symlink. This also 
 allows
 us to merge mmu and nommu proc_exe_link() functions. The costs are holding the
 task lock, a separate reference to the executable file stored in the task
 struct, and increased code in fork, exec, and exit paths.
 
 Changes:
 Clear exe_file field in exit path
 Use task_lock() to protect exe_file between write and read paths

This one does seem safe.

Though do you need to handle the #ifdef __alpha__ in
fs/exec.c:search_binary_handler?

thanks,
-serge

 Signed-off-by: Matt Helsley [EMAIL PROTECTED]
 ---
 
  fs/exec.c |7 +--
  fs/proc/base.c|   21 +
  fs/proc/internal.h|1 -
  fs/proc/task_mmu.c|   34 --
  fs/proc/task_nommu.c  |   34 --
  include/linux/sched.h |3 ++-
  kernel/exit.c |6 ++
  kernel/fork.c |9 -
  8 files changed, 42 insertions(+), 73 deletions(-)
 
 Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
 ===
 --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
 +++ linux-2.6.22-rc2-mm1/include/linux/sched.h
 @@ -988,10 +988,11 @@ struct task_struct {
   int oomkilladj; /* OOM kill score adjustment (bit shift). */
   char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
  it with task_lock())
- initialized normally by flush_old_exec */
 + struct file *exe_file;
  /* file system info */
   int link_count, total_link_count;
  #ifdef CONFIG_SYSVIPC
  /* ipc stuff */
   struct sysv_sem sysvsem;
 @@ -1549,11 +1550,11 @@ static inline int thread_group_empty(str
 
  #define delay_group_leader(p) \
   (thread_group_leader(p)  !thread_group_empty(p))
 
  /*
 - * Protects -fs, -files, -mm, -group_info, -comm, keyring
 + * Protects -fs, -files, -mm, -group_info, -comm, -exe_file, keyring
   * subscriptions and synchronises with wait4().  Also used in procfs.  Also
   * pins the final release of task.io_context.  Also protects -cpuset.
   *
   * Nests both inside and outside of read_lock(tasklist_lock).
   * It must not be nested with write_lock_irq(tasklist_lock),
 Index: linux-2.6.22-rc2-mm1/fs/exec.c
 ===
 --- linux-2.6.22-rc2-mm1.orig/fs/exec.c
 +++ linux-2.6.22-rc2-mm1/fs/exec.c
 @@ -1106,12 +1106,15 @@ int search_binary_handler(struct linux_b
   read_unlock(binfmt_lock);
   retval = fn(bprm, regs);
   if (retval = 0) {
   put_binfmt(fmt);
   allow_write_access(bprm-file);
 - if (bprm-file)
 - fput(bprm-file);
 + task_lock(current);
 + if (current-exe_file)
 + fput(current-exe_file);
 + current-exe_file = bprm-file;
 + task_unlock(current);
   bprm-file = NULL;
   current-did_exec = 1;
   proc_exec_connector(current);
   return retval;
   }
 Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
 ===
 --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
 +++ linux-2.6.22-rc2-mm1/fs/proc/base.c
 @@ -951,10 +951,31 @@ const struct file_operations proc_pid_sc
   .write  = sched_write,
   .llseek = seq_lseek,
   .release= seq_release,
  };
 
 +static int proc_exe_link(struct inode *inode, struct dentry **dentry,
 +  struct vfsmount **mnt)
 +{
 + int error = -ENOENT;
 + struct task_struct *task;
 +
 + task = get_proc_task(inode);
 + if (!task)
 + return error;
 + task_lock(task);
 + if (!task-exe_file)
 + goto out;
 + *mnt = mntget(task-exe_file-f_path.mnt);
 + *dentry = dget(task-exe_file-f_path.dentry);
 + error = 0;
 +out:
 + task_unlock(task);
 + put_task_struct(task);
 + return error;
 +}
 +
  static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata 
 *nd)
  {
   struct inode *inode = dentry-d_inode;
   int error = -EACCES;
 
 Index: linux-2.6.22-rc2-mm1/kernel/exit.c
 ===
 --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
 +++ linux-2.6.22-rc2-mm1/kernel/exit.c
 @@ -924,10 +924,16 @@ fastcall void do_exit(long code)
   if 

Re: [RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-06-01 Thread Serge E. Hallyn
Quoting Matt Helsley ([EMAIL PROTECTED]):
 On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
  Quoting Matt Helsley ([EMAIL PROTECTED]):
   This patch avoids holding the mmap semaphore while walking VMAs in 
   response to
   programs which read or follow the /proc/pid|self/exe symlink. This also 
   allows us
   to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
   separate
   reference to the executable file stored in the task struct and increased 
   code in
   fork, exec, and exit paths.
   
   Signed-off-by: Matt Helsley [EMAIL PROTECTED]
   ---
   
   Compiled and passed simple tests for regressions when patched against a 
   2.6.20
   and 2.6.22-rc2-mm1 kernel.
   
fs/exec.c |5 +++--
fs/proc/base.c|   20 
fs/proc/internal.h|1 -
fs/proc/task_mmu.c|   34 --
fs/proc/task_nommu.c  |   34 --
include/linux/sched.h |1 +
kernel/exit.c |2 ++
kernel/fork.c |   10 +-
8 files changed, 35 insertions(+), 72 deletions(-)
 
 snip
 
   Index: linux-2.6.22-rc2-mm1/kernel/exit.c
   ===
   --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
   +++ linux-2.6.22-rc2-mm1/kernel/exit.c
   @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
 if (unlikely(tsk-audit_context))
 audit_free(tsk);

 taskstats_exit(tsk, group_dead);

   + if (tsk-exe_file)
   + fput(tsk-exe_file);
  
  Hi,
  
  just taking a cursory look so I may be missing something, but doesn't
  this leave the possibility that right here, with tsk-exe_file being
  put, another task would try to look at tsk's /proc/tsk-pid/exe?
  
  thanks,
  -serge
 
exit_mm(tsk);
 
   
 snip
 
 Good question. To be precise, I think the problem doesn't exist here but
 after the exit_mm() because there's a VMA that holds a reference to the
 same file.
 
 The existing code appears to solve the race between
 reading/following /proc/tsk-pid/exe and exit_mm() in the exit path by
 returning -ENOENT for the case where there is no executable VMA with a
 reference to the file backing it.
 
 So I need to put NULL in the exe_file field and adjust the return value
 to be -ENOENT instead of -ENOSYS.
 
 Thanks for the review!

Ok, I had to think about this a bit, but so you're saying you set it to
NULL in do_exit(), and anyone who has just dereferenced tsk-exe_file
before the fput in do_exit() should be ok because the vma hasn't yet
been put?

Should the 
if (!task-exe_file)
goto out;
*mnt = mntget(task-exe_file-f_path.mnt);
*dentry = dget(task-exe_file-f_path.dentry);

also go inside an preempt_disable to prevent sleeping and maybe become

exef = task-exe_file;  /* to prevent task-exe_file being set
to NULL before we've grabbed the path */
if (!exef)
goto out;
get_file(exef);  /* to prevent the mm somehow being put before
we've grabbed the path? */
*mnt = mntget(task-exe_file-f_path.mnt);
*dentry = dget(task-exe_file-f_path.dentry);
put_file(exef);  /* ? */

?

Or am I being overly paranoid?

thanks,
-serge
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-06-01 Thread Matt Helsley
On Fri, 2007-06-01 at 17:31 -0500, Serge E. Hallyn wrote:
 Quoting Matt Helsley ([EMAIL PROTECTED]):
  On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
   Quoting Matt Helsley ([EMAIL PROTECTED]):
This patch avoids holding the mmap semaphore while walking VMAs in 
response to
programs which read or follow the /proc/pid|self/exe symlink. This 
also allows us
to merge mmu and nommu proc_exe_link() functions. The costs are holding 
a separate
reference to the executable file stored in the task struct and 
increased code in
fork, exec, and exit paths.

Signed-off-by: Matt Helsley [EMAIL PROTECTED]
---

Compiled and passed simple tests for regressions when patched against a 
2.6.20
and 2.6.22-rc2-mm1 kernel.

 fs/exec.c |5 +++--
 fs/proc/base.c|   20 
 fs/proc/internal.h|1 -
 fs/proc/task_mmu.c|   34 --
 fs/proc/task_nommu.c  |   34 --
 include/linux/sched.h |1 +
 kernel/exit.c |2 ++
 kernel/fork.c |   10 +-
 8 files changed, 35 insertions(+), 72 deletions(-)
  
  snip
  
Index: linux-2.6.22-rc2-mm1/kernel/exit.c
===
--- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
+++ linux-2.6.22-rc2-mm1/kernel/exit.c
@@ -924,10 +924,12 @@ fastcall void do_exit(long code)
if (unlikely(tsk-audit_context))
audit_free(tsk);
 
taskstats_exit(tsk, group_dead);
 
+   if (tsk-exe_file)
+   fput(tsk-exe_file);
   
   Hi,
   
   just taking a cursory look so I may be missing something, but doesn't
   this leave the possibility that right here, with tsk-exe_file being
   put, another task would try to look at tsk's /proc/tsk-pid/exe?
   
   thanks,
   -serge
  
 exit_mm(tsk);
  

  snip
  
  Good question. To be precise, I think the problem doesn't exist here but
  after the exit_mm() because there's a VMA that holds a reference to the
  same file.
  
  The existing code appears to solve the race between
  reading/following /proc/tsk-pid/exe and exit_mm() in the exit path by
  returning -ENOENT for the case where there is no executable VMA with a
  reference to the file backing it.
  
  So I need to put NULL in the exe_file field and adjust the return value
  to be -ENOENT instead of -ENOSYS.
  
  Thanks for the review!
 
 Ok, I had to think about this a bit, but so you're saying you set it to
 NULL in do_exit(), and anyone who has just dereferenced tsk-exe_file
 before the fput in do_exit() should be ok because the vma hasn't yet
 been put?

Yes

 Should the 
   if (!task-exe_file)
   goto out;
   *mnt = mntget(task-exe_file-f_path.mnt);
   *dentry = dget(task-exe_file-f_path.dentry);
 
 also go inside an preempt_disable to prevent sleeping and maybe become

It needs some form of protection from concurrent access between write
and read. write happens during exec, fork, and exit. In the fork case
however it's not necessary because the new task isn't visible in /proc
yet and the value of current doesn't change anyway.

   exef = task-exe_file;  /* to prevent task-exe_file being set
   to NULL before we've grabbed the path */
   if (!exef)
   goto out;
   get_file(exef);  /* to prevent the mm somehow being put before
   we've grabbed the path? */
   *mnt = mntget(task-exe_file-f_path.mnt);
   *dentry = dget(task-exe_file-f_path.dentry);
   put_file(exef);  /* ? */
 
 ?
 
 Or am I being overly paranoid?

No, you're right. In fact, because readlink() can be initiated by
another task I think disabling preemption really only fixes the problem
on uniprocessor systems. So I was thinking of using task_lock() to
protect exe_file.

Cheers,
-Matt Helsley

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-06-01 Thread Matt Helsley
This patch avoids holding the mmap semaphore while walking VMAs in response to
programs which read or follow the /proc/pid|self/exe symlink. This also allows
us to merge mmu and nommu proc_exe_link() functions. The costs are holding the
task lock, a separate reference to the executable file stored in the task
struct, and increased code in fork, exec, and exit paths.

Changes:
Clear exe_file field in exit path
Use task_lock() to protect exe_file between write and read paths

Signed-off-by: Matt Helsley [EMAIL PROTECTED]
---

 fs/exec.c |7 +--
 fs/proc/base.c|   21 +
 fs/proc/internal.h|1 -
 fs/proc/task_mmu.c|   34 --
 fs/proc/task_nommu.c  |   34 --
 include/linux/sched.h |3 ++-
 kernel/exit.c |6 ++
 kernel/fork.c |9 -
 8 files changed, 42 insertions(+), 73 deletions(-)

Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
===
--- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
+++ linux-2.6.22-rc2-mm1/include/linux/sched.h
@@ -988,10 +988,11 @@ struct task_struct {
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN]; /* executable name excluding path
 - access with [gs]et_task_comm (which lock
   it with task_lock())
 - initialized normally by flush_old_exec */
+   struct file *exe_file;
 /* file system info */
int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
struct sysv_sem sysvsem;
@@ -1549,11 +1550,11 @@ static inline int thread_group_empty(str
 
 #define delay_group_leader(p) \
(thread_group_leader(p)  !thread_group_empty(p))
 
 /*
- * Protects -fs, -files, -mm, -group_info, -comm, keyring
+ * Protects -fs, -files, -mm, -group_info, -comm, -exe_file, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects -cpuset.
  *
  * Nests both inside and outside of read_lock(tasklist_lock).
  * It must not be nested with write_lock_irq(tasklist_lock),
Index: linux-2.6.22-rc2-mm1/fs/exec.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/exec.c
+++ linux-2.6.22-rc2-mm1/fs/exec.c
@@ -1106,12 +1106,15 @@ int search_binary_handler(struct linux_b
read_unlock(binfmt_lock);
retval = fn(bprm, regs);
if (retval = 0) {
put_binfmt(fmt);
allow_write_access(bprm-file);
-   if (bprm-file)
-   fput(bprm-file);
+   task_lock(current);
+   if (current-exe_file)
+   fput(current-exe_file);
+   current-exe_file = bprm-file;
+   task_unlock(current);
bprm-file = NULL;
current-did_exec = 1;
proc_exec_connector(current);
return retval;
}
Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
+++ linux-2.6.22-rc2-mm1/fs/proc/base.c
@@ -951,10 +951,31 @@ const struct file_operations proc_pid_sc
.write  = sched_write,
.llseek = seq_lseek,
.release= seq_release,
 };
 
+static int proc_exe_link(struct inode *inode, struct dentry **dentry,
+struct vfsmount **mnt)
+{
+   int error = -ENOENT;
+   struct task_struct *task;
+
+   task = get_proc_task(inode);
+   if (!task)
+   return error;
+   task_lock(task);
+   if (!task-exe_file)
+   goto out;
+   *mnt = mntget(task-exe_file-f_path.mnt);
+   *dentry = dget(task-exe_file-f_path.dentry);
+   error = 0;
+out:
+   task_unlock(task);
+   put_task_struct(task);
+   return error;
+}
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
struct inode *inode = dentry-d_inode;
int error = -EACCES;
 
Index: linux-2.6.22-rc2-mm1/kernel/exit.c
===
--- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
+++ linux-2.6.22-rc2-mm1/kernel/exit.c
@@ -924,10 +924,16 @@ fastcall void do_exit(long code)
if (unlikely(tsk-audit_context))
audit_free(tsk);
 
taskstats_exit(tsk, group_dead);
 
+   task_lock(tsk);
+   if (tsk-exe_file) {
+   fput(tsk-exe_file);

Re: [RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-05-31 Thread Chris Wright
* Serge E. Hallyn ([EMAIL PROTECTED]) wrote:
  ===
  --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
  +++ linux-2.6.22-rc2-mm1/kernel/exit.c
  @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
  if (unlikely(tsk-audit_context))
  audit_free(tsk);
   
  taskstats_exit(tsk, group_dead);
   
  +   if (tsk-exe_file)
  +   fput(tsk-exe_file);
 
 just taking a cursory look so I may be missing something, but doesn't
 this leave the possibility that right here, with tsk-exe_file being
 put, another task would try to look at tsk's /proc/tsk-pid/exe?

And I hit this one, so there's at least one issue.

[  110.296952] Unable to handle kernel NULL pointer dereference at 
0088 RIP: 
[  110.299053]  [80293fca] d_path+0x1a/0x117
[  110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0 
[  110.303509] Oops:  [1] SMP 
[  110.304719] CPU 1 
[  110.305493] Modules linked in: oprofile
[  110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty 
#183
[  110.309733] RIP: 0010:[80293fca]  [80293fca] 
d_path+0x1a/0x117
[  110.312635] RSP: 0018:810142335e38  EFLAGS: 00010292
[  110.314667] RAX: 81006d58a000 RBX:  RCX: 1000
[  110.317397] RDX: 81006d58a000 RSI:  RDI: 
[  110.320127] RBP: 81006d58a000 R08: fff3 R09: 0006be8b
[  110.322857] R10:  R11: 0001 R12: 
[  110.325588] R13: 1000 R14: 81006d58a000 R15: 
0
[  110.328319] FS:  2b033d578260() GS:81000106e480() 
knlGS:
[  110.331415] CS:  0010 DS:  ES:  CR0: 8005003b
[  110.333613] CR2: 0088 CR3: 6cf54000 CR4: 06e0
[  110.336344] Process pidof (pid: 3983, threadinfo 810142334000, task 
8101421186c0)
[  110.339472] Stack:  8101422a7268  81006d58a000 
fff4
[  110.342556]   1000 00678820 
802b7a54
[  110.345404]     

[  110.348180] Call Trace:
[  110.349188]  [802b7a54] proc_pid_readlink+0x89/0xff
[  110.351387]  [80285e55] sys_readlinkat+0x87/0xa9
[  110.353487]  [8026d4dc] remove_vma+0x5d/0x64
[  110.355455]  [80596acd] error_exit+0x0/0x84
[  110.357389]  [8020935e] system_call+0x7e/0x83
[  110.359388] 
[  110.359958] 
[  110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 
74 
[  110.363381] RIP  [80293fca] d_path+0x1a/0x117
[  110.365386]  RSP 810142335e38
[  110.366720] CR2: 0088
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-05-31 Thread Matt Helsley
On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote:
 Quoting Matt Helsley ([EMAIL PROTECTED]):
  This patch avoids holding the mmap semaphore while walking VMAs in response 
  to
  programs which read or follow the /proc/pid|self/exe symlink. This also 
  allows us
  to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
  separate
  reference to the executable file stored in the task struct and increased 
  code in
  fork, exec, and exit paths.
  
  Signed-off-by: Matt Helsley [EMAIL PROTECTED]
  ---
  
  Compiled and passed simple tests for regressions when patched against a 
  2.6.20
  and 2.6.22-rc2-mm1 kernel.
  
   fs/exec.c |5 +++--
   fs/proc/base.c|   20 
   fs/proc/internal.h|1 -
   fs/proc/task_mmu.c|   34 --
   fs/proc/task_nommu.c  |   34 --
   include/linux/sched.h |1 +
   kernel/exit.c |2 ++
   kernel/fork.c |   10 +-
   8 files changed, 35 insertions(+), 72 deletions(-)

snip

  Index: linux-2.6.22-rc2-mm1/kernel/exit.c
  ===
  --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
  +++ linux-2.6.22-rc2-mm1/kernel/exit.c
  @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
  if (unlikely(tsk-audit_context))
  audit_free(tsk);
   
  taskstats_exit(tsk, group_dead);
   
  +   if (tsk-exe_file)
  +   fput(tsk-exe_file);
 
 Hi,
 
 just taking a cursory look so I may be missing something, but doesn't
 this leave the possibility that right here, with tsk-exe_file being
 put, another task would try to look at tsk's /proc/tsk-pid/exe?
 
 thanks,
 -serge

   exit_mm(tsk);

  
snip

Good question. To be precise, I think the problem doesn't exist here but
after the exit_mm() because there's a VMA that holds a reference to the
same file.

The existing code appears to solve the race between
reading/following /proc/tsk-pid/exe and exit_mm() in the exit path by
returning -ENOENT for the case where there is no executable VMA with a
reference to the file backing it.

So I need to put NULL in the exe_file field and adjust the return value
to be -ENOENT instead of -ENOSYS.

Thanks for the review!

Cheers,
-Matt Helsley

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-05-31 Thread Matt Helsley
On Wed, 2007-05-30 at 23:01 -0700, Chris Wright wrote:
 * Serge E. Hallyn ([EMAIL PROTECTED]) wrote:
   ===
   --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
   +++ linux-2.6.22-rc2-mm1/kernel/exit.c
   @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
 if (unlikely(tsk-audit_context))
 audit_free(tsk);

 taskstats_exit(tsk, group_dead);

   + if (tsk-exe_file)
   + fput(tsk-exe_file);
  
  just taking a cursory look so I may be missing something, but doesn't
  this leave the possibility that right here, with tsk-exe_file being
  put, another task would try to look at tsk's /proc/tsk-pid/exe?
 
 And I hit this one, so there's at least one issue.
 
 [  110.296952] Unable to handle kernel NULL pointer dereference at 
 0088 RIP: 
 [  110.299053]  [80293fca] d_path+0x1a/0x117
 [  110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0 
 [  110.303509] Oops:  [1] SMP 
 [  110.304719] CPU 1 
 [  110.305493] Modules linked in: oprofile
 [  110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty 
 #183
 [  110.309733] RIP: 0010:[80293fca]  [80293fca] 
 d_path+0x1a/0x117
 [  110.312635] RSP: 0018:810142335e38  EFLAGS: 00010292
 [  110.314667] RAX: 81006d58a000 RBX:  RCX: 
 1000
 [  110.317397] RDX: 81006d58a000 RSI:  RDI: 
 
 [  110.320127] RBP: 81006d58a000 R08: fff3 R09: 
 0006be8b
 [  110.322857] R10:  R11: 0001 R12: 
 
 [  110.325588] R13: 1000 R14: 81006d58a000 R15: 
 0
 [  110.328319] FS:  2b033d578260() GS:81000106e480() 
 knlGS:
 [  110.331415] CS:  0010 DS:  ES:  CR0: 8005003b
 [  110.333613] CR2: 0088 CR3: 6cf54000 CR4: 
 06e0
 [  110.336344] Process pidof (pid: 3983, threadinfo 810142334000, task 
 8101421186c0)
 [  110.339472] Stack:  8101422a7268  81006d58a000 
 fff4
 [  110.342556]   1000 00678820 
 802b7a54
 [  110.345404]     
 
 [  110.348180] Call Trace:
 [  110.349188]  [802b7a54] proc_pid_readlink+0x89/0xff
 [  110.351387]  [80285e55] sys_readlinkat+0x87/0xa9
 [  110.353487]  [8026d4dc] remove_vma+0x5d/0x64
 [  110.355455]  [80596acd] error_exit+0x0/0x84
 [  110.357389]  [8020935e] system_call+0x7e/0x83
 [  110.359388] 
 [  110.359958] 
 [  110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 
 74 
 [  110.363381] RIP  [80293fca] d_path+0x1a/0x117
 [  110.365386]  RSP 810142335e38
 [  110.366720] CR2: 0088

Hmm, at first blush this does appear to be related to the bug Serge
pointed out. I'm not certain though, so I'll see if I can spot/produce
any other problems that could explain this.

Thanks for testing my patch!

Cheers,
-Matt Helsley

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-05-30 Thread Serge E. Hallyn
Quoting Matt Helsley ([EMAIL PROTECTED]):
 This patch avoids holding the mmap semaphore while walking VMAs in response to
 programs which read or follow the /proc/pid|self/exe symlink. This also 
 allows us
 to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
 separate
 reference to the executable file stored in the task struct and increased code 
 in
 fork, exec, and exit paths.
 
 Signed-off-by: Matt Helsley [EMAIL PROTECTED]
 ---
 
 Compiled and passed simple tests for regressions when patched against a 2.6.20
 and 2.6.22-rc2-mm1 kernel.
 
  fs/exec.c |5 +++--
  fs/proc/base.c|   20 
  fs/proc/internal.h|1 -
  fs/proc/task_mmu.c|   34 --
  fs/proc/task_nommu.c  |   34 --
  include/linux/sched.h |1 +
  kernel/exit.c |2 ++
  kernel/fork.c |   10 +-
  8 files changed, 35 insertions(+), 72 deletions(-)
 
 Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
 ===
 --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
 +++ linux-2.6.22-rc2-mm1/include/linux/sched.h
 @@ -988,10 +988,11 @@ struct task_struct {
   int oomkilladj; /* OOM kill score adjustment (bit shift). */
   char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
  it with task_lock())
- initialized normally by flush_old_exec */
 + struct file *exe_file;
  /* file system info */
   int link_count, total_link_count;
  #ifdef CONFIG_SYSVIPC
  /* ipc stuff */
   struct sysv_sem sysvsem;
 Index: linux-2.6.22-rc2-mm1/fs/exec.c
 ===
 --- linux-2.6.22-rc2-mm1.orig/fs/exec.c
 +++ linux-2.6.22-rc2-mm1/fs/exec.c
 @@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b
   read_unlock(binfmt_lock);
   retval = fn(bprm, regs);
   if (retval = 0) {
   put_binfmt(fmt);
   allow_write_access(bprm-file);
 - if (bprm-file)
 - fput(bprm-file);
 + if (current-exe_file)
 + fput(current-exe_file);
 + current-exe_file = bprm-file;
   bprm-file = NULL;
   current-did_exec = 1;
   proc_exec_connector(current);
   return retval;
   }
 Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
 ===
 --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
 +++ linux-2.6.22-rc2-mm1/fs/proc/base.c
 @@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc
   .write  = sched_write,
   .llseek = seq_lseek,
   .release= seq_release,
  };
  
 +static int proc_exe_link(struct inode *inode, struct dentry **dentry,
 +  struct vfsmount **mnt)
 +{
 + int error;
 + struct task_struct *task;
 +
 + task = get_proc_task(inode);
 + if (!task)
 + return -ENOENT;
 + error = -ENOSYS;
 + if (!task-exe_file)
 + goto out;
 + *mnt = mntget(task-exe_file-f_path.mnt);
 + *dentry = dget(task-exe_file-f_path.dentry);
 + error = 0;
 +out:
 + put_task_struct(task);
 + return error;
 +}
 +
  static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata 
 *nd)
  {
   struct inode *inode = dentry-d_inode;
   int error = -EACCES;
  
 Index: linux-2.6.22-rc2-mm1/kernel/exit.c
 ===
 --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
 +++ linux-2.6.22-rc2-mm1/kernel/exit.c
 @@ -924,10 +924,12 @@ fastcall void do_exit(long code)
   if (unlikely(tsk-audit_context))
   audit_free(tsk);
  
   taskstats_exit(tsk, group_dead);
  
 + if (tsk-exe_file)
 + fput(tsk-exe_file);

Hi,

just taking a cursory look so I may be missing something, but doesn't
this leave the possibility that right here, with tsk-exe_file being
put, another task would try to look at tsk's /proc/tsk-pid/exe?

thanks,
-serge


   exit_mm(tsk);
  
   if (group_dead)
   acct_process();
   exit_sem(tsk);
 Index: linux-2.6.22-rc2-mm1/kernel/fork.c
 ===
 --- linux-2.6.22-rc2-mm1.orig/kernel/fork.c
 +++ linux-2.6.22-rc2-mm1/kernel/fork.c
 @@ -1163,10 +1163,13 @@ static struct task_struct *copy_process(
  
   /* ok, now we should be set up.. */
   p-exit_signal = (clone_flags  CLONE_THREAD) ? -1 : (clone_flags  
 

[RFC][PATCH] Replacing the /proc/pid|self/exe symlink code

2007-05-29 Thread Matt Helsley
This patch avoids holding the mmap semaphore while walking VMAs in response to
programs which read or follow the /proc/pid|self/exe symlink. This also 
allows us
to merge mmu and nommu proc_exe_link() functions. The costs are holding a 
separate
reference to the executable file stored in the task struct and increased code in
fork, exec, and exit paths.

Signed-off-by: Matt Helsley [EMAIL PROTECTED]
---

Compiled and passed simple tests for regressions when patched against a 2.6.20
and 2.6.22-rc2-mm1 kernel.

 fs/exec.c |5 +++--
 fs/proc/base.c|   20 
 fs/proc/internal.h|1 -
 fs/proc/task_mmu.c|   34 --
 fs/proc/task_nommu.c  |   34 --
 include/linux/sched.h |1 +
 kernel/exit.c |2 ++
 kernel/fork.c |   10 +-
 8 files changed, 35 insertions(+), 72 deletions(-)

Index: linux-2.6.22-rc2-mm1/include/linux/sched.h
===
--- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h
+++ linux-2.6.22-rc2-mm1/include/linux/sched.h
@@ -988,10 +988,11 @@ struct task_struct {
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN]; /* executable name excluding path
 - access with [gs]et_task_comm (which lock
   it with task_lock())
 - initialized normally by flush_old_exec */
+   struct file *exe_file;
 /* file system info */
int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
struct sysv_sem sysvsem;
Index: linux-2.6.22-rc2-mm1/fs/exec.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/exec.c
+++ linux-2.6.22-rc2-mm1/fs/exec.c
@@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b
read_unlock(binfmt_lock);
retval = fn(bprm, regs);
if (retval = 0) {
put_binfmt(fmt);
allow_write_access(bprm-file);
-   if (bprm-file)
-   fput(bprm-file);
+   if (current-exe_file)
+   fput(current-exe_file);
+   current-exe_file = bprm-file;
bprm-file = NULL;
current-did_exec = 1;
proc_exec_connector(current);
return retval;
}
Index: linux-2.6.22-rc2-mm1/fs/proc/base.c
===
--- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c
+++ linux-2.6.22-rc2-mm1/fs/proc/base.c
@@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc
.write  = sched_write,
.llseek = seq_lseek,
.release= seq_release,
 };
 
+static int proc_exe_link(struct inode *inode, struct dentry **dentry,
+struct vfsmount **mnt)
+{
+   int error;
+   struct task_struct *task;
+
+   task = get_proc_task(inode);
+   if (!task)
+   return -ENOENT;
+   error = -ENOSYS;
+   if (!task-exe_file)
+   goto out;
+   *mnt = mntget(task-exe_file-f_path.mnt);
+   *dentry = dget(task-exe_file-f_path.dentry);
+   error = 0;
+out:
+   put_task_struct(task);
+   return error;
+}
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
struct inode *inode = dentry-d_inode;
int error = -EACCES;
 
Index: linux-2.6.22-rc2-mm1/kernel/exit.c
===
--- linux-2.6.22-rc2-mm1.orig/kernel/exit.c
+++ linux-2.6.22-rc2-mm1/kernel/exit.c
@@ -924,10 +924,12 @@ fastcall void do_exit(long code)
if (unlikely(tsk-audit_context))
audit_free(tsk);
 
taskstats_exit(tsk, group_dead);
 
+   if (tsk-exe_file)
+   fput(tsk-exe_file);
exit_mm(tsk);
 
if (group_dead)
acct_process();
exit_sem(tsk);
Index: linux-2.6.22-rc2-mm1/kernel/fork.c
===
--- linux-2.6.22-rc2-mm1.orig/kernel/fork.c
+++ linux-2.6.22-rc2-mm1/kernel/fork.c
@@ -1163,10 +1163,13 @@ static struct task_struct *copy_process(
 
/* ok, now we should be set up.. */
p-exit_signal = (clone_flags  CLONE_THREAD) ? -1 : (clone_flags  
CSIGNAL);
p-pdeath_signal = 0;
p-exit_state = 0;
+   p-exe_file = current-exe_file;
+   if (p-exe_file)
+   get_file(p-exe_file);
 
/*
 * Ok, make it visible to the rest of the system.
 * We dont wake it up yet.
 */
@@