Re: [RFC][PATCH] Replacing the /proc//exe symlink code
Quoting Matt Helsley ([EMAIL PROTECTED]): > This patch avoids holding the mmap semaphore while walking VMAs in response to > programs which read or follow the /proc//exe symlink. This also > allows > us to merge mmu and nommu proc_exe_link() functions. The costs are holding the > task lock, a separate reference to the executable file stored in the task > struct, and increased code in fork, exec, and exit paths. > > Changes: > Clear exe_file field in exit path > Use task_lock() to protect exe_file between write and read paths This one does seem safe. Though do you need to handle the #ifdef __alpha__ in fs/exec.c:search_binary_handler? thanks, -serge > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]> > --- > > fs/exec.c |7 +-- > fs/proc/base.c| 21 + > fs/proc/internal.h|1 - > fs/proc/task_mmu.c| 34 -- > fs/proc/task_nommu.c | 34 -- > include/linux/sched.h |3 ++- > kernel/exit.c |6 ++ > kernel/fork.c |9 - > 8 files changed, 42 insertions(+), 73 deletions(-) > > Index: linux-2.6.22-rc2-mm1/include/linux/sched.h > === > --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h > +++ linux-2.6.22-rc2-mm1/include/linux/sched.h > @@ -988,10 +988,11 @@ struct task_struct { > int oomkilladj; /* OOM kill score adjustment (bit shift). */ > char comm[TASK_COMM_LEN]; /* executable name excluding path >- access with [gs]et_task_comm (which lock > it with task_lock()) >- initialized normally by flush_old_exec */ > + struct file *exe_file; > /* file system info */ > int link_count, total_link_count; > #ifdef CONFIG_SYSVIPC > /* ipc stuff */ > struct sysv_sem sysvsem; > @@ -1549,11 +1550,11 @@ static inline int thread_group_empty(str > > #define delay_group_leader(p) \ > (thread_group_leader(p) && !thread_group_empty(p)) > > /* > - * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring > + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, ->exe_file, keyring > * subscriptions and synchronises with wait4(). Also used in procfs. Also > * pins the final release of task.io_context. Also protects ->cpuset. > * > * Nests both inside and outside of read_lock(&tasklist_lock). > * It must not be nested with write_lock_irq(&tasklist_lock), > Index: linux-2.6.22-rc2-mm1/fs/exec.c > === > --- linux-2.6.22-rc2-mm1.orig/fs/exec.c > +++ linux-2.6.22-rc2-mm1/fs/exec.c > @@ -1106,12 +1106,15 @@ int search_binary_handler(struct linux_b > read_unlock(&binfmt_lock); > retval = fn(bprm, regs); > if (retval >= 0) { > put_binfmt(fmt); > allow_write_access(bprm->file); > - if (bprm->file) > - fput(bprm->file); > + task_lock(current); > + if (current->exe_file) > + fput(current->exe_file); > + current->exe_file = bprm->file; > + task_unlock(current); > bprm->file = NULL; > current->did_exec = 1; > proc_exec_connector(current); > return retval; > } > Index: linux-2.6.22-rc2-mm1/fs/proc/base.c > === > --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c > +++ linux-2.6.22-rc2-mm1/fs/proc/base.c > @@ -951,10 +951,31 @@ const struct file_operations proc_pid_sc > .write = sched_write, > .llseek = seq_lseek, > .release= seq_release, > }; > > +static int proc_exe_link(struct inode *inode, struct dentry **dentry, > + struct vfsmount **mnt) > +{ > + int error = -ENOENT; > + struct task_struct *task; > + > + task = get_proc_task(inode); > + if (!task) > + return error; > + task_lock(task); > + if (!task->exe_file) > + goto out; > + *mnt = mntget(task->exe_file->f_path.mnt); > + *dentry = dget(task->exe_file->f_path.dentry); > + error = 0; > +out: > + task_unlock(task); > + put_task_struct(task); > + return error; > +} > + > static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata > *nd) > { > struct inode *inode = dentry->d_inode; > int error = -EACCES; > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c > === > --- linux-2.6.
[RFC][PATCH] Replacing the /proc//exe symlink code
This patch avoids holding the mmap semaphore while walking VMAs in response to programs which read or follow the /proc//exe symlink. This also allows us to merge mmu and nommu proc_exe_link() functions. The costs are holding the task lock, a separate reference to the executable file stored in the task struct, and increased code in fork, exec, and exit paths. Changes: Clear exe_file field in exit path Use task_lock() to protect exe_file between write and read paths Signed-off-by: Matt Helsley <[EMAIL PROTECTED]> --- fs/exec.c |7 +-- fs/proc/base.c| 21 + fs/proc/internal.h|1 - fs/proc/task_mmu.c| 34 -- fs/proc/task_nommu.c | 34 -- include/linux/sched.h |3 ++- kernel/exit.c |6 ++ kernel/fork.c |9 - 8 files changed, 42 insertions(+), 73 deletions(-) Index: linux-2.6.22-rc2-mm1/include/linux/sched.h === --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h +++ linux-2.6.22-rc2-mm1/include/linux/sched.h @@ -988,10 +988,11 @@ struct task_struct { int oomkilladj; /* OOM kill score adjustment (bit shift). */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) - initialized normally by flush_old_exec */ + struct file *exe_file; /* file system info */ int link_count, total_link_count; #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; @@ -1549,11 +1550,11 @@ static inline int thread_group_empty(str #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) /* - * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, ->exe_file, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), Index: linux-2.6.22-rc2-mm1/fs/exec.c === --- linux-2.6.22-rc2-mm1.orig/fs/exec.c +++ linux-2.6.22-rc2-mm1/fs/exec.c @@ -1106,12 +1106,15 @@ int search_binary_handler(struct linux_b read_unlock(&binfmt_lock); retval = fn(bprm, regs); if (retval >= 0) { put_binfmt(fmt); allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); + task_lock(current); + if (current->exe_file) + fput(current->exe_file); + current->exe_file = bprm->file; + task_unlock(current); bprm->file = NULL; current->did_exec = 1; proc_exec_connector(current); return retval; } Index: linux-2.6.22-rc2-mm1/fs/proc/base.c === --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c +++ linux-2.6.22-rc2-mm1/fs/proc/base.c @@ -951,10 +951,31 @@ const struct file_operations proc_pid_sc .write = sched_write, .llseek = seq_lseek, .release= seq_release, }; +static int proc_exe_link(struct inode *inode, struct dentry **dentry, +struct vfsmount **mnt) +{ + int error = -ENOENT; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return error; + task_lock(task); + if (!task->exe_file) + goto out; + *mnt = mntget(task->exe_file->f_path.mnt); + *dentry = dget(task->exe_file->f_path.dentry); + error = 0; +out: + task_unlock(task); + put_task_struct(task); + return error; +} + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; int error = -EACCES; Index: linux-2.6.22-rc2-mm1/kernel/exit.c === --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c +++ linux-2.6.22-rc2-mm1/kernel/exit.c @@ -924,10 +924,16 @@ fastcall void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); taskstats_exit(tsk, group_dead); + task_lock(tsk); + if (tsk->exe_file) { +
Re: [RFC][PATCH] Replacing the /proc//exe symlink code
On Fri, 2007-06-01 at 17:31 -0500, Serge E. Hallyn wrote: > Quoting Matt Helsley ([EMAIL PROTECTED]): > > On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote: > > > Quoting Matt Helsley ([EMAIL PROTECTED]): > > > > This patch avoids holding the mmap semaphore while walking VMAs in > > > > response to > > > > programs which read or follow the /proc//exe symlink. This > > > > also allows us > > > > to merge mmu and nommu proc_exe_link() functions. The costs are holding > > > > a separate > > > > reference to the executable file stored in the task struct and > > > > increased code in > > > > fork, exec, and exit paths. > > > > > > > > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]> > > > > --- > > > > > > > > Compiled and passed simple tests for regressions when patched against a > > > > 2.6.20 > > > > and 2.6.22-rc2-mm1 kernel. > > > > > > > > fs/exec.c |5 +++-- > > > > fs/proc/base.c| 20 > > > > fs/proc/internal.h|1 - > > > > fs/proc/task_mmu.c| 34 -- > > > > fs/proc/task_nommu.c | 34 -- > > > > include/linux/sched.h |1 + > > > > kernel/exit.c |2 ++ > > > > kernel/fork.c | 10 +- > > > > 8 files changed, 35 insertions(+), 72 deletions(-) > > > > > > > > > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c > > > > === > > > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c > > > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c > > > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code) > > > > if (unlikely(tsk->audit_context)) > > > > audit_free(tsk); > > > > > > > > taskstats_exit(tsk, group_dead); > > > > > > > > + if (tsk->exe_file) > > > > + fput(tsk->exe_file); > > > > > > Hi, > > > > > > just taking a cursory look so I may be missing something, but doesn't > > > this leave the possibility that right here, with tsk->exe_file being > > > put, another task would try to look at tsk's /proc/tsk->pid/exe? > > > > > > thanks, > > > -serge > > > > > > exit_mm(tsk); > > > > > > > > > > > Good question. To be precise, I think the problem doesn't exist here but > > after the exit_mm() because there's a VMA that holds a reference to the > > same file. > > > > The existing code appears to solve the race between > > reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by > > returning -ENOENT for the case where there is no executable VMA with a > > reference to the file backing it. > > > > So I need to put NULL in the exe_file field and adjust the return value > > to be -ENOENT instead of -ENOSYS. > > > > Thanks for the review! > > Ok, I had to think about this a bit, but so you're saying you set it to > NULL in do_exit(), and anyone who has just dereferenced tsk->exe_file > before the fput in do_exit() should be ok because the vma hasn't yet > been put? Yes > Should the > if (!task->exe_file) > goto out; > *mnt = mntget(task->exe_file->f_path.mnt); > *dentry = dget(task->exe_file->f_path.dentry); > > also go inside an preempt_disable to prevent sleeping and maybe become It needs some form of protection from concurrent access between write and read. write happens during exec, fork, and exit. In the fork case however it's not necessary because the new task isn't visible in /proc yet and the value of current doesn't change anyway. > exef = task->exe_file; /* to prevent task->exe_file being set > to NULL before we've grabbed the path */ > if (!exef) > goto out; > get_file(exef); /* to prevent the mm somehow being put before > we've grabbed the path? */ > *mnt = mntget(task->exe_file->f_path.mnt); > *dentry = dget(task->exe_file->f_path.dentry); > put_file(exef); /* ? */ > > ? > > Or am I being overly paranoid? No, you're right. In fact, because readlink() can be initiated by another task I think disabling preemption really only fixes the problem on uniprocessor systems. So I was thinking of using task_lock() to protect exe_file. Cheers, -Matt Helsley - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH] Replacing the /proc//exe symlink code
Quoting Matt Helsley ([EMAIL PROTECTED]): > On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote: > > Quoting Matt Helsley ([EMAIL PROTECTED]): > > > This patch avoids holding the mmap semaphore while walking VMAs in > > > response to > > > programs which read or follow the /proc//exe symlink. This also > > > allows us > > > to merge mmu and nommu proc_exe_link() functions. The costs are holding a > > > separate > > > reference to the executable file stored in the task struct and increased > > > code in > > > fork, exec, and exit paths. > > > > > > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]> > > > --- > > > > > > Compiled and passed simple tests for regressions when patched against a > > > 2.6.20 > > > and 2.6.22-rc2-mm1 kernel. > > > > > > fs/exec.c |5 +++-- > > > fs/proc/base.c| 20 > > > fs/proc/internal.h|1 - > > > fs/proc/task_mmu.c| 34 -- > > > fs/proc/task_nommu.c | 34 -- > > > include/linux/sched.h |1 + > > > kernel/exit.c |2 ++ > > > kernel/fork.c | 10 +- > > > 8 files changed, 35 insertions(+), 72 deletions(-) > > > > > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c > > > === > > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c > > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c > > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code) > > > if (unlikely(tsk->audit_context)) > > > audit_free(tsk); > > > > > > taskstats_exit(tsk, group_dead); > > > > > > + if (tsk->exe_file) > > > + fput(tsk->exe_file); > > > > Hi, > > > > just taking a cursory look so I may be missing something, but doesn't > > this leave the possibility that right here, with tsk->exe_file being > > put, another task would try to look at tsk's /proc/tsk->pid/exe? > > > > thanks, > > -serge > > > > exit_mm(tsk); > > > > > > Good question. To be precise, I think the problem doesn't exist here but > after the exit_mm() because there's a VMA that holds a reference to the > same file. > > The existing code appears to solve the race between > reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by > returning -ENOENT for the case where there is no executable VMA with a > reference to the file backing it. > > So I need to put NULL in the exe_file field and adjust the return value > to be -ENOENT instead of -ENOSYS. > > Thanks for the review! Ok, I had to think about this a bit, but so you're saying you set it to NULL in do_exit(), and anyone who has just dereferenced tsk->exe_file before the fput in do_exit() should be ok because the vma hasn't yet been put? Should the if (!task->exe_file) goto out; *mnt = mntget(task->exe_file->f_path.mnt); *dentry = dget(task->exe_file->f_path.dentry); also go inside an preempt_disable to prevent sleeping and maybe become exef = task->exe_file; /* to prevent task->exe_file being set to NULL before we've grabbed the path */ if (!exef) goto out; get_file(exef); /* to prevent the mm somehow being put before we've grabbed the path? */ *mnt = mntget(task->exe_file->f_path.mnt); *dentry = dget(task->exe_file->f_path.dentry); put_file(exef); /* ? */ ? Or am I being overly paranoid? thanks, -serge - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH] Replacing the /proc//exe symlink code
On Wed, 2007-05-30 at 23:01 -0700, Chris Wright wrote: > * Serge E. Hallyn ([EMAIL PROTECTED]) wrote: > > > === > > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c > > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c > > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code) > > > if (unlikely(tsk->audit_context)) > > > audit_free(tsk); > > > > > > taskstats_exit(tsk, group_dead); > > > > > > + if (tsk->exe_file) > > > + fput(tsk->exe_file); > > > > just taking a cursory look so I may be missing something, but doesn't > > this leave the possibility that right here, with tsk->exe_file being > > put, another task would try to look at tsk's /proc/tsk->pid/exe? > > And I hit this one, so there's at least one issue. > > [ 110.296952] Unable to handle kernel NULL pointer dereference at > 0088 RIP: > [ 110.299053] [] d_path+0x1a/0x117 > [ 110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0 > [ 110.303509] Oops: [1] SMP > [ 110.304719] CPU 1 > [ 110.305493] Modules linked in: oprofile > [ 110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty > #183 > [ 110.309733] RIP: 0010:[] [] > d_path+0x1a/0x117 > [ 110.312635] RSP: 0018:810142335e38 EFLAGS: 00010292 > [ 110.314667] RAX: 81006d58a000 RBX: RCX: > 1000 > [ 110.317397] RDX: 81006d58a000 RSI: RDI: > > [ 110.320127] RBP: 81006d58a000 R08: fff3 R09: > 0006be8b > [ 110.322857] R10: R11: 0001 R12: > > [ 110.325588] R13: 1000 R14: 81006d58a000 R15: > 0 > [ 110.328319] FS: 2b033d578260() GS:81000106e480() > knlGS: > [ 110.331415] CS: 0010 DS: ES: CR0: 8005003b > [ 110.333613] CR2: 0088 CR3: 6cf54000 CR4: > 06e0 > [ 110.336344] Process pidof (pid: 3983, threadinfo 810142334000, task > 8101421186c0) > [ 110.339472] Stack: 8101422a7268 81006d58a000 > fff4 > [ 110.342556] 1000 00678820 > 802b7a54 > [ 110.345404] > > [ 110.348180] Call Trace: > [ 110.349188] [] proc_pid_readlink+0x89/0xff > [ 110.351387] [] sys_readlinkat+0x87/0xa9 > [ 110.353487] [] remove_vma+0x5d/0x64 > [ 110.355455] [] error_exit+0x0/0x84 > [ 110.357389] [] system_call+0x7e/0x83 > [ 110.359388] > [ 110.359958] > [ 110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 > 74 > [ 110.363381] RIP [] d_path+0x1a/0x117 > [ 110.365386] RSP > [ 110.366720] CR2: 0088 Hmm, at first blush this does appear to be related to the bug Serge pointed out. I'm not certain though, so I'll see if I can spot/produce any other problems that could explain this. Thanks for testing my patch! Cheers, -Matt Helsley - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH] Replacing the /proc//exe symlink code
On Wed, 2007-05-30 at 13:09 -0500, Serge E. Hallyn wrote: > Quoting Matt Helsley ([EMAIL PROTECTED]): > > This patch avoids holding the mmap semaphore while walking VMAs in response > > to > > programs which read or follow the /proc//exe symlink. This also > > allows us > > to merge mmu and nommu proc_exe_link() functions. The costs are holding a > > separate > > reference to the executable file stored in the task struct and increased > > code in > > fork, exec, and exit paths. > > > > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]> > > --- > > > > Compiled and passed simple tests for regressions when patched against a > > 2.6.20 > > and 2.6.22-rc2-mm1 kernel. > > > > fs/exec.c |5 +++-- > > fs/proc/base.c| 20 > > fs/proc/internal.h|1 - > > fs/proc/task_mmu.c| 34 -- > > fs/proc/task_nommu.c | 34 -- > > include/linux/sched.h |1 + > > kernel/exit.c |2 ++ > > kernel/fork.c | 10 +- > > 8 files changed, 35 insertions(+), 72 deletions(-) > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c > > === > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code) > > if (unlikely(tsk->audit_context)) > > audit_free(tsk); > > > > taskstats_exit(tsk, group_dead); > > > > + if (tsk->exe_file) > > + fput(tsk->exe_file); > > Hi, > > just taking a cursory look so I may be missing something, but doesn't > this leave the possibility that right here, with tsk->exe_file being > put, another task would try to look at tsk's /proc/tsk->pid/exe? > > thanks, > -serge > > exit_mm(tsk); > Good question. To be precise, I think the problem doesn't exist here but after the exit_mm() because there's a VMA that holds a reference to the same file. The existing code appears to solve the race between reading/following /proc/tsk->pid/exe and exit_mm() in the exit path by returning -ENOENT for the case where there is no executable VMA with a reference to the file backing it. So I need to put NULL in the exe_file field and adjust the return value to be -ENOENT instead of -ENOSYS. Thanks for the review! Cheers, -Matt Helsley - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH] Replacing the /proc//exe symlink code
* Serge E. Hallyn ([EMAIL PROTECTED]) wrote: > > === > > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c > > +++ linux-2.6.22-rc2-mm1/kernel/exit.c > > @@ -924,10 +924,12 @@ fastcall void do_exit(long code) > > if (unlikely(tsk->audit_context)) > > audit_free(tsk); > > > > taskstats_exit(tsk, group_dead); > > > > + if (tsk->exe_file) > > + fput(tsk->exe_file); > > just taking a cursory look so I may be missing something, but doesn't > this leave the possibility that right here, with tsk->exe_file being > put, another task would try to look at tsk's /proc/tsk->pid/exe? And I hit this one, so there's at least one issue. [ 110.296952] Unable to handle kernel NULL pointer dereference at 0088 RIP: [ 110.299053] [] d_path+0x1a/0x117 [ 110.301861] PGD 6d35a067 PUD 6d35e067 PMD 0 [ 110.303509] Oops: [1] SMP [ 110.304719] CPU 1 [ 110.305493] Modules linked in: oprofile [ 110.306969] Pid: 3983, comm: pidof Not tainted 2.6.22-rc3-g7f397dcd-dirty #183 [ 110.309733] RIP: 0010:[] [] d_path+0x1a/0x117 [ 110.312635] RSP: 0018:810142335e38 EFLAGS: 00010292 [ 110.314667] RAX: 81006d58a000 RBX: RCX: 1000 [ 110.317397] RDX: 81006d58a000 RSI: RDI: [ 110.320127] RBP: 81006d58a000 R08: fff3 R09: 0006be8b [ 110.322857] R10: R11: 0001 R12: [ 110.325588] R13: 1000 R14: 81006d58a000 R15: 0 [ 110.328319] FS: 2b033d578260() GS:81000106e480() knlGS: [ 110.331415] CS: 0010 DS: ES: CR0: 8005003b [ 110.333613] CR2: 0088 CR3: 6cf54000 CR4: 06e0 [ 110.336344] Process pidof (pid: 3983, threadinfo 810142334000, task 8101421186c0) [ 110.339472] Stack: 8101422a7268 81006d58a000 fff4 [ 110.342556] 1000 00678820 802b7a54 [ 110.345404] [ 110.348180] Call Trace: [ 110.349188] [] proc_pid_readlink+0x89/0xff [ 110.351387] [] sys_readlinkat+0x87/0xa9 [ 110.353487] [] remove_vma+0x5d/0x64 [ 110.355455] [] error_exit+0x0/0x84 [ 110.357389] [] system_call+0x7e/0x83 [ 110.359388] [ 110.359958] [ 110.359958] Code: 48 8b 87 88 00 00 00 48 85 c0 74 20 48 8b 40 30 48 85 c0 74 [ 110.363381] RIP [] d_path+0x1a/0x117 [ 110.365386] RSP [ 110.366720] CR2: 0088 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH] Replacing the /proc//exe symlink code
Quoting Matt Helsley ([EMAIL PROTECTED]): > This patch avoids holding the mmap semaphore while walking VMAs in response to > programs which read or follow the /proc//exe symlink. This also > allows us > to merge mmu and nommu proc_exe_link() functions. The costs are holding a > separate > reference to the executable file stored in the task struct and increased code > in > fork, exec, and exit paths. > > Signed-off-by: Matt Helsley <[EMAIL PROTECTED]> > --- > > Compiled and passed simple tests for regressions when patched against a 2.6.20 > and 2.6.22-rc2-mm1 kernel. > > fs/exec.c |5 +++-- > fs/proc/base.c| 20 > fs/proc/internal.h|1 - > fs/proc/task_mmu.c| 34 -- > fs/proc/task_nommu.c | 34 -- > include/linux/sched.h |1 + > kernel/exit.c |2 ++ > kernel/fork.c | 10 +- > 8 files changed, 35 insertions(+), 72 deletions(-) > > Index: linux-2.6.22-rc2-mm1/include/linux/sched.h > === > --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h > +++ linux-2.6.22-rc2-mm1/include/linux/sched.h > @@ -988,10 +988,11 @@ struct task_struct { > int oomkilladj; /* OOM kill score adjustment (bit shift). */ > char comm[TASK_COMM_LEN]; /* executable name excluding path >- access with [gs]et_task_comm (which lock > it with task_lock()) >- initialized normally by flush_old_exec */ > + struct file *exe_file; > /* file system info */ > int link_count, total_link_count; > #ifdef CONFIG_SYSVIPC > /* ipc stuff */ > struct sysv_sem sysvsem; > Index: linux-2.6.22-rc2-mm1/fs/exec.c > === > --- linux-2.6.22-rc2-mm1.orig/fs/exec.c > +++ linux-2.6.22-rc2-mm1/fs/exec.c > @@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b > read_unlock(&binfmt_lock); > retval = fn(bprm, regs); > if (retval >= 0) { > put_binfmt(fmt); > allow_write_access(bprm->file); > - if (bprm->file) > - fput(bprm->file); > + if (current->exe_file) > + fput(current->exe_file); > + current->exe_file = bprm->file; > bprm->file = NULL; > current->did_exec = 1; > proc_exec_connector(current); > return retval; > } > Index: linux-2.6.22-rc2-mm1/fs/proc/base.c > === > --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c > +++ linux-2.6.22-rc2-mm1/fs/proc/base.c > @@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc > .write = sched_write, > .llseek = seq_lseek, > .release= seq_release, > }; > > +static int proc_exe_link(struct inode *inode, struct dentry **dentry, > + struct vfsmount **mnt) > +{ > + int error; > + struct task_struct *task; > + > + task = get_proc_task(inode); > + if (!task) > + return -ENOENT; > + error = -ENOSYS; > + if (!task->exe_file) > + goto out; > + *mnt = mntget(task->exe_file->f_path.mnt); > + *dentry = dget(task->exe_file->f_path.dentry); > + error = 0; > +out: > + put_task_struct(task); > + return error; > +} > + > static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata > *nd) > { > struct inode *inode = dentry->d_inode; > int error = -EACCES; > > Index: linux-2.6.22-rc2-mm1/kernel/exit.c > === > --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c > +++ linux-2.6.22-rc2-mm1/kernel/exit.c > @@ -924,10 +924,12 @@ fastcall void do_exit(long code) > if (unlikely(tsk->audit_context)) > audit_free(tsk); > > taskstats_exit(tsk, group_dead); > > + if (tsk->exe_file) > + fput(tsk->exe_file); Hi, just taking a cursory look so I may be missing something, but doesn't this leave the possibility that right here, with tsk->exe_file being put, another task would try to look at tsk's /proc/tsk->pid/exe? thanks, -serge > exit_mm(tsk); > > if (group_dead) > acct_process(); > exit_sem(tsk); > Index: linux-2.6.22-rc2-mm1/kernel/fork.c > === > --- linux-2.6.22-rc2-mm1.orig/kernel/fork.c > +++ linux-2.6.22-rc2-mm1/kernel/fork.c > @@ -1163,10 +1163,13 @@ static struct task_struct *
[RFC][PATCH] Replacing the /proc//exe symlink code
This patch avoids holding the mmap semaphore while walking VMAs in response to programs which read or follow the /proc//exe symlink. This also allows us to merge mmu and nommu proc_exe_link() functions. The costs are holding a separate reference to the executable file stored in the task struct and increased code in fork, exec, and exit paths. Signed-off-by: Matt Helsley <[EMAIL PROTECTED]> --- Compiled and passed simple tests for regressions when patched against a 2.6.20 and 2.6.22-rc2-mm1 kernel. fs/exec.c |5 +++-- fs/proc/base.c| 20 fs/proc/internal.h|1 - fs/proc/task_mmu.c| 34 -- fs/proc/task_nommu.c | 34 -- include/linux/sched.h |1 + kernel/exit.c |2 ++ kernel/fork.c | 10 +- 8 files changed, 35 insertions(+), 72 deletions(-) Index: linux-2.6.22-rc2-mm1/include/linux/sched.h === --- linux-2.6.22-rc2-mm1.orig/include/linux/sched.h +++ linux-2.6.22-rc2-mm1/include/linux/sched.h @@ -988,10 +988,11 @@ struct task_struct { int oomkilladj; /* OOM kill score adjustment (bit shift). */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock it with task_lock()) - initialized normally by flush_old_exec */ + struct file *exe_file; /* file system info */ int link_count, total_link_count; #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; Index: linux-2.6.22-rc2-mm1/fs/exec.c === --- linux-2.6.22-rc2-mm1.orig/fs/exec.c +++ linux-2.6.22-rc2-mm1/fs/exec.c @@ -1106,12 +1106,13 @@ int search_binary_handler(struct linux_b read_unlock(&binfmt_lock); retval = fn(bprm, regs); if (retval >= 0) { put_binfmt(fmt); allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); + if (current->exe_file) + fput(current->exe_file); + current->exe_file = bprm->file; bprm->file = NULL; current->did_exec = 1; proc_exec_connector(current); return retval; } Index: linux-2.6.22-rc2-mm1/fs/proc/base.c === --- linux-2.6.22-rc2-mm1.orig/fs/proc/base.c +++ linux-2.6.22-rc2-mm1/fs/proc/base.c @@ -951,10 +951,30 @@ const struct file_operations proc_pid_sc .write = sched_write, .llseek = seq_lseek, .release= seq_release, }; +static int proc_exe_link(struct inode *inode, struct dentry **dentry, +struct vfsmount **mnt) +{ + int error; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return -ENOENT; + error = -ENOSYS; + if (!task->exe_file) + goto out; + *mnt = mntget(task->exe_file->f_path.mnt); + *dentry = dget(task->exe_file->f_path.dentry); + error = 0; +out: + put_task_struct(task); + return error; +} + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; int error = -EACCES; Index: linux-2.6.22-rc2-mm1/kernel/exit.c === --- linux-2.6.22-rc2-mm1.orig/kernel/exit.c +++ linux-2.6.22-rc2-mm1/kernel/exit.c @@ -924,10 +924,12 @@ fastcall void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); taskstats_exit(tsk, group_dead); + if (tsk->exe_file) + fput(tsk->exe_file); exit_mm(tsk); if (group_dead) acct_process(); exit_sem(tsk); Index: linux-2.6.22-rc2-mm1/kernel/fork.c === --- linux-2.6.22-rc2-mm1.orig/kernel/fork.c +++ linux-2.6.22-rc2-mm1/kernel/fork.c @@ -1163,10 +1163,13 @@ static struct task_struct *copy_process( /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; p->exit_state = 0; + p->exe_file = current->exe_file; + if (p->exe_file) + get_file(p->exe_file); /* * Ok, make it visible to the rest of the system. * We dont wake it u