> I don't understand how this is supposed to work.  Consider
> 
> CPU1                                  CPU2
> 
> atomic_inc(&dp->pde_users);
> if (dp->proc_fops)
>                                       de->proc_fops = NULL;
>       use_proc_fops <= BOOM
>                                       if (atomic_read(&de->pde_users) > 0) {
> 
> what prevents dereference of a NULL proc_fops value?

I was wrong: what prevents it is that proc_fops isn't used at all in these
paths!  However it is used in proc_get_inode thusly:

                if (de->proc_fops)
                        inode->i_fop = de->proc_fops;

What if proc_fops is set to NULL between these two?  Putting it into a
local variable should take care of this one, but since I'm not sure if
it's really needed I'll leave that to you!

Otherwise, here's a patch that adds memory barriers (maybe these can be
SMP memory barriers) since the atomic ops you are using do not imply
such barriers according to Documentation/atomic_ops.txt.  The memory
barriers are needed, since you need to know that if you saw a non-NULL
proc_fops, then remove_proc_entry saw your increased pde_users count.
If the proc_fops write was re-ordered after the pde_users read, or the
proc_fops read was re-ordered before the pde_users write, then this may
not be true.

Also, I removed the early checks for NULL proc_fops.  True, they avoid
an atomic operation and a memory barrier, however they only avoid them
in the error path, when -EIO is going to be returned, which is hardly a
fast path.  In the common case, they represent a pointless check.

Ciao,

Duncan.

PS: Compiles, but otherwise untested.


Index: Linux/fs/proc/generic.c
===================================================================
--- Linux.orig/fs/proc/generic.c        2006-12-09 17:18:32.000000000 +0100
+++ Linux/fs/proc/generic.c     2007-02-01 10:54:36.000000000 +0100
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/namei.h>
 #include <linux/bitops.h>
+#include <linux/delay.h>
 #include <linux/spinlock.h>
 #include <asm/uaccess.h>
 
@@ -76,6 +77,19 @@
        if (!(page = (char*) __get_free_page(GFP_KERNEL)))
                return -ENOMEM;
 
+       /*
+        * We are going to call into module's code via ->get_info or
+        * ->read_proc. Bump refcount so that remove_proc_entry() will
+        * wait for read to complete.
+        */
+       atomic_inc(&dp->pde_users);
+       mb();
+       if (!dp->proc_fops)
+               /*
+                * remove_proc_entry() marked PDE as "going away". Obey.
+                */
+               goto out_dec;
+
        while ((nbytes > 0) && !eof) {
                count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
 
@@ -195,6 +209,8 @@
                buf += n;
                retval += n;
        }
+out_dec:
+       atomic_dec(&dp->pde_users);
        free_page((unsigned long) page);
        return retval;
 }
@@ -205,14 +221,28 @@
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        struct proc_dir_entry * dp;
+       ssize_t rv;
        
        dp = PDE(inode);
 
        if (!dp->write_proc)
                return -EIO;
 
-       /* FIXME: does this routine need ppos?  probably... */
-       return dp->write_proc(file, buffer, count, dp->data);
+       rv = -EIO;
+       /*
+        * We are going to call into module's code via ->write_proc .
+        * Bump refcount so that module won't dissapear while ->write_proc
+        * sleeps in copy_from_user(). remove_proc_entry() will wait for
+        * write to complete.
+        */
+       atomic_inc(&dp->pde_users);
+       mb();
+       if (dp->proc_fops)
+               /* PDE is ready, refcount bumped, call into module. */
+               /* FIXME: does this routine need ppos?  probably... */
+               rv = dp->write_proc(file, buffer, count, dp->data);
+       atomic_dec(&dp->pde_users);
+       return rv;
 }
 
 
@@ -717,12 +747,26 @@
        if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
                goto out;
        len = strlen(fn);
-
+again:
        spin_lock(&proc_subdir_lock);
        for (p = &parent->subdir; *p; p=&(*p)->next ) {
                if (!proc_match(len, fn, *p))
                        continue;
                de = *p;
+
+               /*
+                * Stop accepting new readers/writers. If you're dynamically
+                * allocating ->proc_fops, save a pointer somewhere.
+                */
+               de->proc_fops = NULL;
+               mb();
+               /* Wait until all readers/writers are done. */
+               if (atomic_read(&de->pde_users) > 0) {
+                       spin_unlock(&proc_subdir_lock);
+                       msleep(1);
+                       goto again;
+               }
+
                *p = de->next;
                de->next = NULL;
                if (S_ISDIR(de->mode))
Index: Linux/include/linux/proc_fs.h
===================================================================
--- Linux.orig/include/linux/proc_fs.h  2006-10-03 15:39:31.000000000 +0200
+++ Linux/include/linux/proc_fs.h       2007-02-01 10:42:07.000000000 +0100
@@ -56,6 +56,19 @@
        gid_t gid;
        loff_t size;
        struct inode_operations * proc_iops;
+       /*
+        * NULL ->proc_fops means "PDE is going away RSN" or
+        * "PDE is just created". In either case ->get_info, ->read_proc,
+        * ->write_proc won't be called because it's too late or too early,
+        * respectively.
+        *
+        * Valid ->proc_fops means "use this file_operations" or
+        * "->data is setup, it's safe to call ->read_proc, ->write_proc which
+        * dereference it".
+        *
+        * If you're allocating ->proc_fops dynamically, save a pointer
+        * somewhere.
+        */
        const struct file_operations * proc_fops;
        get_info_t *get_info;
        struct module *owner;
@@ -66,6 +79,8 @@
        atomic_t count;         /* use count */
        int deleted;            /* delete flag */
        void *set;
+       atomic_t pde_users;     /* number of readers + number of writers via
+                                * ->read_proc, ->write_proc, ->get_info */
 };
 
 struct kcore_list {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to