"Paul E. McKenney" <paul...@linux.vnet.ibm.com> writes:

> On Tue, Dec 01, 2015 at 12:21:31AM +0100, Nicolai Stange wrote:
>> Nothing prevents a dentry found by path lookup before a return of
>> __debugfs_remove() to actually get opened after that return. Now, after
>> the return of __debugfs_remove(), there are no guarantees whatsoever
>> regarding the memory the corresponding inode's file_operations object
>> had been kept in.
>> 
>> Since __debugfs_remove() is seldomly invoked, usually from module exit
>> handlers only, the race is hard to trigger and the impact is very low.
>> 
>> A discussion of the problem outlined above as well as a suggested
>> solution can be found in the (sub-)thread rooted at
>> 
>>   http://lkml.kernel.org/g/20130401203445.ga20...@zeniv.linux.org.uk
>>   ("Yet another pipe related oops.")
>> 
>> Basically, Greg KH suggests to introduce an intermediate fops and
>> Al Viro points out that a pointer to the original ones may be stored in
>> ->d_fsdata.
>> 
>> Follow this line of reasoning:
>> - Add SRCU as a reverse dependency of DEBUG_FS.
>> - Introduce a srcu_struct object for the debugfs subsystem.
>> - In debugfs_create_file(), store a pointer to the original
>>   file_operations object in ->d_fsdata.
>> - In __debugfs_remove(), clear out that dentry->d_fsdata member again.
>>   debugfs_remove() and debugfs_remove_recursive() wait for a SRCU grace
>>   period before returning to their caller.
>> - Introduce an intermediate file_operations object named
>>   "debugfs_proxy_file_operations". It's ->open() functions checks,
>>   under the protection of a SRCU read lock, whether the "original"
>>   file_operations pointed to by ->d_fsdata is still valid and if so,
>>   tries to acquire a reference on the owning module. On success, it sets
>>   the file object's ->f_op to the original file_operations and forwards
>>   the ongoing open() call to the original ->open().
>> - For clarity, rename the former debugfs_file_operations to
>>   debugfs_noop_file_operations -- they are in no way canonical.
>> 
>> The choice of SRCU over "normal" RCU is justified by the fact, that the
>> former may also be used to protect ->i_private data from going away
>> during the execution of a file's readers and writers which may (and do)
>> sleep.
>> 
>> Signed-off-by: Nicolai Stange <nicsta...@gmail.com>
>
> One question below.  Other than that, looks good from an RCU perspective.
>
>                                               Thanx, Paul
>
>> ---
>>  Applicable to the Linus tree.
>>  The second of the two patches depends on the first one
>> 
>>  fs/debugfs/file.c       | 29 ++++++++++++++++++++++++++++-
>>  fs/debugfs/inode.c      | 16 +++++++++++++---
>>  include/linux/debugfs.h |  6 +++++-
>>  lib/Kconfig.debug       |  1 +
>>  4 files changed, 47 insertions(+), 5 deletions(-)
>> 
>> diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
>> index d2ba12e..67df2c9 100644
>> --- a/fs/debugfs/file.c
>> +++ b/fs/debugfs/file.c
>> @@ -35,13 +35,40 @@ static ssize_t default_write_file(struct file *file, 
>> const char __user *buf,
>>      return count;
>>  }
>> 
>> -const struct file_operations debugfs_file_operations = {
>> +const struct file_operations debugfs_noop_file_operations = {
>>      .read =         default_read_file,
>>      .write =        default_write_file,
>>      .open =         simple_open,
>>      .llseek =       noop_llseek,
>>  };
>> 
>> +static int proxy_open(struct inode *inode, struct file *filp)
>> +{
>> +    struct dentry * const dentry = filp->f_path.dentry;
>> +    const struct file_operations __rcu *rcu_real_fops;
>> +    const struct file_operations *real_fops;
>> +    int srcu_idx;
>> +
>> +    srcu_idx = srcu_read_lock(&debugfs_srcu);
>> +    rcu_real_fops = (const struct file_operations __rcu *)dentry->d_fsdata;
>> +    real_fops = srcu_dereference(rcu_real_fops, &debugfs_srcu);
>> +    real_fops = fops_get(real_fops);
>> +    srcu_read_unlock(&debugfs_srcu, srcu_idx);
>> +
>> +    if (!real_fops)
>> +            return -ENOENT;
>> +
>> +    replace_fops(filp, real_fops);
>> +    if (filp->f_op->open)
>> +            return filp->f_op->open(inode, filp);
>> +
>> +    return 0;
>> +}
>> +
>> +const struct file_operations debugfs_proxy_file_operations = {
>> +    .open = proxy_open,
>> +};
>> +
>>  static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
>>                                        struct dentry *parent, void *value,
>>                                        const struct file_operations *fops,
>> diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
>> index b7fcc0d..8ae2e1a 100644
>> --- a/fs/debugfs/inode.c
>> +++ b/fs/debugfs/inode.c
>> @@ -30,6 +30,8 @@
>> 
>>  #define DEBUGFS_DEFAULT_MODE        0700
>> 
>> +DEFINE_SRCU(debugfs_srcu);
>> +
>>  static struct vfsmount *debugfs_mount;
>>  static int debugfs_mount_count;
>>  static bool debugfs_registered;
>> @@ -340,8 +342,12 @@ struct dentry *debugfs_create_file(const char *name, 
>> umode_t mode,
>>              return failed_creating(dentry);
>> 
>>      inode->i_mode = mode;
>> -    inode->i_fop = fops ? fops : &debugfs_file_operations;
>>      inode->i_private = data;
>> +
>> +    inode->i_fop = fops ? &debugfs_proxy_file_operations
>> +            : &debugfs_noop_file_operations;
>> +    rcu_assign_pointer(dentry->d_fsdata, (void *)fops);
>> +
>>      d_instantiate(dentry, inode);
>>      fsnotify_create(d_inode(dentry->d_parent), dentry);
>>      return end_creating(dentry);
>> @@ -523,10 +529,12 @@ static int __debugfs_remove(struct dentry *dentry, 
>> struct dentry *parent)
>> 
>>      if (simple_positive(dentry)) {
>>              dget(dentry);
>> -            if (d_is_dir(dentry))
>> +            if (d_is_dir(dentry)) {
>>                      ret = simple_rmdir(d_inode(parent), dentry);
>> -            else
>> +            } else {
>>                      simple_unlink(d_inode(parent), dentry);
>> +                    rcu_assign_pointer(dentry->d_fsdata, NULL);
>> +            }
>>              if (!ret)
>>                      d_delete(dentry);
>>              dput(dentry);
>> @@ -565,6 +573,7 @@ void debugfs_remove(struct dentry *dentry)
>>      mutex_unlock(&d_inode(parent)->i_mutex);
>>      if (!ret)
>>              simple_release_fs(&debugfs_mount, &debugfs_mount_count);
>> +    synchronize_srcu(&debugfs_srcu);
>
> So the reason that this is OK from a performance perspective is that
> people removing large quantities of debugfs entries are supposed to
> use debugfs_remove_recursive()?
>

First of all, thank you very much for your feedback!

And yes, that's the idea behind not putting a synchronize_srcu() at the
end of __debugfs_remove() but at its callers debugfs_remove() and
debugfs_remove_recursive(): amortizing a single SRCU grace period among
multiple removals in the latter case.

I did not systematically investigate whether debugfs_remove_recursive()
is consistently used in preference of multiple debugfs_remove() calls
across the tree though. My subjective impression is that most debugfs
users I looked at are using the *_recursive() variant already.

> If there are valid use cases with lots of debugfs_remove() calls
> in quick succession,

The only thing I'm concerned about is system shutdown, provided module
exit handlers get called there. However, in my tests I did not encounter
any perceptible delays.

> and if this results in excessive latency due to
> lots of synchronize_srcu() calls, then one approach would be to have a
> debugfs_remove_nowait() or some such that omitted the final
> synchronize_srcu().  The last call could use debugfs_remove() to clean
> things up.  There are of course a number of alternative approaches.

There is the not yet exported __debugfs_remove() already. If the need
arises, we could simply export that one.

> But if there is no performance issue with real workloads, then there
> is of course no need to do anything.
>
>>  }
>>  EXPORT_SYMBOL_GPL(debugfs_remove);
>> 
>> @@ -642,6 +651,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
>>      if (!__debugfs_remove(child, parent))
>>              simple_release_fs(&debugfs_mount, &debugfs_mount_count);
>>      mutex_unlock(&d_inode(parent)->i_mutex);
>> +    synchronize_srcu(&debugfs_srcu);
>>  }
>>  EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
>> 
>> diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
>> index 19c066d..f8c7494 100644
>> --- a/include/linux/debugfs.h
>> +++ b/include/linux/debugfs.h
>> @@ -19,6 +19,7 @@
>>  #include <linux/seq_file.h>
>> 
>>  #include <linux/types.h>
>> +#include <linux/srcu.h>
>> 
>>  struct device;
>>  struct file_operations;
>> @@ -44,7 +45,10 @@ extern struct dentry *arch_debugfs_dir;
>>  #if defined(CONFIG_DEBUG_FS)
>> 
>>  /* declared over in file.c */
>> -extern const struct file_operations debugfs_file_operations;
>> +extern const struct file_operations debugfs_noop_file_operations;
>> +extern const struct file_operations debugfs_proxy_file_operations;
>> +
>> +extern struct srcu_struct debugfs_srcu;
>> 
>>  struct dentry *debugfs_create_file(const char *name, umode_t mode,
>>                                 struct dentry *parent, void *data,
>> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
>> index 8c15b29..3929878 100644
>> --- a/lib/Kconfig.debug
>> +++ b/lib/Kconfig.debug
>> @@ -257,6 +257,7 @@ config PAGE_OWNER
>> 
>>  config DEBUG_FS
>>      bool "Debug Filesystem"
>> +    select SRCU
>>      help
>>        debugfs is a virtual file system that kernel developers use to put
>>        debugging files into.  Enable this option to be able to read and
>> -- 
>> 2.6.3
>> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to