On Fri, 2017-10-06 at 16:49 +0100, David Howells wrote:
> Provide an fsopen() system call that starts the process of preparing to
> mount, using an fd as a context handle.  fsopen() is given the name of the
> filesystem that will be used:
> 
>       int mfd = fsopen(const char *fsname, int open_flags,

Can we make open_flags unsigned?

>                        void *reserved3, void *reserved4,
>                        void *reserved5);
> 
> where open_flags can be 0 or O_CLOEXEC and reserved* should all be NULL for
> the moment.
> 
> For example:
> 
>       mfd = fsopen("ext4", O_CLOEXEC, NULL, NULL, NULL);

While I understand the appeal of reusing O_CLOEXEC, I think we'd be
better off with a completely new set of flags here. It's not a "real"
open.

You can define FSO_CLOEXEC and then you have another 31 bits to play
with later should you need to do so.

>       write(mfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
>       write(mfd, "o noatime");
>       write(mfd, "o acl");
>       write(mfd, "o user_attr");
>       write(mfd, "o iversion");
>       write(mfd, "o ");
>       write(mfd, "r /my/container"); // root inside the fs
>       write(mfd, "x create"); // create the superblock
>       fsmount(mfd, container_fd, "/mnt", AT_NO_FOLLOW);
> 
>       mfd = fsopen("afs", -1);
>       write(mfd, "s %grand.central.org:root.cell");
>       write(mfd, "o cell=grand.central.org");
>       write(mfd, "r /");
>       write(mfd, "x create");
>       fsmount(mfd, AT_FDCWD, "/mnt", 0);
> 

We chatted a bit about this on IRC, but I'll reply here too for public
consumption:

I think you may need some other stuff to fully emulate what we call bind
mounting today:

1) a way to attach a new fs_context to an existing superblock Maybe a
mntopen() syscall? Or maybe we can use a new FSO_* flag in conjunction
with a string in one of the reserved fields?

2) a way to walk down to a particular dentry inside the superblock and
mount it instead of the actual root. For the interface you could just
define a new "d /path/inside/superblock" command. Then, do a pathwalk
from the existing root dentry and replace the fscontext root dentry with
it.

> If an error is reported at any step, an error message may be available to be
> read() back (ENODATA will be reported if there isn't an error available) in
> the form:
> 
>       "e <subsys>:<problem>"
>       "e SELinux:Mount on mountpoint not permitted"
> 
> Once fsmount() has been called, further write() calls will incur EBUSY,
> even if the fsmount() fails.  read() is still possible to retrieve error
> information.
> 
> The fsopen() syscall creates a mount context and hangs it of the fd that it
> returns.
> 
> Netlink is not used because it is optional.
> 
> Signed-off-by: David Howells <[email protected]>
> ---
> 
>  arch/x86/entry/syscalls/syscall_32.tbl |    1 
>  arch/x86/entry/syscalls/syscall_64.tbl |    1 
>  fs/Makefile                            |    2 
>  fs/fsopen.c                            |  273 
> ++++++++++++++++++++++++++++++++
>  include/linux/fs_context.h             |    1 
>  include/linux/syscalls.h               |    2 
>  include/uapi/linux/magic.h             |    1 
>  kernel/sys_ni.c                        |    3 
>  8 files changed, 283 insertions(+), 1 deletion(-)
>  create mode 100644 fs/fsopen.c
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
> b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac2161112..9bf8d4c62f85 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
>  382  i386    pkey_free               sys_pkey_free
>  383  i386    statx                   sys_statx
>  384  i386    arch_prctl              sys_arch_prctl                  
> compat_sys_arch_prctl
> +385  i386    fsopen                  sys_fsopen
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
> b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183e2f85..9b198c5fc412 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
>  330  common  pkey_alloc              sys_pkey_alloc
>  331  common  pkey_free               sys_pkey_free
>  332  common  statx                   sys_statx
> +333  common  fsopen                  sys_fsopen
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/Makefile b/fs/Makefile
> index ffe728cc15e1..c42d1d9351a6 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -12,7 +12,7 @@ obj-y :=    open.o read_write.o file_table.o super.o \
>               seq_file.o xattr.o libfs.o fs-writeback.o \
>               pnode.o splice.o sync.o utimes.o \
>               stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
> -             fs_context.o
> +             fs_context.o fsopen.o
>  
>  ifeq ($(CONFIG_BLOCK),y)
>  obj-y +=     buffer.o block_dev.o direct-io.o mpage.o
> diff --git a/fs/fsopen.c b/fs/fsopen.c
> new file mode 100644
> index 000000000000..6ca7e1979273
> --- /dev/null
> +++ b/fs/fsopen.c
> @@ -0,0 +1,273 @@
> +/* Filesystem access-by-fd.
> + *
> + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
> + * Written by David Howells ([email protected])
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public Licence
> + * as published by the Free Software Foundation; either version
> + * 2 of the Licence, or (at your option) any later version.
> + */
> +
> +#include <linux/fs_context.h>
> +#include <linux/mount.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/file.h>
> +#include <linux/magic.h>
> +#include <linux/syscalls.h>
> +
> +static struct vfsmount *fs_fs_mnt __read_mostly;
> +
> +static int fs_fs_release(struct inode *inode, struct file *file)
> +{
> +     struct fs_context *fc = file->private_data;
> +
> +     file->private_data = NULL;
> +
> +     put_fs_context(fc);
> +     return 0;
> +}
> +
> +/*
> + * Userspace writes configuration data and commands to the fd and we parse it
> + * here.  For the moment, we assume a single option or command per write.  
> Each
> + * line written is of the form
> + *
> + *   <option_type><space><stuff...>
> + *
> + *   d /dev/sda1                             -- Device name
> + *   o noatime                               -- Option without value
> + *   o cell=grand.central.org                -- Option with value
> + *   r /                                     -- Dir within device to mount
> + *   x create                                -- Create a superblock
> + */
> +static ssize_t fs_fs_write(struct file *file,
> +                        const char __user *_buf, size_t len, loff_t *pos)
> +{
> +     struct fs_context *fc = file->private_data;
> +     struct inode *inode = file_inode(file);
> +     char opt[2], *data;
> +     ssize_t ret;
> +
> +     if (len < 3 || len > 4095)
> +             return -EINVAL;
> +
> +     if (copy_from_user(opt, _buf, 2) != 0)
> +             return -EFAULT;
> +     switch (opt[0]) {
> +     case 's':
> +     case 'o':
> +     case 'x':
> +             break;
> +     default:
> +             goto err_bad_cmd;
> +     }
> +     if (opt[1] != ' ')
> +             goto err_bad_cmd;
> +
> +     data = memdup_user_nul(_buf + 2, len - 2);
> +     if (IS_ERR(data))
> +             return PTR_ERR(data);
> +
> +     /* From this point onwards we need to lock the fd against someone
> +      * trying to mount it.
> +      */
> +     ret = inode_lock_killable(inode);
> +     if (ret < 0)
> +             goto err_free;
> +
> +     ret = -EINVAL;
> +     switch (opt[0]) {
> +     case 's':
> +             ret = vfs_set_fs_source(fc, data, len - 2);
> +             if (ret < 0)
> +                     goto err_unlock;
> +             data = NULL;
> +             break;
> +
> +     case 'o':
> +             ret = vfs_parse_mount_option(fc, data);
> +             if (ret < 0)
> +                     goto err_unlock;
> +             break;
> +
> +     case 'x':
> +             if (strcmp(data, "create") == 0) {
> +                     ret = vfs_get_tree(fc);
> +             } else {
> +                     ret = -EOPNOTSUPP;
> +             }
> +             if (ret < 0)
> +                     goto err_unlock;
> +             break;
> +
> +     default:
> +             goto err_unlock;
> +     }
> +
> +     ret = len;
> +err_unlock:
> +     inode_unlock(inode);
> +err_free:
> +     kfree(data);
> +     return ret;
> +err_bad_cmd:
> +     return -EINVAL;
> +}
> +
> +const struct file_operations fs_fs_fops = {
> +     .write          = fs_fs_write,
> +     .release        = fs_fs_release,
> +     .llseek         = no_llseek,
> +};
> +
> +/*
> + * Indicate the name we want to display the filesystem file as.
> + */
> +static char *fs_fs_dname(struct dentry *dentry, char *buffer, int buflen)
> +{
> +     return dynamic_dname(dentry, buffer, buflen, "fs:[%lu]",
> +                          d_inode(dentry)->i_ino);
> +}
> +
> +static const struct dentry_operations fs_fs_dentry_operations = {
> +     .d_dname        = fs_fs_dname,
> +};
> +
> +/*
> + * Create a file that can be used to configure a new mount.
> + */
> +static struct file *create_fs_file(struct fs_context *fc)
> +{
> +     struct inode *inode;
> +     struct file *f;
> +     struct path path;
> +     int ret;
> +
> +     inode = alloc_anon_inode(fs_fs_mnt->mnt_sb);
> +     if (!inode)
> +             return ERR_PTR(-ENFILE);
> +     inode->i_fop = &fs_fs_fops;
> +
> +     ret = -ENOMEM;
> +     path.dentry = d_alloc_pseudo(fs_fs_mnt->mnt_sb, &empty_name);
> +     if (!path.dentry)
> +             goto err_inode;
> +     path.mnt = mntget(fs_fs_mnt);
> +
> +     d_instantiate(path.dentry, inode);
> +
> +     f = alloc_file(&path, FMODE_READ | FMODE_WRITE, &fs_fs_fops);
> +     if (IS_ERR(f)) {
> +             ret = PTR_ERR(f);
> +             goto err_file;
> +     }
> +
> +     f->private_data = fc;
> +     return f;
> +
> +err_file:
> +     path_put(&path);
> +     return ERR_PTR(ret);
> +
> +err_inode:
> +     iput(inode);
> +     return ERR_PTR(ret);
> +}
> +
> + const struct super_operations fs_fs_ops = {
> +     .drop_inode     = generic_delete_inode,
> +     .destroy_inode  = free_inode_nonrcu,
> +     .statfs         = simple_statfs,
> +};
> +
> +static struct dentry *fs_fs_mount(struct file_system_type *fs_type,
> +                               int flags, const char *dev_name,
> +                               void *data)
> +{
> +     return mount_pseudo(fs_type, "fs_fs:", &fs_fs_ops,
> +                         &fs_fs_dentry_operations, FS_FS_MAGIC);
> +}
> +
> +static struct file_system_type fs_fs_type = {
> +     .name           = "fs_fs",
> +     .mount          = fs_fs_mount,
> +     .kill_sb        = kill_anon_super,
> +};
> +
> +static int __init init_fs_fs(void)
> +{
> +     int ret;
> +
> +     ret = register_filesystem(&fs_fs_type);
> +     if (ret < 0)
> +             panic("Cannot register fs_fs\n");
> +
> +     fs_fs_mnt = kern_mount(&fs_fs_type);
> +     if (IS_ERR(fs_fs_mnt))
> +             panic("Cannot mount fs_fs: %ld\n", PTR_ERR(fs_fs_mnt));
> +     return 0;
> +}
> +
> +fs_initcall(init_fs_fs);
> +
> +/*
> + * Open a filesystem by name so that it can be configured for mounting.
> + *
> + * We are allowed to specify a container in which the filesystem will be
> + * opened, thereby indicating which namespaces will be used (notably, which
> + * network namespace will be used for network filesystems).
> + */
> +SYSCALL_DEFINE5(fsopen, const char __user *, _fs_name, unsigned int, flags,
> +             void *, reserved3, void *, reserved4, void *, reserved5)
> +{
> +     struct file_system_type *fs_type;
> +     struct fs_context *fc;
> +     struct file *file;
> +     const char *fs_name;
> +     int fd, ret;
> +
> +     if (flags & ~O_CLOEXEC || reserved3 || reserved4 || reserved5)
> +             return -EINVAL;
> +
> +     fs_name = strndup_user(_fs_name, PAGE_SIZE);
> +     if (IS_ERR(fs_name))
> +             return PTR_ERR(fs_name);
> +
> +     fs_type = get_fs_type(fs_name);
> +     kfree(fs_name);
> +     if (!fs_type)
> +             return -ENODEV;
> +
> +     fc = vfs_new_fs_context(fs_type, NULL, 0, FS_CONTEXT_FOR_USER_MOUNT);
> +     put_filesystem(fs_type);
> +     if (IS_ERR(fc))
> +             return PTR_ERR(fc);
> +
> +     ret = -ENOTSUPP;
> +     if (!fc->ops)
> +             goto err_fc;
> +
> +     file = create_fs_file(fc);
> +     if (IS_ERR(file)) {
> +             ret = PTR_ERR(file);
> +             goto err_fc;
> +     }
> +
> +     ret = get_unused_fd_flags(flags & O_CLOEXEC);
> +     if (ret < 0)
> +             goto err_file;
> +
> +     fd = ret;
> +     fd_install(fd, file);
> +     return fd;
> +
> +err_file:
> +     fput(file);
> +     return ret;
> +
> +err_fc:
> +     put_fs_context(fc);
> +     return ret;
> +}
> diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
> index 8af6ff0e869e..3244b231ede0 100644
> --- a/include/linux/fs_context.h
> +++ b/include/linux/fs_context.h
> @@ -101,4 +101,5 @@ extern int vfs_get_super(struct fs_context *fc,
>                        int (*fill_super)(struct super_block *sb,
>                                          struct fs_context *fc));
>  
> +extern const struct file_operations fs_fs_fops;
>  #endif /* _LINUX_FS_CONTEXT_H */
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a78186d826d7..7cd1b65a4152 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -940,5 +940,7 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, 
> unsigned long init_val);
>  asmlinkage long sys_pkey_free(int pkey);
>  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>                         unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_fsopen(const char *fs_name, unsigned int flags,
> +                        void *reserved3, void *reserved4, void *reserved5);
>  
>  #endif
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index e439565df838..722bf42f9564 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -87,5 +87,6 @@
>  #define UDF_SUPER_MAGIC              0x15013346
>  #define BALLOON_KVM_MAGIC    0x13661366
>  #define ZSMALLOC_MAGIC               0x58295829
> +#define FS_FS_MAGIC          0x66736673
>  
>  #endif /* __LINUX_MAGIC_H__ */
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef8576ce9..de1dc63e7e47 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -258,3 +258,6 @@ cond_syscall(sys_membarrier);
>  cond_syscall(sys_pkey_mprotect);
>  cond_syscall(sys_pkey_alloc);
>  cond_syscall(sys_pkey_free);
> +
> +/* fd-based mount */
> +cond_syscall(sys_fsopen);
> 

-- 
Jeff Layton <[email protected]>

Reply via email to