On Wed, Apr 05, 2023 at 09:58:44PM +0000, Ackerley Tng wrote:
> 
> Thanks again for your review!
> 
> Christian Brauner <brau...@kernel.org> writes:
> > On Tue, Apr 04, 2023 at 03:53:13PM +0200, Christian Brauner wrote:
> > > On Fri, Mar 31, 2023 at 11:50:39PM +0000, Ackerley Tng wrote:
> > > >
> > > > ...
> > > >
> > > > -SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> > > > +static int restrictedmem_create(struct vfsmount *mount)
> > > >  {
> > > >         struct file *file, *restricted_file;
> > > >         int fd, err;
> > > >
> > > > -       if (flags)
> > > > -               return -EINVAL;
> > > > -
> > > >         fd = get_unused_fd_flags(0);
> 
> > > Any reasons the file descriptors aren't O_CLOEXEC by default? I don't
> > > see any reasons why we should introduce new fdtypes that aren't
> > > O_CLOEXEC by default. The "don't mix-and-match" train has already left
> > > the station anyway as we do have seccomp noitifer fds and pidfds both of
> > > which are O_CLOEXEC by default.
> 
> 
> Thanks for pointing this out. I agree with using O_CLOEXEC, but didn’t
> notice this before. Let us discuss this under the original series at
> [1].
> 
> > > >         if (fd < 0)
> > > >                 return fd;
> > > >
> > > > -       file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> > > > +       if (mount)
> > > > +               file = shmem_file_setup_with_mnt(mount, 
> > > > "memfd:restrictedmem",
> > > 0, VM_NORESERVE);
> > > > +       else
> > > > +               file = shmem_file_setup("memfd:restrictedmem", 0, 
> > > > VM_NORESERVE);
> > > > +
> > > >         if (IS_ERR(file)) {
> > > >                 err = PTR_ERR(file);
> > > >                 goto err_fd;
> > > > @@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned
> > > int, flags)
> > > >         return err;
> > > >  }
> > > >
> > > > +static bool is_shmem_mount(struct vfsmount *mnt)
> > > > +{
> > > > +       return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == 
> > > > TMPFS_MAGIC;
> 
> > > This can just be if (mnt->mnt_sb->s_magic == TMPFS_MAGIC).
> 
> 
> Will simplify this in the next revision.
> 
> > > > +}
> > > > +
> > > > +static bool is_mount_root(struct file *file)
> > > > +{
> > > > +       return file->f_path.dentry == file->f_path.mnt->mnt_root;
> 
> > > mount -t tmpfs tmpfs /mnt
> > > touch /mnt/bla
> > > touch /mnt/ble
> > > mount --bind /mnt/bla /mnt/ble
> > > fd = open("/mnt/ble")
> > > fd_restricted = memfd_restricted(fd)
> 
> > > IOW, this doesn't restrict it to the tmpfs root. It only restricts it to
> > > paths that refer to the root of any tmpfs mount. To exclude bind-mounts
> > > that aren't bind-mounts of the whole filesystem you want:
> 
> > > path->dentry == path->mnt->mnt_root &&
> > > path->mnt->mnt_root == path->mnt->mnt_sb->s_root
> 
> 
> Will adopt this in the next revision and add a selftest to check
> this. Thanks for pointing this out!
> 
> > > > +}
> > > > +
> > > > +static int restrictedmem_create_on_user_mount(int mount_fd)
> > > > +{
> > > > +       int ret;
> > > > +       struct fd f;
> > > > +       struct vfsmount *mnt;
> > > > +
> > > > +       f = fdget_raw(mount_fd);
> > > > +       if (!f.file)
> > > > +               return -EBADF;
> > > > +
> > > > +       ret = -EINVAL;
> > > > +       if (!is_mount_root(f.file))
> > > > +               goto out;
> > > > +
> > > > +       mnt = f.file->f_path.mnt;
> > > > +       if (!is_shmem_mount(mnt))
> > > > +               goto out;
> > > > +
> > > > +       ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);
> 
> > > With the current semantics you're asking whether you have write
> > > permissions on the /mnt/ble file in order to get answer to the question
> > > whether you're allowed to create an unlinked restricted memory file.
> > > That doesn't make much sense afaict.
> 
> 
> That's true. Since mnt_want_write() already checks for write permissions
> and this syscall creates an unlinked file on the mount, we don't have to
> check permissions on the file then. Will remove this in the next
> revision!
> 
> > > > +       if (ret)
> > > > +               goto out;
> > > > +
> > > > +       ret = mnt_want_write(mnt);
> > > > +       if (unlikely(ret))
> > > > +               goto out;
> > > > +
> > > > +       ret = restrictedmem_create(mnt);
> > > > +
> > > > +       mnt_drop_write(mnt);
> > > > +out:
> > > > +       fdput(f);
> > > > +
> > > > +       return ret;
> > > > +}
> > > > +
> > > > +SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
> > > > +{
> > > > +       if (flags & ~RMFD_USERMNT)
> > > > +               return -EINVAL;
> > > > +
> > > > +       if (flags == RMFD_USERMNT) {
> 
> > > Why do you even need this flag? It seems that @mount_fd being < 0 is
> > > sufficient to indicate that a new restricted memory fd is supposed to be
> > > created in the system instance.
> 
> 
> I'm hoping to have this patch series merged after Chao's patch series
> introduces the memfd_restricted() syscall [1].

I'm curious, is there an LSFMM session for this?

Reply via email to