On Wed, Feb 21, 2024 at 03:24:37PM -0600, Seth Forshee (DigitalOcean) wrote:
> To pass around vfs_caps instead of raw xattr data we will need to
> convert between the two representations near userspace and disk
> boundaries. We already convert xattrs from disks to vfs_caps, so move
> that code into a helper, and change get_vfs_caps_from_disk() to use the
> helper.
> 
> When converting vfs_caps to xattrs we have different considerations
> depending on the destination of the xattr data. For xattrs which will be
> written to disk we need to reject the xattr if the rootid does not map
> into the filesystem's user namespace, whereas xattrs read by userspace
> may need to undergo a conversion from v3 to v2 format when the rootid
> does not map. So this helper is split into an internal and an external
> interface. The internal interface does not return an error if the rootid
> has no mapping in the target user namespace and will be used for
> conversions targeting userspace. The external interface returns
> EOVERFLOW if the rootid has no mapping and will be used for all other
> conversions.
> 
> Signed-off-by: Seth Forshee (DigitalOcean) <sfors...@kernel.org>
> ---
>  include/linux/capability.h |  10 ++
>  security/commoncap.c       | 228 
> +++++++++++++++++++++++++++++++++++----------
>  2 files changed, 187 insertions(+), 51 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index eb46d346bbbc..a0893ac4664b 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -209,6 +209,16 @@ static inline bool checkpoint_restore_ns_capable(struct 
> user_namespace *ns)
>               ns_capable(ns, CAP_SYS_ADMIN);
>  }
>  
> +/* helpers to convert between xattr and in-kernel representations */
> +int vfs_caps_from_xattr(struct mnt_idmap *idmap,
> +                     struct user_namespace *src_userns,
> +                     struct vfs_caps *vfs_caps,
> +                     const void *data, size_t size);
> +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +                       struct user_namespace *dest_userns,
> +                       const struct vfs_caps *vfs_caps,
> +                       void *data, size_t size);
> +
>  /* audit system wants to get cap info from files as well */
>  int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
>                          const struct dentry *dentry,
> diff --git a/security/commoncap.c b/security/commoncap.c
> index a0b5c9740759..7531c9634997 100644
> --- a/security/commoncap.c
> +++ b/security/commoncap.c
> @@ -619,54 +619,41 @@ static inline int bprm_caps_from_vfs_caps(struct 
> vfs_caps *caps,
>  }
>  
>  /**
> - * get_vfs_caps_from_disk - retrieve vfs caps from disk
> + * vfs_caps_from_xattr - convert raw caps xattr data to vfs_caps
>   *
> - * @idmap:   idmap of the mount the inode was found from
> - * @dentry:  dentry from which @inode is retrieved
> - * @cpu_caps:        vfs capabilities
> + * @idmap:      idmap of the mount the inode was found from
> + * @src_userns: user namespace for ids in xattr data
> + * @vfs_caps:   destination buffer for vfs_caps data
> + * @data:       rax xattr caps data
> + * @size:       size of xattr data
>   *
> - * Extract the on-exec-apply capability sets for an executable file.
> + * Converts a raw security.capability xattr into the kernel-internal
> + * capabilities format.
>   *
> - * If the inode has been found through an idmapped mount the idmap of
> - * the vfsmount must be passed through @idmap. This function will then
> - * take care to map the inode according to @idmap before checking
> - * permissions. On non-idmapped mounts or if permission checking is to be
> - * performed on the raw inode simply pass @nop_mnt_idmap.
> + * If the xattr is being read or written through an idmapped mount the
> + * idmap of the vfsmount must be passed through @idmap. This function
> + * will then take care to map the rootid according to @idmap.
> + *
> + * Return: On success, return 0; on error, return < 0.
>   */
> -int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
> -                        const struct dentry *dentry,
> -                        struct vfs_caps *cpu_caps)
> +int vfs_caps_from_xattr(struct mnt_idmap *idmap,
> +                     struct user_namespace *src_userns,
> +                     struct vfs_caps *vfs_caps,
> +                     const void *data, size_t size)
>  {
> -     struct inode *inode = d_backing_inode(dentry);
>       __u32 magic_etc;
> -     int size;
> -     struct vfs_ns_cap_data data, *nscaps = &data;
> -     struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
> +     const struct vfs_ns_cap_data *ns_caps = data;
> +     struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
>       kuid_t rootkuid;
> -     vfsuid_t rootvfsuid;
> -     struct user_namespace *fs_ns;
> -
> -     memset(cpu_caps, 0, sizeof(struct vfs_caps));
> -
> -     if (!inode)
> -             return -ENODATA;
>  
> -     fs_ns = inode->i_sb->s_user_ns;
> -     size = __vfs_getxattr((struct dentry *)dentry, inode,
> -                           XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> -     if (size == -ENODATA || size == -EOPNOTSUPP)
> -             /* no data, that's ok */
> -             return -ENODATA;
> -
> -     if (size < 0)
> -             return size;
> +     memset(vfs_caps, 0, sizeof(*vfs_caps));
>  
>       if (size < sizeof(magic_etc))
>               return -EINVAL;
>  
> -     cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
> +     vfs_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
>  
> -     rootkuid = make_kuid(fs_ns, 0);
> +     rootkuid = make_kuid(src_userns, 0);
>       switch (magic_etc & VFS_CAP_REVISION_MASK) {
>       case VFS_CAP_REVISION_1:
>               if (size != XATTR_CAPS_SZ_1)
> @@ -679,39 +666,178 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
>       case VFS_CAP_REVISION_3:
>               if (size != XATTR_CAPS_SZ_3)
>                       return -EINVAL;
> -             rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
> +             rootkuid = make_kuid(src_userns, le32_to_cpu(ns_caps->rootid));
>               break;
>  
>       default:
>               return -EINVAL;
>       }
>  
> -     rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
> -     if (!vfsuid_valid(rootvfsuid))
> -             return -ENODATA;
> +     vfs_caps->rootid = make_vfsuid(idmap, src_userns, rootkuid);
> +     if (!vfsuid_valid(vfs_caps->rootid))
> +             return -EOVERFLOW;
>  
> -     /* Limit the caps to the mounter of the filesystem
> -      * or the more limited uid specified in the xattr.
> +     vfs_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
> +     vfs_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
> +
> +     /*
> +      * Rev1 had just a single 32-bit word, later expanded
> +      * to a second one for the high bits
>        */
> -     if (!rootid_owns_currentns(rootvfsuid))
> -             return -ENODATA;
> +     if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> +             vfs_caps->permitted.val += 
> (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> +             vfs_caps->inheritable.val += 
> (u64)le32_to_cpu(caps->data[1].inheritable) << 32;

That + makes this even more difficult to read. This should be rewritten.

> +     }
> +
> +     vfs_caps->permitted.val &= CAP_VALID_MASK;
> +     vfs_caps->inheritable.val &= CAP_VALID_MASK;
> +
> +     return 0;
> +}
> +
> +/*
> + * Inner implementation of vfs_caps_to_xattr() which does not return an
> + * error if the rootid does not map into @dest_userns.
> + */
> +static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +                                struct user_namespace *dest_userns,
> +                                const struct vfs_caps *vfs_caps,
> +                                void *data, size_t size)
> +{
> +     struct vfs_ns_cap_data *ns_caps = data;
> +     struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps;
> +     kuid_t rootkuid;
> +     uid_t rootid;
> +
> +     memset(ns_caps, 0, size);
> +
> +     rootid = 0;
> +     switch (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) {
> +     case VFS_CAP_REVISION_1:
> +             if (size < XATTR_CAPS_SZ_1)
> +                     return -EINVAL;
> +             size = XATTR_CAPS_SZ_1;
> +             break;
> +     case VFS_CAP_REVISION_2:
> +             if (size < XATTR_CAPS_SZ_2)
> +                     return -EINVAL;
> +             size = XATTR_CAPS_SZ_2;
> +             break;
> +     case VFS_CAP_REVISION_3:
> +             if (size < XATTR_CAPS_SZ_3)
> +                     return -EINVAL;
> +             size = XATTR_CAPS_SZ_3;
> +             rootkuid = from_vfsuid(idmap, dest_userns, vfs_caps->rootid);
> +             rootid = from_kuid(dest_userns, rootkuid);
> +             ns_caps->rootid = cpu_to_le32(rootid);
> +             break;
>  
> -     cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
> -     cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);
> +     default:
> +             return -EINVAL;
> +     }
> +
> +     caps->magic_etc = cpu_to_le32(vfs_caps->magic_etc);
> +
> +     caps->data[0].permitted = 
> cpu_to_le32(lower_32_bits(vfs_caps->permitted.val));
> +     caps->data[0].inheritable = 
> cpu_to_le32(lower_32_bits(vfs_caps->inheritable.val));
>  
>       /*
>        * Rev1 had just a single 32-bit word, later expanded
>        * to a second one for the high bits
>        */
> -     if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
> -             cpu_caps->permitted.val += 
> (u64)le32_to_cpu(caps->data[1].permitted) << 32;
> -             cpu_caps->inheritable.val += 
> (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
> +     if ((vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) != 
> VFS_CAP_REVISION_1) {
> +             caps->data[1].permitted =
> +                     cpu_to_le32(upper_32_bits(vfs_caps->permitted.val));
> +             caps->data[1].inheritable =
> +                     cpu_to_le32(upper_32_bits(vfs_caps->inheritable.val));
>       }
>  
> -     cpu_caps->permitted.val &= CAP_VALID_MASK;
> -     cpu_caps->inheritable.val &= CAP_VALID_MASK;
> +     return size;
> +}
> +
> +
> +/**
> + * vfs_caps_to_xattr - convert vfs_caps to raw caps xattr data
> + *
> + * @idmap:       idmap of the mount the inode was found from
> + * @dest_userns: user namespace for ids in xattr data
> + * @vfs_caps:    source vfs_caps data
> + * @data:        destination buffer for rax xattr caps data
> + * @size:        size of the @data buffer
> + *
> + * Converts a kernel-internal capability into the raw security.capability
> + * xattr format.
> + *
> + * If the xattr is being read or written through an idmapped mount the
> + * idmap of the vfsmount must be passed through @idmap. This function
> + * will then take care to map the rootid according to @idmap.
> + *
> + * Return: On success, return the size of the xattr data. On error,
> + * return < 0.
> + */
> +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap,
> +                       struct user_namespace *dest_userns,
> +                       const struct vfs_caps *vfs_caps,
> +                       void *data, size_t size)
> +{
> +     struct vfs_ns_cap_data *caps = data;
> +     int ret;

This should very likely be ssize_t ret.

> +
> +     ret = __vfs_caps_to_xattr(idmap, dest_userns, vfs_caps, data, size);
> +     if (ret > 0 &&
> +         (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) == VFS_CAP_REVISION_3 
> &&
> +          le32_to_cpu(caps->rootid) == (uid_t)-1)
> +             return -EOVERFLOW;
> +     return ret;
> +}
> +
> +/**
> + * get_vfs_caps_from_disk - retrieve vfs caps from disk
> + *
> + * @idmap:   idmap of the mount the inode was found from
> + * @dentry:  dentry from which @inode is retrieved
> + * @cpu_caps:        vfs capabilities
> + *
> + * Extract the on-exec-apply capability sets for an executable file.
> + *
> + * If the inode has been found through an idmapped mount the idmap of
> + * the vfsmount must be passed through @idmap. This function will then
> + * take care to map the inode according to @idmap before checking
> + * permissions. On non-idmapped mounts or if permission checking is to be
> + * performed on the raw inode simply pass @nop_mnt_idmap.
> + */
> +int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
> +                        const struct dentry *dentry,
> +                        struct vfs_caps *cpu_caps)
> +{
> +     struct inode *inode = d_backing_inode(dentry);
> +     int size, ret;
> +     struct vfs_ns_cap_data data, *nscaps = &data;
> +
> +     if (!inode)
> +             return -ENODATA;
>  
> -     cpu_caps->rootid = rootvfsuid;
> +     size = __vfs_getxattr((struct dentry *)dentry, inode,
> +                           XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
> +     if (size == -ENODATA || size == -EOPNOTSUPP)
> +             /* no data, that's ok */
> +             return -ENODATA;
> +
> +     if (size < 0)
> +             return size;
> +
> +     ret = vfs_caps_from_xattr(idmap, inode->i_sb->s_user_ns,
> +                               cpu_caps, nscaps, size);
> +     if (ret == -EOVERFLOW)
> +             return -ENODATA;
> +     if (ret)
> +             return ret;
> +
> +     /* Limit the caps to the mounter of the filesystem
> +      * or the more limited uid specified in the xattr.
> +      */
> +     if (!rootid_owns_currentns(cpu_caps->rootid))
> +             return -ENODATA;
>  
>       return 0;
>  }
> 
> -- 
> 2.43.0
> 

Reply via email to