Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag
On Fri, Apr 22, 2022 at 10:43:50PM -0700, Vishal Annapurve wrote: > On Thu, Mar 10, 2022 at 6:09 AM Chao Peng wrote: > > > > From: "Kirill A. Shutemov" > > > > Introduce a new memfd_create() flag indicating the content of the > > created memfd is inaccessible from userspace through ordinary MMU > > access (e.g., read/write/mmap). However, the file content can be > > accessed via a different mechanism (e.g. KVM MMU) indirectly. > > > > It provides semantics required for KVM guest private memory support > > that a file descriptor with this flag set is going to be used as the > > source of guest memory in confidential computing environments such > > as Intel TDX/AMD SEV but may not be accessible from host userspace. > > > > Since page migration/swapping is not yet supported for such usages > > so these pages are currently marked as UNMOVABLE and UNEVICTABLE > > which makes them behave like long-term pinned pages. > > > > The flag can not coexist with MFD_ALLOW_SEALING, future sealing is > > also impossible for a memfd created with this flag. > > > > At this time only shmem implements this flag. > > > > Signed-off-by: Kirill A. Shutemov > > Signed-off-by: Chao Peng > > --- > > include/linux/shmem_fs.h | 7 + > > include/uapi/linux/memfd.h | 1 + > > mm/memfd.c | 26 +++-- > > mm/shmem.c | 57 ++ > > 4 files changed, 88 insertions(+), 3 deletions(-) > > > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > > index e65b80ed09e7..2dde843f28ef 100644 > > --- a/include/linux/shmem_fs.h > > +++ b/include/linux/shmem_fs.h > > @@ -12,6 +12,9 @@ > > > > /* inode in-kernel data */ > > > > +/* shmem extended flags */ > > +#define SHM_F_INACCESSIBLE 0x0001 /* prevent ordinary MMU access > > (e.g. read/write/mmap) to file content */ > > + > > struct shmem_inode_info { > > spinlock_t lock; > > unsigned intseals; /* shmem seals */ > > @@ -24,6 +27,7 @@ struct shmem_inode_info { > > struct shared_policypolicy; /* NUMA memory alloc policy > > */ > > struct simple_xattrsxattrs; /* list of xattrs */ > > atomic_tstop_eviction; /* hold when working on > > inode */ > > + unsigned intxflags; /* shmem extended flags */ > > struct inodevfs_inode; > > }; > > > > @@ -61,6 +65,9 @@ extern struct file *shmem_file_setup(const char *name, > > loff_t size, unsigned long flags); > > extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, > > unsigned long flags); > > +extern struct file *shmem_file_setup_xflags(const char *name, loff_t size, > > + unsigned long flags, > > + unsigned int xflags); > > extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, > > const char *name, loff_t size, unsigned long flags); > > extern int shmem_zero_setup(struct vm_area_struct *); > > diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h > > index 7a8a26751c23..48750474b904 100644 > > --- a/include/uapi/linux/memfd.h > > +++ b/include/uapi/linux/memfd.h > > @@ -8,6 +8,7 @@ > > #define MFD_CLOEXEC0x0001U > > #define MFD_ALLOW_SEALING 0x0002U > > #define MFD_HUGETLB0x0004U > > +#define MFD_INACCESSIBLE 0x0008U > > > > /* > > * Huge page size encoding when MFD_HUGETLB is specified, and a huge page > > diff --git a/mm/memfd.c b/mm/memfd.c > > index 9f80f162791a..74d45a26cf5d 100644 > > --- a/mm/memfd.c > > +++ b/mm/memfd.c > > @@ -245,16 +245,20 @@ long memfd_fcntl(struct file *file, unsigned int cmd, > > unsigned long arg) > > #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) > > #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) > > > > -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) > > +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \ > > + MFD_INACCESSIBLE) > > > > SYSCALL_DEFINE2(memfd_create, > > const char __user *, uname, > > unsigned int, flags) > > { > > + struct address_space *mapping; > > unsigned int *file_seals; > > + unsigned int xflags; > > struct file *file; > > int fd, error; > > char *name; > > + gfp_t gfp; > > long len; > > > > if (!(flags & MFD_HUGETLB)) { > > @@ -267,6 +271,10 @@ SYSCALL_DEFINE2(memfd_create, > > return -EINVAL; > > } > > > > + /* Disallow sealing when MFD_INACCESSIBLE is set. */ > > + if (flags & MFD_INACCESSIBLE && flags & MFD_ALLOW_SEALING) > > + return -EINVAL; > > + > > /* length includes terminating zero */ > >
Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag
On Thu, Mar 10, 2022 at 6:09 AM Chao Peng wrote: > > From: "Kirill A. Shutemov" > > Introduce a new memfd_create() flag indicating the content of the > created memfd is inaccessible from userspace through ordinary MMU > access (e.g., read/write/mmap). However, the file content can be > accessed via a different mechanism (e.g. KVM MMU) indirectly. > > It provides semantics required for KVM guest private memory support > that a file descriptor with this flag set is going to be used as the > source of guest memory in confidential computing environments such > as Intel TDX/AMD SEV but may not be accessible from host userspace. > > Since page migration/swapping is not yet supported for such usages > so these pages are currently marked as UNMOVABLE and UNEVICTABLE > which makes them behave like long-term pinned pages. > > The flag can not coexist with MFD_ALLOW_SEALING, future sealing is > also impossible for a memfd created with this flag. > > At this time only shmem implements this flag. > > Signed-off-by: Kirill A. Shutemov > Signed-off-by: Chao Peng > --- > include/linux/shmem_fs.h | 7 + > include/uapi/linux/memfd.h | 1 + > mm/memfd.c | 26 +++-- > mm/shmem.c | 57 ++ > 4 files changed, 88 insertions(+), 3 deletions(-) > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index e65b80ed09e7..2dde843f28ef 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -12,6 +12,9 @@ > > /* inode in-kernel data */ > > +/* shmem extended flags */ > +#define SHM_F_INACCESSIBLE 0x0001 /* prevent ordinary MMU access (e.g. > read/write/mmap) to file content */ > + > struct shmem_inode_info { > spinlock_t lock; > unsigned intseals; /* shmem seals */ > @@ -24,6 +27,7 @@ struct shmem_inode_info { > struct shared_policypolicy; /* NUMA memory alloc policy */ > struct simple_xattrsxattrs; /* list of xattrs */ > atomic_tstop_eviction; /* hold when working on inode > */ > + unsigned intxflags; /* shmem extended flags */ > struct inodevfs_inode; > }; > > @@ -61,6 +65,9 @@ extern struct file *shmem_file_setup(const char *name, > loff_t size, unsigned long flags); > extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, > unsigned long flags); > +extern struct file *shmem_file_setup_xflags(const char *name, loff_t size, > + unsigned long flags, > + unsigned int xflags); > extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, > const char *name, loff_t size, unsigned long flags); > extern int shmem_zero_setup(struct vm_area_struct *); > diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h > index 7a8a26751c23..48750474b904 100644 > --- a/include/uapi/linux/memfd.h > +++ b/include/uapi/linux/memfd.h > @@ -8,6 +8,7 @@ > #define MFD_CLOEXEC0x0001U > #define MFD_ALLOW_SEALING 0x0002U > #define MFD_HUGETLB0x0004U > +#define MFD_INACCESSIBLE 0x0008U > > /* > * Huge page size encoding when MFD_HUGETLB is specified, and a huge page > diff --git a/mm/memfd.c b/mm/memfd.c > index 9f80f162791a..74d45a26cf5d 100644 > --- a/mm/memfd.c > +++ b/mm/memfd.c > @@ -245,16 +245,20 @@ long memfd_fcntl(struct file *file, unsigned int cmd, > unsigned long arg) > #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) > #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) > > -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) > +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \ > + MFD_INACCESSIBLE) > > SYSCALL_DEFINE2(memfd_create, > const char __user *, uname, > unsigned int, flags) > { > + struct address_space *mapping; > unsigned int *file_seals; > + unsigned int xflags; > struct file *file; > int fd, error; > char *name; > + gfp_t gfp; > long len; > > if (!(flags & MFD_HUGETLB)) { > @@ -267,6 +271,10 @@ SYSCALL_DEFINE2(memfd_create, > return -EINVAL; > } > > + /* Disallow sealing when MFD_INACCESSIBLE is set. */ > + if (flags & MFD_INACCESSIBLE && flags & MFD_ALLOW_SEALING) > + return -EINVAL; > + > /* length includes terminating zero */ > len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); > if (len <= 0) > @@ -301,8 +309,11 @@ SYSCALL_DEFINE2(memfd_create, > HUGETLB_ANONHUGE_INODE, > (flags >> MFD_HUGE_SHIFT) & >
Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag
On Mon, Apr 11, 2022 at 06:10:23PM +0300, Kirill A. Shutemov wrote: > On Thu, Mar 10, 2022 at 10:08:59PM +0800, Chao Peng wrote: > > From: "Kirill A. Shutemov" > > > > Introduce a new memfd_create() flag indicating the content of the > > created memfd is inaccessible from userspace through ordinary MMU > > access (e.g., read/write/mmap). However, the file content can be > > accessed via a different mechanism (e.g. KVM MMU) indirectly. > > > > It provides semantics required for KVM guest private memory support > > that a file descriptor with this flag set is going to be used as the > > source of guest memory in confidential computing environments such > > as Intel TDX/AMD SEV but may not be accessible from host userspace. > > > > Since page migration/swapping is not yet supported for such usages > > so these pages are currently marked as UNMOVABLE and UNEVICTABLE > > which makes them behave like long-term pinned pages. > > > > The flag can not coexist with MFD_ALLOW_SEALING, future sealing is > > also impossible for a memfd created with this flag. > > > > At this time only shmem implements this flag. > > > > Signed-off-by: Kirill A. Shutemov > > Signed-off-by: Chao Peng > > --- > > include/linux/shmem_fs.h | 7 + > > include/uapi/linux/memfd.h | 1 + > > mm/memfd.c | 26 +++-- > > mm/shmem.c | 57 ++ > > 4 files changed, 88 insertions(+), 3 deletions(-) > > > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > > index e65b80ed09e7..2dde843f28ef 100644 > > --- a/include/linux/shmem_fs.h > > +++ b/include/linux/shmem_fs.h > > @@ -12,6 +12,9 @@ > > > > /* inode in-kernel data */ > > > > +/* shmem extended flags */ > > +#define SHM_F_INACCESSIBLE 0x0001 /* prevent ordinary MMU access (e.g. > > read/write/mmap) to file content */ > > + > > struct shmem_inode_info { > > spinlock_t lock; > > unsigned intseals; /* shmem seals */ > > @@ -24,6 +27,7 @@ struct shmem_inode_info { > > struct shared_policypolicy; /* NUMA memory alloc policy */ > > struct simple_xattrsxattrs; /* list of xattrs */ > > atomic_tstop_eviction; /* hold when working on inode */ > > + unsigned intxflags; /* shmem extended flags */ > > struct inodevfs_inode; > > }; > > > > AFAICS, only two bits of 'flags' are used. And that's very strange that > VM_ flags are used for the purpose. My guess that someone was lazy to > introduce new constants for this. > > I think we should fix this: introduce SHM_F_LOCKED and SHM_F_NORESERVE > alongside with SHM_F_INACCESSIBLE and stuff them all into info->flags. > It also makes shmem_file_setup_xflags() go away. Did a quick search and sounds we only use SHM_F_LOCKED/SHM_F_NORESERVE and that definitely don't have to be VM_ flags. Chao > > -- > Kirill A. Shutemov
Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag
On Thu, Mar 10, 2022 at 10:08:59PM +0800, Chao Peng wrote: > From: "Kirill A. Shutemov" > > Introduce a new memfd_create() flag indicating the content of the > created memfd is inaccessible from userspace through ordinary MMU > access (e.g., read/write/mmap). However, the file content can be > accessed via a different mechanism (e.g. KVM MMU) indirectly. > > It provides semantics required for KVM guest private memory support > that a file descriptor with this flag set is going to be used as the > source of guest memory in confidential computing environments such > as Intel TDX/AMD SEV but may not be accessible from host userspace. > > Since page migration/swapping is not yet supported for such usages > so these pages are currently marked as UNMOVABLE and UNEVICTABLE > which makes them behave like long-term pinned pages. > > The flag can not coexist with MFD_ALLOW_SEALING, future sealing is > also impossible for a memfd created with this flag. > > At this time only shmem implements this flag. > > Signed-off-by: Kirill A. Shutemov > Signed-off-by: Chao Peng > --- > include/linux/shmem_fs.h | 7 + > include/uapi/linux/memfd.h | 1 + > mm/memfd.c | 26 +++-- > mm/shmem.c | 57 ++ > 4 files changed, 88 insertions(+), 3 deletions(-) > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index e65b80ed09e7..2dde843f28ef 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -12,6 +12,9 @@ > > /* inode in-kernel data */ > > +/* shmem extended flags */ > +#define SHM_F_INACCESSIBLE 0x0001 /* prevent ordinary MMU access (e.g. > read/write/mmap) to file content */ > + > struct shmem_inode_info { > spinlock_t lock; > unsigned intseals; /* shmem seals */ > @@ -24,6 +27,7 @@ struct shmem_inode_info { > struct shared_policypolicy; /* NUMA memory alloc policy */ > struct simple_xattrsxattrs; /* list of xattrs */ > atomic_tstop_eviction; /* hold when working on inode */ > + unsigned intxflags; /* shmem extended flags */ > struct inodevfs_inode; > }; > AFAICS, only two bits of 'flags' are used. And that's very strange that VM_ flags are used for the purpose. My guess that someone was lazy to introduce new constants for this. I think we should fix this: introduce SHM_F_LOCKED and SHM_F_NORESERVE alongside with SHM_F_INACCESSIBLE and stuff them all into info->flags. It also makes shmem_file_setup_xflags() go away. -- Kirill A. Shutemov
[PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag
From: "Kirill A. Shutemov" Introduce a new memfd_create() flag indicating the content of the created memfd is inaccessible from userspace through ordinary MMU access (e.g., read/write/mmap). However, the file content can be accessed via a different mechanism (e.g. KVM MMU) indirectly. It provides semantics required for KVM guest private memory support that a file descriptor with this flag set is going to be used as the source of guest memory in confidential computing environments such as Intel TDX/AMD SEV but may not be accessible from host userspace. Since page migration/swapping is not yet supported for such usages so these pages are currently marked as UNMOVABLE and UNEVICTABLE which makes them behave like long-term pinned pages. The flag can not coexist with MFD_ALLOW_SEALING, future sealing is also impossible for a memfd created with this flag. At this time only shmem implements this flag. Signed-off-by: Kirill A. Shutemov Signed-off-by: Chao Peng --- include/linux/shmem_fs.h | 7 + include/uapi/linux/memfd.h | 1 + mm/memfd.c | 26 +++-- mm/shmem.c | 57 ++ 4 files changed, 88 insertions(+), 3 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e65b80ed09e7..2dde843f28ef 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -12,6 +12,9 @@ /* inode in-kernel data */ +/* shmem extended flags */ +#define SHM_F_INACCESSIBLE 0x0001 /* prevent ordinary MMU access (e.g. read/write/mmap) to file content */ + struct shmem_inode_info { spinlock_t lock; unsigned intseals; /* shmem seals */ @@ -24,6 +27,7 @@ struct shmem_inode_info { struct shared_policypolicy; /* NUMA memory alloc policy */ struct simple_xattrsxattrs; /* list of xattrs */ atomic_tstop_eviction; /* hold when working on inode */ + unsigned intxflags; /* shmem extended flags */ struct inodevfs_inode; }; @@ -61,6 +65,9 @@ extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); +extern struct file *shmem_file_setup_xflags(const char *name, loff_t size, + unsigned long flags, + unsigned int xflags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..48750474b904 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,7 @@ #define MFD_CLOEXEC0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB0x0004U +#define MFD_INACCESSIBLE 0x0008U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/mm/memfd.c b/mm/memfd.c index 9f80f162791a..74d45a26cf5d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -245,16 +245,20 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \ + MFD_INACCESSIBLE) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { + struct address_space *mapping; unsigned int *file_seals; + unsigned int xflags; struct file *file; int fd, error; char *name; + gfp_t gfp; long len; if (!(flags & MFD_HUGETLB)) { @@ -267,6 +271,10 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; } + /* Disallow sealing when MFD_INACCESSIBLE is set. */ + if (flags & MFD_INACCESSIBLE && flags & MFD_ALLOW_SEALING) + return -EINVAL; + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) @@ -301,8 +309,11 @@ SYSCALL_DEFINE2(memfd_create, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else - file = shmem_file_setup(name, 0, VM_NORESERVE); + } else { + xflags = flags & MFD_INACCESSIBLE ? SHM_F_INACCESSIBLE : 0; + file = shmem_file_setup_xflags(name, 0, VM_NORESERVE,