Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag

2022-04-24 Thread Chao Peng
On Fri, Apr 22, 2022 at 10:43:50PM -0700, Vishal Annapurve wrote:
> On Thu, Mar 10, 2022 at 6:09 AM Chao Peng  wrote:
> >
> > From: "Kirill A. Shutemov" 
> >
> > Introduce a new memfd_create() flag indicating the content of the
> > created memfd is inaccessible from userspace through ordinary MMU
> > access (e.g., read/write/mmap). However, the file content can be
> > accessed via a different mechanism (e.g. KVM MMU) indirectly.
> >
> > It provides semantics required for KVM guest private memory support
> > that a file descriptor with this flag set is going to be used as the
> > source of guest memory in confidential computing environments such
> > as Intel TDX/AMD SEV but may not be accessible from host userspace.
> >
> > Since page migration/swapping is not yet supported for such usages
> > so these pages are currently marked as UNMOVABLE and UNEVICTABLE
> > which makes them behave like long-term pinned pages.
> >
> > The flag can not coexist with MFD_ALLOW_SEALING, future sealing is
> > also impossible for a memfd created with this flag.
> >
> > At this time only shmem implements this flag.
> >
> > Signed-off-by: Kirill A. Shutemov 
> > Signed-off-by: Chao Peng 
> > ---
> >  include/linux/shmem_fs.h   |  7 +
> >  include/uapi/linux/memfd.h |  1 +
> >  mm/memfd.c | 26 +++--
> >  mm/shmem.c | 57 ++
> >  4 files changed, 88 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> > index e65b80ed09e7..2dde843f28ef 100644
> > --- a/include/linux/shmem_fs.h
> > +++ b/include/linux/shmem_fs.h
> > @@ -12,6 +12,9 @@
> >
> >  /* inode in-kernel data */
> >
> > +/* shmem extended flags */
> > +#define SHM_F_INACCESSIBLE 0x0001  /* prevent ordinary MMU access 
> > (e.g. read/write/mmap) to file content */
> > +
> >  struct shmem_inode_info {
> > spinlock_t  lock;
> > unsigned intseals;  /* shmem seals */
> > @@ -24,6 +27,7 @@ struct shmem_inode_info {
> > struct shared_policypolicy; /* NUMA memory alloc policy 
> > */
> > struct simple_xattrsxattrs; /* list of xattrs */
> > atomic_tstop_eviction;  /* hold when working on 
> > inode */
> > +   unsigned intxflags; /* shmem extended flags */
> > struct inodevfs_inode;
> >  };
> >
> > @@ -61,6 +65,9 @@ extern struct file *shmem_file_setup(const char *name,
> > loff_t size, unsigned long flags);
> >  extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
> > unsigned long flags);
> > +extern struct file *shmem_file_setup_xflags(const char *name, loff_t size,
> > +   unsigned long flags,
> > +   unsigned int xflags);
> >  extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
> > const char *name, loff_t size, unsigned long flags);
> >  extern int shmem_zero_setup(struct vm_area_struct *);
> > diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
> > index 7a8a26751c23..48750474b904 100644
> > --- a/include/uapi/linux/memfd.h
> > +++ b/include/uapi/linux/memfd.h
> > @@ -8,6 +8,7 @@
> >  #define MFD_CLOEXEC0x0001U
> >  #define MFD_ALLOW_SEALING  0x0002U
> >  #define MFD_HUGETLB0x0004U
> > +#define MFD_INACCESSIBLE   0x0008U
> >
> >  /*
> >   * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
> > diff --git a/mm/memfd.c b/mm/memfd.c
> > index 9f80f162791a..74d45a26cf5d 100644
> > --- a/mm/memfd.c
> > +++ b/mm/memfd.c
> > @@ -245,16 +245,20 @@ long memfd_fcntl(struct file *file, unsigned int cmd, 
> > unsigned long arg)
> >  #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
> >  #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
> >
> > -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
> > +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
> > +  MFD_INACCESSIBLE)
> >
> >  SYSCALL_DEFINE2(memfd_create,
> > const char __user *, uname,
> > unsigned int, flags)
> >  {
> > +   struct address_space *mapping;
> > unsigned int *file_seals;
> > +   unsigned int xflags;
> > struct file *file;
> > int fd, error;
> > char *name;
> > +   gfp_t gfp;
> > long len;
> >
> > if (!(flags & MFD_HUGETLB)) {
> > @@ -267,6 +271,10 @@ SYSCALL_DEFINE2(memfd_create,
> > return -EINVAL;
> > }
> >
> > +   /* Disallow sealing when MFD_INACCESSIBLE is set. */
> > +   if (flags & MFD_INACCESSIBLE && flags & MFD_ALLOW_SEALING)
> > +   return -EINVAL;
> > +
> > /* length includes terminating zero */
> > 

Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag

2022-04-22 Thread Vishal Annapurve
On Thu, Mar 10, 2022 at 6:09 AM Chao Peng  wrote:
>
> From: "Kirill A. Shutemov" 
>
> Introduce a new memfd_create() flag indicating the content of the
> created memfd is inaccessible from userspace through ordinary MMU
> access (e.g., read/write/mmap). However, the file content can be
> accessed via a different mechanism (e.g. KVM MMU) indirectly.
>
> It provides semantics required for KVM guest private memory support
> that a file descriptor with this flag set is going to be used as the
> source of guest memory in confidential computing environments such
> as Intel TDX/AMD SEV but may not be accessible from host userspace.
>
> Since page migration/swapping is not yet supported for such usages
> so these pages are currently marked as UNMOVABLE and UNEVICTABLE
> which makes them behave like long-term pinned pages.
>
> The flag can not coexist with MFD_ALLOW_SEALING, future sealing is
> also impossible for a memfd created with this flag.
>
> At this time only shmem implements this flag.
>
> Signed-off-by: Kirill A. Shutemov 
> Signed-off-by: Chao Peng 
> ---
>  include/linux/shmem_fs.h   |  7 +
>  include/uapi/linux/memfd.h |  1 +
>  mm/memfd.c | 26 +++--
>  mm/shmem.c | 57 ++
>  4 files changed, 88 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index e65b80ed09e7..2dde843f28ef 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -12,6 +12,9 @@
>
>  /* inode in-kernel data */
>
> +/* shmem extended flags */
> +#define SHM_F_INACCESSIBLE 0x0001  /* prevent ordinary MMU access (e.g. 
> read/write/mmap) to file content */
> +
>  struct shmem_inode_info {
> spinlock_t  lock;
> unsigned intseals;  /* shmem seals */
> @@ -24,6 +27,7 @@ struct shmem_inode_info {
> struct shared_policypolicy; /* NUMA memory alloc policy */
> struct simple_xattrsxattrs; /* list of xattrs */
> atomic_tstop_eviction;  /* hold when working on inode 
> */
> +   unsigned intxflags; /* shmem extended flags */
> struct inodevfs_inode;
>  };
>
> @@ -61,6 +65,9 @@ extern struct file *shmem_file_setup(const char *name,
> loff_t size, unsigned long flags);
>  extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
> unsigned long flags);
> +extern struct file *shmem_file_setup_xflags(const char *name, loff_t size,
> +   unsigned long flags,
> +   unsigned int xflags);
>  extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
> const char *name, loff_t size, unsigned long flags);
>  extern int shmem_zero_setup(struct vm_area_struct *);
> diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
> index 7a8a26751c23..48750474b904 100644
> --- a/include/uapi/linux/memfd.h
> +++ b/include/uapi/linux/memfd.h
> @@ -8,6 +8,7 @@
>  #define MFD_CLOEXEC0x0001U
>  #define MFD_ALLOW_SEALING  0x0002U
>  #define MFD_HUGETLB0x0004U
> +#define MFD_INACCESSIBLE   0x0008U
>
>  /*
>   * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
> diff --git a/mm/memfd.c b/mm/memfd.c
> index 9f80f162791a..74d45a26cf5d 100644
> --- a/mm/memfd.c
> +++ b/mm/memfd.c
> @@ -245,16 +245,20 @@ long memfd_fcntl(struct file *file, unsigned int cmd, 
> unsigned long arg)
>  #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
>  #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
>
> -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
> +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
> +  MFD_INACCESSIBLE)
>
>  SYSCALL_DEFINE2(memfd_create,
> const char __user *, uname,
> unsigned int, flags)
>  {
> +   struct address_space *mapping;
> unsigned int *file_seals;
> +   unsigned int xflags;
> struct file *file;
> int fd, error;
> char *name;
> +   gfp_t gfp;
> long len;
>
> if (!(flags & MFD_HUGETLB)) {
> @@ -267,6 +271,10 @@ SYSCALL_DEFINE2(memfd_create,
> return -EINVAL;
> }
>
> +   /* Disallow sealing when MFD_INACCESSIBLE is set. */
> +   if (flags & MFD_INACCESSIBLE && flags & MFD_ALLOW_SEALING)
> +   return -EINVAL;
> +
> /* length includes terminating zero */
> len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
> if (len <= 0)
> @@ -301,8 +309,11 @@ SYSCALL_DEFINE2(memfd_create,
> HUGETLB_ANONHUGE_INODE,
> (flags >> MFD_HUGE_SHIFT) &
> 

Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag

2022-04-12 Thread Chao Peng
On Mon, Apr 11, 2022 at 06:10:23PM +0300, Kirill A. Shutemov wrote:
> On Thu, Mar 10, 2022 at 10:08:59PM +0800, Chao Peng wrote:
> > From: "Kirill A. Shutemov" 
> > 
> > Introduce a new memfd_create() flag indicating the content of the
> > created memfd is inaccessible from userspace through ordinary MMU
> > access (e.g., read/write/mmap). However, the file content can be
> > accessed via a different mechanism (e.g. KVM MMU) indirectly.
> > 
> > It provides semantics required for KVM guest private memory support
> > that a file descriptor with this flag set is going to be used as the
> > source of guest memory in confidential computing environments such
> > as Intel TDX/AMD SEV but may not be accessible from host userspace.
> > 
> > Since page migration/swapping is not yet supported for such usages
> > so these pages are currently marked as UNMOVABLE and UNEVICTABLE
> > which makes them behave like long-term pinned pages.
> > 
> > The flag can not coexist with MFD_ALLOW_SEALING, future sealing is
> > also impossible for a memfd created with this flag.
> > 
> > At this time only shmem implements this flag.
> > 
> > Signed-off-by: Kirill A. Shutemov 
> > Signed-off-by: Chao Peng 
> > ---
> >  include/linux/shmem_fs.h   |  7 +
> >  include/uapi/linux/memfd.h |  1 +
> >  mm/memfd.c | 26 +++--
> >  mm/shmem.c | 57 ++
> >  4 files changed, 88 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> > index e65b80ed09e7..2dde843f28ef 100644
> > --- a/include/linux/shmem_fs.h
> > +++ b/include/linux/shmem_fs.h
> > @@ -12,6 +12,9 @@
> >  
> >  /* inode in-kernel data */
> >  
> > +/* shmem extended flags */
> > +#define SHM_F_INACCESSIBLE 0x0001  /* prevent ordinary MMU access (e.g. 
> > read/write/mmap) to file content */
> > +
> >  struct shmem_inode_info {
> > spinlock_t  lock;
> > unsigned intseals;  /* shmem seals */
> > @@ -24,6 +27,7 @@ struct shmem_inode_info {
> > struct shared_policypolicy; /* NUMA memory alloc policy */
> > struct simple_xattrsxattrs; /* list of xattrs */
> > atomic_tstop_eviction;  /* hold when working on inode */
> > +   unsigned intxflags; /* shmem extended flags */
> > struct inodevfs_inode;
> >  };
> >  
> 
> AFAICS, only two bits of 'flags' are used. And that's very strange that
> VM_ flags are used for the purpose. My guess that someone was lazy to
> introduce new constants for this.
> 
> I think we should fix this: introduce SHM_F_LOCKED and SHM_F_NORESERVE
> alongside with SHM_F_INACCESSIBLE and stuff them all into info->flags.
> It also makes shmem_file_setup_xflags() go away.

Did a quick search and sounds we only use SHM_F_LOCKED/SHM_F_NORESERVE and
that definitely don't have to be VM_ flags.

Chao
> 
> -- 
>  Kirill A. Shutemov



Re: [PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag

2022-04-11 Thread Kirill A. Shutemov
On Thu, Mar 10, 2022 at 10:08:59PM +0800, Chao Peng wrote:
> From: "Kirill A. Shutemov" 
> 
> Introduce a new memfd_create() flag indicating the content of the
> created memfd is inaccessible from userspace through ordinary MMU
> access (e.g., read/write/mmap). However, the file content can be
> accessed via a different mechanism (e.g. KVM MMU) indirectly.
> 
> It provides semantics required for KVM guest private memory support
> that a file descriptor with this flag set is going to be used as the
> source of guest memory in confidential computing environments such
> as Intel TDX/AMD SEV but may not be accessible from host userspace.
> 
> Since page migration/swapping is not yet supported for such usages
> so these pages are currently marked as UNMOVABLE and UNEVICTABLE
> which makes them behave like long-term pinned pages.
> 
> The flag can not coexist with MFD_ALLOW_SEALING, future sealing is
> also impossible for a memfd created with this flag.
> 
> At this time only shmem implements this flag.
> 
> Signed-off-by: Kirill A. Shutemov 
> Signed-off-by: Chao Peng 
> ---
>  include/linux/shmem_fs.h   |  7 +
>  include/uapi/linux/memfd.h |  1 +
>  mm/memfd.c | 26 +++--
>  mm/shmem.c | 57 ++
>  4 files changed, 88 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index e65b80ed09e7..2dde843f28ef 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -12,6 +12,9 @@
>  
>  /* inode in-kernel data */
>  
> +/* shmem extended flags */
> +#define SHM_F_INACCESSIBLE   0x0001  /* prevent ordinary MMU access (e.g. 
> read/write/mmap) to file content */
> +
>  struct shmem_inode_info {
>   spinlock_t  lock;
>   unsigned intseals;  /* shmem seals */
> @@ -24,6 +27,7 @@ struct shmem_inode_info {
>   struct shared_policypolicy; /* NUMA memory alloc policy */
>   struct simple_xattrsxattrs; /* list of xattrs */
>   atomic_tstop_eviction;  /* hold when working on inode */
> + unsigned intxflags; /* shmem extended flags */
>   struct inodevfs_inode;
>  };
>  

AFAICS, only two bits of 'flags' are used. And that's very strange that
VM_ flags are used for the purpose. My guess that someone was lazy to
introduce new constants for this.

I think we should fix this: introduce SHM_F_LOCKED and SHM_F_NORESERVE
alongside with SHM_F_INACCESSIBLE and stuff them all into info->flags.
It also makes shmem_file_setup_xflags() go away.

-- 
 Kirill A. Shutemov



[PATCH v5 01/13] mm/memfd: Introduce MFD_INACCESSIBLE flag

2022-03-10 Thread Chao Peng
From: "Kirill A. Shutemov" 

Introduce a new memfd_create() flag indicating the content of the
created memfd is inaccessible from userspace through ordinary MMU
access (e.g., read/write/mmap). However, the file content can be
accessed via a different mechanism (e.g. KVM MMU) indirectly.

It provides semantics required for KVM guest private memory support
that a file descriptor with this flag set is going to be used as the
source of guest memory in confidential computing environments such
as Intel TDX/AMD SEV but may not be accessible from host userspace.

Since page migration/swapping is not yet supported for such usages
so these pages are currently marked as UNMOVABLE and UNEVICTABLE
which makes them behave like long-term pinned pages.

The flag can not coexist with MFD_ALLOW_SEALING, future sealing is
also impossible for a memfd created with this flag.

At this time only shmem implements this flag.

Signed-off-by: Kirill A. Shutemov 
Signed-off-by: Chao Peng 
---
 include/linux/shmem_fs.h   |  7 +
 include/uapi/linux/memfd.h |  1 +
 mm/memfd.c | 26 +++--
 mm/shmem.c | 57 ++
 4 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index e65b80ed09e7..2dde843f28ef 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -12,6 +12,9 @@
 
 /* inode in-kernel data */
 
+/* shmem extended flags */
+#define SHM_F_INACCESSIBLE 0x0001  /* prevent ordinary MMU access (e.g. 
read/write/mmap) to file content */
+
 struct shmem_inode_info {
spinlock_t  lock;
unsigned intseals;  /* shmem seals */
@@ -24,6 +27,7 @@ struct shmem_inode_info {
struct shared_policypolicy; /* NUMA memory alloc policy */
struct simple_xattrsxattrs; /* list of xattrs */
atomic_tstop_eviction;  /* hold when working on inode */
+   unsigned intxflags; /* shmem extended flags */
struct inodevfs_inode;
 };
 
@@ -61,6 +65,9 @@ extern struct file *shmem_file_setup(const char *name,
loff_t size, unsigned long flags);
 extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
unsigned long flags);
+extern struct file *shmem_file_setup_xflags(const char *name, loff_t size,
+   unsigned long flags,
+   unsigned int xflags);
 extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
const char *name, loff_t size, unsigned long flags);
 extern int shmem_zero_setup(struct vm_area_struct *);
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 7a8a26751c23..48750474b904 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -8,6 +8,7 @@
 #define MFD_CLOEXEC0x0001U
 #define MFD_ALLOW_SEALING  0x0002U
 #define MFD_HUGETLB0x0004U
+#define MFD_INACCESSIBLE   0x0008U
 
 /*
  * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
diff --git a/mm/memfd.c b/mm/memfd.c
index 9f80f162791a..74d45a26cf5d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -245,16 +245,20 @@ long memfd_fcntl(struct file *file, unsigned int cmd, 
unsigned long arg)
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
+  MFD_INACCESSIBLE)
 
 SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
 {
+   struct address_space *mapping;
unsigned int *file_seals;
+   unsigned int xflags;
struct file *file;
int fd, error;
char *name;
+   gfp_t gfp;
long len;
 
if (!(flags & MFD_HUGETLB)) {
@@ -267,6 +271,10 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
}
 
+   /* Disallow sealing when MFD_INACCESSIBLE is set. */
+   if (flags & MFD_INACCESSIBLE && flags & MFD_ALLOW_SEALING)
+   return -EINVAL;
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
@@ -301,8 +309,11 @@ SYSCALL_DEFINE2(memfd_create,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
-   } else
-   file = shmem_file_setup(name, 0, VM_NORESERVE);
+   } else {
+   xflags = flags & MFD_INACCESSIBLE ? SHM_F_INACCESSIBLE : 0;
+   file = shmem_file_setup_xflags(name, 0, VM_NORESERVE,