Module Name:    src
Committed By:   christos
Date:           Mon Jul 10 02:31:55 UTC 2023

Modified Files:
        src/distrib/sets/lists/comp: mi
        src/lib/libc/sys: Makefile.inc fcntl.2
        src/sys/compat/linux/arch/amd64: syscalls.master
        src/sys/compat/linux/common: linux_fcntl.h linux_file.c linux_misc.c
            linux_sysctl.c
        src/sys/kern: sys_descrip.c vfs_syscalls.c vfs_vnops.c
        src/sys/sys: fcntl.h file.h mman.h
        src/usr.bin/fstat: fstat.c misc.c
Added Files:
        src/lib/libc/sys: memfd_create.2
        src/sys/kern: sys_memfd.c

Log Message:
Add memfd_create(2) from GSoC 2023 by Theodore Preduta


To generate a diff of this commit:
cvs rdiff -u -r1.2436 -r1.2437 src/distrib/sets/lists/comp/mi
cvs rdiff -u -r1.250 -r1.251 src/lib/libc/sys/Makefile.inc
cvs rdiff -u -r1.49 -r1.50 src/lib/libc/sys/fcntl.2
cvs rdiff -u -r0 -r1.1 src/lib/libc/sys/memfd_create.2
cvs rdiff -u -r1.67 -r1.68 src/sys/compat/linux/arch/amd64/syscalls.master
cvs rdiff -u -r1.20 -r1.21 src/sys/compat/linux/common/linux_fcntl.h
cvs rdiff -u -r1.122 -r1.123 src/sys/compat/linux/common/linux_file.c
cvs rdiff -u -r1.256 -r1.257 src/sys/compat/linux/common/linux_misc.c
cvs rdiff -u -r1.47 -r1.48 src/sys/compat/linux/common/linux_sysctl.c
cvs rdiff -u -r1.47 -r1.48 src/sys/kern/sys_descrip.c
cvs rdiff -u -r0 -r1.1 src/sys/kern/sys_memfd.c
cvs rdiff -u -r1.559 -r1.560 src/sys/kern/vfs_syscalls.c
cvs rdiff -u -r1.241 -r1.242 src/sys/kern/vfs_vnops.c
cvs rdiff -u -r1.54 -r1.55 src/sys/sys/fcntl.h
cvs rdiff -u -r1.92 -r1.93 src/sys/sys/file.h
cvs rdiff -u -r1.62 -r1.63 src/sys/sys/mman.h
cvs rdiff -u -r1.117 -r1.118 src/usr.bin/fstat/fstat.c
cvs rdiff -u -r1.24 -r1.25 src/usr.bin/fstat/misc.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/distrib/sets/lists/comp/mi
diff -u src/distrib/sets/lists/comp/mi:1.2436 src/distrib/sets/lists/comp/mi:1.2437
--- src/distrib/sets/lists/comp/mi:1.2436	Sat Jul  8 13:43:13 2023
+++ src/distrib/sets/lists/comp/mi	Sun Jul  9 22:31:54 2023
@@ -1,4 +1,4 @@
-#	$NetBSD: mi,v 1.2436 2023/07/08 17:43:13 christos Exp $
+#	$NetBSD: mi,v 1.2437 2023/07/10 02:31:54 christos Exp $
 #
 # Note: don't delete entries from here - mark them as "obsolete" instead.
 ./etc/mtree/set.comp				comp-sys-root
@@ -4826,6 +4826,7 @@
 ./usr/share/man/cat2/lutimes.0			comp-c-catman		.cat
 ./usr/share/man/cat2/m68k_sync_icache.0		comp-c-catman		.cat
 ./usr/share/man/cat2/madvise.0			comp-c-catman		.cat
+./usr/share/man/cat2/memfd_create.0		comp-c-catman		.cat
 ./usr/share/man/cat2/mincore.0			comp-c-catman		.cat
 ./usr/share/man/cat2/minherit.0			comp-c-catman		.cat
 ./usr/share/man/cat2/mkdir.0			comp-c-catman		.cat
@@ -13315,6 +13316,7 @@
 ./usr/share/man/html2/lutimes.html		comp-c-htmlman		html
 ./usr/share/man/html2/m68k_sync_icache.html	comp-c-htmlman		html
 ./usr/share/man/html2/madvise.html		comp-c-htmlman		html
+./usr/share/man/html2/memfd_create.html		comp-c-htmlman		html
 ./usr/share/man/html2/mincore.html		comp-c-htmlman		html
 ./usr/share/man/html2/minherit.html		comp-c-htmlman		html
 ./usr/share/man/html2/mkdir.html		comp-c-htmlman		html
@@ -21632,6 +21634,7 @@
 ./usr/share/man/man2/lutimes.2			comp-c-man		.man
 ./usr/share/man/man2/m68k_sync_icache.2		comp-c-man		.man
 ./usr/share/man/man2/madvise.2			comp-c-man		.man
+./usr/share/man/man2/memfd_create.2		comp-c-man		.man
 ./usr/share/man/man2/mincore.2			comp-c-man		.man
 ./usr/share/man/man2/minherit.2			comp-c-man		.man
 ./usr/share/man/man2/mkdir.2			comp-c-man		.man

Index: src/lib/libc/sys/Makefile.inc
diff -u src/lib/libc/sys/Makefile.inc:1.250 src/lib/libc/sys/Makefile.inc:1.251
--- src/lib/libc/sys/Makefile.inc:1.250	Mon Nov  1 01:53:45 2021
+++ src/lib/libc/sys/Makefile.inc	Sun Jul  9 22:31:54 2023
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile.inc,v 1.250 2021/11/01 05:53:45 thorpej Exp $
+#	$NetBSD: Makefile.inc,v 1.251 2023/07/10 02:31:54 christos Exp $
 #	@(#)Makefile.inc	8.3 (Berkeley) 10/24/94
 
 # sys sources
@@ -134,7 +134,8 @@ ASM=\
 		_lwp_unpark_all.S _lwp_suspend.S _lwp_continue.S \
 		_lwp_wakeup.S _lwp_detach.S _lwp_setprivate.S \
 		_lwp_setname.S _lwp_getname.S _lwp_ctl.S \
-	madvise.S mincore.S minherit.S mkdir.S mkdirat.S mkfifo.S mkfifoat.S \
+	madvise.S memfd_create.S mincore.S minherit.S mkdir.S mkdirat.S \
+		mkfifo.S mkfifoat.S \
 		__mknod50.S mlock.S mlockall.S modctl.S __mount50.S \
 		mprotect.S __msgctl50.S msgget.S munlock.S munlockall.S \
 		munmap.S \
@@ -275,7 +276,7 @@ MAN+=	accept.2 access.2 acct.2 adjtime.2
 	_lwp_suspend.2 _lwp_wakeup.2 _lwp_wait.2 _lwp_kill.2 \
 	_lwp_getname.2 _lwp_getprivate.2 \
 	_lwp_park.2 _lwp_unpark.2 _lwp_unpark_all.2 \
-	mkdir.2 mkfifo.2 mknod.2 \
+	memfd_create.2 mkdir.2 mkfifo.2 mknod.2 \
 	madvise.2 mincore.2 minherit.2 mlock.2 mlockall.2 mmap.2 modctl.2 \
 	mount.2 \
 	mprotect.2 mremap.2 msgctl.2 msgget.2 msgrcv.2 msgsnd.2 msync.2 \

Index: src/lib/libc/sys/fcntl.2
diff -u src/lib/libc/sys/fcntl.2:1.49 src/lib/libc/sys/fcntl.2:1.50
--- src/lib/libc/sys/fcntl.2:1.49	Sun Dec  4 14:01:19 2022
+++ src/lib/libc/sys/fcntl.2	Sun Jul  9 22:31:54 2023
@@ -1,4 +1,4 @@
-.\"	$NetBSD: fcntl.2,v 1.49 2022/12/04 19:01:19 uwe Exp $
+.\"	$NetBSD: fcntl.2,v 1.50 2023/07/10 02:31:54 christos Exp $
 .\"
 .\" Copyright (c) 1983, 1993
 .\"	The Regents of the University of California.  All rights reserved.
@@ -29,7 +29,7 @@
 .\"
 .\"     @(#)fcntl.2	8.2 (Berkeley) 1/12/94
 .\"
-.Dd September 26, 2019
+.Dd July 5, 2023
 .Dt FCNTL 2
 .Os
 .Sh NAME
@@ -162,6 +162,24 @@ in the buffer pointed to by
 .Fa arg
 should be pointing to a buffer of at least
 .Dv MAXPATHLEN .
+.It Dv F_ADD_SEALS
+Add seals specified in
+.Fa arg
+to
+.Fa fd
+to restrict possible operations on
+.Fa fd
+as described below.
+Like flags, multiple seals can be specified at once.
+Additionally, specifying seals that are already associated with
+.Fa fd
+is a no-op.
+.It Dv F_GET_SEALS
+Get the seals currently associated with
+.Fa fd
+as described below
+.Fa ( arg
+is ignored).
 .El
 .Pp
 The set of valid flags for the
@@ -324,13 +342,44 @@ or an
 request fails or blocks respectively when another process has existing
 locks on bytes in the specified region and the type of any of those
 locks conflicts with the type specified in the request.
+.Pp
+Possible seals are:
+.Bl -tag -width F_SEAL_FUTURE_WRITE
+.It Dv F_SEAL_SEAL
+Prevent any further seals from being added to
+.Fa fd .
+.It Dv F_SEAL_SHRINK
+Prevent the size of
+.Fa fd
+from decreasing.
+.It Dv F_SEAL_GROW
+Prevent the size of
+.Fa fd
+from increasing.
+.It Dv F_SEAL_WRITE
+Prevent any write operations to
+.Fa fd .
+.Dv F_SEAL_WRITE
+cannot be applied if
+.Fa fd
+has any memory mappings.
+.It Dv F_SEAL_FUTURE_WRITE
+Like
+.Dv F_SEAL_WRITE
+but allow any current memory mappings of
+.Fa fd
+to remain open, including those with
+.Dv PROT_WRITE .
+.El
 .Sh NOTES
-The
-.Dv F_GETPATH
-functionality is implemented using the reverse
+For
+.Dv F_GETPATH :
+.Bl -bullet -compact
+.It
+For vnodes, functionality is implemented using the reverse
 .Xr namei 9
 cache.
-The implications of this are:
+The implications of this are
 .Bl -bullet -compact
 .It
 For hard links where the file descriptor can resolve to multiple pathnames,
@@ -341,16 +390,25 @@ may fail if the corresponding entry has 
 .Xr namei 9
 cache and return
 .Er ENOENT .
+.El
 .It
-File descriptors that don't point to vnodes are not handled, as
-well as symbolic links since there is currently no way to obtain
-a file descriptor pointing to a symbolic link.
+For a file descriptor created by
+.Xr memfd_create 2 ,
+the name provided at
+.Fa fd
+creation, with the prefix
+.Dq memfd:
+is used.
+.It
+Other types of file descriptors are not handled, as well as symbolic
+links since there is currently no way to obtain a file descriptor
+pointing to a symbolic link.
 .El
 .Sh RETURN VALUES
 Upon successful completion, the value returned depends on
 .Fa cmd
 as follows:
-.Bl -tag -width F_GETOWNX -offset indent
+.Bl -tag -width F_GET_SEALS -offset indent
 .It Dv F_DUPFD
 A new file descriptor.
 .It Dv F_GETFD
@@ -361,6 +419,9 @@ Value of flags.
 Value of file descriptor owner.
 .It Dv F_MAXFD
 Value of the highest file descriptor open by the process.
+.It Dv F_GET_SEALS
+Value of the seals currently associated with
+.Fa fd .
 .It other
 Value other than \-1.
 .El
@@ -473,6 +534,18 @@ is an exclusive lock
 and
 .Fa fildes
 is not a valid file descriptor open for writing.
+.It Bq Er EBUSY
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS ,
+.Fa arg
+contains
+.Dv F_SEAL_WRITE
+and
+.Fa fd
+is currently mapped by
+.Xr mmap 2 .
 .It Bq Er EDEADLK
 The argument
 .Fa cmd
@@ -512,6 +585,24 @@ and the data to which
 points is not valid, or
 .Fa fildes
 refers to a file that does not support locking.
+.Pp
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS
+or
+.Dv F_GET_SEALS
+and
+.Fa fd
+does not support seals.
+.Pp
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS
+and
+.Fa arg
+contains set bits for unsupported seals.
 .It Bq Er EMFILE
 The argument
 .Fa cmd
@@ -562,6 +653,15 @@ has been reached.
 It can be modified using the
 .Li kern.maxfiles
 .Xr sysctl 7 .
+.It Bq Er EPERM
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS
+and
+.Fa fd
+already has
+.Dv F_SEAL_SEAL .
 .It Bq Er ERANGE
 The argument
 .Fa cmd

Index: src/sys/compat/linux/arch/amd64/syscalls.master
diff -u src/sys/compat/linux/arch/amd64/syscalls.master:1.67 src/sys/compat/linux/arch/amd64/syscalls.master:1.68
--- src/sys/compat/linux/arch/amd64/syscalls.master:1.67	Wed Dec  1 23:29:48 2021
+++ src/sys/compat/linux/arch/amd64/syscalls.master	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-	$NetBSD: syscalls.master,v 1.67 2021/12/02 04:29:48 ryo Exp $
+	$NetBSD: syscalls.master,v 1.68 2023/07/10 02:31:55 christos Exp $
 
 ;	@(#)syscalls.master	8.1 (Berkeley) 7/19/93
 
@@ -567,6 +567,145 @@
 311	UNIMPL		process_vm_writev
 312	UNIMPL		kcmp
 313	UNIMPL		finit_module
+314	UNIMPL		sched_setattr
+315	UNIMPL		sched_getattr
+316	UNIMPL		renameat2
+317	UNIMPL		seccomp
+318	NOARGS		{ ssize_t|sys||getrandom(void *buf, size_t buflen, \
+			    unsigned int flags); }
+319	STD		{ int|linux_sys||memfd_create(const char *name, \
+			    unsigned int flags); }
+320	UNIMPL		kexec_file_load
+321	UNIMPL		bpf
+322	UNIMPL		execveat
+323	UNIMPL		userfaultfd
+324	UNIMPL		membarrier
+325	UNIMPL		mlock2
+326	UNIMPL		copy_file_range
+327	UNIMPL		preadv2
+328	UNIMPL		pwritev2
+329	UNIMPL		pkey_mprotect
+330	UNIMPL		pkey_alloc
+331	UNIMPL		pkey_free
+332	UNIMPL		statx
+333	UNIMPL		io_pgetevents
+334	UNIMPL		rseq
+335	UNIMPL
+336	UNIMPL
+337	UNIMPL
+338	UNIMPL
+339	UNIMPL
+340	UNIMPL
+341	UNIMPL
+342	UNIMPL
+343	UNIMPL
+344	UNIMPL
+345	UNIMPL
+346	UNIMPL
+347	UNIMPL
+348	UNIMPL
+349	UNIMPL
+350	UNIMPL
+351	UNIMPL
+352	UNIMPL
+353	UNIMPL
+354	UNIMPL
+355	UNIMPL
+356	UNIMPL
+357	UNIMPL
+358	UNIMPL
+359	UNIMPL
+360	UNIMPL
+361	UNIMPL
+362	UNIMPL
+363	UNIMPL
+364	UNIMPL
+365	UNIMPL
+366	UNIMPL
+367	UNIMPL
+368	UNIMPL
+369	UNIMPL
+370	UNIMPL
+371	UNIMPL
+372	UNIMPL
+373	UNIMPL
+374	UNIMPL
+375	UNIMPL
+376	UNIMPL
+377	UNIMPL
+378	UNIMPL
+379	UNIMPL
+380	UNIMPL
+381	UNIMPL
+382	UNIMPL
+383	UNIMPL
+384	UNIMPL
+385	UNIMPL
+386	UNIMPL
+387	UNIMPL
+388	UNIMPL
+389	UNIMPL
+390	UNIMPL
+391	UNIMPL
+392	UNIMPL
+393	UNIMPL
+394	UNIMPL
+395	UNIMPL
+396	UNIMPL
+397	UNIMPL
+398	UNIMPL
+399	UNIMPL
+400	UNIMPL
+401	UNIMPL
+402	UNIMPL
+403	UNIMPL
+404	UNIMPL
+405	UNIMPL
+406	UNIMPL
+407	UNIMPL
+408	UNIMPL
+409	UNIMPL
+410	UNIMPL
+411	UNIMPL
+412	UNIMPL
+413	UNIMPL
+414	UNIMPL
+415	UNIMPL
+416	UNIMPL
+417	UNIMPL
+418	UNIMPL
+419	UNIMPL
+420	UNIMPL
+421	UNIMPL
+422	UNIMPL
+423	UNIMPL
+424	UNIMPL		pidfd_send_signal
+425	UNIMPL		io_uring_setup
+426	UNIMPL		io_uring_enter
+427	UNIMPL		io_uring_register
+428	UNIMPL		open_tree
+429	UNIMPL		move_mount
+430	UNIMPL		fsopen
+431	UNIMPL		fsconfig
+432	UNIMPL		fsmount
+433	UNIMPL		fspick
+434	UNIMPL		pidfd_open
+435	UNIMPL		clone3
+436	UNIMPL		close_range
+437	UNIMPL		openat2
+438	UNIMPL		pidfd_getfd
+439	UNIMPL		faccessat2
+440	UNIMPL		process_madvise
+441	UNIMPL		epoll_pwait2
+442	UNIMPL		mount_setattr
+443	UNIMPL		quotactl_fd
+444	UNIMPL		landlock_create_ruleset
+445	UNIMPL		landlock_add_rule
+446	UNIMPL		landlock_restrict_self
+447	UNIMPL		memfd_secret
+448	UNIMPL		process_mrelease
+449	UNIMPL		futex_waitv
+450	UNIMPL		set_mempolicy_home_node
 
 ; we want a "nosys" syscall, we'll just add an extra entry for it.
-314	STD		{ int|linux_sys||nosys(void); }
+451	STD		{ int|linux_sys||nosys(void); }

Index: src/sys/compat/linux/common/linux_fcntl.h
diff -u src/sys/compat/linux/common/linux_fcntl.h:1.20 src/sys/compat/linux/common/linux_fcntl.h:1.21
--- src/sys/compat/linux/common/linux_fcntl.h:1.20	Wed Nov 24 21:27:08 2021
+++ src/sys/compat/linux/common/linux_fcntl.h	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: linux_fcntl.h,v 1.20 2021/11/25 02:27:08 ryo Exp $	*/
+/*	$NetBSD: linux_fcntl.h,v 1.21 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 1995, 1998 The NetBSD Foundation, Inc.
@@ -100,6 +100,8 @@ struct linux_flock64 {
 #define	LINUX_F_DUPFD_CLOEXEC 	(LINUX_F_SPECIFIC_BASE + 6)
 #define	LINUX_F_SETPIPE_SZ 	(LINUX_F_SPECIFIC_BASE + 7)
 #define	LINUX_F_GETPIPE_SZ 	(LINUX_F_SPECIFIC_BASE + 8)
+#define	LINUX_F_ADD_SEALS	(LINUX_F_SPECIFIC_BASE + 9)
+#define	LINUX_F_GET_SEALS	(LINUX_F_SPECIFIC_BASE + 10)
 
 /*
  * We have to have 4 copies of the code that converts linux fcntl() file

Index: src/sys/compat/linux/common/linux_file.c
diff -u src/sys/compat/linux/common/linux_file.c:1.122 src/sys/compat/linux/common/linux_file.c:1.123
--- src/sys/compat/linux/common/linux_file.c:1.122	Wed Nov 24 22:08:04 2021
+++ src/sys/compat/linux/common/linux_file.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: linux_file.c,v 1.122 2021/11/25 03:08:04 ryo Exp $	*/
+/*	$NetBSD: linux_file.c,v 1.123 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 1995, 1998, 2008 The NetBSD Foundation, Inc.
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_file.c,v 1.122 2021/11/25 03:08:04 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_file.c,v 1.123 2023/07/10 02:31:55 christos Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -440,6 +440,14 @@ linux_sys_fcntl(struct lwp *l, const str
 		cmd = F_DUPFD_CLOEXEC;
 		break;
 
+	case LINUX_F_ADD_SEALS:
+		cmd = F_ADD_SEALS;
+		break;
+
+	case LINUX_F_GET_SEALS:
+		cmd = F_GET_SEALS;
+		break;
+
 	default:
 		return EOPNOTSUPP;
 	}

Index: src/sys/compat/linux/common/linux_misc.c
diff -u src/sys/compat/linux/common/linux_misc.c:1.256 src/sys/compat/linux/common/linux_misc.c:1.257
--- src/sys/compat/linux/common/linux_misc.c:1.256	Wed Dec  1 23:29:48 2021
+++ src/sys/compat/linux/common/linux_misc.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: linux_misc.c,v 1.256 2021/12/02 04:29:48 ryo Exp $	*/
+/*	$NetBSD: linux_misc.c,v 1.257 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 1995, 1998, 1999, 2008 The NetBSD Foundation, Inc.
@@ -57,7 +57,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_misc.c,v 1.256 2021/12/02 04:29:48 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_misc.c,v 1.257 2023/07/10 02:31:55 christos Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -169,9 +169,9 @@ const struct linux_mnttypes linux_fstype
 const int linux_fstypes_cnt = sizeof(linux_fstypes) / sizeof(linux_fstypes[0]);
 
 # ifdef DEBUG_LINUX
-#define DPRINTF(a)	uprintf a
+#define	DPRINTF(a)	uprintf a
 # else
-#define DPRINTF(a)
+#define	DPRINTF(a)
 # endif
 
 /* Local linux_misc.c functions: */
@@ -1681,3 +1681,66 @@ linux_sys_eventfd2(struct lwp *l, const 
 	return linux_do_eventfd2(l, SCARG(uap, initval), SCARG(uap, flags),
 				 retval);
 }
+
+#define	LINUX_MFD_CLOEXEC	0x0001U
+#define	LINUX_MFD_ALLOW_SEALING	0x0002U
+#define	LINUX_MFD_HUGETLB	0x0004U
+#define	LINUX_MFD_NOEXEC_SEAL	0x0008U
+#define	LINUX_MFD_EXEC		0x0010U
+#define	LINUX_MFD_HUGE_FLAGS	(0x3f << 26)
+
+#define	LINUX_MFD_ALL_FLAGS	(LINUX_MFD_CLOEXEC|LINUX_MFD_ALLOW_SEALING \
+				|LINUX_MFD_HUGETLB|LINUX_MFD_NOEXEC_SEAL \
+				|LINUX_MFD_EXEC|LINUX_MFD_HUGE_FLAGS)
+#define	LINUX_MFD_KNOWN_FLAGS	(LINUX_MFD_CLOEXEC|LINUX_MFD_ALLOW_SEALING)
+
+#define LINUX_MFD_NAME_MAX	249
+
+/*
+ * memfd_create(2).  Do some error checking and then call NetBSD's
+ * version.
+ */
+int
+linux_sys_memfd_create(struct lwp *l,
+    const struct linux_sys_memfd_create_args *uap, register_t *retval)
+{
+	/* {
+		syscallarg(const char *) name;
+		syscallarg(unsigned int) flags;
+	} */
+	int error;
+	char *pbuf;
+	struct sys_memfd_create_args muap;
+	const unsigned int lflags = SCARG(uap, flags);
+
+	KASSERT(LINUX_MFD_NAME_MAX < NAME_MAX); /* sanity check */
+
+	if (lflags & ~LINUX_MFD_ALL_FLAGS)
+		return EINVAL;
+	if ((lflags & LINUX_MFD_HUGE_FLAGS) != 0 &&
+	    (lflags & LINUX_MFD_HUGETLB) == 0)
+		return EINVAL;
+	if ((lflags & LINUX_MFD_HUGETLB) && (lflags & LINUX_MFD_ALLOW_SEALING))
+		return EINVAL;
+
+	/* Linux has a stricter limit for name size */
+	pbuf = PNBUF_GET();
+	error = copyinstr(SCARG(uap, name), pbuf, LINUX_MFD_NAME_MAX+1, NULL);
+	PNBUF_PUT(pbuf);
+	pbuf = NULL;
+	if (error != 0) {
+		if (error == ENAMETOOLONG)
+			error = EINVAL;
+		return error;
+	}
+
+	if (lflags & ~LINUX_MFD_KNOWN_FLAGS) {
+		DPRINTF(("linux_sys_memfd_create: ignored flags %x\n",
+		    lflags & ~LINUX_MFD_KNOWN_FLAGS));
+	}
+
+	SCARG(&muap, name) = SCARG(uap, name);
+	SCARG(&muap, flags) = lflags & LINUX_MFD_KNOWN_FLAGS;
+
+	return sys_memfd_create(l, &muap, retval);
+}

Index: src/sys/compat/linux/common/linux_sysctl.c
diff -u src/sys/compat/linux/common/linux_sysctl.c:1.47 src/sys/compat/linux/common/linux_sysctl.c:1.48
--- src/sys/compat/linux/common/linux_sysctl.c:1.47	Thu Sep 23 02:56:27 2021
+++ src/sys/compat/linux/common/linux_sysctl.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: linux_sysctl.c,v 1.47 2021/09/23 06:56:27 ryo Exp $	*/
+/*	$NetBSD: linux_sysctl.c,v 1.48 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 2003, 2008 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_sysctl.c,v 1.47 2021/09/23 06:56:27 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_sysctl.c,v 1.48 2023/07/10 02:31:55 christos Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -57,8 +57,8 @@ __KERNEL_RCSID(0, "$NetBSD: linux_sysctl
 #include <compat/linux/common/linux_machdep.h>
 
 char linux_sysname[128] = "Linux";
-char linux_release[128] = "3.11.6";
-char linux_version[128] = "#1 SMP PREEMPT Thu Oct 24 16:23:02 UTC 2013";
+char linux_release[128] = "6.3.10";
+char linux_version[128] = "#1 SMP PREEMPT_DYNAMIC Wed Jun 28 18:34:30 UTC 2023";
 
 struct sysctlnode linux_sysctl_root = {
 	.sysctl_flags = SYSCTL_VERSION|

Index: src/sys/kern/sys_descrip.c
diff -u src/sys/kern/sys_descrip.c:1.47 src/sys/kern/sys_descrip.c:1.48
--- src/sys/kern/sys_descrip.c:1.47	Sun May 14 05:29:58 2023
+++ src/sys/kern/sys_descrip.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: sys_descrip.c,v 1.47 2023/05/14 09:29:58 riastradh Exp $	*/
+/*	$NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
@@ -67,7 +67,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.47 2023/05/14 09:29:58 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -315,26 +315,6 @@ out:	if (fp)
 	return error;
 }
 
-static int
-do_fcntl_getpath(struct lwp *l, file_t *fp, char *upath)
-{
-	char *kpath;
-	int error;
-
-	if (fp->f_type != DTYPE_VNODE)
-		return EOPNOTSUPP;
-
-	kpath = PNBUF_GET();
-
-	error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode, l, l->l_proc);
-	if (!error)
-		error = copyoutstr(kpath, upath, MAXPATHLEN, NULL);
-
-	PNBUF_PUT(kpath);
-
-	return error;
-}
-
 /*
  * The file control system call.
  */
@@ -350,6 +330,7 @@ sys_fcntl(struct lwp *l, const struct sy
 	filedesc_t *fdp;
 	fdtab_t *dt;
 	file_t *fp;
+	char *kpath;
 	struct flock fl;
 	bool cloexec = false;
 
@@ -486,7 +467,30 @@ sys_fcntl(struct lwp *l, const struct sy
 		break;
 
 	case F_GETPATH:
-		error = do_fcntl_getpath(l, fp, SCARG(uap, arg));
+		kpath = PNBUF_GET();
+
+		/* vnodes need extra context, so are handled separately */
+		if (fp->f_type == DTYPE_VNODE)
+			error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode,
+			    l, l->l_proc);
+		else
+			error = (*fp->f_ops->fo_fcntl)(fp, F_GETPATH, kpath);
+
+		if (error == 0)
+			error = copyoutstr(kpath, SCARG(uap, arg), MAXPATHLEN,
+			    NULL);
+
+		PNBUF_PUT(kpath);
+		break;
+
+	case F_ADD_SEALS:
+		tmp = (int)(uintptr_t) SCARG(uap, arg);
+		error = (*fp->f_ops->fo_fcntl)(fp, F_ADD_SEALS, &tmp);
+		break;
+
+	case F_GET_SEALS:
+		error = (*fp->f_ops->fo_fcntl)(fp, F_GET_SEALS, &tmp);
+		*retval = tmp;
 		break;
 
 	default:

Index: src/sys/kern/vfs_syscalls.c
diff -u src/sys/kern/vfs_syscalls.c:1.559 src/sys/kern/vfs_syscalls.c:1.560
--- src/sys/kern/vfs_syscalls.c:1.559	Sat Apr 29 02:34:20 2023
+++ src/sys/kern/vfs_syscalls.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_syscalls.c,v 1.559 2023/04/29 06:34:20 riastradh Exp $	*/
+/*	$NetBSD: vfs_syscalls.c,v 1.560 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
@@ -70,7 +70,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.559 2023/04/29 06:34:20 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.560 2023/07/10 02:31:55 christos Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_fileassoc.h"
@@ -4149,34 +4149,19 @@ sys_ftruncate(struct lwp *l, const struc
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */
-	struct vattr vattr;
-	struct vnode *vp;
 	file_t *fp;
-	int error;
+	int error, fd = SCARG(uap, fd);
 
-	if (SCARG(uap, length) < 0)
-		return EINVAL;
+	fp = fd_getfile(fd);
+	if (fp == NULL)
+		return EBADF;
+	if (fp->f_ops->fo_truncate == NULL)
+		error = EOPNOTSUPP;
+	else
+		error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length));
 
-	/* fd_getvnode() will use the descriptor for us */
-	if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
-		return (error);
-	if ((fp->f_flag & FWRITE) == 0) {
-		error = EINVAL;
-		goto out;
-	}
-	vp = fp->f_vnode;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	if (vp->v_type == VDIR)
-		error = EISDIR;
-	else if ((error = vn_writechk(vp)) == 0) {
-		vattr_null(&vattr);
-		vattr.va_size = SCARG(uap, length);
-		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
-	}
-	VOP_UNLOCK(vp);
- out:
-	fd_putfile(SCARG(uap, fd));
-	return (error);
+	fd_putfile(fd);
+	return error;
 }
 
 /*

Index: src/sys/kern/vfs_vnops.c
diff -u src/sys/kern/vfs_vnops.c:1.241 src/sys/kern/vfs_vnops.c:1.242
--- src/sys/kern/vfs_vnops.c:1.241	Sat Apr 22 09:53:02 2023
+++ src/sys/kern/vfs_vnops.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_vnops.c,v 1.241 2023/04/22 13:53:02 riastradh Exp $	*/
+/*	$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.241 2023/04/22 13:53:02 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $");
 
 #include "veriexec.h"
 
@@ -125,6 +125,7 @@ static int vn_seek(struct file *, off_t,
 static int vn_advlock(struct file *, void *, int, struct flock *, int);
 static int vn_fpathconf(struct file *, int, register_t *);
 static int vn_posix_fadvise(struct file *, off_t, off_t, int);
+static int vn_truncate(file_t *, off_t);
 
 const struct fileops vnops = {
 	.fo_name = "vn",
@@ -142,6 +143,7 @@ const struct fileops vnops = {
 	.fo_advlock = vn_advlock,
 	.fo_fpathconf = vn_fpathconf,
 	.fo_posix_fadvise = vn_posix_fadvise,
+	.fo_truncate = vn_truncate,
 };
 
 /*
@@ -1331,6 +1333,33 @@ vn_posix_fadvise(struct file *fp, off_t 
 	return error;
 }
 
+static int
+vn_truncate(file_t *fp, off_t length)
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	int error = 0;
+
+	if (length < 0)
+		return EINVAL;
+
+	if ((fp->f_flag & FWRITE) == 0)
+		return EINVAL;
+	vp = fp->f_vnode;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0) {
+		vattr_null(&vattr);
+		vattr.va_size = length;
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
+	}
+	VOP_UNLOCK(vp);
+
+	return error;
+}
+
+
 /*
  * Check that the vnode is still valid, and if so
  * acquire requested lock.

Index: src/sys/sys/fcntl.h
diff -u src/sys/sys/fcntl.h:1.54 src/sys/sys/fcntl.h:1.55
--- src/sys/sys/fcntl.h:1.54	Mon Mar 30 16:17:42 2020
+++ src/sys/sys/fcntl.h	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: fcntl.h,v 1.54 2020/03/30 20:17:42 kamil Exp $	*/
+/*	$NetBSD: fcntl.h,v 1.55 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 1983, 1990, 1993
@@ -200,6 +200,8 @@
 #define	F_GETNOSIGPIPE	13		/* get SIGPIPE disposition */
 #define	F_SETNOSIGPIPE	14		/* set SIGPIPE disposition */
 #define	F_GETPATH	15		/* get pathname associated with fd */
+#define	F_ADD_SEALS	16		/* set seals */
+#define	F_GET_SEALS	17		/* get seals */
 #endif
 
 /* file descriptor flags (F_GETFD, F_SETFD) */
@@ -215,6 +217,15 @@
 #define	F_POSIX		0x040	 	/* Use POSIX semantics for lock */
 #endif
 
+/* types of seals (F_ADD_SEALS, F_GET_SEALS) */
+#if defined(_NETBSD_SOURCE)
+#define	F_SEAL_SEAL		0x0001	/* prevent further seals from being set */
+#define	F_SEAL_SHRINK		0x0002	/* prevent file from shrinking */
+#define	F_SEAL_GROW		0x0004	/* prevent file from growing */
+#define	F_SEAL_WRITE		0x0008	/* prevent writes */
+#define	F_SEAL_FUTURE_WRITE	0x0010	/* prevent future writes while mapped */
+#endif
+
 /* Constants for fcntl's passed to the underlying fs - like ioctl's. */
 #if defined(_NETBSD_SOURCE)
 #define	F_PARAM_MASK	0xfff

Index: src/sys/sys/file.h
diff -u src/sys/sys/file.h:1.92 src/sys/sys/file.h:1.93
--- src/sys/sys/file.h:1.92	Sat Apr 22 09:53:02 2023
+++ src/sys/sys/file.h	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: file.h,v 1.92 2023/04/22 13:53:02 riastradh Exp $	*/
+/*	$NetBSD: file.h,v 1.93 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -103,6 +103,7 @@ struct fileops {
 	int	(*fo_fpathconf)	(struct file *, int, register_t *);
 	int	(*fo_posix_fadvise)
 				(struct file *, off_t, off_t, int);
+	int	(*fo_truncate)	(struct file *, off_t);
 };
 
 union file_data {
@@ -121,6 +122,7 @@ union file_data {
 	struct mqueue *fd_mq;		// DTYPE_MQUEUE
 	struct ksem *fd_ks;		// DTYPE_SEM
 	struct iscsifd *fd_iscsi;	// DTYPE_MISC (iscsi)
+	struct memfd *fd_memfd;		// DTYPE_MEMFD
 };
 
 /*
@@ -160,6 +162,7 @@ struct file {
 #define f_ksem		f_undata.fd_ks
 #define f_eventfd	f_undata.fd_eventfd
 #define f_timerfd	f_undata.fd_timerfd
+#define f_memfd		f_undata.fd_memfd
 
 #define f_rndctx	f_undata.fd_rndctx
 #define f_audioctx	f_undata.fd_audioctx
@@ -184,10 +187,11 @@ struct file {
 #define	DTYPE_SEM	8		/* semaphore */
 #define	DTYPE_EVENTFD	9		/* eventfd */
 #define	DTYPE_TIMERFD	10		/* timerfd */
+#define	DTYPE_MEMFD	11		/* memfd */
 
 #define DTYPE_NAMES	\
     "0", "file", "socket", "pipe", "kqueue", "misc", "crypto", "mqueue", \
-    "semaphore", "eventfd", "timerfd"
+    "semaphore", "eventfd", "timerfd", "memfd"
 
 #ifdef _KERNEL
 

Index: src/sys/sys/mman.h
diff -u src/sys/sys/mman.h:1.62 src/sys/sys/mman.h:1.63
--- src/sys/sys/mman.h:1.62	Fri Dec  6 14:37:43 2019
+++ src/sys/sys/mman.h	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: mman.h,v 1.62 2019/12/06 19:37:43 christos Exp $	*/
+/*	$NetBSD: mman.h,v 1.63 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1993
@@ -212,7 +212,13 @@ typedef	__off_t		off_t;		/* file offset 
 					   implemented in UVM */
 #define	MAP_INHERIT_ZERO	4	/* zero in child */
 #define	MAP_INHERIT_DEFAULT	MAP_INHERIT_COPY
-#endif
+
+/*
+ * Flags to memfd_create
+ */
+#define	MFD_CLOEXEC		0x1U
+#define	MFD_ALLOW_SEALING	0x2U
+#endif /* _NETBSD_SOURCE */
 
 #ifndef _KERNEL
 
@@ -234,12 +240,31 @@ int	madvise(void *, size_t, int);
 int	mincore(void *, size_t, char *);
 int	minherit(void *, size_t, int);
 void *	mremap(void *, size_t, void *, size_t, int);
+int	memfd_create(const char *, unsigned int);
 #endif
 int	posix_madvise(void *, size_t, int);
 int	shm_open(const char *, int, mode_t);
 int	shm_unlink(const char *);
 __END_DECLS
 
+#else
+
+#include <sys/syslimits.h>	/* for NAME_MAX */
+#include <sys/timespec.h>	/* for struct timespec */
+#include <sys/mutex.h>		/* for kmutex_t */
+
+struct memfd {
+	char			mfd_name[NAME_MAX+1];
+	struct uvm_object	*mfd_uobj;
+	size_t			mfd_size;
+	int			mfd_seals;
+	kmutex_t		mfd_lock;	/* for truncate */
+
+	struct timespec		mfd_btime;
+	struct timespec		mfd_atime;
+	struct timespec		mfd_mtime;
+};
+
 #endif /* !_KERNEL */
 
 #endif /* !_SYS_MMAN_H_ */

Index: src/usr.bin/fstat/fstat.c
diff -u src/usr.bin/fstat/fstat.c:1.117 src/usr.bin/fstat/fstat.c:1.118
--- src/usr.bin/fstat/fstat.c:1.117	Fri Oct 28 01:27:16 2022
+++ src/usr.bin/fstat/fstat.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: fstat.c,v 1.117 2022/10/28 05:27:16 ozaki-r Exp $	*/
+/*	$NetBSD: fstat.c,v 1.118 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 1988, 1993
@@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1988, 19
 #if 0
 static char sccsid[] = "@(#)fstat.c	8.3 (Berkeley) 5/2/95";
 #else
-__RCSID("$NetBSD: fstat.c,v 1.117 2022/10/28 05:27:16 ozaki-r Exp $");
+__RCSID("$NetBSD: fstat.c,v 1.118 2023/07/10 02:31:55 christos Exp $");
 #endif
 #endif /* not lint */
 
@@ -548,6 +548,7 @@ ftrans(fdfile_t *fp, int i)
 	case DTYPE_CRYPTO:
 	case DTYPE_MQUEUE:
 	case DTYPE_SEM:
+	case DTYPE_MEMFD:
 		if (checkfile == 0)
 			misctrans(&file, i);
 		break;

Index: src/usr.bin/fstat/misc.c
diff -u src/usr.bin/fstat/misc.c:1.24 src/usr.bin/fstat/misc.c:1.25
--- src/usr.bin/fstat/misc.c:1.24	Sun Sep 13 00:14:48 2020
+++ src/usr.bin/fstat/misc.c	Sun Jul  9 22:31:55 2023
@@ -1,4 +1,4 @@
-/*	$NetBSD: misc.c,v 1.24 2020/09/13 04:14:48 isaki Exp $	*/
+/*	$NetBSD: misc.c,v 1.25 2023/07/10 02:31:55 christos Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: misc.c,v 1.24 2020/09/13 04:14:48 isaki Exp $");
+__RCSID("$NetBSD: misc.c,v 1.25 2023/07/10 02:31:55 christos Exp $");
 
 #include <stdbool.h>
 #include <sys/param.h>
@@ -56,6 +56,9 @@ __RCSID("$NetBSD: misc.c,v 1.24 2020/09/
 #undef _KERNEL
 #include <sys/cprng.h>
 #include <sys/vnode.h>
+#define _KERNEL
+#include <sys/mman.h>
+#undef _KERNEL
 #include <sys/mount.h>
 
 #include <net/bpfdesc.h>
@@ -110,7 +113,9 @@ static struct nlist nl[] = {
     { .n_name = "audio_fileops" },
 #define NL_PAD		19
     { .n_name = "pad_fileops" },
-#define NL_MAX		20
+#define NL_MEMFD	20
+    { .n_name = "memfd_fileops" },
+#define NL_MAX		21
     { .n_name = NULL }
 };
 
@@ -263,6 +268,40 @@ p_audio(struct file *f)
 	return 0;
 }
 
+static int
+p_memfd_seal(int seen, int all, int target, const char *name)
+{
+	if (all & target)
+		(void)printf("%s%s", (seen ? "|" : ""), name);
+
+	return seen || (all & target);
+}
+
+static int
+p_memfd(struct file *f)
+{
+	int seal_yet = 0;
+	struct memfd mfd;
+
+	if (!KVM_READ(f->f_data, &mfd, sizeof(mfd))) {
+		dprintf("can't read memfd at %p for pid %d", f->f_data, Pid);
+		return 0;
+	}
+	(void)printf("* %s, seals=", mfd.mfd_name);
+	if (mfd.mfd_seals == 0)
+		(void)printf("0");
+	else {
+		seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_SEAL, "F_SEAL_SEAL");
+		seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_SHRINK, "F_SEAL_SHRINK");
+		seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_GROW, "F_SEAL_GROW");
+		seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_WRITE, "F_SEAL_WRITE");
+		seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_FUTURE_WRITE, "F_SEAL_FUTURE_WRITE");
+	}
+
+	oprint(f, "\n");
+	return 0;
+}
+
 int
 pmisc(struct file *f, const char *name)
 {
@@ -310,6 +349,8 @@ pmisc(struct file *f, const char *name)
 	case NL_PAD:
 		printf("* pad %p", f->f_data);
 		break;
+	case NL_MEMFD:
+		return p_memfd(f);
 	case NL_MAX:
 		printf("* %s ops=%p %p", name, f->f_ops, f->f_data);
 		break;

Added files:

Index: src/lib/libc/sys/memfd_create.2
diff -u /dev/null src/lib/libc/sys/memfd_create.2:1.1
--- /dev/null	Sun Jul  9 22:31:55 2023
+++ src/lib/libc/sys/memfd_create.2	Sun Jul  9 22:31:54 2023
@@ -0,0 +1,125 @@
+.\"	$NetBSD: memfd_create.2,v 1.1 2023/07/10 02:31:54 christos Exp $
+.\"
+.\" Copyright (c) 2023 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Theodore Preduta.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd July 5, 2023
+.Dt MEMFD_CREATE 2
+.Os
+.Sh NAME
+.Nm memfd_create
+.Nd create anonymous files
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In sys/mman.h
+.Ft int
+.Fn memfd_create "const char *name" "unsigned int flags"
+.Sh DESCRIPTION
+The
+.Fn memfd_create
+system call returns a file descriptor to a file named
+.Fa name
+backed only by RAM.
+Initially, the size of the file is zero.
+.Pp
+The length of
+.Fa name
+must not exceed
+.Dv NAME_MAX-6
+characters in length, to allow for the prefix
+.Dq memfd:
+to be added.
+But since the file descriptor does not live on disk,
+.Fa name
+does not have to be unique.
+.Fa name
+is only intended to be used for debugging purposes and commands like
+.Xr fstat 1 .
+.Pp
+Additionally, any of the following may be specified as the
+.Fa flags :
+.Bl -tag -width MFD_ALLOW_SEALING
+.It Dv MFD_CLOEXEC
+Set the
+.Xr close 2
+on
+.Xr exec 3
+flag.
+.It Dv MFD_ALLOW_SEALING
+Allow adding seals to the file descriptor using the
+.Xr fcntl 2
+.Dv F_ADD_SEALS
+command.
+.El
+.Pp
+Otherwise, the returned file descriptor behaves the same as a regular file,
+including the ability to be mapped by
+.Xr mmap 2 .
+.Sh RETURN VALUES
+If successful, the
+.Fn memfd_create
+system call returns a non-negative integer.
+On failure -1 is returned and
+.Fa errno
+is set to indicate the error.
+.Sh ERRORS
+.Fn memfd_create
+will fail if:
+.Bl -tag -width Er
+.It Bq Er EFAULT
+The argument
+.Fa name
+is
+.Dv NULL
+or points to invalid memory.
+.It Bq Er EINVAL
+The argument
+.Fa flags
+has any bits set other than
+.Dv MFD_CLOEXEC
+or
+.Dv MFD_ALLOW_SEALING .
+.It Bq Er ENAMETOOLONG
+The length of
+.Fa name
+appended with the prefix
+.Dq memfd:
+would exceed
+.Dv NAME_MAX .
+.It Bq Er ENFILE
+The system file table is full.
+.El
+.Sh SEE ALSO
+.Xr fcntl 2 ,
+.Xr mmap 2 ,
+.Xr shmget 2 ,
+.Xr shm_open 3
+.Sh HISTORY
+.Fn memfd_create
+is compatible with the Linux system call of the same name that first appeared in
+Linux 3.17.

Index: src/sys/kern/sys_memfd.c
diff -u /dev/null src/sys/kern/sys_memfd.c:1.1
--- /dev/null	Sun Jul  9 22:31:55 2023
+++ src/sys/kern/sys_memfd.c	Sun Jul  9 22:31:55 2023
@@ -0,0 +1,408 @@
+/*	$NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $	*/
+
+/*-
+ * Copyright (c) 2023 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Theodore Preduta.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/mman.h>
+#include <sys/syscallargs.h>
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_object.h>
+
+#define F_SEAL_ANY_WRITE	(F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
+#define MFD_KNOWN_SEALS		(F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
+				|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
+
+static const char memfd_prefix[] = "memfd:";
+
+static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
+static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
+static int memfd_ioctl(file_t *, u_long, void *);
+static int memfd_fcntl(file_t *, u_int, void *);
+static int memfd_stat(file_t *, struct stat *);
+static int memfd_close(file_t *);
+static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
+    struct uvm_object **, int *);
+static int memfd_seek(file_t *, off_t, int, off_t *, int);
+static int memfd_truncate(file_t *, off_t);
+
+static const struct fileops memfd_fileops = {
+	.fo_name = "memfd",
+	.fo_read = memfd_read,
+	.fo_write = memfd_write,
+	.fo_ioctl = memfd_ioctl,
+	.fo_fcntl = memfd_fcntl,
+	.fo_poll = fnullop_poll,
+	.fo_stat = memfd_stat,
+	.fo_close = memfd_close,
+	.fo_kqfilter = fnullop_kqfilter,
+	.fo_restart = fnullop_restart,
+	.fo_mmap = memfd_mmap,
+	.fo_seek = memfd_seek,
+	.fo_fpathconf = (void *)eopnotsupp,
+	.fo_posix_fadvise = (void *)eopnotsupp,
+	.fo_truncate = memfd_truncate,
+};
+
+/*
+ * memfd_create(2).  Creat a file descriptor associated with anonymous
+ * memory.
+ */
+int
+sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
+    register_t *retval)
+{
+	/* {
+		syscallarg(const char *) name;
+		syscallarg(unsigned int) flags;
+	} */
+	int error, fd;
+	file_t *fp;
+	struct memfd *mfd;
+	struct proc *p = l->l_proc;
+	const unsigned int flags = SCARG(uap, flags);
+
+	KASSERT(NAME_MAX - sizeof(memfd_prefix) > 0); /* sanity check */
+
+	if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
+		return EINVAL;
+
+	mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
+	mfd->mfd_size = 0;
+	mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
+	mutex_init(&mfd->mfd_lock, MUTEX_DEFAULT, IPL_NONE);
+
+	strcpy(mfd->mfd_name, memfd_prefix);
+	error = copyinstr(SCARG(uap, name),
+	    &mfd->mfd_name[sizeof(memfd_prefix) - 1],
+	    sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
+	if (error != 0)
+ 		goto leave;
+
+	getnanotime(&mfd->mfd_btime);
+
+	if ((flags & MFD_ALLOW_SEALING) == 0)
+		mfd->mfd_seals |= F_SEAL_SEAL;
+
+	error = fd_allocfile(&fp, &fd);
+	if (error != 0)
+		goto leave;
+
+	fp->f_flag = FREAD|FWRITE;
+	fp->f_type = DTYPE_MEMFD;
+	fp->f_ops = &memfd_fileops;
+	fp->f_memfd = mfd;
+	fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
+	fd_affix(p, fp, fd);
+
+	*retval = fd;
+	return 0;
+
+leave:
+	uao_detach(mfd->mfd_uobj);
+	kmem_free(mfd, sizeof(*mfd));
+	return error;
+}
+
+static int
+memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
+    int flags)
+{
+	int error;
+	vsize_t todo;
+	struct memfd *mfd = fp->f_memfd;
+
+	if (offp == &fp->f_offset)
+		mutex_enter(&fp->f_lock);
+
+	if (*offp < 0) {
+		error = EINVAL;
+		goto leave;
+	}
+
+	/* Trying to read past the end does nothing. */
+	if (*offp >= mfd->mfd_size) {
+		error = 0;
+		goto leave;
+	}
+
+	uio->uio_offset = *offp;
+	todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
+	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
+	    UBC_READ|UBC_PARTIALOK);
+
+leave:
+	if (offp == &fp->f_offset)
+		mutex_exit(&fp->f_lock);
+
+	getnanotime(&mfd->mfd_atime);
+
+	return error;
+}
+
+static int
+memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
+    int flags)
+{
+	int error;
+	vsize_t todo;
+	struct memfd *mfd = fp->f_memfd;
+
+	if (mfd->mfd_seals & F_SEAL_ANY_WRITE)
+		return EPERM;
+
+	if (offp == &fp->f_offset)
+		mutex_enter(&fp->f_lock);
+
+	if (*offp < 0) {
+		error = EINVAL;
+		goto leave;
+	}
+
+	uio->uio_offset = *offp;
+	todo = uio->uio_resid;
+
+	if (mfd->mfd_seals & F_SEAL_GROW) {
+		if (*offp >= mfd->mfd_size) {
+			error = EPERM;
+			goto leave;
+		}
+
+		/* Truncate the write to fit in mfd_size */
+		if (*offp + uio->uio_resid >= mfd->mfd_size)
+			todo = mfd->mfd_size - *offp;
+	} else if (*offp + uio->uio_resid >= mfd->mfd_size) {
+		/* Grow to accommodate the write request. */
+		error = memfd_truncate(fp, *offp + uio->uio_resid);
+		if (error != 0)
+			goto leave;
+	}
+
+	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
+	    UBC_WRITE|UBC_PARTIALOK);
+
+	getnanotime(&mfd->mfd_mtime);
+
+leave:
+	if (offp == &fp->f_offset)
+		mutex_exit(&fp->f_lock);
+
+	return error;
+}
+
+static int
+memfd_ioctl(file_t *fp, u_long cmd, void *data)
+{
+
+	return EINVAL;
+}
+
+static int
+memfd_fcntl(file_t *fp, u_int cmd, void *data)
+{
+	struct memfd *mfd = fp->f_memfd;
+
+	switch (cmd) {
+	case F_ADD_SEALS:
+		if (mfd->mfd_seals & F_SEAL_SEAL)
+			return EPERM;
+
+		if (*(int *)data & ~MFD_KNOWN_SEALS)
+		        return EINVAL;
+
+		/*
+		 * Can only add F_SEAL_WRITE if there are no currently
+		 * open mmaps.
+		 *
+		 * XXX should only disallow if there are no currently
+		 * open mmaps with PROT_WRITE.
+		 */
+		if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
+		    (*(int *)data & F_SEAL_WRITE) != 0 &&
+		    mfd->mfd_uobj->uo_refs > 1)
+			return EBUSY;
+
+		mfd->mfd_seals |= *(int *)data;
+		return 0;
+
+	case F_GET_SEALS:
+		*(int *)data = mfd->mfd_seals;
+		return 0;
+
+	default:
+		return EINVAL;
+	}
+}
+
+static int
+memfd_stat(file_t *fp, struct stat *st)
+{
+	struct memfd *mfd = fp->f_memfd;
+
+	memset(st, 0, sizeof(*st));
+	st->st_uid = kauth_cred_geteuid(fp->f_cred);
+	st->st_gid = kauth_cred_getegid(fp->f_cred);
+	st->st_size = mfd->mfd_size;
+
+	st->st_mode = S_IREAD;
+	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
+		st->st_mode |= S_IWRITE;
+
+	st->st_birthtimespec = mfd->mfd_btime;
+	st->st_ctimespec = mfd->mfd_mtime;
+	st->st_atimespec = mfd->mfd_atime;
+	st->st_mtimespec = mfd->mfd_mtime;
+
+	return 0;
+}
+
+static int
+memfd_close(file_t *fp)
+{
+	struct memfd *mfd = fp->f_memfd;
+
+	uao_detach(mfd->mfd_uobj);
+	mutex_destroy(&mfd->mfd_lock);
+
+	kmem_free(mfd, sizeof(*mfd));
+	fp->f_memfd = NULL;
+
+	return 0;
+}
+
+static int
+memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
+    int *advicep, struct uvm_object **uobjp, int *maxprotp)
+{
+	struct memfd *mfd = fp->f_memfd;
+
+	/* uvm_mmap guarantees page-aligned offset and size.  */
+	KASSERT(*offp == round_page(*offp));
+	KASSERT(size == round_page(size));
+	KASSERT(size > 0);
+
+	if (*offp < 0)
+		return EINVAL;
+	if (*offp + size > mfd->mfd_size)
+		return EINVAL;
+
+	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
+	    (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0)
+		return EPERM;
+
+	uao_reference(fp->f_memfd->mfd_uobj);
+	*uobjp = fp->f_memfd->mfd_uobj;
+
+	*maxprotp = prot;
+	*advicep = UVM_ADV_RANDOM;
+
+	return 0;
+}
+
+static int
+memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
+    int flags)
+{
+	off_t newoff;
+	int error;
+
+	switch (whence) {
+	case SEEK_CUR:
+		newoff = fp->f_offset + delta;
+		break;
+
+	case SEEK_END:
+		newoff = fp->f_memfd->mfd_size + delta;
+		break;
+
+	case SEEK_SET:
+		newoff = delta;
+		break;
+
+	default:
+		error = EINVAL;
+		return error;
+	}
+
+	if (newoffp)
+		*newoffp = newoff;
+	if (flags & FOF_UPDATE_OFFSET)
+		fp->f_offset = newoff;
+
+	return 0;
+}
+
+static int
+memfd_truncate(file_t *fp, off_t length)
+{
+	struct memfd *mfd = fp->f_memfd;
+	int error = 0;
+	voff_t start, end;
+
+	if (length < 0)
+		return EINVAL;
+	if (length == mfd->mfd_size)
+		return 0;
+
+	if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
+		return EPERM;
+	if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
+		return EPERM;
+
+	mutex_enter(&mfd->mfd_lock);
+
+	if (length > mfd->mfd_size)
+		ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
+		    length - mfd->mfd_size, 0);
+	else {
+		/* length < mfd->mfd_size, so try to get rid of excess pages */
+		start = round_page(length);
+		end = round_page(mfd->mfd_size);
+
+		if (start < end) { /* we actually have pages to remove */
+			rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
+			error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
+			    start, end, PGO_FREE);
+			/* pgo_put drops vmobjlock */
+		}
+	}
+
+	getnanotime(&mfd->mfd_mtime);
+	mfd->mfd_size = length;
+	mutex_exit(&mfd->mfd_lock);
+	return error;
+}

Reply via email to