Module Name: src
Committed By: christos
Date: Mon Jul 10 02:31:55 UTC 2023
Modified Files:
src/distrib/sets/lists/comp: mi
src/lib/libc/sys: Makefile.inc fcntl.2
src/sys/compat/linux/arch/amd64: syscalls.master
src/sys/compat/linux/common: linux_fcntl.h linux_file.c linux_misc.c
linux_sysctl.c
src/sys/kern: sys_descrip.c vfs_syscalls.c vfs_vnops.c
src/sys/sys: fcntl.h file.h mman.h
src/usr.bin/fstat: fstat.c misc.c
Added Files:
src/lib/libc/sys: memfd_create.2
src/sys/kern: sys_memfd.c
Log Message:
Add memfd_create(2) from GSoC 2023 by Theodore Preduta
To generate a diff of this commit:
cvs rdiff -u -r1.2436 -r1.2437 src/distrib/sets/lists/comp/mi
cvs rdiff -u -r1.250 -r1.251 src/lib/libc/sys/Makefile.inc
cvs rdiff -u -r1.49 -r1.50 src/lib/libc/sys/fcntl.2
cvs rdiff -u -r0 -r1.1 src/lib/libc/sys/memfd_create.2
cvs rdiff -u -r1.67 -r1.68 src/sys/compat/linux/arch/amd64/syscalls.master
cvs rdiff -u -r1.20 -r1.21 src/sys/compat/linux/common/linux_fcntl.h
cvs rdiff -u -r1.122 -r1.123 src/sys/compat/linux/common/linux_file.c
cvs rdiff -u -r1.256 -r1.257 src/sys/compat/linux/common/linux_misc.c
cvs rdiff -u -r1.47 -r1.48 src/sys/compat/linux/common/linux_sysctl.c
cvs rdiff -u -r1.47 -r1.48 src/sys/kern/sys_descrip.c
cvs rdiff -u -r0 -r1.1 src/sys/kern/sys_memfd.c
cvs rdiff -u -r1.559 -r1.560 src/sys/kern/vfs_syscalls.c
cvs rdiff -u -r1.241 -r1.242 src/sys/kern/vfs_vnops.c
cvs rdiff -u -r1.54 -r1.55 src/sys/sys/fcntl.h
cvs rdiff -u -r1.92 -r1.93 src/sys/sys/file.h
cvs rdiff -u -r1.62 -r1.63 src/sys/sys/mman.h
cvs rdiff -u -r1.117 -r1.118 src/usr.bin/fstat/fstat.c
cvs rdiff -u -r1.24 -r1.25 src/usr.bin/fstat/misc.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/distrib/sets/lists/comp/mi
diff -u src/distrib/sets/lists/comp/mi:1.2436 src/distrib/sets/lists/comp/mi:1.2437
--- src/distrib/sets/lists/comp/mi:1.2436 Sat Jul 8 13:43:13 2023
+++ src/distrib/sets/lists/comp/mi Sun Jul 9 22:31:54 2023
@@ -1,4 +1,4 @@
-# $NetBSD: mi,v 1.2436 2023/07/08 17:43:13 christos Exp $
+# $NetBSD: mi,v 1.2437 2023/07/10 02:31:54 christos Exp $
#
# Note: don't delete entries from here - mark them as "obsolete" instead.
./etc/mtree/set.comp comp-sys-root
@@ -4826,6 +4826,7 @@
./usr/share/man/cat2/lutimes.0 comp-c-catman .cat
./usr/share/man/cat2/m68k_sync_icache.0 comp-c-catman .cat
./usr/share/man/cat2/madvise.0 comp-c-catman .cat
+./usr/share/man/cat2/memfd_create.0 comp-c-catman .cat
./usr/share/man/cat2/mincore.0 comp-c-catman .cat
./usr/share/man/cat2/minherit.0 comp-c-catman .cat
./usr/share/man/cat2/mkdir.0 comp-c-catman .cat
@@ -13315,6 +13316,7 @@
./usr/share/man/html2/lutimes.html comp-c-htmlman html
./usr/share/man/html2/m68k_sync_icache.html comp-c-htmlman html
./usr/share/man/html2/madvise.html comp-c-htmlman html
+./usr/share/man/html2/memfd_create.html comp-c-htmlman html
./usr/share/man/html2/mincore.html comp-c-htmlman html
./usr/share/man/html2/minherit.html comp-c-htmlman html
./usr/share/man/html2/mkdir.html comp-c-htmlman html
@@ -21632,6 +21634,7 @@
./usr/share/man/man2/lutimes.2 comp-c-man .man
./usr/share/man/man2/m68k_sync_icache.2 comp-c-man .man
./usr/share/man/man2/madvise.2 comp-c-man .man
+./usr/share/man/man2/memfd_create.2 comp-c-man .man
./usr/share/man/man2/mincore.2 comp-c-man .man
./usr/share/man/man2/minherit.2 comp-c-man .man
./usr/share/man/man2/mkdir.2 comp-c-man .man
Index: src/lib/libc/sys/Makefile.inc
diff -u src/lib/libc/sys/Makefile.inc:1.250 src/lib/libc/sys/Makefile.inc:1.251
--- src/lib/libc/sys/Makefile.inc:1.250 Mon Nov 1 01:53:45 2021
+++ src/lib/libc/sys/Makefile.inc Sun Jul 9 22:31:54 2023
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile.inc,v 1.250 2021/11/01 05:53:45 thorpej Exp $
+# $NetBSD: Makefile.inc,v 1.251 2023/07/10 02:31:54 christos Exp $
# @(#)Makefile.inc 8.3 (Berkeley) 10/24/94
# sys sources
@@ -134,7 +134,8 @@ ASM=\
_lwp_unpark_all.S _lwp_suspend.S _lwp_continue.S \
_lwp_wakeup.S _lwp_detach.S _lwp_setprivate.S \
_lwp_setname.S _lwp_getname.S _lwp_ctl.S \
- madvise.S mincore.S minherit.S mkdir.S mkdirat.S mkfifo.S mkfifoat.S \
+ madvise.S memfd_create.S mincore.S minherit.S mkdir.S mkdirat.S \
+ mkfifo.S mkfifoat.S \
__mknod50.S mlock.S mlockall.S modctl.S __mount50.S \
mprotect.S __msgctl50.S msgget.S munlock.S munlockall.S \
munmap.S \
@@ -275,7 +276,7 @@ MAN+= accept.2 access.2 acct.2 adjtime.2
_lwp_suspend.2 _lwp_wakeup.2 _lwp_wait.2 _lwp_kill.2 \
_lwp_getname.2 _lwp_getprivate.2 \
_lwp_park.2 _lwp_unpark.2 _lwp_unpark_all.2 \
- mkdir.2 mkfifo.2 mknod.2 \
+ memfd_create.2 mkdir.2 mkfifo.2 mknod.2 \
madvise.2 mincore.2 minherit.2 mlock.2 mlockall.2 mmap.2 modctl.2 \
mount.2 \
mprotect.2 mremap.2 msgctl.2 msgget.2 msgrcv.2 msgsnd.2 msync.2 \
Index: src/lib/libc/sys/fcntl.2
diff -u src/lib/libc/sys/fcntl.2:1.49 src/lib/libc/sys/fcntl.2:1.50
--- src/lib/libc/sys/fcntl.2:1.49 Sun Dec 4 14:01:19 2022
+++ src/lib/libc/sys/fcntl.2 Sun Jul 9 22:31:54 2023
@@ -1,4 +1,4 @@
-.\" $NetBSD: fcntl.2,v 1.49 2022/12/04 19:01:19 uwe Exp $
+.\" $NetBSD: fcntl.2,v 1.50 2023/07/10 02:31:54 christos Exp $
.\"
.\" Copyright (c) 1983, 1993
.\" The Regents of the University of California. All rights reserved.
@@ -29,7 +29,7 @@
.\"
.\" @(#)fcntl.2 8.2 (Berkeley) 1/12/94
.\"
-.Dd September 26, 2019
+.Dd July 5, 2023
.Dt FCNTL 2
.Os
.Sh NAME
@@ -162,6 +162,24 @@ in the buffer pointed to by
.Fa arg
should be pointing to a buffer of at least
.Dv MAXPATHLEN .
+.It Dv F_ADD_SEALS
+Add seals specified in
+.Fa arg
+to
+.Fa fd
+to restrict possible operations on
+.Fa fd
+as described below.
+Like flags, multiple seals can be specified at once.
+Additionally, specifying seals that are already associated with
+.Fa fd
+is a no-op.
+.It Dv F_GET_SEALS
+Get the seals currently associated with
+.Fa fd
+as described below
+.Fa ( arg
+is ignored).
.El
.Pp
The set of valid flags for the
@@ -324,13 +342,44 @@ or an
request fails or blocks respectively when another process has existing
locks on bytes in the specified region and the type of any of those
locks conflicts with the type specified in the request.
+.Pp
+Possible seals are:
+.Bl -tag -width F_SEAL_FUTURE_WRITE
+.It Dv F_SEAL_SEAL
+Prevent any further seals from being added to
+.Fa fd .
+.It Dv F_SEAL_SHRINK
+Prevent the size of
+.Fa fd
+from decreasing.
+.It Dv F_SEAL_GROW
+Prevent the size of
+.Fa fd
+from increasing.
+.It Dv F_SEAL_WRITE
+Prevent any write operations to
+.Fa fd .
+.Dv F_SEAL_WRITE
+cannot be applied if
+.Fa fd
+has any memory mappings.
+.It Dv F_SEAL_FUTURE_WRITE
+Like
+.Dv F_SEAL_WRITE
+but allow any current memory mappings of
+.Fa fd
+to remain open, including those with
+.Dv PROT_WRITE .
+.El
.Sh NOTES
-The
-.Dv F_GETPATH
-functionality is implemented using the reverse
+For
+.Dv F_GETPATH :
+.Bl -bullet -compact
+.It
+For vnodes, functionality is implemented using the reverse
.Xr namei 9
cache.
-The implications of this are:
+The implications of this are
.Bl -bullet -compact
.It
For hard links where the file descriptor can resolve to multiple pathnames,
@@ -341,16 +390,25 @@ may fail if the corresponding entry has
.Xr namei 9
cache and return
.Er ENOENT .
+.El
.It
-File descriptors that don't point to vnodes are not handled, as
-well as symbolic links since there is currently no way to obtain
-a file descriptor pointing to a symbolic link.
+For a file descriptor created by
+.Xr memfd_create 2 ,
+the name provided at
+.Fa fd
+creation, with the prefix
+.Dq memfd:
+is used.
+.It
+Other types of file descriptors are not handled, as well as symbolic
+links since there is currently no way to obtain a file descriptor
+pointing to a symbolic link.
.El
.Sh RETURN VALUES
Upon successful completion, the value returned depends on
.Fa cmd
as follows:
-.Bl -tag -width F_GETOWNX -offset indent
+.Bl -tag -width F_GET_SEALS -offset indent
.It Dv F_DUPFD
A new file descriptor.
.It Dv F_GETFD
@@ -361,6 +419,9 @@ Value of flags.
Value of file descriptor owner.
.It Dv F_MAXFD
Value of the highest file descriptor open by the process.
+.It Dv F_GET_SEALS
+Value of the seals currently associated with
+.Fa fd .
.It other
Value other than \-1.
.El
@@ -473,6 +534,18 @@ is an exclusive lock
and
.Fa fildes
is not a valid file descriptor open for writing.
+.It Bq Er EBUSY
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS ,
+.Fa arg
+contains
+.Dv F_SEAL_WRITE
+and
+.Fa fd
+is currently mapped by
+.Xr mmap 2 .
.It Bq Er EDEADLK
The argument
.Fa cmd
@@ -512,6 +585,24 @@ and the data to which
points is not valid, or
.Fa fildes
refers to a file that does not support locking.
+.Pp
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS
+or
+.Dv F_GET_SEALS
+and
+.Fa fd
+does not support seals.
+.Pp
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS
+and
+.Fa arg
+contains set bits for unsupported seals.
.It Bq Er EMFILE
The argument
.Fa cmd
@@ -562,6 +653,15 @@ has been reached.
It can be modified using the
.Li kern.maxfiles
.Xr sysctl 7 .
+.It Bq Er EPERM
+The argument
+.Fa cmd
+is
+.Dv F_ADD_SEALS
+and
+.Fa fd
+already has
+.Dv F_SEAL_SEAL .
.It Bq Er ERANGE
The argument
.Fa cmd
Index: src/sys/compat/linux/arch/amd64/syscalls.master
diff -u src/sys/compat/linux/arch/amd64/syscalls.master:1.67 src/sys/compat/linux/arch/amd64/syscalls.master:1.68
--- src/sys/compat/linux/arch/amd64/syscalls.master:1.67 Wed Dec 1 23:29:48 2021
+++ src/sys/compat/linux/arch/amd64/syscalls.master Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
- $NetBSD: syscalls.master,v 1.67 2021/12/02 04:29:48 ryo Exp $
+ $NetBSD: syscalls.master,v 1.68 2023/07/10 02:31:55 christos Exp $
; @(#)syscalls.master 8.1 (Berkeley) 7/19/93
@@ -567,6 +567,145 @@
311 UNIMPL process_vm_writev
312 UNIMPL kcmp
313 UNIMPL finit_module
+314 UNIMPL sched_setattr
+315 UNIMPL sched_getattr
+316 UNIMPL renameat2
+317 UNIMPL seccomp
+318 NOARGS { ssize_t|sys||getrandom(void *buf, size_t buflen, \
+ unsigned int flags); }
+319 STD { int|linux_sys||memfd_create(const char *name, \
+ unsigned int flags); }
+320 UNIMPL kexec_file_load
+321 UNIMPL bpf
+322 UNIMPL execveat
+323 UNIMPL userfaultfd
+324 UNIMPL membarrier
+325 UNIMPL mlock2
+326 UNIMPL copy_file_range
+327 UNIMPL preadv2
+328 UNIMPL pwritev2
+329 UNIMPL pkey_mprotect
+330 UNIMPL pkey_alloc
+331 UNIMPL pkey_free
+332 UNIMPL statx
+333 UNIMPL io_pgetevents
+334 UNIMPL rseq
+335 UNIMPL
+336 UNIMPL
+337 UNIMPL
+338 UNIMPL
+339 UNIMPL
+340 UNIMPL
+341 UNIMPL
+342 UNIMPL
+343 UNIMPL
+344 UNIMPL
+345 UNIMPL
+346 UNIMPL
+347 UNIMPL
+348 UNIMPL
+349 UNIMPL
+350 UNIMPL
+351 UNIMPL
+352 UNIMPL
+353 UNIMPL
+354 UNIMPL
+355 UNIMPL
+356 UNIMPL
+357 UNIMPL
+358 UNIMPL
+359 UNIMPL
+360 UNIMPL
+361 UNIMPL
+362 UNIMPL
+363 UNIMPL
+364 UNIMPL
+365 UNIMPL
+366 UNIMPL
+367 UNIMPL
+368 UNIMPL
+369 UNIMPL
+370 UNIMPL
+371 UNIMPL
+372 UNIMPL
+373 UNIMPL
+374 UNIMPL
+375 UNIMPL
+376 UNIMPL
+377 UNIMPL
+378 UNIMPL
+379 UNIMPL
+380 UNIMPL
+381 UNIMPL
+382 UNIMPL
+383 UNIMPL
+384 UNIMPL
+385 UNIMPL
+386 UNIMPL
+387 UNIMPL
+388 UNIMPL
+389 UNIMPL
+390 UNIMPL
+391 UNIMPL
+392 UNIMPL
+393 UNIMPL
+394 UNIMPL
+395 UNIMPL
+396 UNIMPL
+397 UNIMPL
+398 UNIMPL
+399 UNIMPL
+400 UNIMPL
+401 UNIMPL
+402 UNIMPL
+403 UNIMPL
+404 UNIMPL
+405 UNIMPL
+406 UNIMPL
+407 UNIMPL
+408 UNIMPL
+409 UNIMPL
+410 UNIMPL
+411 UNIMPL
+412 UNIMPL
+413 UNIMPL
+414 UNIMPL
+415 UNIMPL
+416 UNIMPL
+417 UNIMPL
+418 UNIMPL
+419 UNIMPL
+420 UNIMPL
+421 UNIMPL
+422 UNIMPL
+423 UNIMPL
+424 UNIMPL pidfd_send_signal
+425 UNIMPL io_uring_setup
+426 UNIMPL io_uring_enter
+427 UNIMPL io_uring_register
+428 UNIMPL open_tree
+429 UNIMPL move_mount
+430 UNIMPL fsopen
+431 UNIMPL fsconfig
+432 UNIMPL fsmount
+433 UNIMPL fspick
+434 UNIMPL pidfd_open
+435 UNIMPL clone3
+436 UNIMPL close_range
+437 UNIMPL openat2
+438 UNIMPL pidfd_getfd
+439 UNIMPL faccessat2
+440 UNIMPL process_madvise
+441 UNIMPL epoll_pwait2
+442 UNIMPL mount_setattr
+443 UNIMPL quotactl_fd
+444 UNIMPL landlock_create_ruleset
+445 UNIMPL landlock_add_rule
+446 UNIMPL landlock_restrict_self
+447 UNIMPL memfd_secret
+448 UNIMPL process_mrelease
+449 UNIMPL futex_waitv
+450 UNIMPL set_mempolicy_home_node
; we want a "nosys" syscall, we'll just add an extra entry for it.
-314 STD { int|linux_sys||nosys(void); }
+451 STD { int|linux_sys||nosys(void); }
Index: src/sys/compat/linux/common/linux_fcntl.h
diff -u src/sys/compat/linux/common/linux_fcntl.h:1.20 src/sys/compat/linux/common/linux_fcntl.h:1.21
--- src/sys/compat/linux/common/linux_fcntl.h:1.20 Wed Nov 24 21:27:08 2021
+++ src/sys/compat/linux/common/linux_fcntl.h Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: linux_fcntl.h,v 1.20 2021/11/25 02:27:08 ryo Exp $ */
+/* $NetBSD: linux_fcntl.h,v 1.21 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 1995, 1998 The NetBSD Foundation, Inc.
@@ -100,6 +100,8 @@ struct linux_flock64 {
#define LINUX_F_DUPFD_CLOEXEC (LINUX_F_SPECIFIC_BASE + 6)
#define LINUX_F_SETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 7)
#define LINUX_F_GETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 8)
+#define LINUX_F_ADD_SEALS (LINUX_F_SPECIFIC_BASE + 9)
+#define LINUX_F_GET_SEALS (LINUX_F_SPECIFIC_BASE + 10)
/*
* We have to have 4 copies of the code that converts linux fcntl() file
Index: src/sys/compat/linux/common/linux_file.c
diff -u src/sys/compat/linux/common/linux_file.c:1.122 src/sys/compat/linux/common/linux_file.c:1.123
--- src/sys/compat/linux/common/linux_file.c:1.122 Wed Nov 24 22:08:04 2021
+++ src/sys/compat/linux/common/linux_file.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: linux_file.c,v 1.122 2021/11/25 03:08:04 ryo Exp $ */
+/* $NetBSD: linux_file.c,v 1.123 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 1995, 1998, 2008 The NetBSD Foundation, Inc.
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_file.c,v 1.122 2021/11/25 03:08:04 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_file.c,v 1.123 2023/07/10 02:31:55 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -440,6 +440,14 @@ linux_sys_fcntl(struct lwp *l, const str
cmd = F_DUPFD_CLOEXEC;
break;
+ case LINUX_F_ADD_SEALS:
+ cmd = F_ADD_SEALS;
+ break;
+
+ case LINUX_F_GET_SEALS:
+ cmd = F_GET_SEALS;
+ break;
+
default:
return EOPNOTSUPP;
}
Index: src/sys/compat/linux/common/linux_misc.c
diff -u src/sys/compat/linux/common/linux_misc.c:1.256 src/sys/compat/linux/common/linux_misc.c:1.257
--- src/sys/compat/linux/common/linux_misc.c:1.256 Wed Dec 1 23:29:48 2021
+++ src/sys/compat/linux/common/linux_misc.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: linux_misc.c,v 1.256 2021/12/02 04:29:48 ryo Exp $ */
+/* $NetBSD: linux_misc.c,v 1.257 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 1995, 1998, 1999, 2008 The NetBSD Foundation, Inc.
@@ -57,7 +57,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_misc.c,v 1.256 2021/12/02 04:29:48 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_misc.c,v 1.257 2023/07/10 02:31:55 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -169,9 +169,9 @@ const struct linux_mnttypes linux_fstype
const int linux_fstypes_cnt = sizeof(linux_fstypes) / sizeof(linux_fstypes[0]);
# ifdef DEBUG_LINUX
-#define DPRINTF(a) uprintf a
+#define DPRINTF(a) uprintf a
# else
-#define DPRINTF(a)
+#define DPRINTF(a)
# endif
/* Local linux_misc.c functions: */
@@ -1681,3 +1681,66 @@ linux_sys_eventfd2(struct lwp *l, const
return linux_do_eventfd2(l, SCARG(uap, initval), SCARG(uap, flags),
retval);
}
+
+#define LINUX_MFD_CLOEXEC 0x0001U
+#define LINUX_MFD_ALLOW_SEALING 0x0002U
+#define LINUX_MFD_HUGETLB 0x0004U
+#define LINUX_MFD_NOEXEC_SEAL 0x0008U
+#define LINUX_MFD_EXEC 0x0010U
+#define LINUX_MFD_HUGE_FLAGS (0x3f << 26)
+
+#define LINUX_MFD_ALL_FLAGS (LINUX_MFD_CLOEXEC|LINUX_MFD_ALLOW_SEALING \
+ |LINUX_MFD_HUGETLB|LINUX_MFD_NOEXEC_SEAL \
+ |LINUX_MFD_EXEC|LINUX_MFD_HUGE_FLAGS)
+#define LINUX_MFD_KNOWN_FLAGS (LINUX_MFD_CLOEXEC|LINUX_MFD_ALLOW_SEALING)
+
+#define LINUX_MFD_NAME_MAX 249
+
+/*
+ * memfd_create(2). Do some error checking and then call NetBSD's
+ * version.
+ */
+int
+linux_sys_memfd_create(struct lwp *l,
+ const struct linux_sys_memfd_create_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(const char *) name;
+ syscallarg(unsigned int) flags;
+ } */
+ int error;
+ char *pbuf;
+ struct sys_memfd_create_args muap;
+ const unsigned int lflags = SCARG(uap, flags);
+
+ KASSERT(LINUX_MFD_NAME_MAX < NAME_MAX); /* sanity check */
+
+ if (lflags & ~LINUX_MFD_ALL_FLAGS)
+ return EINVAL;
+ if ((lflags & LINUX_MFD_HUGE_FLAGS) != 0 &&
+ (lflags & LINUX_MFD_HUGETLB) == 0)
+ return EINVAL;
+ if ((lflags & LINUX_MFD_HUGETLB) && (lflags & LINUX_MFD_ALLOW_SEALING))
+ return EINVAL;
+
+ /* Linux has a stricter limit for name size */
+ pbuf = PNBUF_GET();
+ error = copyinstr(SCARG(uap, name), pbuf, LINUX_MFD_NAME_MAX+1, NULL);
+ PNBUF_PUT(pbuf);
+ pbuf = NULL;
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = EINVAL;
+ return error;
+ }
+
+ if (lflags & ~LINUX_MFD_KNOWN_FLAGS) {
+ DPRINTF(("linux_sys_memfd_create: ignored flags %x\n",
+ lflags & ~LINUX_MFD_KNOWN_FLAGS));
+ }
+
+ SCARG(&muap, name) = SCARG(uap, name);
+ SCARG(&muap, flags) = lflags & LINUX_MFD_KNOWN_FLAGS;
+
+ return sys_memfd_create(l, &muap, retval);
+}
Index: src/sys/compat/linux/common/linux_sysctl.c
diff -u src/sys/compat/linux/common/linux_sysctl.c:1.47 src/sys/compat/linux/common/linux_sysctl.c:1.48
--- src/sys/compat/linux/common/linux_sysctl.c:1.47 Thu Sep 23 02:56:27 2021
+++ src/sys/compat/linux/common/linux_sysctl.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: linux_sysctl.c,v 1.47 2021/09/23 06:56:27 ryo Exp $ */
+/* $NetBSD: linux_sysctl.c,v 1.48 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2003, 2008 The NetBSD Foundation, Inc.
@@ -34,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_sysctl.c,v 1.47 2021/09/23 06:56:27 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_sysctl.c,v 1.48 2023/07/10 02:31:55 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -57,8 +57,8 @@ __KERNEL_RCSID(0, "$NetBSD: linux_sysctl
#include <compat/linux/common/linux_machdep.h>
char linux_sysname[128] = "Linux";
-char linux_release[128] = "3.11.6";
-char linux_version[128] = "#1 SMP PREEMPT Thu Oct 24 16:23:02 UTC 2013";
+char linux_release[128] = "6.3.10";
+char linux_version[128] = "#1 SMP PREEMPT_DYNAMIC Wed Jun 28 18:34:30 UTC 2023";
struct sysctlnode linux_sysctl_root = {
.sysctl_flags = SYSCTL_VERSION|
Index: src/sys/kern/sys_descrip.c
diff -u src/sys/kern/sys_descrip.c:1.47 src/sys/kern/sys_descrip.c:1.48
--- src/sys/kern/sys_descrip.c:1.47 Sun May 14 05:29:58 2023
+++ src/sys/kern/sys_descrip.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: sys_descrip.c,v 1.47 2023/05/14 09:29:58 riastradh Exp $ */
+/* $NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
@@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.47 2023/05/14 09:29:58 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -315,26 +315,6 @@ out: if (fp)
return error;
}
-static int
-do_fcntl_getpath(struct lwp *l, file_t *fp, char *upath)
-{
- char *kpath;
- int error;
-
- if (fp->f_type != DTYPE_VNODE)
- return EOPNOTSUPP;
-
- kpath = PNBUF_GET();
-
- error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode, l, l->l_proc);
- if (!error)
- error = copyoutstr(kpath, upath, MAXPATHLEN, NULL);
-
- PNBUF_PUT(kpath);
-
- return error;
-}
-
/*
* The file control system call.
*/
@@ -350,6 +330,7 @@ sys_fcntl(struct lwp *l, const struct sy
filedesc_t *fdp;
fdtab_t *dt;
file_t *fp;
+ char *kpath;
struct flock fl;
bool cloexec = false;
@@ -486,7 +467,30 @@ sys_fcntl(struct lwp *l, const struct sy
break;
case F_GETPATH:
- error = do_fcntl_getpath(l, fp, SCARG(uap, arg));
+ kpath = PNBUF_GET();
+
+ /* vnodes need extra context, so are handled separately */
+ if (fp->f_type == DTYPE_VNODE)
+ error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode,
+ l, l->l_proc);
+ else
+ error = (*fp->f_ops->fo_fcntl)(fp, F_GETPATH, kpath);
+
+ if (error == 0)
+ error = copyoutstr(kpath, SCARG(uap, arg), MAXPATHLEN,
+ NULL);
+
+ PNBUF_PUT(kpath);
+ break;
+
+ case F_ADD_SEALS:
+ tmp = (int)(uintptr_t) SCARG(uap, arg);
+ error = (*fp->f_ops->fo_fcntl)(fp, F_ADD_SEALS, &tmp);
+ break;
+
+ case F_GET_SEALS:
+ error = (*fp->f_ops->fo_fcntl)(fp, F_GET_SEALS, &tmp);
+ *retval = tmp;
break;
default:
Index: src/sys/kern/vfs_syscalls.c
diff -u src/sys/kern/vfs_syscalls.c:1.559 src/sys/kern/vfs_syscalls.c:1.560
--- src/sys/kern/vfs_syscalls.c:1.559 Sat Apr 29 02:34:20 2023
+++ src/sys/kern/vfs_syscalls.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: vfs_syscalls.c,v 1.559 2023/04/29 06:34:20 riastradh Exp $ */
+/* $NetBSD: vfs_syscalls.c,v 1.560 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
@@ -70,7 +70,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.559 2023/04/29 06:34:20 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.560 2023/07/10 02:31:55 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_fileassoc.h"
@@ -4149,34 +4149,19 @@ sys_ftruncate(struct lwp *l, const struc
syscallarg(int) pad;
syscallarg(off_t) length;
} */
- struct vattr vattr;
- struct vnode *vp;
file_t *fp;
- int error;
+ int error, fd = SCARG(uap, fd);
- if (SCARG(uap, length) < 0)
- return EINVAL;
+ fp = fd_getfile(fd);
+ if (fp == NULL)
+ return EBADF;
+ if (fp->f_ops->fo_truncate == NULL)
+ error = EOPNOTSUPP;
+ else
+ error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length));
- /* fd_getvnode() will use the descriptor for us */
- if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
- return (error);
- if ((fp->f_flag & FWRITE) == 0) {
- error = EINVAL;
- goto out;
- }
- vp = fp->f_vnode;
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- if (vp->v_type == VDIR)
- error = EISDIR;
- else if ((error = vn_writechk(vp)) == 0) {
- vattr_null(&vattr);
- vattr.va_size = SCARG(uap, length);
- error = VOP_SETATTR(vp, &vattr, fp->f_cred);
- }
- VOP_UNLOCK(vp);
- out:
- fd_putfile(SCARG(uap, fd));
- return (error);
+ fd_putfile(fd);
+ return error;
}
/*
Index: src/sys/kern/vfs_vnops.c
diff -u src/sys/kern/vfs_vnops.c:1.241 src/sys/kern/vfs_vnops.c:1.242
--- src/sys/kern/vfs_vnops.c:1.241 Sat Apr 22 09:53:02 2023
+++ src/sys/kern/vfs_vnops.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: vfs_vnops.c,v 1.241 2023/04/22 13:53:02 riastradh Exp $ */
+/* $NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.241 2023/04/22 13:53:02 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $");
#include "veriexec.h"
@@ -125,6 +125,7 @@ static int vn_seek(struct file *, off_t,
static int vn_advlock(struct file *, void *, int, struct flock *, int);
static int vn_fpathconf(struct file *, int, register_t *);
static int vn_posix_fadvise(struct file *, off_t, off_t, int);
+static int vn_truncate(file_t *, off_t);
const struct fileops vnops = {
.fo_name = "vn",
@@ -142,6 +143,7 @@ const struct fileops vnops = {
.fo_advlock = vn_advlock,
.fo_fpathconf = vn_fpathconf,
.fo_posix_fadvise = vn_posix_fadvise,
+ .fo_truncate = vn_truncate,
};
/*
@@ -1331,6 +1333,33 @@ vn_posix_fadvise(struct file *fp, off_t
return error;
}
+static int
+vn_truncate(file_t *fp, off_t length)
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ int error = 0;
+
+ if (length < 0)
+ return EINVAL;
+
+ if ((fp->f_flag & FWRITE) == 0)
+ return EINVAL;
+ vp = fp->f_vnode;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0) {
+ vattr_null(&vattr);
+ vattr.va_size = length;
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred);
+ }
+ VOP_UNLOCK(vp);
+
+ return error;
+}
+
+
/*
* Check that the vnode is still valid, and if so
* acquire requested lock.
Index: src/sys/sys/fcntl.h
diff -u src/sys/sys/fcntl.h:1.54 src/sys/sys/fcntl.h:1.55
--- src/sys/sys/fcntl.h:1.54 Mon Mar 30 16:17:42 2020
+++ src/sys/sys/fcntl.h Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: fcntl.h,v 1.54 2020/03/30 20:17:42 kamil Exp $ */
+/* $NetBSD: fcntl.h,v 1.55 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 1983, 1990, 1993
@@ -200,6 +200,8 @@
#define F_GETNOSIGPIPE 13 /* get SIGPIPE disposition */
#define F_SETNOSIGPIPE 14 /* set SIGPIPE disposition */
#define F_GETPATH 15 /* get pathname associated with fd */
+#define F_ADD_SEALS 16 /* set seals */
+#define F_GET_SEALS 17 /* get seals */
#endif
/* file descriptor flags (F_GETFD, F_SETFD) */
@@ -215,6 +217,15 @@
#define F_POSIX 0x040 /* Use POSIX semantics for lock */
#endif
+/* types of seals (F_ADD_SEALS, F_GET_SEALS) */
+#if defined(_NETBSD_SOURCE)
+#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
+#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
+#define F_SEAL_GROW 0x0004 /* prevent file from growing */
+#define F_SEAL_WRITE 0x0008 /* prevent writes */
+#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
+#endif
+
/* Constants for fcntl's passed to the underlying fs - like ioctl's. */
#if defined(_NETBSD_SOURCE)
#define F_PARAM_MASK 0xfff
Index: src/sys/sys/file.h
diff -u src/sys/sys/file.h:1.92 src/sys/sys/file.h:1.93
--- src/sys/sys/file.h:1.92 Sat Apr 22 09:53:02 2023
+++ src/sys/sys/file.h Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: file.h,v 1.92 2023/04/22 13:53:02 riastradh Exp $ */
+/* $NetBSD: file.h,v 1.93 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -103,6 +103,7 @@ struct fileops {
int (*fo_fpathconf) (struct file *, int, register_t *);
int (*fo_posix_fadvise)
(struct file *, off_t, off_t, int);
+ int (*fo_truncate) (struct file *, off_t);
};
union file_data {
@@ -121,6 +122,7 @@ union file_data {
struct mqueue *fd_mq; // DTYPE_MQUEUE
struct ksem *fd_ks; // DTYPE_SEM
struct iscsifd *fd_iscsi; // DTYPE_MISC (iscsi)
+ struct memfd *fd_memfd; // DTYPE_MEMFD
};
/*
@@ -160,6 +162,7 @@ struct file {
#define f_ksem f_undata.fd_ks
#define f_eventfd f_undata.fd_eventfd
#define f_timerfd f_undata.fd_timerfd
+#define f_memfd f_undata.fd_memfd
#define f_rndctx f_undata.fd_rndctx
#define f_audioctx f_undata.fd_audioctx
@@ -184,10 +187,11 @@ struct file {
#define DTYPE_SEM 8 /* semaphore */
#define DTYPE_EVENTFD 9 /* eventfd */
#define DTYPE_TIMERFD 10 /* timerfd */
+#define DTYPE_MEMFD 11 /* memfd */
#define DTYPE_NAMES \
"0", "file", "socket", "pipe", "kqueue", "misc", "crypto", "mqueue", \
- "semaphore", "eventfd", "timerfd"
+ "semaphore", "eventfd", "timerfd", "memfd"
#ifdef _KERNEL
Index: src/sys/sys/mman.h
diff -u src/sys/sys/mman.h:1.62 src/sys/sys/mman.h:1.63
--- src/sys/sys/mman.h:1.62 Fri Dec 6 14:37:43 2019
+++ src/sys/sys/mman.h Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: mman.h,v 1.62 2019/12/06 19:37:43 christos Exp $ */
+/* $NetBSD: mman.h,v 1.63 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 1982, 1986, 1993
@@ -212,7 +212,13 @@ typedef __off_t off_t; /* file offset
implemented in UVM */
#define MAP_INHERIT_ZERO 4 /* zero in child */
#define MAP_INHERIT_DEFAULT MAP_INHERIT_COPY
-#endif
+
+/*
+ * Flags to memfd_create
+ */
+#define MFD_CLOEXEC 0x1U
+#define MFD_ALLOW_SEALING 0x2U
+#endif /* _NETBSD_SOURCE */
#ifndef _KERNEL
@@ -234,12 +240,31 @@ int madvise(void *, size_t, int);
int mincore(void *, size_t, char *);
int minherit(void *, size_t, int);
void * mremap(void *, size_t, void *, size_t, int);
+int memfd_create(const char *, unsigned int);
#endif
int posix_madvise(void *, size_t, int);
int shm_open(const char *, int, mode_t);
int shm_unlink(const char *);
__END_DECLS
+#else
+
+#include <sys/syslimits.h> /* for NAME_MAX */
+#include <sys/timespec.h> /* for struct timespec */
+#include <sys/mutex.h> /* for kmutex_t */
+
+struct memfd {
+ char mfd_name[NAME_MAX+1];
+ struct uvm_object *mfd_uobj;
+ size_t mfd_size;
+ int mfd_seals;
+ kmutex_t mfd_lock; /* for truncate */
+
+ struct timespec mfd_btime;
+ struct timespec mfd_atime;
+ struct timespec mfd_mtime;
+};
+
#endif /* !_KERNEL */
#endif /* !_SYS_MMAN_H_ */
Index: src/usr.bin/fstat/fstat.c
diff -u src/usr.bin/fstat/fstat.c:1.117 src/usr.bin/fstat/fstat.c:1.118
--- src/usr.bin/fstat/fstat.c:1.117 Fri Oct 28 01:27:16 2022
+++ src/usr.bin/fstat/fstat.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: fstat.c,v 1.117 2022/10/28 05:27:16 ozaki-r Exp $ */
+/* $NetBSD: fstat.c,v 1.118 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 1988, 1993
@@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1988, 19
#if 0
static char sccsid[] = "@(#)fstat.c 8.3 (Berkeley) 5/2/95";
#else
-__RCSID("$NetBSD: fstat.c,v 1.117 2022/10/28 05:27:16 ozaki-r Exp $");
+__RCSID("$NetBSD: fstat.c,v 1.118 2023/07/10 02:31:55 christos Exp $");
#endif
#endif /* not lint */
@@ -548,6 +548,7 @@ ftrans(fdfile_t *fp, int i)
case DTYPE_CRYPTO:
case DTYPE_MQUEUE:
case DTYPE_SEM:
+ case DTYPE_MEMFD:
if (checkfile == 0)
misctrans(&file, i);
break;
Index: src/usr.bin/fstat/misc.c
diff -u src/usr.bin/fstat/misc.c:1.24 src/usr.bin/fstat/misc.c:1.25
--- src/usr.bin/fstat/misc.c:1.24 Sun Sep 13 00:14:48 2020
+++ src/usr.bin/fstat/misc.c Sun Jul 9 22:31:55 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: misc.c,v 1.24 2020/09/13 04:14:48 isaki Exp $ */
+/* $NetBSD: misc.c,v 1.25 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__RCSID("$NetBSD: misc.c,v 1.24 2020/09/13 04:14:48 isaki Exp $");
+__RCSID("$NetBSD: misc.c,v 1.25 2023/07/10 02:31:55 christos Exp $");
#include <stdbool.h>
#include <sys/param.h>
@@ -56,6 +56,9 @@ __RCSID("$NetBSD: misc.c,v 1.24 2020/09/
#undef _KERNEL
#include <sys/cprng.h>
#include <sys/vnode.h>
+#define _KERNEL
+#include <sys/mman.h>
+#undef _KERNEL
#include <sys/mount.h>
#include <net/bpfdesc.h>
@@ -110,7 +113,9 @@ static struct nlist nl[] = {
{ .n_name = "audio_fileops" },
#define NL_PAD 19
{ .n_name = "pad_fileops" },
-#define NL_MAX 20
+#define NL_MEMFD 20
+ { .n_name = "memfd_fileops" },
+#define NL_MAX 21
{ .n_name = NULL }
};
@@ -263,6 +268,40 @@ p_audio(struct file *f)
return 0;
}
+static int
+p_memfd_seal(int seen, int all, int target, const char *name)
+{
+ if (all & target)
+ (void)printf("%s%s", (seen ? "|" : ""), name);
+
+ return seen || (all & target);
+}
+
+static int
+p_memfd(struct file *f)
+{
+ int seal_yet = 0;
+ struct memfd mfd;
+
+ if (!KVM_READ(f->f_data, &mfd, sizeof(mfd))) {
+ dprintf("can't read memfd at %p for pid %d", f->f_data, Pid);
+ return 0;
+ }
+ (void)printf("* %s, seals=", mfd.mfd_name);
+ if (mfd.mfd_seals == 0)
+ (void)printf("0");
+ else {
+ seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_SEAL, "F_SEAL_SEAL");
+ seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_SHRINK, "F_SEAL_SHRINK");
+ seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_GROW, "F_SEAL_GROW");
+ seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_WRITE, "F_SEAL_WRITE");
+ seal_yet = p_memfd_seal(seal_yet, mfd.mfd_seals, F_SEAL_FUTURE_WRITE, "F_SEAL_FUTURE_WRITE");
+ }
+
+ oprint(f, "\n");
+ return 0;
+}
+
int
pmisc(struct file *f, const char *name)
{
@@ -310,6 +349,8 @@ pmisc(struct file *f, const char *name)
case NL_PAD:
printf("* pad %p", f->f_data);
break;
+ case NL_MEMFD:
+ return p_memfd(f);
case NL_MAX:
printf("* %s ops=%p %p", name, f->f_ops, f->f_data);
break;
Added files:
Index: src/lib/libc/sys/memfd_create.2
diff -u /dev/null src/lib/libc/sys/memfd_create.2:1.1
--- /dev/null Sun Jul 9 22:31:55 2023
+++ src/lib/libc/sys/memfd_create.2 Sun Jul 9 22:31:54 2023
@@ -0,0 +1,125 @@
+.\" $NetBSD: memfd_create.2,v 1.1 2023/07/10 02:31:54 christos Exp $
+.\"
+.\" Copyright (c) 2023 The NetBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Theodore Preduta.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd July 5, 2023
+.Dt MEMFD_CREATE 2
+.Os
+.Sh NAME
+.Nm memfd_create
+.Nd create anonymous files
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In sys/mman.h
+.Ft int
+.Fn memfd_create "const char *name" "unsigned int flags"
+.Sh DESCRIPTION
+The
+.Fn memfd_create
+system call returns a file descriptor to a file named
+.Fa name
+backed only by RAM.
+Initially, the size of the file is zero.
+.Pp
+The length of
+.Fa name
+must not exceed
+.Dv NAME_MAX-6
+characters in length, to allow for the prefix
+.Dq memfd:
+to be added.
+But since the file descriptor does not live on disk,
+.Fa name
+does not have to be unique.
+.Fa name
+is only intended to be used for debugging purposes and commands like
+.Xr fstat 1 .
+.Pp
+Additionally, any of the following may be specified as the
+.Fa flags :
+.Bl -tag -width MFD_ALLOW_SEALING
+.It Dv MFD_CLOEXEC
+Set the
+.Xr close 2
+on
+.Xr exec 3
+flag.
+.It Dv MFD_ALLOW_SEALING
+Allow adding seals to the file descriptor using the
+.Xr fcntl 2
+.Dv F_ADD_SEALS
+command.
+.El
+.Pp
+Otherwise, the returned file descriptor behaves the same as a regular file,
+including the ability to be mapped by
+.Xr mmap 2 .
+.Sh RETURN VALUES
+If successful, the
+.Fn memfd_create
+system call returns a non-negative integer.
+On failure -1 is returned and
+.Fa errno
+is set to indicate the error.
+.Sh ERRORS
+.Fn memfd_create
+will fail if:
+.Bl -tag -width Er
+.It Bq Er EFAULT
+The argument
+.Fa name
+is
+.Dv NULL
+or points to invalid memory.
+.It Bq Er EINVAL
+The argument
+.Fa flags
+has any bits set other than
+.Dv MFD_CLOEXEC
+or
+.Dv MFD_ALLOW_SEALING .
+.It Bq Er ENAMETOOLONG
+The length of
+.Fa name
+appended with the prefix
+.Dq memfd:
+would exceed
+.Dv NAME_MAX .
+.It Bq Er ENFILE
+The system file table is full.
+.El
+.Sh SEE ALSO
+.Xr fcntl 2 ,
+.Xr mmap 2 ,
+.Xr shmget 2 ,
+.Xr shm_open 3
+.Sh HISTORY
+.Fn memfd_create
+is compatible with the Linux system call of the same name that first appeared in
+Linux 3.17.
Index: src/sys/kern/sys_memfd.c
diff -u /dev/null src/sys/kern/sys_memfd.c:1.1
--- /dev/null Sun Jul 9 22:31:55 2023
+++ src/sys/kern/sys_memfd.c Sun Jul 9 22:31:55 2023
@@ -0,0 +1,408 @@
+/* $NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $ */
+
+/*-
+ * Copyright (c) 2023 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Theodore Preduta.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/mman.h>
+#include <sys/syscallargs.h>
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_object.h>
+
+#define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
+#define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
+ |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
+
+static const char memfd_prefix[] = "memfd:";
+
+static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
+static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
+static int memfd_ioctl(file_t *, u_long, void *);
+static int memfd_fcntl(file_t *, u_int, void *);
+static int memfd_stat(file_t *, struct stat *);
+static int memfd_close(file_t *);
+static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
+ struct uvm_object **, int *);
+static int memfd_seek(file_t *, off_t, int, off_t *, int);
+static int memfd_truncate(file_t *, off_t);
+
+static const struct fileops memfd_fileops = {
+ .fo_name = "memfd",
+ .fo_read = memfd_read,
+ .fo_write = memfd_write,
+ .fo_ioctl = memfd_ioctl,
+ .fo_fcntl = memfd_fcntl,
+ .fo_poll = fnullop_poll,
+ .fo_stat = memfd_stat,
+ .fo_close = memfd_close,
+ .fo_kqfilter = fnullop_kqfilter,
+ .fo_restart = fnullop_restart,
+ .fo_mmap = memfd_mmap,
+ .fo_seek = memfd_seek,
+ .fo_fpathconf = (void *)eopnotsupp,
+ .fo_posix_fadvise = (void *)eopnotsupp,
+ .fo_truncate = memfd_truncate,
+};
+
+/*
+ * memfd_create(2). Creat a file descriptor associated with anonymous
+ * memory.
+ */
+int
+sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
+ register_t *retval)
+{
+ /* {
+ syscallarg(const char *) name;
+ syscallarg(unsigned int) flags;
+ } */
+ int error, fd;
+ file_t *fp;
+ struct memfd *mfd;
+ struct proc *p = l->l_proc;
+ const unsigned int flags = SCARG(uap, flags);
+
+ KASSERT(NAME_MAX - sizeof(memfd_prefix) > 0); /* sanity check */
+
+ if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
+ return EINVAL;
+
+ mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
+ mfd->mfd_size = 0;
+ mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
+ mutex_init(&mfd->mfd_lock, MUTEX_DEFAULT, IPL_NONE);
+
+ strcpy(mfd->mfd_name, memfd_prefix);
+ error = copyinstr(SCARG(uap, name),
+ &mfd->mfd_name[sizeof(memfd_prefix) - 1],
+ sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
+ if (error != 0)
+ goto leave;
+
+ getnanotime(&mfd->mfd_btime);
+
+ if ((flags & MFD_ALLOW_SEALING) == 0)
+ mfd->mfd_seals |= F_SEAL_SEAL;
+
+ error = fd_allocfile(&fp, &fd);
+ if (error != 0)
+ goto leave;
+
+ fp->f_flag = FREAD|FWRITE;
+ fp->f_type = DTYPE_MEMFD;
+ fp->f_ops = &memfd_fileops;
+ fp->f_memfd = mfd;
+ fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
+ fd_affix(p, fp, fd);
+
+ *retval = fd;
+ return 0;
+
+leave:
+ uao_detach(mfd->mfd_uobj);
+ kmem_free(mfd, sizeof(*mfd));
+ return error;
+}
+
+static int
+memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
+ int flags)
+{
+ int error;
+ vsize_t todo;
+ struct memfd *mfd = fp->f_memfd;
+
+ if (offp == &fp->f_offset)
+ mutex_enter(&fp->f_lock);
+
+ if (*offp < 0) {
+ error = EINVAL;
+ goto leave;
+ }
+
+ /* Trying to read past the end does nothing. */
+ if (*offp >= mfd->mfd_size) {
+ error = 0;
+ goto leave;
+ }
+
+ uio->uio_offset = *offp;
+ todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
+ error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
+ UBC_READ|UBC_PARTIALOK);
+
+leave:
+ if (offp == &fp->f_offset)
+ mutex_exit(&fp->f_lock);
+
+ getnanotime(&mfd->mfd_atime);
+
+ return error;
+}
+
+static int
+memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
+ int flags)
+{
+ int error;
+ vsize_t todo;
+ struct memfd *mfd = fp->f_memfd;
+
+ if (mfd->mfd_seals & F_SEAL_ANY_WRITE)
+ return EPERM;
+
+ if (offp == &fp->f_offset)
+ mutex_enter(&fp->f_lock);
+
+ if (*offp < 0) {
+ error = EINVAL;
+ goto leave;
+ }
+
+ uio->uio_offset = *offp;
+ todo = uio->uio_resid;
+
+ if (mfd->mfd_seals & F_SEAL_GROW) {
+ if (*offp >= mfd->mfd_size) {
+ error = EPERM;
+ goto leave;
+ }
+
+ /* Truncate the write to fit in mfd_size */
+ if (*offp + uio->uio_resid >= mfd->mfd_size)
+ todo = mfd->mfd_size - *offp;
+ } else if (*offp + uio->uio_resid >= mfd->mfd_size) {
+ /* Grow to accommodate the write request. */
+ error = memfd_truncate(fp, *offp + uio->uio_resid);
+ if (error != 0)
+ goto leave;
+ }
+
+ error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
+ UBC_WRITE|UBC_PARTIALOK);
+
+ getnanotime(&mfd->mfd_mtime);
+
+leave:
+ if (offp == &fp->f_offset)
+ mutex_exit(&fp->f_lock);
+
+ return error;
+}
+
+static int
+memfd_ioctl(file_t *fp, u_long cmd, void *data)
+{
+
+ return EINVAL;
+}
+
+static int
+memfd_fcntl(file_t *fp, u_int cmd, void *data)
+{
+ struct memfd *mfd = fp->f_memfd;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ if (mfd->mfd_seals & F_SEAL_SEAL)
+ return EPERM;
+
+ if (*(int *)data & ~MFD_KNOWN_SEALS)
+ return EINVAL;
+
+ /*
+ * Can only add F_SEAL_WRITE if there are no currently
+ * open mmaps.
+ *
+ * XXX should only disallow if there are no currently
+ * open mmaps with PROT_WRITE.
+ */
+ if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
+ (*(int *)data & F_SEAL_WRITE) != 0 &&
+ mfd->mfd_uobj->uo_refs > 1)
+ return EBUSY;
+
+ mfd->mfd_seals |= *(int *)data;
+ return 0;
+
+ case F_GET_SEALS:
+ *(int *)data = mfd->mfd_seals;
+ return 0;
+
+ default:
+ return EINVAL;
+ }
+}
+
+static int
+memfd_stat(file_t *fp, struct stat *st)
+{
+ struct memfd *mfd = fp->f_memfd;
+
+ memset(st, 0, sizeof(*st));
+ st->st_uid = kauth_cred_geteuid(fp->f_cred);
+ st->st_gid = kauth_cred_getegid(fp->f_cred);
+ st->st_size = mfd->mfd_size;
+
+ st->st_mode = S_IREAD;
+ if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
+ st->st_mode |= S_IWRITE;
+
+ st->st_birthtimespec = mfd->mfd_btime;
+ st->st_ctimespec = mfd->mfd_mtime;
+ st->st_atimespec = mfd->mfd_atime;
+ st->st_mtimespec = mfd->mfd_mtime;
+
+ return 0;
+}
+
+static int
+memfd_close(file_t *fp)
+{
+ struct memfd *mfd = fp->f_memfd;
+
+ uao_detach(mfd->mfd_uobj);
+ mutex_destroy(&mfd->mfd_lock);
+
+ kmem_free(mfd, sizeof(*mfd));
+ fp->f_memfd = NULL;
+
+ return 0;
+}
+
+static int
+memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
+ int *advicep, struct uvm_object **uobjp, int *maxprotp)
+{
+ struct memfd *mfd = fp->f_memfd;
+
+ /* uvm_mmap guarantees page-aligned offset and size. */
+ KASSERT(*offp == round_page(*offp));
+ KASSERT(size == round_page(size));
+ KASSERT(size > 0);
+
+ if (*offp < 0)
+ return EINVAL;
+ if (*offp + size > mfd->mfd_size)
+ return EINVAL;
+
+ if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
+ (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0)
+ return EPERM;
+
+ uao_reference(fp->f_memfd->mfd_uobj);
+ *uobjp = fp->f_memfd->mfd_uobj;
+
+ *maxprotp = prot;
+ *advicep = UVM_ADV_RANDOM;
+
+ return 0;
+}
+
+static int
+memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
+ int flags)
+{
+ off_t newoff;
+ int error;
+
+ switch (whence) {
+ case SEEK_CUR:
+ newoff = fp->f_offset + delta;
+ break;
+
+ case SEEK_END:
+ newoff = fp->f_memfd->mfd_size + delta;
+ break;
+
+ case SEEK_SET:
+ newoff = delta;
+ break;
+
+ default:
+ error = EINVAL;
+ return error;
+ }
+
+ if (newoffp)
+ *newoffp = newoff;
+ if (flags & FOF_UPDATE_OFFSET)
+ fp->f_offset = newoff;
+
+ return 0;
+}
+
+static int
+memfd_truncate(file_t *fp, off_t length)
+{
+ struct memfd *mfd = fp->f_memfd;
+ int error = 0;
+ voff_t start, end;
+
+ if (length < 0)
+ return EINVAL;
+ if (length == mfd->mfd_size)
+ return 0;
+
+ if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
+ return EPERM;
+ if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
+ return EPERM;
+
+ mutex_enter(&mfd->mfd_lock);
+
+ if (length > mfd->mfd_size)
+ ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
+ length - mfd->mfd_size, 0);
+ else {
+ /* length < mfd->mfd_size, so try to get rid of excess pages */
+ start = round_page(length);
+ end = round_page(mfd->mfd_size);
+
+ if (start < end) { /* we actually have pages to remove */
+ rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
+ error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
+ start, end, PGO_FREE);
+ /* pgo_put drops vmobjlock */
+ }
+ }
+
+ getnanotime(&mfd->mfd_mtime);
+ mfd->mfd_size = length;
+ mutex_exit(&mfd->mfd_lock);
+ return error;
+}