Add comprehensive kernel API specification for the execveat() system
call.

Signed-off-by: Sasha Levin <sas...@kernel.org>
---
 fs/exec.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 245 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index 3d006105ab23d..49d8647c053ef 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2223,6 +2223,251 @@ SYSCALL_DEFINE3(execve,
        return do_execve(getname(filename), argv, envp);
 }
 
+
+/* Valid flag combinations for execveat */
+static const s64 execveat_valid_flags[] = {
+       0,
+       AT_EMPTY_PATH,
+       AT_SYMLINK_NOFOLLOW,
+       AT_EXECVE_CHECK,
+       AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
+       AT_EMPTY_PATH | AT_EXECVE_CHECK,
+       AT_SYMLINK_NOFOLLOW | AT_EXECVE_CHECK,
+       AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW | AT_EXECVE_CHECK,
+};
+
+DEFINE_KERNEL_API_SPEC(sys_execveat)
+       KAPI_DESCRIPTION("Execute a new program relative to a directory file 
descriptor")
+       KAPI_LONG_DESC("Executes the program referred to by the combination of 
fd and filename. "
+                      "This system call is useful when implementing a secure 
execution environment "
+                      "or when the calling process has an open file descriptor 
but no access to "
+                      "the corresponding pathname. Like execve(), it replaces 
the current process "
+                      "image with a new process image.")
+       KAPI_CONTEXT(KAPI_CTX_PROCESS | KAPI_CTX_SLEEPABLE)
+
+       KAPI_PARAM(0, "fd", "int", "Directory file descriptor")
+               KAPI_PARAM_FLAGS(KAPI_PARAM_IN)
+               .type = KAPI_TYPE_FD,
+               .constraint_type = KAPI_CONSTRAINT_NONE,
+               .constraints = "AT_FDCWD for current directory, or valid 
directory file descriptor",
+       KAPI_PARAM_END
+
+       KAPI_PARAM(1, "filename", "const char __user *", "Pathname of the 
program to execute")
+               KAPI_PARAM_FLAGS(KAPI_PARAM_IN | KAPI_PARAM_USER | 
KAPI_PARAM_OPTIONAL)
+               .type = KAPI_TYPE_PATH,
+               .constraint_type = KAPI_CONSTRAINT_NONE,
+               .constraints = "Relative or absolute path; empty string with 
AT_EMPTY_PATH to use fd directly",
+       KAPI_PARAM_END
+
+       KAPI_PARAM(2, "argv", "const char __user *const __user *", "Array of 
argument strings passed to the new program")
+               KAPI_PARAM_FLAGS(KAPI_PARAM_IN | KAPI_PARAM_USER)
+               .type = KAPI_TYPE_USER_PTR,
+               .constraint_type = KAPI_CONSTRAINT_NONE,
+               .constraints = "NULL-terminated array of pointers to 
null-terminated strings",
+       KAPI_PARAM_END
+
+       KAPI_PARAM(3, "envp", "const char __user *const __user *", "Array of 
environment strings for the new program")
+               KAPI_PARAM_FLAGS(KAPI_PARAM_IN | KAPI_PARAM_USER)
+               .type = KAPI_TYPE_USER_PTR,
+               .constraint_type = KAPI_CONSTRAINT_NONE,
+               .constraints = "NULL-terminated array of pointers to 
null-terminated strings in form key=value",
+       KAPI_PARAM_END
+
+       KAPI_PARAM(4, "flags", "int", "Execution flags")
+               KAPI_PARAM_FLAGS(KAPI_PARAM_IN)
+               .type = KAPI_TYPE_INT,
+               .constraint_type = KAPI_CONSTRAINT_MASK,
+               .valid_mask = AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW | 
AT_EXECVE_CHECK,
+               .constraints = "0 or combination of AT_EMPTY_PATH, 
AT_SYMLINK_NOFOLLOW, and AT_EXECVE_CHECK",
+       KAPI_PARAM_END
+
+       /* Return specification */
+       KAPI_RETURN("long", "Does not return on success (except with 
AT_EXECVE_CHECK which returns 0); returns -1 on error")
+               .type = KAPI_TYPE_INT,
+               .check_type = KAPI_RETURN_ERROR_CHECK,
+       KAPI_RETURN_END
+
+       /* Error codes */
+       KAPI_ERROR(0, -E2BIG, "E2BIG", "Argument list too long", "The total 
size of argv and envp exceeds the system limit.")
+       KAPI_ERROR(1, -EACCES, "EACCES", "Permission denied", "Search 
permission denied on a component of the path, file is not regular, or execute 
permission denied for file or interpreter.")
+       KAPI_ERROR(2, -EBADF, "EBADF", "Bad file descriptor", "fd is not a 
valid file descriptor.")
+       KAPI_ERROR(3, -EFAULT, "EFAULT", "Bad address", "filename, argv, or 
envp points outside accessible address space.")
+       KAPI_ERROR(4, -EINVAL, "EINVAL", "Invalid flags or executable format", 
"Invalid flags specified, or ELF executable has more than one PT_INTERP 
segment.")
+       KAPI_ERROR(5, -EIO, "EIO", "I/O error", "An I/O error occurred while 
reading from the file system.")
+       KAPI_ERROR(6, -EISDIR, "EISDIR", "Is a directory", "An ELF interpreter 
was a directory.")
+       KAPI_ERROR(7, -ELIBBAD, "ELIBBAD", "Invalid ELF interpreter", "An ELF 
interpreter was not in a recognized format.")
+       KAPI_ERROR(8, -ELOOP, "ELOOP", "Too many symbolic links", "Too many 
symbolic links encountered, or AT_SYMLINK_NOFOLLOW was specified but filename 
refers to a symbolic link.")
+       KAPI_ERROR(9, -EMFILE, "EMFILE", "Too many open files", "The 
per-process limit on open file descriptors has been reached.")
+       KAPI_ERROR(10, -ENAMETOOLONG, "ENAMETOOLONG", "Filename too long", 
"filename or one of the strings in argv or envp is too long.")
+       KAPI_ERROR(11, -ENFILE, "ENFILE", "System file table overflow", "The 
system-wide limit on open files has been reached.")
+       KAPI_ERROR(12, -ENOENT, "ENOENT", "File not found", "The file filename 
or an interpreter does not exist, or filename is empty and AT_EMPTY_PATH was 
not specified in flags.")
+       KAPI_ERROR(13, -ENOEXEC, "ENOEXEC", "Exec format error", "An executable 
is not in a recognized format, is for wrong architecture, or has other format 
errors preventing execution.")
+       KAPI_ERROR(14, -ENOMEM, "ENOMEM", "Out of memory", "Insufficient kernel 
memory available.")
+       KAPI_ERROR(15, -ENOTDIR, "ENOTDIR", "Not a directory", "A component of 
the path prefix is not a directory, or fd is not a directory when a relative 
path is given.")
+       KAPI_ERROR(16, -EPERM, "EPERM", "Operation not permitted", "The 
filesystem is mounted nosuid, the user is not root, and the file has 
set-user-ID or set-group-ID bit set.")
+       KAPI_ERROR(17, -ETXTBSY, "ETXTBSY", "Text file busy", "The executable 
was open for writing by one or more processes.")
+       KAPI_ERROR(18, -EAGAIN, "EAGAIN", "Resource temporarily unavailable", 
"RLIMIT_NPROC limit exceeded - too many processes for this user.")
+       KAPI_ERROR(19, -EINTR, "EINTR", "Interrupted by signal", "The exec was 
interrupted by a signal during setup phase.")
+
+       /* Signal specifications */
+       KAPI_SIGNAL(0, 0, "FATAL_SIGNALS", KAPI_SIGNAL_RECEIVE, 
KAPI_SIGNAL_ACTION_TERMINATE)
+               KAPI_SIGNAL_CONDITION("Fatal signal pending during exec setup")
+               KAPI_SIGNAL_DESC("Fatal signals (checked via 
fatal_signal_pending()) can interrupt exec during setup phases like 
de_thread(). This causes exec to fail and the process to exit.")
+               KAPI_SIGNAL_RESTARTABLE
+       KAPI_SIGNAL_END
+
+       KAPI_SIGNAL(1, SIGKILL, "SIGKILL", KAPI_SIGNAL_SEND, 
KAPI_SIGNAL_ACTION_TERMINATE)
+               KAPI_SIGNAL_TARGET("All other threads in the thread group")
+               KAPI_SIGNAL_CONDITION("Multi-threaded process doing exec")
+               KAPI_SIGNAL_DESC("During de_thread(), zap_other_threads() sends 
SIGKILL to all other threads in the thread group to ensure only the execing 
thread survives.")
+       KAPI_SIGNAL_END
+
+       KAPI_SIGNAL(2, 0, "ALL_HANDLERS", KAPI_SIGNAL_HANDLE, 
KAPI_SIGNAL_ACTION_CUSTOM)
+               KAPI_SIGNAL_CONDITION("Signal has a handler installed")
+               KAPI_SIGNAL_DESC("flush_signal_handlers() resets all signal 
handlers to SIG_DFL except for signals that are ignored (SIG_IGN). This happens 
after de_thread() completes.")
+       KAPI_SIGNAL_END
+
+       KAPI_SIGNAL(3, 0, "IGNORED_SIGNALS", KAPI_SIGNAL_IGNORE, 
KAPI_SIGNAL_ACTION_CUSTOM)
+               KAPI_SIGNAL_CONDITION("Signal disposition is SIG_IGN")
+               KAPI_SIGNAL_DESC("Signals set to SIG_IGN are preserved across 
exec. This is POSIX-compliant behavior allowing parent processes to ignore 
signals in children.")
+       KAPI_SIGNAL_END
+
+       KAPI_SIGNAL(4, 0, "PENDING_SIGNALS", KAPI_SIGNAL_HANDLE, 
KAPI_SIGNAL_ACTION_CUSTOM)
+               KAPI_SIGNAL_CONDITION("Any pending signals")
+               KAPI_SIGNAL_DESC("All pending signals are cleared during exec. 
This includes both thread-specific and process-wide pending signals.")
+       KAPI_SIGNAL_END
+
+       KAPI_SIGNAL(5, 0, "TIMER_SIGNALS", KAPI_SIGNAL_HANDLE, 
KAPI_SIGNAL_ACTION_CUSTOM)
+               KAPI_SIGNAL_CONDITION("Timer-generated signals pending")
+               KAPI_SIGNAL_DESC("flush_itimer_signals() clears any pending 
timer signals (SIGALRM, SIGVTALRM, SIGPROF) to prevent confusion in the new 
program.")
+       KAPI_SIGNAL_END
+
+       KAPI_SIGNAL(6, SIGCHLD, "SIGCHLD", KAPI_SIGNAL_SEND, 
KAPI_SIGNAL_ACTION_DEFAULT)
+               KAPI_SIGNAL_TARGET("Parent process when this process exits")
+               KAPI_SIGNAL_CONDITION("Process exit after exec")
+               KAPI_SIGNAL_DESC("The exit_signal is set to SIGCHLD during 
exec, ensuring the parent will receive SIGCHLD when this process terminates.")
+       KAPI_SIGNAL_END
+
+       KAPI_SIGNAL(7, 0, "SIGALTSTACK", KAPI_SIGNAL_HANDLE, 
KAPI_SIGNAL_ACTION_CUSTOM)
+               KAPI_SIGNAL_CONDITION("Process had alternate signal stack")
+               KAPI_SIGNAL_DESC("Any alternate signal stack (sigaltstack) is 
not preserved across exec. The new program starts with no alternate stack.")
+       KAPI_SIGNAL_END
+
+       /* Side effects */
+       KAPI_SIDE_EFFECT(0, KAPI_EFFECT_PROCESS_STATE | KAPI_EFFECT_FREE_MEMORY 
| KAPI_EFFECT_ALLOC_MEMORY,
+                        "process image",
+                        "Replaces entire process image including code, data, 
heap, and stack")
+       KAPI_SIDE_EFFECT_END
+
+       KAPI_SIDE_EFFECT(1, KAPI_EFFECT_MODIFY_STATE | 
KAPI_EFFECT_RESOURCE_DESTROY,
+                        "file descriptors",
+                        "Closes all file descriptors with close-on-exec flag 
set")
+               KAPI_EFFECT_CONDITION("FD_CLOEXEC flag set")
+       KAPI_SIDE_EFFECT_END
+
+       KAPI_SIDE_EFFECT(2, KAPI_EFFECT_MODIFY_STATE,
+                        "signal handlers",
+                        "Resets all signal handlers to default, preserves 
ignored signals")
+       KAPI_SIDE_EFFECT_END
+
+       KAPI_SIDE_EFFECT(3, KAPI_EFFECT_PROCESS_STATE | KAPI_EFFECT_SIGNAL_SEND,
+                        "thread group",
+                        "Kills all other threads in the thread group with 
SIGKILL")
+               KAPI_EFFECT_CONDITION("Multi-threaded process")
+       KAPI_SIDE_EFFECT_END
+
+       KAPI_SIDE_EFFECT(4, KAPI_EFFECT_MODIFY_STATE,
+                        "process attributes",
+                        "Clears pending signals, timers, alternate signal 
stack, and various process attributes")
+       KAPI_SIDE_EFFECT_END
+
+       KAPI_SIDE_EFFECT(5, KAPI_EFFECT_FILESYSTEM,
+                        "executable file",
+                        "Opens and reads the executable file, may trigger 
filesystem operations")
+       KAPI_SIDE_EFFECT_END
+
+       KAPI_SIDE_EFFECT(6, KAPI_EFFECT_MODIFY_STATE,
+                        "security context",
+                        "May change SELinux/AppArmor context based on file 
labels and transitions")
+               KAPI_EFFECT_CONDITION("LSM enabled")
+       KAPI_SIDE_EFFECT_END
+
+       /* State transitions */
+       KAPI_STATE_TRANS(0, "process memory",
+                        "old program image", "new program image",
+                        "Complete replacement of process address space with 
new program")
+       KAPI_STATE_TRANS_END
+
+       KAPI_STATE_TRANS(1, "process credentials",
+                        "current credentials", "potentially modified 
credentials",
+                        "May change effective UID/GID based on file 
permissions")
+               KAPI_STATE_TRANS_COND("setuid/setgid binary")
+       KAPI_STATE_TRANS_END
+
+       KAPI_STATE_TRANS(2, "thread state",
+                        "multi-threaded", "single-threaded",
+                        "Process becomes single-threaded after killing other 
threads")
+               KAPI_STATE_TRANS_COND("Multi-threaded process")
+       KAPI_STATE_TRANS_END
+
+       KAPI_STATE_TRANS(3, "signal state",
+                        "custom handlers and pending signals", "default 
handlers, no pending signals",
+                        "Signal handling reset to clean state for new program")
+       KAPI_STATE_TRANS_END
+
+       KAPI_STATE_TRANS(4, "file descriptor table",
+                        "contains close-on-exec FDs", "close-on-exec FDs 
closed",
+                        "All file descriptors marked FD_CLOEXEC are closed 
during exec")
+               KAPI_STATE_TRANS_COND("FDs with FD_CLOEXEC")
+       KAPI_STATE_TRANS_END
+
+       KAPI_STATE_TRANS(5, "working directory",
+                        "fd-relative operations", "resolved to absolute paths",
+                        "Directory fd operations resolved before exec 
completes")
+               KAPI_STATE_TRANS_COND("Using dirfd != AT_FDCWD")
+       KAPI_STATE_TRANS_END
+
+       /* Locking information */
+       KAPI_LOCK(0, "cred_guard_mutex", KAPI_LOCK_MUTEX)
+               KAPI_LOCK_DESC("Protects against concurrent credential changes 
during exec")
+               KAPI_LOCK_ACQUIRED
+               KAPI_LOCK_DESC("Ensures atomic credential transition during 
exec process")
+       KAPI_LOCK_END
+
+       KAPI_LOCK(1, "sighand->siglock", KAPI_LOCK_SPINLOCK)
+               KAPI_LOCK_DESC("Protects signal handler modifications")
+               KAPI_LOCK_ACQUIRED
+               KAPI_LOCK_RELEASED
+               KAPI_LOCK_DESC("Taken during signal handler reset and pending 
signal clearing")
+       KAPI_LOCK_END
+
+       KAPI_SIDE_EFFECT_COUNT(7)
+       KAPI_STATE_TRANS_COUNT(6)
+
+       .error_count = 20,
+       .param_count = 5,
+       .since_version = "3.19",
+       .examples = "/* Execute /bin/echo using AT_FDCWD */\n"
+                   "char *argv[] = { \"echo\", \"hello\", NULL };\n"
+                   "char *envp[] = { \"PATH=/bin\", NULL };\n"
+                   "execveat(AT_FDCWD, \"/bin/echo\", argv, envp, 0);\n\n"
+                   "/* Execute via file descriptor */\n"
+                   "int fd = open(\"/bin/echo\", O_PATH);\n"
+                   "execveat(fd, \"\", argv, envp, AT_EMPTY_PATH);\n\n"
+                   "/* Execute relative to directory fd */\n"
+                   "int dirfd = open(\"/bin\", O_RDONLY | O_DIRECTORY);\n"
+                   "execveat(dirfd, \"echo\", argv, envp, 0);",
+       .notes = "execveat() was added to allow fexecve() to be implemented on 
systems that "
+                "do not have /proc mounted. When filename is an empty string 
and AT_EMPTY_PATH "
+                "is specified, the file descriptor fd specifies the file to be 
executed. "
+                "AT_SYMLINK_NOFOLLOW prevents following symbolic links. "
+                "AT_EXECVE_CHECK (since Linux 6.12) only checks if execution 
would be allowed "
+                "without actually executing. Like execve(), on success 
execveat() does not return "
+                "(except with AT_EXECVE_CHECK which returns 0).",
+       .signal_count = 8,
+       .lock_count = 2,
+KAPI_END_SPEC;
+
 SYSCALL_DEFINE5(execveat,
                int, fd, const char __user *, filename,
                const char __user *const __user *, argv,
-- 
2.39.5


Reply via email to