Module Name: src
Committed By: christos
Date: Sat Aug 19 17:57:55 UTC 2023
Modified Files:
src/sys/compat/linux/common: linux_fcntl.h linux_mod.c
Added Files:
src/sys/compat/linux/common: linux_inotify.c linux_inotify.h
Log Message:
new inotify support GSoC 2023 (Theodore Preduta)
To generate a diff of this commit:
cvs rdiff -u -r1.21 -r1.22 src/sys/compat/linux/common/linux_fcntl.h
cvs rdiff -u -r0 -r1.1 src/sys/compat/linux/common/linux_inotify.c \
src/sys/compat/linux/common/linux_inotify.h
cvs rdiff -u -r1.14 -r1.15 src/sys/compat/linux/common/linux_mod.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/compat/linux/common/linux_fcntl.h
diff -u src/sys/compat/linux/common/linux_fcntl.h:1.21 src/sys/compat/linux/common/linux_fcntl.h:1.22
--- src/sys/compat/linux/common/linux_fcntl.h:1.21 Sun Jul 9 22:31:55 2023
+++ src/sys/compat/linux/common/linux_fcntl.h Sat Aug 19 13:57:54 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: linux_fcntl.h,v 1.21 2023/07/10 02:31:55 christos Exp $ */
+/* $NetBSD: linux_fcntl.h,v 1.22 2023/08/19 17:57:54 christos Exp $ */
/*-
* Copyright (c) 1995, 1998 The NetBSD Foundation, Inc.
@@ -32,6 +32,11 @@
#ifndef _LINUX_FCNTL_H
#define _LINUX_FCNTL_H
+#ifdef _KERNEL
+#include <compat/linux/common/linux_types.h> /* For linux_off_t */
+struct stat;
+#endif
+
/*
* The arguments in the flock structure have a different order from the
* BSD structure.
@@ -50,6 +55,7 @@
#define LINUX_AT_NO_AUTOMOUNT 0x0800
#define LINUX_AT_EMPTY_PATH 0x1000
+#ifdef _KERNEL
int linux_to_bsd_ioflags(int);
int linux_to_bsd_atflags(int);
int bsd_to_linux_statx(struct stat *, struct linux_statx *, unsigned int);
@@ -70,6 +76,7 @@ struct linux_flock64 {
off_t l_len;
linux_pid_t l_pid;
};
+#endif /* _KERNEL */
#if defined(__i386__)
#include <compat/linux/arch/i386/linux_fcntl.h>
@@ -103,6 +110,7 @@ struct linux_flock64 {
#define LINUX_F_ADD_SEALS (LINUX_F_SPECIFIC_BASE + 9)
#define LINUX_F_GET_SEALS (LINUX_F_SPECIFIC_BASE + 10)
+#ifdef _KERNEL
/*
* We have to have 4 copies of the code that converts linux fcntl() file
* locking to native form because there are 4 layouts for the structures.
@@ -171,6 +179,6 @@ LINUX##_to_bsd_##FLOCK(struct flock *bfp
LINUX##_to_bsd_##FLOCK(&bfl, &lfl); \
return do_fcntl_lock(fd, cmd == setlk ? F_SETLK : F_SETLKW, &bfl); \
} while (0)
-
+#endif /* _KERNEL */
#endif /* !_LINUX_FCNTL_H */
Index: src/sys/compat/linux/common/linux_mod.c
diff -u src/sys/compat/linux/common/linux_mod.c:1.14 src/sys/compat/linux/common/linux_mod.c:1.15
--- src/sys/compat/linux/common/linux_mod.c:1.14 Sun Apr 26 14:53:33 2020
+++ src/sys/compat/linux/common/linux_mod.c Sat Aug 19 13:57:54 2023
@@ -1,4 +1,4 @@
-/* $NetBSD: linux_mod.c,v 1.14 2020/04/26 18:53:33 thorpej Exp $ */
+/* $NetBSD: linux_mod.c,v 1.15 2023/08/19 17:57:54 christos Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: linux_mod.c,v 1.14 2020/04/26 18:53:33 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: linux_mod.c,v 1.15 2023/08/19 17:57:54 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_execfmt.h"
@@ -46,6 +46,7 @@ __KERNEL_RCSID(0, "$NetBSD: linux_mod.c,
#include <sys/signalvar.h>
#include <sys/sysctl.h>
+#include <compat/linux/common/linux_inotify.h>
#include <compat/linux/common/linux_sysctl.h>
#include <compat/linux/common/linux_exec.h>
@@ -162,7 +163,12 @@ compat_linux_modcmd(modcmd_t cmd, void *
switch (cmd) {
case MODULE_CMD_INIT:
+ error = linux_inotify_init();
+ if (error != 0)
+ return error;
error = exec_add(linux_execsw, __arraycount(linux_execsw));
+ if (error)
+ linux_inotify_fini();
return error;
case MODULE_CMD_FINI:
@@ -170,6 +176,7 @@ compat_linux_modcmd(modcmd_t cmd, void *
if (error)
return error;
linux_sysctl_fini();
+ linux_inotify_fini();
return 0;
default:
Added files:
Index: src/sys/compat/linux/common/linux_inotify.c
diff -u /dev/null src/sys/compat/linux/common/linux_inotify.c:1.1
--- /dev/null Sat Aug 19 13:57:55 2023
+++ src/sys/compat/linux/common/linux_inotify.c Sat Aug 19 13:57:54 2023
@@ -0,0 +1,1317 @@
+/* $NetBSD: linux_inotify.c,v 1.1 2023/08/19 17:57:54 christos Exp $ */
+
+/*-
+ * Copyright (c) 2023 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Theodore Preduta.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: linux_inotify.c,v 1.1 2023/08/19 17:57:54 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/bitops.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventvar.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/selinfo.h>
+#include <sys/select.h>
+#include <sys/signal.h>
+#include <sys/vnode.h>
+
+#include <sys/syscallargs.h>
+
+#include <compat/linux/common/linux_machdep.h>
+#include <compat/linux/common/linux_fcntl.h>
+#include <compat/linux/common/linux_inotify.h>
+#include <compat/linux/common/linux_ipc.h>
+#include <compat/linux/common/linux_sched.h>
+#include <compat/linux/common/linux_sem.h>
+#include <compat/linux/common/linux_signal.h>
+
+#include <compat/linux/linux_syscallargs.h>
+
+/*
+ * inotify(2). This interface allows the user to get file system
+ * events and (unlike kqueue(2)) their order is strictly preserved.
+ * While nice, the API has sufficient gotchas that mean we don't want
+ * to add native entry points for it. They are:
+ *
+ * - Because data is returned via read(2), this API is prone to
+ * unaligned memory accesses. There is a note in the Linux man page
+ * that says the name field of struct linux_inotify_event *can* be
+ * used for alignment purposes. In practice, even Linux doesn't
+ * always do this, so for simplicity, we don't ever do this.
+ */
+
+#define LINUX_INOTIFY_MAX_QUEUED 16384
+#define LINUX_INOTIFY_MAX_FROM_KEVENT 3
+
+#if DEBUG_LINUX
+#define DPRINTF(x) uprintf x
+#else
+#define DPRINTF(x) __nothing
+#endif
+
+struct inotify_entry {
+ TAILQ_ENTRY(inotify_entry) ie_entries;
+ struct linux_inotify_event ie_event;
+ char ie_name[NAME_MAX+1];
+};
+
+struct inotify_dir_entries {
+ size_t ide_count;
+ struct inotify_dir_entry {
+ char name[NAME_MAX + 1];
+ ino_t fileno;
+ } ide_entries[];
+};
+#define INOTIFY_DIR_ENTRIES_SIZE(count) (sizeof(struct inotify_dir_entries) \
+ + count * sizeof(struct inotify_dir_entry))
+
+struct inotifyfd {
+ int ifd_kqfd; /* kqueue fd used by this inotify */
+ /* instance */
+ struct selinfo ifd_sel; /* for EVFILT_READ by epoll */
+ kmutex_t ifd_lock; /* lock for ifd_sel, ifd_wds and */
+ /* ifd_nwds */
+
+ struct inotify_dir_entries **ifd_wds;
+ /* keeps track of watch descriptors */
+ /* for directories: snapshot of the */
+ /* directory state */
+ /* for files: an inotify_dir_entries */
+ /* with ide_count == 0 */
+ size_t ifd_nwds; /* max watch descriptor that can be */
+ /* stored in ifd_wds + 1 */
+
+ TAILQ_HEAD(, inotify_entry) ifd_qhead; /* queue of pending events */
+ size_t ifd_qcount; /* number of pending events */
+ kcondvar_t ifd_qcv; /* condvar for blocking reads */
+ kmutex_t ifd_qlock; /* lock for ifd_q* and interlock */
+ /* for ifd_qcv */
+};
+
+struct inotify_kevent_mask_pair {
+ uint32_t inotify;
+ uint32_t kevent;
+};
+
+static int inotify_kev_fetch_changes(void *, const struct kevent *,
+ struct kevent *, size_t, int);
+static int do_inotify_init(struct lwp *, register_t *, int);
+static int inotify_close_wd(struct inotifyfd *, int);
+static uint32_t inotify_mask_to_kevent_fflags(uint32_t, enum vtype);
+static void do_kevent_to_inotify(int32_t, uint32_t, uint32_t,
+ struct inotify_entry *, size_t *, char *);
+static int kevent_to_inotify(struct inotifyfd *, int, enum vtype, uint32_t,
+ uint32_t, struct inotify_entry *, size_t *);
+static int inotify_readdir(file_t *, struct dirent *, int *);
+static struct inotify_dir_entries *get_inotify_dir_entries(int);
+
+static int inotify_filt_attach(struct knote *);
+static void inotify_filt_detach(struct knote *);
+static int inotify_filt_event(struct knote *, long);
+static void inotify_read_filt_detach(struct knote *);
+static int inotify_read_filt_event(struct knote *, long);
+
+static int inotify_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
+static int inotify_close(file_t *);
+static int inotify_poll(file_t *, int);
+static int inotify_kqfilter(file_t *, struct knote *);
+static void inotify_restart(file_t *);
+
+static const char inotify_filtname[] = "LINUX_INOTIFY";
+static int inotify_filtid;
+
+/* "fake" EVFILT_VNODE that gets attached to ifd_deps */
+static const struct filterops inotify_filtops = {
+ .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+ .f_attach = inotify_filt_attach,
+ .f_detach = inotify_filt_detach,
+ .f_event = inotify_filt_event,
+ .f_touch = NULL,
+};
+
+/* EVFILT_READ attached to inotifyfd (to support watching via epoll) */
+static const struct filterops inotify_read_filtops = {
+ .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
+ .f_attach = NULL, /* attached via .fo_kqfilter */
+ .f_detach = inotify_read_filt_detach,
+ .f_event = inotify_read_filt_event,
+ .f_touch = NULL,
+};
+
+static const struct fileops inotify_fileops = {
+ .fo_name = "inotify",
+ .fo_read = inotify_read,
+ .fo_write = fbadop_write,
+ .fo_ioctl = fbadop_ioctl,
+ .fo_fcntl = fnullop_fcntl,
+ .fo_poll = inotify_poll,
+ .fo_stat = fbadop_stat,
+ .fo_close = inotify_close,
+ .fo_kqfilter = inotify_kqfilter,
+ .fo_restart = inotify_restart,
+ .fo_fpathconf = (void *)eopnotsupp,
+};
+
+/* basic flag translations */
+static const struct inotify_kevent_mask_pair common_inotify_to_kevent[] = {
+ { .inotify = LINUX_IN_ATTRIB, .kevent = NOTE_ATTRIB, },
+ { .inotify = LINUX_IN_CLOSE_NOWRITE, .kevent = NOTE_CLOSE, },
+ { .inotify = LINUX_IN_OPEN, .kevent = NOTE_OPEN, },
+ { .inotify = LINUX_IN_MOVE_SELF, .kevent = NOTE_RENAME, },
+};
+static const size_t common_inotify_to_kevent_len =
+ __arraycount(common_inotify_to_kevent);
+
+static const struct inotify_kevent_mask_pair vreg_inotify_to_kevent[] = {
+ { .inotify = LINUX_IN_ACCESS, .kevent = NOTE_READ, },
+ { .inotify = LINUX_IN_ATTRIB, .kevent = NOTE_ATTRIB|NOTE_LINK, },
+ { .inotify = LINUX_IN_CLOSE_WRITE, .kevent = NOTE_CLOSE_WRITE, },
+ { .inotify = LINUX_IN_MODIFY, .kevent = NOTE_WRITE, },
+};
+static const size_t vreg_inotify_to_kevent_len =
+ __arraycount(vreg_inotify_to_kevent);
+
+static const struct inotify_kevent_mask_pair vdir_inotify_to_kevent[] = {
+ { .inotify = LINUX_IN_ACCESS, .kevent = NOTE_READ, },
+ { .inotify = LINUX_IN_CREATE, .kevent = NOTE_WRITE, },
+ { .inotify = LINUX_IN_DELETE, .kevent = NOTE_WRITE, },
+ { .inotify = LINUX_IN_MOVED_FROM, .kevent = NOTE_WRITE, },
+ { .inotify = LINUX_IN_MOVED_TO, .kevent = NOTE_WRITE, },
+};
+static const size_t vdir_inotify_to_kevent_len =
+ __arraycount(vdir_inotify_to_kevent);
+
+static const struct inotify_kevent_mask_pair common_kevent_to_inotify[] = {
+ { .kevent = NOTE_ATTRIB, .inotify = LINUX_IN_ATTRIB, },
+ { .kevent = NOTE_CLOSE, .inotify = LINUX_IN_CLOSE_NOWRITE, },
+ { .kevent = NOTE_CLOSE_WRITE, .inotify = LINUX_IN_CLOSE_WRITE, },
+ { .kevent = NOTE_OPEN, .inotify = LINUX_IN_OPEN, },
+ { .kevent = NOTE_READ, .inotify = LINUX_IN_ACCESS, },
+ { .kevent = NOTE_RENAME, .inotify = LINUX_IN_MOVE_SELF, },
+ { .kevent = NOTE_REVOKE, .inotify = LINUX_IN_UNMOUNT, },
+};
+static const size_t common_kevent_to_inotify_len =
+ __arraycount(common_kevent_to_inotify);
+
+static const struct inotify_kevent_mask_pair vreg_kevent_to_inotify[] = {
+ { .kevent = NOTE_DELETE|NOTE_LINK, .inotify = LINUX_IN_ATTRIB, },
+ { .kevent = NOTE_WRITE, .inotify = LINUX_IN_MODIFY, },
+};
+static const size_t vreg_kevent_to_inotify_len =
+ __arraycount(vreg_kevent_to_inotify);
+
+/*
+ * Register the custom kfilter for inotify.
+ */
+int
+linux_inotify_init(void)
+{
+ return kfilter_register(inotify_filtname, &inotify_filtops,
+ &inotify_filtid);
+}
+
+/*
+ * Unregister the custom kfilter for inotify.
+ */
+int
+linux_inotify_fini(void)
+{
+ return kfilter_unregister(inotify_filtname);
+}
+
+/*
+ * Copyin callback used by kevent. This copies already converted
+ * filters from kernel memory to the kevent internal kernel memory.
+ * Hence the memcpy instead of copyin.
+ */
+static int
+inotify_kev_fetch_changes(void *ctx, const struct kevent *changelist,
+ struct kevent *changes, size_t index, int n)
+{
+ memcpy(changes, changelist + index, n * sizeof(*changes));
+
+ return 0;
+}
+
+/*
+ * Initialize a new inotify fd.
+ */
+static int
+do_inotify_init(struct lwp *l, register_t *retval, int flags)
+{
+ file_t *fp;
+ int error, fd;
+ struct proc *p = l->l_proc;
+ struct inotifyfd *ifd;
+ struct sys_kqueue1_args kqa;
+
+ if (flags & ~(LINUX_IN_ALL_FLAGS))
+ return EINVAL;
+
+ ifd = kmem_zalloc(sizeof(*ifd), KM_SLEEP);
+ mutex_init(&ifd->ifd_lock, MUTEX_DEFAULT, IPL_NONE);
+ mutex_init(&ifd->ifd_qlock, MUTEX_DEFAULT, IPL_NONE);
+ cv_init(&ifd->ifd_qcv, "inotify");
+ selinit(&ifd->ifd_sel);
+ TAILQ_INIT(&ifd->ifd_qhead);
+
+ ifd->ifd_nwds = 1;
+ ifd->ifd_wds = kmem_zalloc(ifd->ifd_nwds * sizeof(*ifd->ifd_wds),
+ KM_SLEEP);
+
+ SCARG(&kqa, flags) = 0;
+ if (flags & LINUX_IN_NONBLOCK)
+ SCARG(&kqa, flags) |= O_NONBLOCK;
+ error = sys_kqueue1(l, &kqa, retval);
+ if (error != 0)
+ goto leave0;
+ ifd->ifd_kqfd = *retval;
+
+ error = fd_allocfile(&fp, &fd);
+ if (error != 0)
+ goto leave1;
+
+ fp->f_flag = FREAD;
+ if (flags & LINUX_IN_NONBLOCK)
+ fp->f_flag |= FNONBLOCK;
+ fp->f_type = DTYPE_MISC;
+ fp->f_ops = &inotify_fileops;
+ fp->f_data = ifd;
+ fd_set_exclose(l, fd, (flags & LINUX_IN_CLOEXEC) != 0);
+ fd_affix(p, fp, fd);
+
+ *retval = fd;
+ return 0;
+
+leave1:
+ KASSERT(fd_getfile(ifd->ifd_kqfd) != NULL);
+ fd_close(ifd->ifd_kqfd);
+leave0:
+ kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
+ kmem_free(ifd, sizeof(*ifd));
+
+ mutex_destroy(&ifd->ifd_lock);
+ mutex_destroy(&ifd->ifd_qlock);
+ cv_destroy(&ifd->ifd_qcv);
+ seldestroy(&ifd->ifd_sel);
+
+ return error;
+}
+
+/*
+ * inotify_init(2). Initialize a new inotify fd with flags=0.
+ */
+int
+linux_sys_inotify_init(struct lwp *l, const void *v, register_t *retval)
+{
+ return do_inotify_init(l, retval, 0);
+}
+
+/*
+ * inotify_init(2). Initialize a new inotify fd with the given flags.
+ */
+int
+linux_sys_inotify_init1(struct lwp *l,
+ const struct linux_sys_inotify_init1_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(int) flags;
+ } */
+
+ return do_inotify_init(l, retval, SCARG(uap, flags));
+}
+
+/*
+ * Convert inotify mask to the fflags of an equivalent kevent.
+ */
+static uint32_t
+inotify_mask_to_kevent_fflags(uint32_t mask, enum vtype type)
+{
+ const struct inotify_kevent_mask_pair *type_inotify_to_kevent;
+ uint32_t fflags;
+ size_t i, type_inotify_to_kevent_len;
+
+ switch (type) {
+ case VREG:
+ case VDIR:
+ case VLNK:
+ break;
+
+ default:
+ return 0;
+ }
+
+ /* flags that all watches could have */
+ fflags = NOTE_DELETE|NOTE_REVOKE;
+ for (i = 0; i < common_inotify_to_kevent_len; i++)
+ if (mask & common_inotify_to_kevent[i].inotify)
+ fflags |= common_inotify_to_kevent[i].kevent;
+
+ /* flags that depend on type */
+ switch (type) {
+ case VREG:
+ type_inotify_to_kevent = vreg_inotify_to_kevent;
+ type_inotify_to_kevent_len = vreg_inotify_to_kevent_len;
+ break;
+
+ case VDIR:
+ type_inotify_to_kevent = vdir_inotify_to_kevent;
+ type_inotify_to_kevent_len = vdir_inotify_to_kevent_len;
+ break;
+
+ default:
+ type_inotify_to_kevent_len = 0;
+ break;
+ }
+ for (i = 0; i < type_inotify_to_kevent_len; i++)
+ if (mask & type_inotify_to_kevent[i].inotify)
+ fflags |= type_inotify_to_kevent[i].kevent;
+
+ return fflags;
+}
+
+/*
+ * inotify_add_watch(2). Open a fd for pathname (if desired by mask)
+ * track it and add an equivalent kqueue event for it in
+ * ifd->ifd_kqfd.
+ */
+int
+linux_sys_inotify_add_watch(struct lwp *l,
+ const struct linux_sys_inotify_add_watch_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(int) fd;
+ syscallarg(const char *) pathname;
+ syscallarg(uint32_t) mask;
+ } */
+ int wd, dup_of_wd, i, error = 0;
+ file_t *fp, *wp, *cur_fp;
+ struct stat wst, cur_st;
+ struct inotifyfd *ifd;
+ struct inotify_dir_entries **new_wds;
+ struct knote *kn, *tmpkn;
+ struct sys_open_args oa;
+ struct kevent kev;
+ enum vtype wtype;
+ struct kevent_ops k_ops = {
+ .keo_private = NULL,
+ .keo_fetch_timeout = NULL,
+ .keo_fetch_changes = inotify_kev_fetch_changes,
+ .keo_put_events = NULL,
+ };
+ const int fd = SCARG(uap, fd);
+ const uint32_t mask = SCARG(uap, mask);
+
+ if (mask & ~LINUX_IN_ADD_KNOWN)
+ return EINVAL;
+
+ fp = fd_getfile(fd);
+ if (fp == NULL)
+ return EBADF;
+
+ if (fp->f_ops != &inotify_fileops) {
+ /* not an inotify fd */
+ error = EBADF;
+ goto leave0;
+ }
+
+ ifd = fp->f_data;
+
+ mutex_enter(&ifd->ifd_lock);
+
+ /* open a new file descriptor for the watch descriptor */
+ SCARG(&oa, path) = SCARG(uap, pathname);
+ SCARG(&oa, mode) = 0;
+ SCARG(&oa, flags) = O_RDONLY;
+ if (mask & LINUX_IN_DONT_FOLLOW)
+ SCARG(&oa, flags) |= O_NOFOLLOW;
+ if (mask & LINUX_IN_ONLYDIR)
+ SCARG(&oa, flags) |= O_DIRECTORY;
+
+ error = sys_open(l, &oa, retval);
+ if (error != 0)
+ goto leave1;
+ wd = *retval;
+
+ wp = fd_getfile(wd);
+ KASSERT(wp != NULL);
+ wtype = wp->f_vnode->v_type;
+ error = vn_stat(wp->f_vnode, &wst);
+ fd_putfile(wd);
+ if (error != 0)
+ goto leave1;
+
+ /* translate the flags */
+ memset(&kev, 0, sizeof(kev));
+ EV_SET(&kev, wd, inotify_filtid, EV_ADD|EV_ENABLE,
+ NOTE_DELETE|NOTE_REVOKE, 0, ifd);
+ if (mask & LINUX_IN_ONESHOT)
+ kev.flags |= EV_ONESHOT;
+ kev.fflags |= inotify_mask_to_kevent_fflags(mask, wtype);
+
+ /* Check to see if we already have a descriptor to wd's file. */
+ dup_of_wd = -1;
+ for (i = 0; i < ifd->ifd_nwds; i++) {
+ if (ifd->ifd_wds[i] != NULL) {
+ cur_fp = fd_getfile(i);
+ if (cur_fp == NULL) {
+ DPRINTF(("%s: wd=%d was closed externally\n",
+ __func__, i));
+ error = EBADF;
+ goto leave1;
+ }
+ if (cur_fp->f_type != DTYPE_VNODE) {
+ DPRINTF(("%s: wd=%d was replaced "
+ "with a non-vnode\n", __func__, i));
+ error = EBADF;
+ }
+ if (error == 0)
+ error = vn_stat(cur_fp->f_vnode, &cur_st);
+ fd_putfile(i);
+ if (error != 0)
+ goto leave1;
+
+ if (wst.st_ino == cur_st.st_ino) {
+ dup_of_wd = i;
+ break;
+ }
+ }
+ }
+
+ if (dup_of_wd != -1) {
+ /*
+ * If we do not have a descriptor to wd's file, we need to add
+ * a knote.
+ */
+ error = kevent1(retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL,
+ &k_ops);
+ if (error != 0) {
+ KASSERT(fd_getfile(wd) != NULL);
+ fd_close(wd);
+ } else {
+ /* Success! */
+ *retval = wd;
+
+ /* Resize ifd_nwds to accomodate wd. */
+ if (wd+1 > ifd->ifd_nwds) {
+ new_wds = kmem_zalloc(
+ (wd+1) * sizeof(*ifd->ifd_wds), KM_SLEEP);
+ memcpy(new_wds, ifd->ifd_wds,
+ ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
+
+ kmem_free(ifd->ifd_wds,
+ ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
+
+ ifd->ifd_wds = new_wds;
+ ifd->ifd_nwds = wd+1;
+ }
+
+ ifd->ifd_wds[wd] = get_inotify_dir_entries(wd);
+ }
+ } else {
+ /*
+ * If we do have a descriptor to wd's file, try to edit
+ * the relevant knote.
+ */
+
+ /* We do not need wd anymore. */
+ fd_getfile(wd);
+ fd_close(wd);
+
+ if (mask & LINUX_IN_MASK_CREATE) {
+ error = EEXIST;
+ goto leave1;
+ }
+
+ wp = fd_getfile(dup_of_wd);
+ if (wp == NULL) {
+ DPRINTF(("%s: wd=%d was closed externally "
+ "(race, probably)\n", __func__, dup_of_wd));
+ error = EBADF;
+ goto leave1;
+ }
+
+ mutex_enter(wp->f_vnode->v_interlock);
+
+ /*
+ * XXX We are forced to find the appropriate knote
+ * manually because we cannot create a custom f_touch
+ * function for inotify_filtops. See filter_touch()
+ * in kern_event.c for details.
+ */
+ SLIST_FOREACH_SAFE(kn, &wp->f_vnode->v_klist->vk_klist,
+ kn_selnext, tmpkn) {
+ if (kn->kn_fop == &inotify_filtops
+ && ifd == kn->kn_kevent.udata) {
+ mutex_enter(&kn->kn_kq->kq_lock);
+ if (mask & LINUX_IN_MASK_ADD)
+ kn->kn_sfflags |= kev.fflags;
+ else
+ kn->kn_sfflags = kev.fflags;
+ wp->f_vnode->v_klist->vk_interest |=
+ kn->kn_sfflags;
+ mutex_exit(&kn->kn_kq->kq_lock);
+ }
+ }
+
+ mutex_exit(wp->f_vnode->v_interlock);
+ fd_putfile(dup_of_wd);
+ }
+
+leave1:
+ mutex_exit(&ifd->ifd_lock);
+leave0:
+ fd_putfile(fd);
+ return error;
+}
+
+/*
+ * Remove a wd from ifd and close wd.
+ */
+static int
+inotify_close_wd(struct inotifyfd *ifd, int wd)
+{
+ file_t *wp;
+ int error;
+ register_t retval;
+ struct kevent kev;
+ struct kevent_ops k_ops = {
+ .keo_private = NULL,
+ .keo_fetch_timeout = NULL,
+ .keo_fetch_changes = inotify_kev_fetch_changes,
+ .keo_put_events = NULL,
+ };
+
+ mutex_enter(&ifd->ifd_lock);
+
+ KASSERT(0 <= wd && wd < ifd->ifd_nwds && ifd->ifd_wds[wd] != NULL);
+
+ kmem_free(ifd->ifd_wds[wd],
+ INOTIFY_DIR_ENTRIES_SIZE(ifd->ifd_wds[wd]->ide_count));
+ ifd->ifd_wds[wd] = NULL;
+
+ mutex_exit(&ifd->ifd_lock);
+
+ wp = fd_getfile(wd);
+ if (wp == NULL) {
+ DPRINTF(("%s: wd=%d is already closed\n", __func__, wd));
+ return 0;
+ }
+ KASSERT(!mutex_owned(wp->f_vnode->v_interlock));
+
+ memset(&kev, 0, sizeof(kev));
+ EV_SET(&kev, wd, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
+ error = kevent1(&retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL, &k_ops);
+ if (error != 0)
+ DPRINTF(("%s: attempt to disable all events for wd=%d "
+ "had error=%d\n", __func__, wd, error));
+
+ return fd_close(wd);
+}
+
+/*
+ * inotify_rm_watch(2). Close wd and remove it from ifd->ifd_wds.
+ */
+int
+linux_sys_inotify_rm_watch(struct lwp *l,
+ const struct linux_sys_inotify_rm_watch_args *uap, register_t *retval)
+{
+ /* {
+ syscallarg(int) fd;
+ syscallarg(int) wd;
+ } */
+ struct inotifyfd *ifd;
+ file_t *fp;
+ int error = 0;
+ const int fd = SCARG(uap, fd);
+ const int wd = SCARG(uap, wd);
+
+ fp = fd_getfile(fd);
+ if (fp == NULL)
+ return EBADF;
+ if (fp->f_ops != &inotify_fileops) {
+ /* not an inotify fd */
+ error = EINVAL;
+ goto leave;
+ }
+
+ ifd = fp->f_data;
+ if (wd < 0 || wd >= ifd->ifd_nwds || ifd->ifd_wds[wd] == NULL) {
+ error = EINVAL;
+ goto leave;
+ }
+
+ error = inotify_close_wd(ifd, wd);
+
+leave:
+ fd_putfile(fd);
+ return error;
+}
+
+/*
+ * Attach the inotify filter.
+ */
+static int
+inotify_filt_attach(struct knote *kn)
+{
+ file_t *fp = kn->kn_obj;
+ struct vnode *vp;
+
+ KASSERT(fp->f_type == DTYPE_VNODE);
+ vp = fp->f_vnode;
+
+ /*
+ * Needs to be set so that we get the same event handling as
+ * EVFILT_VNODE. Otherwise we don't get any events.
+ *
+ * A consequence of this is that modifications/removals of
+ * this knote need to specify EVFILT_VNODE rather than
+ * inotify_filtid.
+ */
+ kn->kn_filter = EVFILT_VNODE;
+
+ kn->kn_fop = &inotify_filtops;
+ kn->kn_hook = vp;
+ vn_knote_attach(vp, kn);
+
+ return 0;
+}
+
+/*
+ * Detach the inotify filter.
+ */
+static void
+inotify_filt_detach(struct knote *kn)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+ vn_knote_detach(vp, kn);
+}
+
+/*
+ * Create a single inotify event.
+ */
+static void
+do_kevent_to_inotify(int32_t wd, uint32_t mask, uint32_t cookie,
+ struct inotify_entry *buf, size_t *nbuf, char *name)
+{
+ KASSERT(*nbuf < LINUX_INOTIFY_MAX_FROM_KEVENT);
+
+ buf += *nbuf;
+
+ memset(buf, 0, sizeof(*buf));
+
+ buf->ie_event.wd = wd;
+ buf->ie_event.mask = mask;
+ buf->ie_event.cookie = cookie;
+
+ if (name != NULL) {
+ buf->ie_event.len = strlen(name) + 1;
+ KASSERT(buf->ie_event.len < sizeof(buf->ie_name));
+ strcpy(buf->ie_name, name);
+ }
+
+ ++(*nbuf);
+}
+
+/*
+ * Like vn_readdir(), but with vnode locking that depends on if we already have
+ * v_interlock (to avoid double locking in some situations).
+ */
+static int
+inotify_readdir(file_t *fp, struct dirent *dep, int *done)
+{
+ struct vnode *vp;
+ struct iovec iov;
+ struct uio uio;
+ int error, eofflag;
+
+ KASSERT(fp->f_type == DTYPE_VNODE);
+ vp = fp->f_vnode;
+ KASSERT(vp->v_type == VDIR);
+
+ iov.iov_base = dep;
+ iov.iov_len = sizeof(*dep);
+
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_rw = UIO_READ;
+ uio.uio_resid = sizeof(*dep);
+ UIO_SETUP_SYSSPACE(&uio);
+
+ mutex_enter(&fp->f_lock);
+ uio.uio_offset = fp->f_offset;
+ mutex_exit(&fp->f_lock);
+
+ /* XXX: should pass whether to lock or not */
+ if (!mutex_owned(vp->v_interlock))
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_READDIR(vp, &uio, fp->f_cred, &eofflag, NULL, NULL);
+ if (!mutex_owned(vp->v_interlock))
+ VOP_UNLOCK(vp);
+
+ mutex_enter(&fp->f_lock);
+ fp->f_offset = uio.uio_offset;
+ mutex_exit(&fp->f_lock);
+
+ *done = sizeof(*dep) - uio.uio_resid;
+ return error;
+}
+
+/*
+ * Create (and allocate) an appropriate inotify_dir_entries struct for wd to be
+ * used on ifd_wds of inotifyfd. If the entries on a directory fail to be read,
+ * NULL is returned.
+ */
+static struct inotify_dir_entries *
+get_inotify_dir_entries(int wd)
+{
+ struct dirent de;
+ struct dirent *currdep;
+ struct inotify_dir_entries *idep = NULL;
+ file_t *wp;
+ int done, error;
+ size_t i, decount;
+
+ wp = fd_getfile(wd);
+ if (wp == NULL)
+ return NULL;
+ if (wp->f_type != DTYPE_VNODE)
+ goto leave;
+
+ /* for non-directories, we have 0 entries. */
+ if (wp->f_vnode->v_type != VDIR) {
+ idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(0), KM_SLEEP);
+ goto leave;
+ }
+
+ mutex_enter(&wp->f_lock);
+ wp->f_offset = 0;
+ mutex_exit(&wp->f_lock);
+ decount = 0;
+ for (;;) {
+ error = inotify_readdir(wp, &de, &done);
+ if (error != 0)
+ goto leave;
+ if (done == 0)
+ break;
+
+ currdep = &de;
+ while ((char *)currdep < ((char *)&de) + done) {
+ decount++;
+ currdep = _DIRENT_NEXT(currdep);
+ }
+ }
+
+ idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(decount), KM_SLEEP);
+ idep->ide_count = decount;
+
+ mutex_enter(&wp->f_lock);
+ wp->f_offset = 0;
+ mutex_exit(&wp->f_lock);
+ for (i = 0; i < decount;) {
+ error = inotify_readdir(wp, &de, &done);
+ if (error != 0 || done == 0) {
+ kmem_free(idep, INOTIFY_DIR_ENTRIES_SIZE(decount));
+ idep = NULL;
+ goto leave;
+ }
+
+ currdep = &de;
+ while ((char *)currdep < ((char *)&de) + done) {
+ idep->ide_entries[i].fileno = currdep->d_fileno;
+ strcpy(idep->ide_entries[i].name, currdep->d_name);
+
+ currdep = _DIRENT_NEXT(currdep);
+ i++;
+ }
+ }
+
+leave:
+ fd_putfile(wd);
+ return idep;
+}
+
+static size_t
+find_entry(struct inotify_dir_entries *i1, struct inotify_dir_entries *i2)
+{
+ for (size_t i = 0; i < i2->ide_count; i++)
+ if (i2->ide_entries[i].fileno != i1->ide_entries[i].fileno)
+ return i;
+ KASSERTMSG(0, "Entry not found");
+ return -1;
+}
+
+static void
+handle_write(struct inotifyfd *ifd, int wd, struct inotify_entry *buf,
+ size_t *nbuf)
+{
+ struct inotify_dir_entries *old_idep, *new_idep;
+ size_t i;
+
+ mutex_enter(&ifd->ifd_lock);
+
+ old_idep = ifd->ifd_wds[wd];
+ KASSERT(old_idep != NULL);
+ new_idep = get_inotify_dir_entries(wd);
+ if (new_idep == NULL) {
+ DPRINTF(("%s: directory for wd=%d could not be read\n",
+ __func__, wd));
+ mutex_exit(&ifd->ifd_lock);
+ return;
+ }
+
+
+ if (old_idep->ide_count < new_idep->ide_count) {
+ KASSERT(old_idep->ide_count + 1 == new_idep->ide_count);
+
+ /* Find the new entry. */
+ i = find_entry(new_idep, old_idep);
+ do_kevent_to_inotify(wd, LINUX_IN_CREATE, 0,
+ buf, nbuf, new_idep->ide_entries[i].name);
+ goto out;
+ }
+
+ if (old_idep->ide_count > new_idep->ide_count) {
+ KASSERT(old_idep->ide_count == new_idep->ide_count + 1);
+
+ /* Find the deleted entry. */
+ i = find_entry(old_idep, new_idep);
+
+ do_kevent_to_inotify(wd, LINUX_IN_DELETE, 0,
+ buf, nbuf, old_idep->ide_entries[i].name);
+ goto out;
+ }
+
+ /*
+ * XXX Because we are not watching the entire
+ * file system, the only time we know for sure
+ * that the event is a LINUX_IN_MOVED_FROM/
+ * LINUX_IN_MOVED_TO is when the move happens
+ * within a single directory... ie. the number
+ * of directory entries has not changed.
+ *
+ * Otherwise all we can say for sure is that
+ * something was created/deleted. So we issue a
+ * LINUX_IN_CREATE/LINUX_IN_DELETE.
+ */
+ ino_t changed = new_idep->ide_entries[new_idep->ide_count - 1].fileno;
+
+ /* Find the deleted entry. */
+ for (i = 0; i < old_idep->ide_count; i++)
+ if (old_idep->ide_entries[i].fileno == changed)
+ break;
+ KASSERT(i != old_idep->ide_count);
+
+ do_kevent_to_inotify(wd, LINUX_IN_MOVED_FROM, changed, buf, nbuf,
+ old_idep->ide_entries[i].name);
+
+ do_kevent_to_inotify(wd, LINUX_IN_MOVED_TO, changed, buf, nbuf,
+ new_idep->ide_entries[new_idep->ide_count - 1].name);
+
+out:
+ ifd->ifd_wds[wd] = new_idep;
+ mutex_exit(&ifd->ifd_lock);
+}
+
+/*
+ * Convert a kevent flags and fflags for EVFILT_VNODE to some number
+ * of inotify events.
+ */
+static int
+kevent_to_inotify(struct inotifyfd *ifd, int wd, enum vtype wtype,
+ uint32_t flags, uint32_t fflags, struct inotify_entry *buf,
+ size_t *nbuf)
+{
+ struct stat st;
+ file_t *wp;
+ size_t i;
+ int error = 0;
+
+ for (i = 0; i < common_kevent_to_inotify_len; i++)
+ if (fflags & common_kevent_to_inotify[i].kevent)
+ do_kevent_to_inotify(wd,
+ common_kevent_to_inotify[i].inotify, 0, buf, nbuf,
+ NULL);
+
+ if (wtype == VREG) {
+ for (i = 0; i < vreg_kevent_to_inotify_len; i++)
+ if (fflags & vreg_kevent_to_inotify[i].kevent)
+ do_kevent_to_inotify(wd,
+ vreg_kevent_to_inotify[i].inotify, 0,
+ buf, nbuf, NULL);
+ } else if (wtype == VDIR) {
+ for (i = 0; i < *nbuf; i++)
+ if (buf[i].ie_event.mask &
+ (LINUX_IN_ACCESS|LINUX_IN_ATTRIB
+ |LINUX_IN_CLOSE|LINUX_IN_OPEN))
+ buf[i].ie_event.mask |= LINUX_IN_ISDIR;
+
+ /* Need to disambiguate the possible NOTE_WRITEs. */
+ if (fflags & NOTE_WRITE)
+ handle_write(ifd, wd, buf, nbuf);
+ }
+
+ /*
+ * Need to check if wd is actually has a link count of 0 to issue a
+ * LINUX_IN_DELETE_SELF.
+ */
+ if (fflags & NOTE_DELETE) {
+ wp = fd_getfile(wd);
+ KASSERT(wp != NULL);
+ KASSERT(wp->f_type == DTYPE_VNODE);
+ vn_stat(wp->f_vnode, &st);
+ fd_putfile(wd);
+
+ if (st.st_nlink == 0)
+ do_kevent_to_inotify(wd, LINUX_IN_DELETE_SELF, 0,
+ buf, nbuf, NULL);
+ }
+
+ /* LINUX_IN_IGNORED must be the last event issued for wd. */
+ if ((flags & EV_ONESHOT) || (fflags & (NOTE_REVOKE|NOTE_DELETE))) {
+ do_kevent_to_inotify(wd, LINUX_IN_IGNORED, 0, buf, nbuf, NULL);
+ /*
+ * XXX in theory we could call inotify_close_wd(ifd, wd) but if
+ * we get here we must already be holding v_interlock for
+ * wd... so we can't.
+ *
+ * For simplicity we do nothing, and so wd will only be closed
+ * when the inotify fd is closed.
+ */
+ }
+
+ return error;
+}
+
+/*
+ * Handle an event. Unlike EVFILT_VNODE, we translate the event to a
+ * linux_inotify_event and put it in our own custom queue.
+ */
+static int
+inotify_filt_event(struct knote *kn, long hint)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+ struct inotifyfd *ifd;
+ struct inotify_entry *cur_ie;
+ size_t nbuf, i;
+ uint32_t status;
+ struct inotify_entry buf[LINUX_INOTIFY_MAX_FROM_KEVENT];
+
+ /*
+ * If KN_WILLDETACH is set then
+ * 1. kn->kn_kevent.udata has already been trashed with a
+ * struct lwp *, so we don't have access to a real ifd
+ * anymore, and
+ * 2. we're about to detach anyways, so we don't really care
+ * about the events.
+ * (Also because of this we need to get ifd under the same
+ * lock as kn->kn_status.)
+ */
+ mutex_enter(&kn->kn_kq->kq_lock);
+ status = kn->kn_status;
+ ifd = kn->kn_kevent.udata;
+ mutex_exit(&kn->kn_kq->kq_lock);
+ if (status & KN_WILLDETACH)
+ return 0;
+
+ /*
+ * Because we use kqueue() and file descriptors underneath,
+ * functions like inotify_add_watch can actually trigger
+ * events (ie. we're watching for LINUX_IN_OPEN). In all
+ * cases where this could happen, we must already own
+ * ifd->ifd_lock, so we can just drop these events.
+ */
+ /* XXX: why do we need this here? */
+ if (mutex_owned(&ifd->ifd_lock))
+ return 0;
+
+ /*
+ * If we don't care about the NOTEs in hint, we don't generate
+ * any events.
+ */
+ hint &= kn->kn_sfflags;
+ if (hint == 0)
+ return 0;
+
+ KASSERT(mutex_owned(vp->v_interlock));
+
+ mutex_enter(&ifd->ifd_qlock);
+
+ /*
+ * early out: there's no point even traslating the event if we
+ * have nowhere to put it (and an LINUX_IN_Q_OVERFLOW has
+ * already been added).
+ */
+ if (ifd->ifd_qcount >= LINUX_INOTIFY_MAX_QUEUED)
+ goto leave;
+
+ nbuf = 0;
+ (void)kevent_to_inotify(ifd, kn->kn_id, vp->v_type, kn->kn_flags,
+ hint, buf, &nbuf);
+ for (i = 0; i < nbuf && ifd->ifd_qcount < LINUX_INOTIFY_MAX_QUEUED-1;
+ i++) {
+ cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
+ memcpy(cur_ie, &buf[i], sizeof(*cur_ie));
+
+ TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
+ ifd->ifd_qcount++;
+ }
+ /* handle early overflow, by adding an overflow event to the end */
+ if (i != nbuf) {
+ nbuf = 0;
+ cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
+ do_kevent_to_inotify(-1, LINUX_IN_Q_OVERFLOW, 0,
+ cur_ie, &nbuf, NULL);
+
+ TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
+ ifd->ifd_qcount++;
+ }
+
+ if (nbuf > 0) {
+ cv_signal(&ifd->ifd_qcv);
+
+ mutex_enter(&ifd->ifd_lock);
+ selnotify(&ifd->ifd_sel, 0, 0);
+ mutex_exit(&ifd->ifd_lock);
+ } else
+ DPRINTF(("%s: hint=%lx resulted in 0 inotify events\n",
+ __func__, hint));
+
+leave:
+ mutex_exit(&ifd->ifd_qlock);
+ return 0;
+}
+
+/*
+ * Read inotify events from the queue.
+ */
+static int
+inotify_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
+ int flags)
+{
+ struct inotify_entry *cur_iep;
+ size_t cur_size, nread;
+ int error = 0;
+ struct inotifyfd *ifd = fp->f_data;
+
+ mutex_enter(&ifd->ifd_qlock);
+
+ if (ifd->ifd_qcount == 0) {
+ if (fp->f_flag & O_NONBLOCK) {
+ error = EAGAIN;
+ goto leave;
+ }
+
+ while (ifd->ifd_qcount == 0) {
+ /* wait until there is an event to read */
+ error = cv_wait_sig(&ifd->ifd_qcv, &ifd->ifd_qlock);
+ if (error != 0) {
+ error = EINTR;
+ goto leave;
+ }
+ }
+ }
+
+ KASSERT(ifd->ifd_qcount > 0);
+ KASSERT(mutex_owned(&ifd->ifd_qlock));
+
+ nread = 0;
+ while (ifd->ifd_qcount > 0) {
+ cur_iep = TAILQ_FIRST(&ifd->ifd_qhead);
+ KASSERT(cur_iep != NULL);
+
+ cur_size = sizeof(cur_iep->ie_event) + cur_iep->ie_event.len;
+ if (cur_size > uio->uio_resid) {
+ if (nread == 0)
+ error = EINVAL;
+ break;
+ }
+
+ error = uiomove(&cur_iep->ie_event, sizeof(cur_iep->ie_event),
+ uio);
+ if (error != 0)
+ break;
+ error = uiomove(&cur_iep->ie_name, cur_iep->ie_event.len, uio);
+ if (error != 0)
+ break;
+
+ /* cleanup */
+ TAILQ_REMOVE(&ifd->ifd_qhead, cur_iep, ie_entries);
+ kmem_free(cur_iep, sizeof(*cur_iep));
+
+ nread++;
+ ifd->ifd_qcount--;
+ }
+
+leave:
+ /* Wake up the next reader, if the queue is not empty. */
+ if (ifd->ifd_qcount > 0)
+ cv_signal(&ifd->ifd_qcv);
+
+ mutex_exit(&ifd->ifd_qlock);
+ return error;
+}
+
+/*
+ * Close all the file descriptors associated with fp.
+ */
+static int
+inotify_close(file_t *fp)
+{
+ int error;
+ size_t i;
+ file_t *kqfp;
+ struct inotifyfd *ifd = fp->f_data;
+
+ for (i = 0; i < ifd->ifd_nwds; i++) {
+ if (ifd->ifd_wds[i] != NULL) {
+ error = inotify_close_wd(ifd, i);
+ if (error != 0)
+ return error;
+ }
+ }
+
+ /* the reference we need to hold is ifd->ifd_kqfp */
+ kqfp = fd_getfile(ifd->ifd_kqfd);
+ if (kqfp == NULL) {
+ DPRINTF(("%s: kqfp=%d is already closed\n", __func__,
+ ifd->ifd_kqfd));
+ } else {
+ error = fd_close(ifd->ifd_kqfd);
+ if (error != 0)
+ return error;
+ }
+
+ mutex_destroy(&ifd->ifd_lock);
+ mutex_destroy(&ifd->ifd_qlock);
+ cv_destroy(&ifd->ifd_qcv);
+ seldestroy(&ifd->ifd_sel);
+
+ kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
+ kmem_free(ifd, sizeof(*ifd));
+ fp->f_data = NULL;
+
+ return 0;
+}
+
+/*
+ * Check if there are pending read events.
+ */
+static int
+inotify_poll(file_t *fp, int events)
+{
+ int revents;
+ struct inotifyfd *ifd = fp->f_data;
+
+ revents = 0;
+ if (events & (POLLIN|POLLRDNORM)) {
+ mutex_enter(&ifd->ifd_qlock);
+
+ if (ifd->ifd_qcount > 0)
+ revents |= events & (POLLIN|POLLRDNORM);
+
+ mutex_exit(&ifd->ifd_qlock);
+ }
+
+ return revents;
+}
+
+/*
+ * Attach EVFILT_READ to the inotify instance in fp.
+ *
+ * This is so you can watch inotify with epoll. No other kqueue
+ * filter needs to be supported.
+ */
+static int
+inotify_kqfilter(file_t *fp, struct knote *kn)
+{
+ struct inotifyfd *ifd = fp->f_data;
+
+ KASSERT(fp == kn->kn_obj);
+
+ if (kn->kn_filter != EVFILT_READ)
+ return EINVAL;
+
+ kn->kn_fop = &inotify_read_filtops;
+ mutex_enter(&ifd->ifd_lock);
+ selrecord_knote(&ifd->ifd_sel, kn);
+ mutex_exit(&ifd->ifd_lock);
+
+ return 0;
+}
+
+/*
+ * Detach a filter from an inotify instance.
+ */
+static void
+inotify_read_filt_detach(struct knote *kn)
+{
+ struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
+
+ mutex_enter(&ifd->ifd_lock);
+ selremove_knote(&ifd->ifd_sel, kn);
+ mutex_exit(&ifd->ifd_lock);
+}
+
+/*
+ * Handle EVFILT_READ events. Note that nothing is put in kn_data.
+ */
+static int
+inotify_read_filt_event(struct knote *kn, long hint)
+{
+ int rv;
+ struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
+
+ mutex_enter(&ifd->ifd_qlock);
+ rv = (ifd->ifd_qcount > 0);
+ mutex_exit(&ifd->ifd_qlock);
+
+ return rv;
+}
+
+/*
+ * Restart the inotify instance.
+ */
+static void
+inotify_restart(file_t *fp)
+{
+ struct inotifyfd *ifd = fp->f_data;
+
+ mutex_enter(&ifd->ifd_qlock);
+ cv_broadcast(&ifd->ifd_qcv);
+ mutex_exit(&ifd->ifd_qlock);
+}
Index: src/sys/compat/linux/common/linux_inotify.h
diff -u /dev/null src/sys/compat/linux/common/linux_inotify.h:1.1
--- /dev/null Sat Aug 19 13:57:55 2023
+++ src/sys/compat/linux/common/linux_inotify.h Sat Aug 19 13:57:54 2023
@@ -0,0 +1,92 @@
+/* $NetBSD: linux_inotify.h,v 1.1 2023/08/19 17:57:54 christos Exp $ */
+
+/*-
+ * Copyright (c) 2023 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Theodore Preduta.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_INOTIFY_H
+#define _LINUX_INOTIFY_H
+
+#include <sys/types.h>
+/* for LINUX_O_CLOEXEC, LINUX_O_NONBLOCK */
+#include <compat/linux/common/linux_fcntl.h>
+
+#define LINUX_IN_CLOEXEC LINUX_O_CLOEXEC
+#define LINUX_IN_NONBLOCK LINUX_O_NONBLOCK
+
+#define LINUX_IN_ALL_FLAGS (LINUX_IN_CLOEXEC|LINUX_IN_NONBLOCK)
+
+#define LINUX_IN_ACCESS 0x00000001
+#define LINUX_IN_MODIFY 0x00000002
+#define LINUX_IN_ATTRIB 0x00000004
+#define LINUX_IN_CLOSE_WRITE 0x00000008
+#define LINUX_IN_CLOSE_NOWRITE 0x00000010
+#define LINUX_IN_OPEN 0x00000020
+#define LINUX_IN_MOVED_FROM 0x00000040
+#define LINUX_IN_MOVED_TO 0x00000080
+#define LINUX_IN_CREATE 0x00000100
+#define LINUX_IN_DELETE 0x00000200
+#define LINUX_IN_DELETE_SELF 0x00000400
+#define LINUX_IN_MOVE_SELF 0x00000800
+#define LINUX_IN_UNMOUNT 0x00002000
+#define LINUX_IN_Q_OVERFLOW 0x00004000
+#define LINUX_IN_IGNORED 0x00008000
+
+#define LINUX_IN_ONLYDIR 0x01000000
+#define LINUX_IN_DONT_FOLLOW 0x02000000
+#define LINUX_IN_EXCL_UNLINK 0x04000000
+#define LINUX_IN_MASK_CREATE 0x10000000
+#define LINUX_IN_MASK_ADD 0x20000000
+#define LINUX_IN_ISDIR 0x40000000
+#define LINUX_IN_ONESHOT 0x80000000
+
+#define LINUX_IN_CLOSE (LINUX_IN_CLOSE_WRITE|LINUX_IN_CLOSE_NOWRITE)
+
+#define LINUX_IN_ADD_KNOWN (LINUX_IN_ACCESS|LINUX_IN_ATTRIB \
+ |LINUX_IN_CLOSE_WRITE|LINUX_IN_CLOSE_NOWRITE \
+ |LINUX_IN_CREATE|LINUX_IN_DELETE \
+ |LINUX_IN_DELETE_SELF|LINUX_IN_MODIFY \
+ |LINUX_IN_MOVE_SELF|LINUX_IN_MOVED_FROM \
+ |LINUX_IN_MOVED_TO|LINUX_IN_OPEN \
+ |LINUX_IN_DONT_FOLLOW|LINUX_IN_ONLYDIR \
+ |LINUX_IN_MASK_CREATE|LINUX_IN_MASK_ADD \
+ |LINUX_IN_ISDIR|LINUX_IN_ONESHOT)
+
+struct linux_inotify_event {
+ int32_t wd;
+ uint32_t mask;
+ uint32_t cookie;
+ uint32_t len;
+ char name[];
+};
+
+#ifdef _KERNEL
+int linux_inotify_init(void);
+int linux_inotify_fini(void);
+#endif
+
+#endif /* !_LINUX_INOTIFY_H */