Hello. This pseudo fs allows to bind a file descriptor to different kinds of events, which allows to poll them using epoll().
This particular morning hack supports signals only. If idea is supposed to be right, I can cook up POSIX timers support. Signal delivery note. If special flag is set in signalfd(signo, flag), then signals are _not_ delivered through pending mask update but only through epoll queue. (Copied from kevent). Userspace signal code and patch itself can be found at: http://tservice.net.ru/~s0mbre/archive/eventfs/ signal.c is also attached for interested reader. Signed-off-by: Evgeniy Polyakov <[EMAIL PROTECTED]> diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 2697e92..b14ee54 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -319,3 +319,4 @@ ENTRY(sys_call_table) .long sys_move_pages .long sys_getcpu .long sys_epoll_pwait + .long sys_signalfd /* 320 */ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index eda7a0d..bc6336c 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -719,4 +719,5 @@ ia32_sys_call_table: .quad compat_sys_move_pages .quad sys_getcpu .quad sys_epoll_pwait + .quad sys_signalfd ia32_syscall_end: diff --git a/fs/Kconfig b/fs/Kconfig index 3c4886b..09803ad 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1032,6 +1032,15 @@ config CONFIGFS_FS Both sysfs and configfs can and should exist together on the same system. One is not a replacement for the other. +config EVENTFS + bool "Enable eventpoll filesystem support" if EMBEDDED + depends on EPOLL + default y + help + Allows to bind file descriptors to different kinds of objects + like signals and timers and work with them using epoll + family of system calls. + endmenu menu "Miscellaneous filesystems" diff --git a/fs/Makefile b/fs/Makefile index 9edf411..185bcb1 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -22,6 +22,7 @@ endif obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o +obj-$(CONFIG_EVENTFS) += eventfs.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o diff --git a/fs/eventfs.c b/fs/eventfs.c new file mode 100644 index 0000000..dae108c --- /dev/null +++ b/fs/eventfs.c @@ -0,0 +1,221 @@ +/* + * 2007 Copyright (c) Evgeniy Polyakov <[EMAIL PROTECTED]> + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/device.h> +#include <linux/poll.h> +#include <asm/io.h> + +static inline void eventfs_set_signal_file(int sig, struct file *file) +{ + spin_lock_irq(¤t->sighand->siglock); + current->signal_file[sig-1] = file; + spin_unlock_irq(¤t->sighand->siglock); +} + +static int eventfs_signal_release(struct inode *inode, struct file *file) +{ + int sig = (int)((unsigned long)(file->private_data) & 0x0fffffff); + eventfs_set_signal_file(sig, NULL); + return 0; +} + +static unsigned int eventfs_signal_poll(struct file *file, struct poll_table_struct *wait) +{ + int sig = (int)((unsigned long)(file->private_data) & 0x0fffffff); + unsigned int mask = 0; + unsigned long flags; + + poll_wait(file, ¤t->signal_wait, wait); + + spin_lock_irqsave(¤t->sighand->siglock, flags); + if (!sigismember(¤t->blocked, sig) && (((unsigned long)(file->private_data)) & 0x40000000)) { + mask = POLLIN | POLLRDNORM; + file->private_data = (void *)(((unsigned long)(file->private_data)) & ~0x40000000); + } + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + return mask; +} + +struct file_operations eventfs_signal_fops = { + .release = eventfs_signal_release, + .poll = eventfs_signal_poll, + .owner = THIS_MODULE, +}; + +static struct vfsmount *eventfs_mnt __read_mostly; + +static int eventfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "eventfs", NULL, 0x193748dd, mnt); +} + +static struct file_system_type eventfs_fs_type = { + .name = "eventfs", + .get_sb = eventfs_get_sb, + .kill_sb = kill_anon_super, +}; + +static int eventfs_delete_dentry(struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations eventfs_dentry_operations = { + .d_delete = eventfs_delete_dentry, +}; + +static int eventfs_init(struct file **filp, struct file_operations *fops, unsigned long priv) +{ + struct qstr this; + char name[32]; + struct dentry *dentry; + struct inode *inode; + struct file *file; + int err = -ENFILE, fd; + + file = get_empty_filp(); + if (!file) + goto err_out_exit; + + inode = new_inode(eventfs_mnt->mnt_sb); + if (!inode) + goto err_out_fput; + + inode->i_fop = fops; + + inode->i_state = I_DIRTY; + inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + err = get_unused_fd(); + if (err < 0) + goto err_out_iput; + fd = err; + + sprintf(name, "[%lu]", inode->i_ino); + this.name = name; + this.len = strlen(name); + this.hash = inode->i_ino; + dentry = d_alloc(eventfs_mnt->mnt_sb->s_root, &this); + if (!dentry) + goto err_out_put_fd; + dentry->d_op = &eventfs_dentry_operations; + d_add(dentry, inode); + file->f_vfsmnt = mntget(eventfs_mnt); + file->f_dentry = dentry; + file->f_mapping = inode->i_mapping; + file->f_pos = 0; + file->f_flags = O_RDONLY; + file->f_op = fops; + file->f_mode = FMODE_READ; + file->f_version = 0; + file->private_data = (void *)priv; + + fd_install(fd, file); + *filp = file; + + return fd; + +err_out_put_fd: + put_unused_fd(fd); +err_out_iput: + iput(inode); +err_out_fput: + put_filp(file); +err_out_exit: + return err; +} + +asmlinkage long sys_signalfd(int sig, int flags) +{ + int fd, err = 0; + struct file *file; + unsigned long priv = sig; + + if (!valid_signal(sig) || sig < 1/* || sig_kernel_only(sig) */) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + file = current->signal_file[sig-1]; + if (file) + err = -EEXIST; + else + current->signal_file[sig-1] = (void *)1; + spin_unlock_irq(¤t->sighand->siglock); + + if (err) + return err; + + file = NULL; + if (flags) + priv |= 0x80000000; + fd = eventfs_init(&file, &eventfs_signal_fops, priv); + if (fd < 0) + goto err_out_clean_file; + + eventfs_set_signal_file(sig, file); + + return fd; + +err_out_clean_file: + eventfs_set_signal_file(sig, NULL); + return fd; +} + +/* + * Eventfs subsystem initialization - create caches and register + * filesystem to get control file descriptors from. + */ +static int __init eventfs_sys_init(void) +{ + int err; + + err = register_filesystem(&eventfs_fs_type); + if (err) + goto err_out_exit; + + eventfs_mnt = kern_mount(&eventfs_fs_type); + err = PTR_ERR(eventfs_mnt); + if (IS_ERR(eventfs_mnt)) + goto err_out_unreg; + + printk(KERN_INFO "Eventfs subsystem has been successfully registered.\n"); + + return 0; + +err_out_unreg: + unregister_filesystem(&eventfs_fs_type); +err_out_exit: + return err; +} + +module_init(eventfs_sys_init); diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 833fa17..c72a568 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -325,10 +325,11 @@ #define __NR_move_pages 317 #define __NR_getcpu 318 #define __NR_epoll_pwait 319 +#define __NR_signalfd 320 #ifdef __KERNEL__ -#define NR_syscalls 320 +#define NR_syscalls 321 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index c5f596e..62a21f3 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,8 +619,10 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range) __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_signalfd 280 +__SYSCALL(__NR_signalfd, sys_signalfd) -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_signalfd #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/include/linux/sched.h b/include/linux/sched.h index 49fe299..22c1412 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -935,6 +935,10 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; +#ifdef CONFIG_EVENTFS + struct file *signal_file[_NSIG]; + wait_queue_head_t signal_wait; +#endif sigset_t blocked, real_blocked; sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1912c6c..b34f4e6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -605,4 +605,6 @@ asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct g int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage long sys_signalfd(int sig, int flags); + #endif diff --git a/kernel/fork.c b/kernel/fork.c index d154cc7..1b318da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1128,6 +1128,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_namespaces; +#ifdef CONFIG_EVENTFS + memset(p->signal_file, 0, ARRAY_SIZE(p->signal_file)); + init_waitqueue_head(&p->signal_wait); +#endif + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? diff --git a/kernel/signal.c b/kernel/signal.c index 3670225..059977c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -739,6 +739,16 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigqueue * q = NULL; int ret = 0; +#ifdef CONFIG_EVENTFS + if (t->signal_file[sig-1]) { + struct file *file = t->signal_file[sig-1]; + file->private_data = (void *)(((unsigned long)(file->private_data)) | 0x40000000); + wake_up(&t->signal_wait); + if (((unsigned long)(file->private_data)) & 0x80000000) + return 1; + } +#endif + /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. @@ -817,6 +827,18 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) ret = send_signal(sig, info, t, &t->pending); if (!ret && !sigismember(&t->blocked, sig)) signal_wake_up(t, sig == SIGKILL); +#ifdef CONFIG_EVENTFS + /* + * Eventfs allows to deliver signals through epoll queue, + * it is possible to setup epoll to not deliver + * signal through the usual way, in that case send_signal() + * returns 1 and signal is delivered only through epoll queue. + * We simulate successfull delivery notification through this hack: + */ + if (ret == 1) + ret = 0; + +#endif out: return ret; } @@ -1006,6 +1028,18 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) * to avoid several races. */ ret = send_signal(sig, info, p, &p->signal->shared_pending); +#ifdef CONFIG_EVENTFS + /* + * Eventfs allows to deliver signals through epoll queue, + * it is possible to setup epoll to not deliver + * signal through the usual way, in that case send_signal() + * returns 1 and signal is delivered only through epoll queue. + * We simulate successfull delivery notification through this hack: + */ + if (ret == 1) + ret = 0; + +#endif if (unlikely(ret)) return ret; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d7306d0..c131d20 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -113,6 +113,8 @@ cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_signalfd); + /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); -- Evgeniy Polyakov
#include <sys/epoll.h> #include <signal.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <errno.h> #include <string.h> #include <time.h> #include <unistd.h> #include <linux/unistd.h> #include <linux/types.h> #define _syscall2(type,name,type1,arg1,type2,arg2) \ type name (type1 arg1, type2 arg2) \ {\ return syscall(__NR_##name, arg1, arg2);\ } _syscall2(int, signalfd, int, sig, int, flags); #if 1 #define ulog(f, a...) fprintf(stderr, f, ##a) #else #define ulog(f, a...) #endif #define ulog_err(f, a...) ulog(f ": %s [%d].\n", ##a, strerror(errno), errno) static void sig_handler(int signo) { printf("%s, signo: %d.\n", __func__, signo); } static int epoll_signal_init(int ctl_fd, int sig, int flags) { int fd; struct epoll_event event; fd = signalfd(sig, flags); if (fd < 0) { ulog_err("Failed to bind signal: %d, flags: %d", sig, flags); return fd; } event.events = EPOLLIN; event.data.fd = fd; if (epoll_ctl(ctl_fd, EPOLL_CTL_ADD, fd, &event) < 0) { ulog_err("Failed to perform control ADD operation: fd: %d", fd); return -1; } return 0; } int main() { int err, ctl_fd, i; struct epoll_event event[256]; ctl_fd = epoll_create(10); if (ctl_fd < 0) { ulog_err("Failed to create epoll control file descriptor"); return -1; } signal(SIGUSR1, sig_handler); signal(SIGUSR2, sig_handler); err = epoll_signal_init(ctl_fd, SIGUSR1, 1); if (err) return err; err = epoll_signal_init(ctl_fd, SIGUSR2, 0); if (err) return err; ulog("%s: pid: %d\n", __func__, getpid()); kill(getpid(), SIGUSR1); kill(getpid(), SIGUSR2); while (1) { err = epoll_wait(ctl_fd, event, 256, 10000); if (err < 0) { ulog_err("Failed to wait for events"); continue; } if (err == 0) continue; for (i=0; i<err; ++i) printf("Dequeued signal, fd: %u.\n", event[i].data.fd); } return 0; }