[RFC] error management in add_disk()
Hi, I have a problem with the error management of add_disk() and del_gendisk(). add_disk() adds an entry in /sys/block/. The filename in /sys/block is not (struct gen_disk)->disk_name but more or less the first KOBJ_NAME_LEN characters of (struct gen_disk)->disk_name. #define KOBJ_NAME_LEN 20 My problem occurs when we try to add 2 disks with different names, but when the KOBJ_NAME_LEN first characters are the same. It is not allowed to have several files in /sys/block/ with the same name. It does not prevent the disk to work, but the creation of the file in /sys/block will silently fail. See the call chain: add_disk() -> register_disk() -> kobject_add(&disk->kobj) del_gendisk() -> kobject_del(&disk->kobj) add_disk() and register_disk() return void, so the caller cannot know that there is a problem. kobject_add() returns an error but register_disk() ignores the error. The disk driver is unaware of the failure of disk_add(), and may call del_gendisk(). It deletes an object in /sys/block/ that was not added (bug!). The disk driver cannot check that, so the reference counter of /sys/block is decremented on kobject_del(). If the user run add_disk() and del_gendisk() a lot of times with different names having the same 20-characters beginning, the reference counter of the /sys/block directory will reach 0 (and it will be freed) although it still contains files. The attached test module triggers the problem. You can try something like: for i in $(seq 1 100) ; do insmod ./adddiskbug.ko ; rmmod adddiskbug ; done The attached patch fixes the problem by changing the prototype of add_disk() and register_disk() to return errors. Index: linux-2.6.23-rc1/block/genhd.c === --- linux-2.6.23-rc1.orig/block/genhd.c 2007-07-23 14:52:11.0 +0200 +++ linux-2.6.23-rc1/block/genhd.c 2007-07-23 18:08:58.0 +0200 @@ -175,13 +175,22 @@ * This function registers the partitioning information in @disk * with the kernel. */ -void add_disk(struct gendisk *disk) +int add_disk(struct gendisk *disk) { + int ret; + disk->flags |= GENHD_FL_UP; blk_register_region(MKDEV(disk->major, disk->first_minor), disk->minors, NULL, exact_match, exact_lock, disk); - register_disk(disk); + ret = register_disk(disk); + if (ret) { + blk_unregister_region(MKDEV(disk->major, disk->first_minor), + disk->minors); + return ret; + } blk_register_queue(disk); + + return 0; } EXPORT_SYMBOL(add_disk); Index: linux-2.6.23-rc1/include/linux/genhd.h === --- linux-2.6.23-rc1.orig/include/linux/genhd.h 2007-07-09 01:32:17.0 +0200 +++ linux-2.6.23-rc1/include/linux/genhd.h 2007-07-23 15:12:53.0 +0200 @@ -235,7 +235,7 @@ /* drivers/block/genhd.c */ extern int get_blkdev_list(char *, int); -extern void add_disk(struct gendisk *disk); +extern int add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern void unlink_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *part); Index: linux-2.6.23-rc1/fs/partitions/check.c === --- linux-2.6.23-rc1.orig/fs/partitions/check.c 2007-07-23 14:52:13.0 +0200 +++ linux-2.6.23-rc1/fs/partitions/check.c 2007-07-23 15:53:04.0 +0200 @@ -469,7 +469,7 @@ } /* Not exported, helper to add_disk(). */ -void register_disk(struct gendisk *disk) +int register_disk(struct gendisk *disk) { struct block_device *bdev; char *s; @@ -483,11 +483,11 @@ if (s) *s = '!'; if ((err = kobject_add(&disk->kobj))) - return; + return -EEXIST; err = disk_sysfs_symlinks(disk); if (err) { kobject_del(&disk->kobj); - return; + return -EEXIST; } disk_sysfs_add_subdirs(disk); @@ -523,6 +523,8 @@ continue; kobject_uevent(&p->kobj, KOBJ_ADD); } + + return 0; } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) Index: linux-2.6.23-rc1/include/linux/blkdev.h === --- linux-2.6.23-rc1.orig/include/linux/blkdev.h2007-07-23 14:52:14.0 +0200 +++ linux-2.6.23-rc1/include/linux/blkdev.h 2007-07-23 15:58:47.0 +0200 @@ -643,7 +643,7 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -extern void register_disk(struct gendisk *dev); +extern int register_disk(struct gendisk *dev); extern void generic_make_request(struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_p
Re: [RFC] error management in add_disk()
Le Tue, 24 Jul 2007 14:28:05 +0100, Al Viro <[EMAIL PROTECTED]> a écrit : >On Tue, Jul 24, 2007 at 01:57:53PM +0200, Alban Crequy wrote: >> Hi, >> >> I have a problem with the error management of add_disk() and >> del_gendisk(). >> >> add_disk() adds an entry in /sys/block/. The filename >> in /sys/block is not (struct gen_disk)->disk_name but more or less >> the first KOBJ_NAME_LEN characters of (struct gen_disk)->disk_name. >> >> #define KOBJ_NAME_LEN 20 >> >> My problem occurs when we try to add 2 disks with different names, >> but when the KOBJ_NAME_LEN first characters are the same. > >So don't do that. I no more do that. But I still think it would be better if we found a way to manage errors in that case. I fear that parts of kernel make this error. For example, old version of GFS has this code: http://csourcesearch.net/package/gfs-kernel/2.6.9/gfs-kernel-2.6.9-27/src/gfs/diaper.c char buf[BDEVNAME_SIZE]; bdevname(real, buf); snprintf(gd->disk_name, sizeof(gd->disk_name), "diapered_%s", buf); Since BDEVNAME_SIZE is 32 and KOBJ_NAME_LEN is 20, the bug happens quite easily. I did not check closely if this is a problem, but there is other parts in the current kernel that build the disk_name with snprintf("...%s...") >> The attached test module triggers the problem. You can try something >> like: for i in $(seq 1 100) ; do insmod ./adddiskbug.ko ; rmmod >> adddiskbug ; done >> >> The attached patch fixes the problem by changing the prototype of >> add_disk() and register_disk() to return errors. > >This is bogus. Just what would callers do with these error values? >Ignore them silently? Bail out? Can't do - at that point disk just >might have been opened already. add_disk() is the point of no return; >we are already past the last point where we could bail out. I missed that point - that the disk might have been opened. Where is the point of no return in add_disk() exactly? Is it really before the kobject_add() that causes the problem? In this case, perhaps we can 1/ check that the kobject_add() will not fail before the point of no return, 2/ pass this point and then 3/ do the kobject_add(). And add appropriate locking to ensure that nobody add another disk with the same 20-characters truncated name between 1/ and 3/. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/5 RFC] Add an interface to discover relationships between namespaces
Hi, On 14 July 2016 at 20:20, Andrey Vagin wrote: > Each namespace has an owning user namespace and now there is not way > to discover these relationships. > > Pid and user namepaces are hierarchical. There is no way to discover > parent-child relationships too. > > Why we may want to know relationships between namespaces? > > One use would be visualization, in order to understand the running system. This looks interesting to me because I am interested in representing in a graphical way the relationship between different mounts in different mount namespaces (showing the ID, the parent-children relationships, mount peer groups, the master-slave relationships etc), specially for containers. The first idea was to take both /proc/1/mountinfo and /proc/$OTHER_PID/mountinfo and I can correlate the "shared:" and "master:" fields in the mountinfo files. But I cannot read the /proc/$pid/mountinfo of mount namespaces when there are no processes in those mount namespaces. For example, if those mount namespaces stay alive only because they contain "shared&slave" mounts between master mounts and slave mounts that I can see in /proc/$pid/mountinfo. Fictional example: # mntns 1, mountinfo 1 (visible via /proc/1/mountinfo) 61 0 253:1 / / rw shared:1 # mntns 2, mountinfo 2 (not visible via any /proc/$pid/mountinfo) 731 569 0:75 / / rw master:1 shared:42 # mntns 3, mountinfo 3 (not visible via any /proc/${container_pid}/mountinfo) 762 597 0:82 / / rw master:42 shared:76 As far as I understand, I cannot get a reference to the mntns2 fd because mnt namespaces are not hierarchical, and I cannot get its /proc/???/mountinfo because no processes live inside. Is there a way around it? Should this use case be handled together? Thanks! Alban > Another would be to answer the question: what capability does process X have > to > perform operations on a resource governed by namespace Y? > > One more use-case (which usually called abnormal) is checkpoint/restart. > In CRIU we age going to dump and restore nested namespaces. > > There [1] was a discussion about which interface to choose to determing > relationships between namespaces. > > Eric suggested to add two ioctl-s [2]: >> Grumble, Grumble. I think this may actually a case for creating ioctls >> for these two cases. Now that random nsfs file descriptors are bind >> mountable the original reason for using proc files is not as pressing. >> >> One ioctl for the user namespace that owns a file descriptor. >> One ioctl for the parent namespace of a namespace file descriptor. > > Here is an implementaions of these ioctl-s. > > [1] https://lkml.org/lkml/2016/7/6/158 > [2] https://lkml.org/lkml/2016/7/9/101 > > Cc: "Eric W. Biederman" > Cc: James Bottomley > Cc: "Michael Kerrisk (man-pages)" > Cc: "W. Trevor King" > Cc: Alexander Viro > Cc: Serge Hallyn > > -- > 2.5.5 > > ___ > Containers mailing list > contain...@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/containers
[PATCH] [RFC] proc connector: add namespace events
From: Alban Crequy The act of a process creating or joining a namespace via clone(), unshare() or setns() is a useful signal for monitoring applications. I am working on a monitoring application that keeps track of all the containers and all processes inside each container. The current way of doing it is by polling regularly in /proc for the list of processes and in /proc/*/ns/* to know which namespaces they belong to. This is inefficient on systems with a large number of containers and a large number of processes. Instead, I would inspect /proc only one time and get the updates with the proc connector. Unfortunately, the proc connector gives me the list of processes but does not notify me when a process changes namespaces. So I would still need to inspect /proc/*/ns/*. This patch add namespace events for processes. It generates a namespace event each time a process changes namespace via clone(), unshare() or setns(). For example, the following command: | # unshare -n -f ls -l /proc/self/ns/net | lrwxrwxrwx 1 root root 0 Sep 6 05:35 /proc/self/ns/net -> 'net:[4026532142]' causes the proc connector to generate the following events: | fork: ppid=696 pid=858 | exec: pid=858 | ns: pid=858 type=net reason=set old_inum=4026531957 inum=4026532142 | fork: ppid=858 pid=859 | exec: pid=859 | exit: pid=859 | exit: pid=858 Note: this patch is just a RFC, we are exploring other ways to achieve the same feature. The current implementation has the following limitations: - Ideally, I want to know whether the event is cause by clone(), unshare() or setns(). At the moment, the reason field only distinguishes between clone() and non-clone. - The event for pid namespaces is generated when pid_ns_for_children changes. I think that's ok, and it just needs to be documented for userspace in the same way it is already documented in pid_namespaces(7). Userspace really needs to know whether the event is caused by clone() or non-clone to interpret the event correctly. - Events for userns are not implemented yet. I skipped it for now because user namespaces are not managed with nsproxy as other namespaces. - The mnt namespace struct is more private than other so the code is a bit different for this. I don't know if there is a better way to do this. - Userspace needs a way to know whether namespace events are implemented in the proc connector. If not implemented, userspaces needs to fallback to polling changes in /proc/*/ns/*. I am not sure whether to add a Netlink message to query the kernel if the feature is implemented or otherwise. - There is no granularity when subscribing for proc connector events. I figured it might not be a problem since namespace events are more rare than other fork/exec events. It will probably not flood existing users of the proc connector. Signed-off-by: Alban Crequy --- drivers/connector/cn_proc.c | 28 + include/linux/cn_proc.h | 4 +++ include/uapi/linux/cn_proc.h | 16 +- kernel/nsproxy.c | 71 4 files changed, 118 insertions(+), 1 deletion(-) diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index a782ce8..69e6815 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -246,6 +246,34 @@ void proc_comm_connector(struct task_struct *task) send_msg(msg); } +void proc_ns_connector(struct task_struct *task, int type, int reason, u64 old_inum, u64 inum) +{ + struct cn_msg *msg; + struct proc_event *ev; + __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); + + if (atomic_read(&proc_event_num_listeners) < 1) + return; + + msg = buffer_to_cn_msg(buffer); + ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); + ev->timestamp_ns = ktime_get_ns(); + ev->what = PROC_EVENT_NM; + ev->event_data.nm.process_pid = task->pid; + ev->event_data.nm.process_tgid = task->tgid; + ev->event_data.nm.type = type; + ev->event_data.nm.reason = reason; + ev->event_data.nm.old_inum = old_inum; + ev->event_data.nm.inum = inum; + + memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); + msg->ack = 0; /* not used */ + msg->len = sizeof(*ev); + msg->flags = 0; /* not used */ + send_msg(msg); +} + void proc_coredump_connector(struct task_struct *task) { struct cn_msg *msg; diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h index 1d5b02a..2e6915e 100644 --- a/include/linux/cn_proc.h +++ b/include/linux/cn_proc.h @@ -26,6 +26,7 @@ void proc_id_connector(struct task_struct *task, int which_id); void proc_sid_connector(struct task_struct *task); void proc_ptrace_connector(struct task_struct *task, int which_id); void proc_comm_connector(struct task_struct *task);
Re: [PATCH] [RFC] proc connector: add namespace events
On 12 September 2016 at 23:39, Evgeniy Polyakov wrote: > Hi everyone > > 08.09.2016, 18:39, "Alban Crequy" : >> The act of a process creating or joining a namespace via clone(), >> unshare() or setns() is a useful signal for monitoring applications. > >> + if (old_ns->mnt_ns != new_ns->mnt_ns) >> + proc_ns_connector(tsk, CLONE_NEWNS, PROC_NM_REASON_CLONE, old_mntns_inum, >> new_mntns_inum); >> + >> + if (old_ns->uts_ns != new_ns->uts_ns) >> + proc_ns_connector(tsk, CLONE_NEWUTS, PROC_NM_REASON_CLONE, >> old_ns->uts_ns->ns.inum, new_ns->uts_ns->ns.inum); >> + >> + if (old_ns->ipc_ns != new_ns->ipc_ns) >> + proc_ns_connector(tsk, CLONE_NEWIPC, PROC_NM_REASON_CLONE, >> old_ns->ipc_ns->ns.inum, new_ns->ipc_ns->ns.inum); >> + >> + if (old_ns->net_ns != new_ns->net_ns) >> + proc_ns_connector(tsk, CLONE_NEWNET, PROC_NM_REASON_CLONE, >> old_ns->net_ns->ns.inum, new_ns->net_ns->ns.inum); >> + >> + if (old_ns->cgroup_ns != new_ns->cgroup_ns) >> + proc_ns_connector(tsk, CLONE_NEWCGROUP, PROC_NM_REASON_CLONE, >> old_ns->cgroup_ns->ns.inum, new_ns->cgroup_ns->ns.inum); >> + >> + if (old_ns->pid_ns_for_children != new_ns->pid_ns_for_children) >> + proc_ns_connector(tsk, CLONE_NEWPID, PROC_NM_REASON_CLONE, >> old_ns->pid_ns_for_children->ns.inum, new_ns->pid_ns_for_children->ns.inum); >> + } >> + > > Patch looks good to me from technical/connector point of view, but these even > multiplication is a bit weird imho. > > I'm not against it, but did you consider sending just 2 serialized ns > structures via single message, and client > would check all ns bits himself? I have not considered it, thanks for the suggestion. Should we offer the guarantee to userspace that it will always be send in one single message? If we want to give the information about the userns change too, it will be a bit more complicated to write the patch because it is not done in the same function. Note that I will probably not have the chance to spend more time on this patch soon because Iago will explore other methods with eBPF+kprobes instead. eBPF+kprobes would not have the same API stability though. I was curious to see if anyone would find the namespace addition to proc connector interesting for other projects.
overlayfs: regression bug from 4bacc9c9 (Make f_path always point to the overlay and f_inode to the underlay)
Hi, I'm reporting an issue in overlay fs that was introduced in v4.2 (it worked on v4.1): when overlay fs is mounted inside a overlay fs, I get a "no such device or address" error (ENXIO) during open(). After adding some debug printks, I found that the ENXIO comes from fs/inode.c:no_open(). The bug was initially reported on: https://github.com/coreos/rkt/issues/1537 The following commands can reproduce the issue: # mkdir upper lower work merged # mount -t overlay overlay -olowerdir=lower,upperdir=upper,workdir=work merged/ # cd merged # mkdir upper2 lower2 work2 merged2 # mount -t overlay overlay -olowerdir=lower2,upperdir=upper2,workdir=work2 merged2/ # cd merged2 # echo hello > test.txt bash: test.txt: No such device or address For debugging purposes, I added a "BUG()" in fs/inode.c:no_open() in order to see the stack at the moment the ENXIO is returned: [0.446166] Call Trace: [0.446166] [] do_dentry_open+0x1ff/0x2f0 [0.446166] [] ? inode_init_always+0x1b0/0x1b0 [0.446166] [] vfs_open+0x56/0x60 [0.446166] [] path_openat+0x1de/0x1250 [0.446166] [] ? filemap_fault+0xb1/0x420 [0.446166] [] ? __inc_zone_state+0x20/0x60 [0.446166] [] do_filp_open+0x91/0x100 [0.446166] [] ? kmem_cache_alloc+0x193/0x210 [0.446166] [] ? getname_flags+0x56/0x1f0 [0.446166] [] ? __alloc_fd+0x3f/0x100 [0.446166] [] do_sys_open+0x13a/0x230 [0.446166] [] SyS_open+0x1e/0x20 [0.446166] [] entry_SYSCALL_64_fastpath+0x12/0x71 git-bisect found the following: > 4bacc9c9234c7c8eec44f5ed4e960d9f96fa0f01 is the first bad commit > commit 4bacc9c9234c7c8eec44f5ed4e960d9f96fa0f01 > Author: David Howells > Date: Thu Jun 18 14:32:31 2015 +0100 > > overlayfs: Make f_path always point to the overlay > and f_inode to the underlay See patch (2) in https://lwn.net/Articles/648525/: > (2) The main VFS patch that makes an open file struct referring to a union > file have ->f_path point to the union/overlay file whilst ->f_inode and > ->f_mapping refer to the subordinate file that does the actual work. Cheers, Alban -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: overlayfs: regression bug from 4bacc9c9 (Make f_path always point to the overlay and f_inode to the underlay)
On 12 October 2015 at 15:50, Miklos Szeredi wrote: > On Wed, Oct 07, 2015 at 02:23:23PM +0200, Alban Crequy wrote: >> Hi, >> >> I'm reporting an issue in overlay fs that was introduced in v4.2 (it >> worked on v4.1): when overlay fs is mounted inside a overlay fs, I get >> a "no such device or address" error (ENXIO) during open(). After >> adding some debug printks, I found that the ENXIO comes from >> fs/inode.c:no_open(). >> >> The bug was initially reported on: >> https://github.com/coreos/rkt/issues/1537 >> >> The following commands can reproduce the issue: > > Thanks for the excellent report. > > See below for a fix. Please let me know if you see any issues. Thanks for the fix. I tested it and it fixes the test case with the commands I wrote in my previous email. Tested-by: Alban Crequy > Thanks, > Miklos > --- > > Subject: ovl: fix open in stacked overlay > From: Miklos Szeredi > > If two overlayfs filesystems are stacked on top of each other, then we need > recursion in ovl_d_select_inode(). > > I guess d_backing_inode() is supposed to do that. But currently it doesn't > and that functionality is open coded in vfs_open(). This is now copied > into ovl_d_select_inode() to fix this regression. > > Reported-by: Alban Crequy > Signed-off-by: Miklos Szeredi > Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay...") > Cc: David Howells > Cc: # v4.2+ > --- > fs/overlayfs/inode.c |3 +++ > 1 file changed, 3 insertions(+) > > --- a/fs/overlayfs/inode.c > +++ b/fs/overlayfs/inode.c > @@ -363,6 +363,9 @@ struct inode *ovl_d_select_inode(struct > ovl_path_upper(dentry, &realpath); > } > > + if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE) > + return realpath.dentry->d_op->d_select_inode(realpath.dentry, > file_flags); > + > return d_backing_inode(realpath.dentry); > } > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH v2] ima,fuse: introduce new fs flag FS_NO_IMA_CACHE
On Thu, Jan 18, 2018 at 10:25 PM, Mimi Zohar wrote: > On Tue, 2018-01-16 at 16:10 +0100, Alban Crequy wrote: >> From: Alban Crequy >> >> This patch forces files to be re-measured, re-appraised and re-audited >> on file systems with the feature flag FS_NO_IMA_CACHE. In that way, >> cached integrity results won't be used. >> >> For now, this patch adds the new flag only FUSE filesystems. This is >> needed because the userspace FUSE process can change the underlying >> files at any time. > > Thanks, it's working nicely. > > >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index 511fbaabf624..2bd7e73ebc2a 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -2075,6 +2075,7 @@ struct file_system_type { >> #define FS_BINARY_MOUNTDATA 2 >> #define FS_HAS_SUBTYPE 4 >> #define FS_USERNS_MOUNT 8 /* Can be mounted by userns >> root */ >> +#define FS_NO_IMA_CACHE 16 /* Force IMA to re-measure, >> re-appraise, re-audit files */ >> #define FS_RENAME_DOES_D_MOVE32768 /* FS will handle d_move() >> during rename() internally. */ >> struct dentry *(*mount) (struct file_system_type *, int, >> const char *, void *); >> > > Since IMA is going to need another flag, we probably should have a > consistent prefix (eg. "FS_IMA"). Maybe rename this flag to > FS_IMA_NO_CACHE. Ok, I can rename it. Is there a discussion about the other IMA flag? > I'm also wondering if this change should be > separated from the IMA change. Do you mean one patch for adding the flag and the IMA change and another patch for using the flag in FUSE? Thanks! Alban
Re: [RFC PATCH v2] ima,fuse: introduce new fs flag FS_NO_IMA_CACHE
On Fri, Jan 19, 2018 at 5:56 PM, Mimi Zohar wrote: > On Fri, 2018-01-19 at 11:35 +0100, Alban Crequy wrote: >> On Thu, Jan 18, 2018 at 10:25 PM, Mimi Zohar >> wrote: >> > On Tue, 2018-01-16 at 16:10 +0100, Alban Crequy wrote: >> >> From: Alban Crequy >> >> >> >> This patch forces files to be re-measured, re-appraised and re-audited >> >> on file systems with the feature flag FS_NO_IMA_CACHE. In that way, >> >> cached integrity results won't be used. >> >> >> >> For now, this patch adds the new flag only FUSE filesystems. This is >> >> needed because the userspace FUSE process can change the underlying >> >> files at any time. >> > >> > Thanks, it's working nicely. >> > >> > >> >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> >> index 511fbaabf624..2bd7e73ebc2a 100644 >> >> --- a/include/linux/fs.h >> >> +++ b/include/linux/fs.h >> >> @@ -2075,6 +2075,7 @@ struct file_system_type { >> >> #define FS_BINARY_MOUNTDATA 2 >> >> #define FS_HAS_SUBTYPE 4 >> >> #define FS_USERNS_MOUNT 8 /* Can be mounted by userns >> >> root */ >> >> +#define FS_NO_IMA_CACHE 16 /* Force IMA to re-measure, >> >> re-appraise, re-audit files */ >> >> #define FS_RENAME_DOES_D_MOVE32768 /* FS will handle d_move() >> >> during rename() internally. */ >> >> struct dentry *(*mount) (struct file_system_type *, int, >> >> const char *, void *); >> >> >> > >> > Since IMA is going to need another flag, we probably should have a >> > consistent prefix (eg. "FS_IMA"). Maybe rename this flag to >> > FS_IMA_NO_CACHE. >> >> Ok, I can rename it. >> >> Is there a discussion about the other IMA flag? > > There's not a single thread that I can point to, but more of an on > going discussion as to what it means for a filesystem to support IMA > and how that decision is made. > > - Initial measuring, verifying, auditing files > - Safely detecting when a file changes > - Not applicable/supported > > With Sascha Hauer's patch "ima: Use i_version only when filesystem > supports it" and this patch, the second issue is addressed, but will > cause files to be re-validated, perhaps unnecessarily, impacting > performance. > > Some filesystems should not be evaluated, such as pseudo filesystems > (eg. cgroups, sysfs, devpts, pstorefs, efivarfs, debugfs, selinux, > smack). Instead of defining a flag indicating whether or not IMA is > applicable/supported, we should define a new flag, indicating whether > it is a pseudo filesystem. This would eliminate a large portion of at > least the builtin IMA policy rules. Thanks for the explanation. If that other flag is about whether it is a pseudo filesystem, it might not have "IMA" in the name though. >> > I'm also wondering if this change should be >> > separated from the IMA change. >> >> Do you mean one patch for adding the flag and the IMA change and >> another patch for using the flag in FUSE? > > The flag and FUSE usage of the flag, separately from IMA. Ok, I will send a v3 with the 2 changes. Alban
[RFC PATCH v3 1/2] fuse: introduce new fs_type flag FS_IMA_NO_CACHE
From: Alban Crequy This new fs_type flag FS_IMA_NO_CACHE means files should be re-measured, re-appraised and re-audited each time. Cached integrity results should not be used. It is useful in FUSE because the userspace FUSE process can change the underlying files at any time without notifying the kernel. Cc: linux-kernel@vger.kernel.org Cc: linux-integr...@vger.kernel.org Cc: linux-security-mod...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Miklos Szeredi Cc: Alexander Viro Cc: Mimi Zohar Cc: Dmitry Kasatkin Cc: James Morris Cc: "Serge E. Hallyn" Cc: Seth Forshee Cc: Christoph Hellwig Tested-by: Dongsu Park Signed-off-by: Alban Crequy --- fs/fuse/inode.c| 2 +- include/linux/fs.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 624f18bbfd2b..0a9e516461d5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1205,7 +1205,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE, + .fs_flags = FS_HAS_SUBTYPE | FS_IMA_NO_CACHE, .mount = fuse_mount, .kill_sb= fuse_kill_sb_anon, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 511fbaabf624..ced841ba6701 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2075,6 +2075,7 @@ struct file_system_type { #define FS_BINARY_MOUNTDATA2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT8 /* Can be mounted by userns root */ +#define FS_IMA_NO_CACHE16 /* Force IMA to re-measure, re-appraise, re-audit files */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- 2.13.6
[RFC PATCH v3 0/2] ima,fuse: introduce new fs flag FS_IMA_NO_CACHE
This patchset v3 introduces a new fs flag FS_IMA_NO_CACHE and uses it in FUSE. This forces files to be re-measured, re-appraised and re-audited on file systems with the feature flag FS_IMA_NO_CACHE. In that way, cached integrity results won't be used. There was a previous attempt (unmerged) with a IMA option named "force" and using that option for FUSE filesystems. These patches use a different approach so that the IMA subsystem does not need to know about FUSE. - https://www.spinics.net/lists/linux-integrity/msg00948.html - https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1584131.html Changes since v1: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1587390.html - include linux-fsdevel mailing list in cc - mark patch as RFC - based on next-integrity, without other unmerged FUSE / IMA patches Changes since v2: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1587678.html - rename flag to FS_IMA_NO_CACHE - split patch into 2 The patchset is also available in our github repo: https://github.com/kinvolk/linux/tree/alban/fuse-flag-ima-nocache-v3 Alban Crequy (2): fuse: introduce new fs_type flag FS_IMA_NO_CACHE ima: force re-appraisal on filesystems with FS_IMA_NO_CACHE fs/fuse/inode.c | 2 +- include/linux/fs.h| 1 + security/integrity/ima/ima_main.c | 24 ++-- 3 files changed, 24 insertions(+), 3 deletions(-) -- 2.13.6
[RFC PATCH v3 2/2] ima: force re-appraisal on filesystems with FS_IMA_NO_CACHE
From: Alban Crequy This patch forces files to be re-measured, re-appraised and re-audited on file systems with the feature flag FS_IMA_NO_CACHE. In that way, cached integrity results won't be used. How to test this: The test I did was using a patched version of the memfs FUSE driver [1][2] and two very simple "hello-world" programs [4] (prog1 prints "hello world: 1" and prog2 prints "hello world: 2"). I copy prog1 and prog2 in the fuse-memfs mount point, execute them and check the sha1 hash in "/sys/kernel/security/ima/ascii_runtime_measurements". My patch on the memfs FUSE driver added a backdoor command to serve prog1 when the kernel asks for prog2 or vice-versa. In this way, I can exec prog1 and get it to print "hello world: 2" without ever replacing the file via the VFS, so the kernel is not aware of the change. The test was done using the branch "alban/fuse-flag-ima-nocache-v3" [3]. Step by step test procedure: 1. Mount the memfs FUSE using [2]: rm -f /tmp/memfs-switch* ; memfs -L DEBUG /mnt/memfs 2. Copy prog1 and prog2 using [4] cp prog1 /mnt/memfs/prog1 cp prog2 /mnt/memfs/prog2 3. Lookup the files and let the FUSE driver to keep the handles open: dd if=/mnt/memfs/prog1 bs=1 | (read -n 1 x ; sleep 3600 ) & dd if=/mnt/memfs/prog2 bs=1 | (read -n 1 x ; sleep 3600 ) & 4. Check the 2 programs work correctly: $ /mnt/memfs/prog1 hello world: 1 $ /mnt/memfs/prog2 hello world: 2 5. Check the measurements for prog1 and prog2: $ sudo cat /sys/kernel/security/ima/ascii_runtime_measurements \ | grep /mnt/memfs/prog 10 [...] ima-ng sha1:ac14c9268cd2[...] /mnt/memfs/prog1 10 [...] ima-ng sha1:799cb5d1e06d[...] /mnt/memfs/prog2 6. Use the backdoor command in my patched memfs to redirect file operations on file handle 3 to file handle 2: rm -f /tmp/memfs-switch* ; touch /tmp/memfs-switch-3-2 7. Check how the FUSE driver serves different content for the files: $ /mnt/memfs/prog1 hello world: 2 $ /mnt/memfs/prog2 hello world: 2 8. Check the measurements: sudo cat /sys/kernel/security/ima/ascii_runtime_measurements \ | grep /mnt/memfs/prog Without the patch, there are no new measurements, despite the FUSE driver having served different executables. With the patch, I can see additional measurements for prog1 and prog2 with the hashes reversed when the FUSE driver served the alternative content. [1] https://github.com/bbengfort/memfs [2] https://github.com/kinvolk/memfs/commits/alban/switch-files [3] https://github.com/kinvolk/linux/commits/alban/fuse-flag-ima-nocache-v3 [4] https://github.com/kinvolk/fuse-userns-patches/commit/cf1f5750cab0 Cc: linux-kernel@vger.kernel.org Cc: linux-integr...@vger.kernel.org Cc: linux-security-mod...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Miklos Szeredi Cc: Alexander Viro Cc: Mimi Zohar Cc: Dmitry Kasatkin Cc: James Morris Cc: "Serge E. Hallyn" Cc: Seth Forshee Cc: Christoph Hellwig Tested-by: Dongsu Park Signed-off-by: Alban Crequy --- security/integrity/ima/ima_main.c | 24 ++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 6d78cb26784d..8870a7bbe9b9 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "ima.h" @@ -228,9 +229,28 @@ static int process_measurement(struct file *file, char *buf, loff_t size, IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK | IMA_ACTION_FLAGS); - if (test_and_clear_bit(IMA_CHANGE_XATTR, &iint->atomic_flags)) - /* reset all flags if ima_inode_setxattr was called */ + /* +* Reset the measure, appraise and audit cached flags either if: +* - ima_inode_setxattr was called, or +* - based on filesystem feature flag +* forcing the file to be re-evaluated. +*/ + if (test_and_clear_bit(IMA_CHANGE_XATTR, &iint->atomic_flags)) { iint->flags &= ~IMA_DONE_MASK; + } else if (inode->i_sb->s_type->fs_flags & FS_IMA_NO_CACHE) { + if (action & IMA_MEASURE) { + iint->measured_pcrs = 0; + iint->flags &= + ~(IMA_COLLECTED | IMA_MEASURE | IMA_MEASURED); + } + if (action & IMA_APPRAISE) + iint->flags &= + ~(IMA_COLLECTED | IMA_APPRAISE | IMA_APPRAISED | + IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK); + if (action & IMA_AUDIT) + iint->flags &= + ~(IMA_COLLECTED | IMA_AUDIT | IMA_AUDITED); + } /* Determine if already appraised/measured based on bitmask * (IMA_MEASURE, IMA_MEASURED, IMA__APPRAISE, IMA__APPRAISED, -- 2.13.6
Re: [PATCH 08/11] fuse: Support fuse filesystems outside of init_user_ns
[Adding Tejun, David, Tom for question about cuse] On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park wrote: > From: Seth Forshee > > In order to support mounts from namespaces other than > init_user_ns, fuse must translate uids and gids to/from the > userns of the process servicing requests on /dev/fuse. This > patch does that, with a couple of restrictions on the namespace: > > - The userns for the fuse connection is fixed to the namespace >from which /dev/fuse is opened. > > - The namespace must be the same as s_user_ns. > > These restrictions simplify the implementation by avoiding the > need to pass around userns references and by allowing fuse to > rely on the checks in inode_change_ok for ownership changes. > Either restriction could be relaxed in the future if needed. > > For cuse the namespace used for the connection is also simply > current_user_ns() at the time /dev/cuse is opened. Was a use case discussed for using cuse in a new unprivileged userns? I ran some tests yesterday with cusexmp [1] and I could add a new char device as an unprivileged user with: $ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp --maj=99 --min=30 --name=foo where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777. Then, I could see the new device: $ cat /proc/devices | grep foo 99 foo On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it seems dangerous if the dev node can be provided otherwise and if we don't have a use case for it. Thoughts? [1] https://github.com/fuse4x/fuse/blob/master/example/cusexmp.c#L9 Cheers, Alban > Patch v4 is available: https://patchwork.kernel.org/patch/8944661/ > > Cc: linux-fsde...@vger.kernel.org > Cc: linux-kernel@vger.kernel.org > Cc: Miklos Szeredi > Signed-off-by: Seth Forshee > Signed-off-by: Dongsu Park > --- > fs/fuse/cuse.c | 3 ++- > fs/fuse/dev.c| 11 --- > fs/fuse/dir.c| 14 +++--- > fs/fuse/fuse_i.h | 6 +- > fs/fuse/inode.c | 31 +++ > 5 files changed, 41 insertions(+), 24 deletions(-) > > diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c > index e9e97803..b1b83259 100644 > --- a/fs/fuse/cuse.c > +++ b/fs/fuse/cuse.c > @@ -48,6 +48,7 @@ > #include > #include > #include > +#include > > #include "fuse_i.h" > > @@ -498,7 +499,7 @@ static int cuse_channel_open(struct inode *inode, struct > file *file) > if (!cc) > return -ENOMEM; > > - fuse_conn_init(&cc->fc); > + fuse_conn_init(&cc->fc, current_user_ns()); > > fud = fuse_dev_alloc(&cc->fc); > if (!fud) { > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c > index 17f0d05b..0f780e16 100644 > --- a/fs/fuse/dev.c > +++ b/fs/fuse/dev.c > @@ -114,8 +114,8 @@ static void __fuse_put_request(struct fuse_req *req) > > static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req) > { > - req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); > - req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); > + req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); > + req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); > req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); > } > > @@ -167,6 +167,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn > *fc, unsigned npages, > __set_bit(FR_WAITING, &req->flags); > if (for_background) > __set_bit(FR_BACKGROUND, &req->flags); > + if (req->in.h.uid == (uid_t)-1 || req->in.h.gid == (gid_t)-1) { > + fuse_put_request(fc, req); > + return ERR_PTR(-EOVERFLOW); > + } > > return req; > > @@ -1260,7 +1264,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, > struct file *file, > in = &req->in; > reqsize = in->h.len; > > - if (task_active_pid_ns(current) != fc->pid_ns) { > + if (task_active_pid_ns(current) != fc->pid_ns || > + current_user_ns() != fc->user_ns) { > rcu_read_lock(); > in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); > rcu_read_unlock(); > diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c > index 24967382..ad1cfac1 100644 > --- a/fs/fuse/dir.c > +++ b/fs/fuse/dir.c > @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct > fuse_attr *attr, > stat->ino = attr->ino; > stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 0); > stat->nlink = attr->nlink; > - stat->uid = make_kuid(&init_user_ns, attr->uid); > - stat->gid = make_kgid(&init_user_ns, attr->gid); > + stat->uid = make_kuid(fc->user_ns, attr->uid); > + stat->gid = make_kgid(fc->user_ns, attr->gid); > stat->rdev = inode->i_rdev; > stat->atime.tv_sec = attr->atime; > stat->atime.tv_nsec = attr->atimensec; > @@ -1475,17 +1475,17 @@ static bool update_mtime(unsigned ivalid, bool > trust_local_mtime) > re
Re: [PATCH 08/11] fuse: Support fuse filesystems outside of init_user_ns
On Wed, Jan 17, 2018 at 3:29 PM, Seth Forshee wrote: > On Wed, Jan 17, 2018 at 11:59:06AM +0100, Alban Crequy wrote: >> [Adding Tejun, David, Tom for question about cuse] >> >> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park wrote: >> > From: Seth Forshee >> > >> > In order to support mounts from namespaces other than >> > init_user_ns, fuse must translate uids and gids to/from the >> > userns of the process servicing requests on /dev/fuse. This >> > patch does that, with a couple of restrictions on the namespace: >> > >> > - The userns for the fuse connection is fixed to the namespace >> >from which /dev/fuse is opened. >> > >> > - The namespace must be the same as s_user_ns. >> > >> > These restrictions simplify the implementation by avoiding the >> > need to pass around userns references and by allowing fuse to >> > rely on the checks in inode_change_ok for ownership changes. >> > Either restriction could be relaxed in the future if needed. >> > >> > For cuse the namespace used for the connection is also simply >> > current_user_ns() at the time /dev/cuse is opened. >> >> Was a use case discussed for using cuse in a new unprivileged userns? >> >> I ran some tests yesterday with cusexmp [1] and I could add a new char >> device as an unprivileged user with: >> >> $ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp >> --maj=99 --min=30 --name=foo >> >> where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777. >> Then, I could see the new device: >> >> $ cat /proc/devices | grep foo >> 99 foo >> >> On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it >> seems dangerous if the dev node can be provided otherwise and if we >> don't have a use case for it. >> >> Thoughts? > > I can't remember the specific reasons, but I had concluded that letting > unprivileged users use cuse within a user namespace isn't safe. But > having a cuse device node usable by regular users at all is equally > unsafe I suspect, This makes sense. > so I don't think your example demonstrates any problem > specific to user namespaces. There shouldn't be any way to use a user > namespace to gain access permissions towards /dev/cuse, otherwise we > have bigger problems than cuse to worry about. >From my tests, the patch seem safe but I don't fully understand why that is. I am not trying to gain more permissions towards /dev/cuse but to create another cuse char file from within the unprivileged userns. I tested the scenario by patching the memfs userspace FUSE driver to generate the char device whenever the file is named "cuse" (turning the regular file into a char device with the cuse major/minor behind the scene): $ unshare -U -r -m # memfs /mnt/memfs & # ls -l /mnt/memfs # echo -n > /mnt/memfs/cuse -bash: /mnt/memfs/cuse: Input/output error # ls -l /mnt/memfs/cuse crwxrwxrwx. 1 root root 10, 203 Jan 17 18:24 /mnt/memfs/cuse # cat /mnt/memfs/cuse cat: /mnt/memfs/cuse: Permission denied But then, I could not use that char device, even though it seems to have the correct major/minor and permissions. The kernel FUSE code seems to call init_special_inode() to handle character devices. I don't understand why it seems to be safe. Thanks! Alban
Re: [PATCH 08/11] fuse: Support fuse filesystems outside of init_user_ns
On Wed, Jan 17, 2018 at 8:31 PM, Seth Forshee wrote: > On Wed, Jan 17, 2018 at 07:56:59PM +0100, Alban Crequy wrote: >> On Wed, Jan 17, 2018 at 3:29 PM, Seth Forshee >> wrote: >> > On Wed, Jan 17, 2018 at 11:59:06AM +0100, Alban Crequy wrote: >> >> [Adding Tejun, David, Tom for question about cuse] >> >> >> >> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park wrote: >> >> > From: Seth Forshee >> >> > >> >> > In order to support mounts from namespaces other than >> >> > init_user_ns, fuse must translate uids and gids to/from the >> >> > userns of the process servicing requests on /dev/fuse. This >> >> > patch does that, with a couple of restrictions on the namespace: >> >> > >> >> > - The userns for the fuse connection is fixed to the namespace >> >> >from which /dev/fuse is opened. >> >> > >> >> > - The namespace must be the same as s_user_ns. >> >> > >> >> > These restrictions simplify the implementation by avoiding the >> >> > need to pass around userns references and by allowing fuse to >> >> > rely on the checks in inode_change_ok for ownership changes. >> >> > Either restriction could be relaxed in the future if needed. >> >> > >> >> > For cuse the namespace used for the connection is also simply >> >> > current_user_ns() at the time /dev/cuse is opened. >> >> >> >> Was a use case discussed for using cuse in a new unprivileged userns? >> >> >> >> I ran some tests yesterday with cusexmp [1] and I could add a new char >> >> device as an unprivileged user with: >> >> >> >> $ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp >> >> --maj=99 --min=30 --name=foo >> >> >> >> where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777. >> >> Then, I could see the new device: >> >> >> >> $ cat /proc/devices | grep foo >> >> 99 foo >> >> >> >> On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it >> >> seems dangerous if the dev node can be provided otherwise and if we >> >> don't have a use case for it. >> >> >> >> Thoughts? >> > >> > I can't remember the specific reasons, but I had concluded that letting >> > unprivileged users use cuse within a user namespace isn't safe. But >> > having a cuse device node usable by regular users at all is equally >> > unsafe I suspect, >> >> This makes sense. >> >> > so I don't think your example demonstrates any problem >> > specific to user namespaces. There shouldn't be any way to use a user >> > namespace to gain access permissions towards /dev/cuse, otherwise we >> > have bigger problems than cuse to worry about. >> >> From my tests, the patch seem safe but I don't fully understand why that is. >> >> I am not trying to gain more permissions towards /dev/cuse but to >> create another cuse char file from within the unprivileged userns. I >> tested the scenario by patching the memfs userspace FUSE driver to >> generate the char device whenever the file is named "cuse" (turning >> the regular file into a char device with the cuse major/minor behind >> the scene): >> >> $ unshare -U -r -m >> # memfs /mnt/memfs & >> # ls -l /mnt/memfs >> # echo -n > /mnt/memfs/cuse >> -bash: /mnt/memfs/cuse: Input/output error >> # ls -l /mnt/memfs/cuse >> crwxrwxrwx. 1 root root 10, 203 Jan 17 18:24 /mnt/memfs/cuse >> # cat /mnt/memfs/cuse >> cat: /mnt/memfs/cuse: Permission denied >> >> But then, I could not use that char device, even though it seems to >> have the correct major/minor and permissions. The kernel FUSE code >> seems to call init_special_inode() to handle character devices. I >> don't understand why it seems to be safe. > > Because for new mounts in non-init user namespaces alloc_super() sets > SB_I_NODEV flag in s_iflags, which disallows opening device nodes in > that filesystem. I see. Thanks for the explanation!
Re: [PATCH v5 00/11] FUSE mounts from non-init user namespaces
On Tue, Jan 9, 2018 at 4:05 PM, Dongsu Park wrote: > Hi, > > On Mon, Dec 25, 2017 at 8:05 AM, Eric W. Biederman > wrote: >> Dongsu Park writes: >> >>> This patchset v5 is based on work by Seth Forshee and Eric Biederman. >>> The latest patchset was v4: >>> https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1132206.html >>> >>> At the moment, filesystems backed by physical medium can only be mounted >>> by real root in the initial user namespace. This restriction exists >>> because if it's allowed for root user in non-init user namespaces to >>> mount the filesystem, then it effectively allows the user to control the >>> underlying source of the filesystem. In case of FUSE, the source would >>> mean any underlying device. >>> >>> However, in many use cases such as containers, it's necessary to allow >>> filesystems to be mounted from non-init user namespaces. Goal of this >>> patchset is to allow FUSE filesystems to be mounted from non-init user >>> namespaces. Support for other filesystems like ext4 are not in the >>> scope of this patchset. >>> >>> Let me describe how to test mounting from non-init user namespaces. It's >>> assumed that tests are done via sshfs, a userspace filesystem based on >>> FUSE with ssh as backend. Testing system is Fedora 27. >> >> In general I am for this work, and more bodies and more eyes on it is >> generally better. >> >> I will review this after the New Year, I am out for the holidays right >> now. > > Thanks. I'll wait for your review. Hi Eric, Do you have some cycles for this now that it is the new year? A review on the associated ima issue would also be appreciated: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1587678.html Cheers, Alban >>> >>> $ sudo dnf install -y sshfs >>> $ sudo mkdir -p /mnt/userns >>> >>> ### workaround to get the sshfs permission checks >>> $ sudo chown -R $UID:$UID /etc/ssh/ssh_config.d /usr/share/crypto-policies >>> >>> $ unshare -U -r -m >>> # sshfs root@localhost: /mnt/userns >>> >>> ### You can see sshfs being mounted from a non-init user namespace >>> # mount | grep sshfs >>> root@localhost: on /mnt/userns type fuse.sshfs >>> (rw,nosuid,nodev,relatime,user_id=0,group_id=0) >>> >>> # touch /mnt/userns/test >>> # ls -l /mnt/userns/test >>> -rw-r--r-- 1 root root 0 Dec 11 19:01 /mnt/userns/test >>> >>> >>> Open another terminal, check the mountpoint from outside the namespace. >>> >>> >>> $ grep userns /proc/$(pidof sshfs)/mountinfo >>> 131 102 0:35 / /mnt/userns rw,nosuid,nodev,relatime - fuse.sshfs >>> root@localhost: rw,user_id=0,group_id=0 >>> >>> >>> After all tests are done, you can unmount the filesystem >>> inside the namespace. >>> >>> >>> # fusermount -u /mnt/userns >>> >>> >>> Changes since v4: >>> * Remove other parts like ext4 to keep the patchset minimal for FUSE >>> * Add and change commit messages >>> * Describe how to test non-init user namespaces >>> >>> TODO: >>> * Think through potential security implications. There are 2 patches >>>being prepared for security issues. One is "ima: define a new policy >>>option named force" by Mimi Zohar, which adds an option to specify >>>that the results should not be cached: >>>https://marc.info/?l=linux-integrity&m=151275680115856&w=2 >>>The other one is to basically prevent FUSE results from being cached, >>>which is still in progress. >>> >>> * Test IMA/LSMs. Details are written in >>> >>> https://github.com/kinvolk/fuse-userns-patches/blob/master/tests/TESTING_INTEGRITY.md >>> >>> Patches 1-2 deal with an additional flag of lookup_bdev() to check for >>> additional inode permission. >>> >>> Patches 3-7 allow the superblock owner to change ownership of inodes, and >>> deal with additional capability checks w.r.t user namespaces. >>> >>> Patches 8-10 allow FUSE filesystems to be mounted outside of the init >>> user namespace. >>> >>> Patch 11 handles a corner case of non-root users in EVM. >>> >>> The patchset is also available in our github repo: >>> https://github.com/kinvolk/linux/tree/dongsu/fuse-userns-v5-1 >>> >>> >>> Eric W. Biederman (1): >>> fs: Allow superblock owner to change ownership of inodes >>> >>> Seth Forshee (10): >>> block_dev: Support checking inode permissions in lookup_bdev() >>> mtd: Check permissions towards mtd block device inode when mounting >>> fs: Don't remove suid for CAP_FSETID for userns root >>> fs: Allow superblock owner to access do_remount_sb() >>> capabilities: Allow privileged user in s_user_ns to set security.* >>> xattrs >>> fs: Allow CAP_SYS_ADMIN in s_user_ns to freeze and thaw filesystems >>> fuse: Support fuse filesystems outside of init_user_ns >>> fuse: Restrict allow_other to the superblock's namespace or a >>> descendant >>> fuse: Allow user namespace mounts >>> evm: Don't update hmacs in user ns mounts >>> >>> drivers/md/bcache/super.c | 2 +- >>> drivers/md/dm-table.c | 2 +- >>> driv
[PATCH] ima,fuse: introduce new fs flag FS_NO_IMA_CACHE
From: Alban Crequy This patch forces files to be re-measured, re-appraised and re-audited on file systems with the feature flag FS_NO_IMA_CACHE. In that way, cached integrity results won't be used. For now, only FUSE filesystems use this flag. This is because the userspace FUSE process can change the underlying files at any times. This patch is based on the patch "ima: define a new policy option named force" by Mimi. [1] How to test this: The test I did was using a patched version of the memfs FUSE driver [2][3] and two very simple "hello-world" programs [5] (prog1 prints "hello world: 1" and prog2 prints "hello world: 2"). I copy prog1 and prog2 in the fuse-memfs mount point, execute them and check the sha1 hash in "/sys/kernel/security/ima/ascii_runtime_measurements". My patch on the memfs FUSE driver added a backdoor command to serve prog1 when the kernel asks for prog2 or vice-versa. In this way, I can exec prog1 and get it to print "hello world: 2" without ever replacing the file via the VFS, so the kernel is not aware of the change. The test was done using the branch "alban/fuse-flag-ima-nocache" [4]. Step by step test procedure: 1. Mount the memfs FUSE using [3]: rm -f /tmp/memfs-switch* ; memfs -L DEBUG /mnt/memfs 2. Copy prog1 and prog2 using [5] cp prog1 /mnt/memfs/prog1 cp prog2 /mnt/memfs/prog2 3. Lookup the files and let the FUSE driver to keep the handles open: dd if=/mnt/memfs/prog1 bs=1 | (read -n 1 x ; sleep 3600 ) & dd if=/mnt/memfs/prog2 bs=1 | (read -n 1 x ; sleep 3600 ) & 4. Check the 2 programs work correctly: $ /mnt/memfs/prog1 hello world: 1 $ /mnt/memfs/prog2 hello world: 2 5. Check the measurements for prog1 and prog2: $ sudo cat /sys/kernel/security/ima/ascii_runtime_measurements|grep /mnt/memfs/prog 10 7ac5aed52061cb09120e977c6d04ee5c7b11c371 ima-ng sha1:ac14c9268cd2811f7a5adea17b27d84f50e1122c /mnt/memfs/prog1 10 9acc17a9a32aec4a676b8f6558e17a3d6c9a78e6 ima-ng sha1:799cb5d1e06d5c37ae7a76ba25ecd1bd01476383 /mnt/memfs/prog2 6. Use the backdoor command in my patched memfs to redirect file operations on file handle 3 to file handle 2: rm -f /tmp/memfs-switch* ; touch /tmp/memfs-switch-3-2 7. Check how the FUSE driver serves different content for the files: $ /mnt/memfs/prog1 hello world: 2 $ /mnt/memfs/prog2 hello world: 2 8. Check the measurements: sudo cat /sys/kernel/security/ima/ascii_runtime_measurements|grep /mnt/memfs/prog Without the patch, there are no new measurements, despite the FUSE driver having served different executables. With the patch, I can see additional measurements for prog1 and prog2 with the hashes reversed when the FUSE driver served the alternative content. [1] https://www.spinics.net/lists/linux-integrity/msg00948.html [2] https://github.com/bbengfort/memfs [3] https://github.com/kinvolk/memfs/commits/alban/switch-files [4] https://github.com/kinvolk/linux/commits/alban/fuse-flag-ima-nocache [5] https://github.com/kinvolk/fuse-userns-patches/commit/cf1f5750cab0 Cc: linux-integr...@vger.kernel.org Cc: linux-security-mod...@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Miklos Szeredi Cc: Mimi Zohar Cc: Seth Forshee Cc: Christoph Hellwig Tested-by: Dongsu Park Signed-off-by: Alban Crequy --- fs/fuse/inode.c | 2 +- include/linux/fs.h| 1 + security/integrity/ima/ima_main.c | 11 +++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8c98edee3628..b511e6469b0a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1212,7 +1212,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_NO_IMA_CACHE, .mount = fuse_mount, .kill_sb= fuse_kill_sb_anon, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index fce19c491970..88da6908a2b2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2075,6 +2075,7 @@ struct file_system_type { #define FS_BINARY_MOUNTDATA2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT8 /* Can be mounted by userns root */ +#define FS_NO_IMA_CACHE16 /* Force IMA to re-measure, re-appraise, re-audit files */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 88af481502f7..e6e45ab15dfc 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -24,6 +24,7 @@ #include #include #i
Re: [PATCH] ima,fuse: introduce new fs flag FS_NO_IMA_CACHE
On Tue, Jan 16, 2018 at 11:41 AM, Alban Crequy wrote: > From: Alban Crequy > > This patch forces files to be re-measured, re-appraised and re-audited > on file systems with the feature flag FS_NO_IMA_CACHE. In that way, > cached integrity results won't be used. > > For now, only FUSE filesystems use this flag. This is because the > userspace FUSE process can change the underlying files at any times. > > This patch is based on the patch "ima: define a new policy option > named force" by Mimi. [1] > > How to test this: > > > > The test I did was using a patched version of the memfs FUSE driver > [2][3] and two very simple "hello-world" programs [5] (prog1 prints > "hello world: 1" and prog2 prints "hello world: 2"). > > I copy prog1 and prog2 in the fuse-memfs mount point, execute them and > check the sha1 hash in > "/sys/kernel/security/ima/ascii_runtime_measurements". > > My patch on the memfs FUSE driver added a backdoor command to serve > prog1 when the kernel asks for prog2 or vice-versa. In this way, I can > exec prog1 and get it to print "hello world: 2" without ever replacing > the file via the VFS, so the kernel is not aware of the change. > > The test was done using the branch "alban/fuse-flag-ima-nocache" [4]. > > Step by step test procedure: > > 1. Mount the memfs FUSE using [3]: > rm -f /tmp/memfs-switch* ; memfs -L DEBUG /mnt/memfs > > 2. Copy prog1 and prog2 using [5] > cp prog1 /mnt/memfs/prog1 > cp prog2 /mnt/memfs/prog2 > > 3. Lookup the files and let the FUSE driver to keep the handles open: > dd if=/mnt/memfs/prog1 bs=1 | (read -n 1 x ; sleep 3600 ) & > dd if=/mnt/memfs/prog2 bs=1 | (read -n 1 x ; sleep 3600 ) & > > 4. Check the 2 programs work correctly: > $ /mnt/memfs/prog1 > hello world: 1 > $ /mnt/memfs/prog2 > hello world: 2 > > 5. Check the measurements for prog1 and prog2: > $ sudo cat /sys/kernel/security/ima/ascii_runtime_measurements|grep > /mnt/memfs/prog > 10 7ac5aed52061cb09120e977c6d04ee5c7b11c371 ima-ng > sha1:ac14c9268cd2811f7a5adea17b27d84f50e1122c /mnt/memfs/prog1 > 10 9acc17a9a32aec4a676b8f6558e17a3d6c9a78e6 ima-ng > sha1:799cb5d1e06d5c37ae7a76ba25ecd1bd01476383 /mnt/memfs/prog2 > > 6. Use the backdoor command in my patched memfs to redirect file > operations on file handle 3 to file handle 2: > rm -f /tmp/memfs-switch* ; touch /tmp/memfs-switch-3-2 > > 7. Check how the FUSE driver serves different content for the files: > $ /mnt/memfs/prog1 > hello world: 2 > $ /mnt/memfs/prog2 > hello world: 2 > > 8. Check the measurements: > sudo cat /sys/kernel/security/ima/ascii_runtime_measurements|grep > /mnt/memfs/prog > > Without the patch, there are no new measurements, despite the FUSE > driver having served different executables. > > With the patch, I can see additional measurements for prog1 and prog2 > with the hashes reversed when the FUSE driver served the alternative > content. > > > > [1] https://www.spinics.net/lists/linux-integrity/msg00948.html > [2] https://github.com/bbengfort/memfs > [3] https://github.com/kinvolk/memfs/commits/alban/switch-files > [4] https://github.com/kinvolk/linux/commits/alban/fuse-flag-ima-nocache > [5] https://github.com/kinvolk/fuse-userns-patches/commit/cf1f5750cab0 > > Cc: linux-integr...@vger.kernel.org > Cc: linux-security-mod...@vger.kernel.org > Cc: linux-kernel@vger.kernel.org > Cc: Miklos Szeredi > Cc: Mimi Zohar > Cc: Seth Forshee > Cc: Christoph Hellwig > Tested-by: Dongsu Park > Signed-off-by: Alban Crequy > --- > fs/fuse/inode.c | 2 +- > include/linux/fs.h| 1 + > security/integrity/ima/ima_main.c | 11 +++ > 3 files changed, 9 insertions(+), 5 deletions(-) > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 8c98edee3628..b511e6469b0a 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -1212,7 +1212,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) > static struct file_system_type fuse_fs_type = { > .owner = THIS_MODULE, > .name = "fuse", > - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, > + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_NO_IMA_CACHE, I just realised I should not have submitted this patch based on the branch I am on because FS_USERNS_MOUNT is not there now, so this patch does not apply cleanly on next-integrity at the moment. Sorry about that. > .mount = fuse_mount, > .kill_sb= fuse_kill_sb_anon, > }; > diff --git a/include/linux/fs.h b/include/linux/fs.h > index fce19c491970..88da6
[RFC PATCH v2] ima,fuse: introduce new fs flag FS_NO_IMA_CACHE
From: Alban Crequy This patch forces files to be re-measured, re-appraised and re-audited on file systems with the feature flag FS_NO_IMA_CACHE. In that way, cached integrity results won't be used. For now, this patch adds the new flag only FUSE filesystems. This is needed because the userspace FUSE process can change the underlying files at any time. How to test this: The test I did was using a patched version of the memfs FUSE driver [2][3] and two very simple "hello-world" programs [5] (prog1 prints "hello world: 1" and prog2 prints "hello world: 2"). I copy prog1 and prog2 in the fuse-memfs mount point, execute them and check the sha1 hash in "/sys/kernel/security/ima/ascii_runtime_measurements". My patch on the memfs FUSE driver added a backdoor command to serve prog1 when the kernel asks for prog2 or vice-versa. In this way, I can exec prog1 and get it to print "hello world: 2" without ever replacing the file via the VFS, so the kernel is not aware of the change. The test was done using the branch "alban/fuse-flag-ima-nocache-v2" [4]. Step by step test procedure: 1. Mount the memfs FUSE using [3]: rm -f /tmp/memfs-switch* ; memfs -L DEBUG /mnt/memfs 2. Copy prog1 and prog2 using [5] cp prog1 /mnt/memfs/prog1 cp prog2 /mnt/memfs/prog2 3. Lookup the files and let the FUSE driver to keep the handles open: dd if=/mnt/memfs/prog1 bs=1 | (read -n 1 x ; sleep 3600 ) & dd if=/mnt/memfs/prog2 bs=1 | (read -n 1 x ; sleep 3600 ) & 4. Check the 2 programs work correctly: $ /mnt/memfs/prog1 hello world: 1 $ /mnt/memfs/prog2 hello world: 2 5. Check the measurements for prog1 and prog2: $ sudo cat /sys/kernel/security/ima/ascii_runtime_measurements \ | grep /mnt/memfs/prog 10 [...] ima-ng sha1:ac14c9268cd2[...] /mnt/memfs/prog1 10 [...] ima-ng sha1:799cb5d1e06d[...] /mnt/memfs/prog2 6. Use the backdoor command in my patched memfs to redirect file operations on file handle 3 to file handle 2: rm -f /tmp/memfs-switch* ; touch /tmp/memfs-switch-3-2 7. Check how the FUSE driver serves different content for the files: $ /mnt/memfs/prog1 hello world: 2 $ /mnt/memfs/prog2 hello world: 2 8. Check the measurements: sudo cat /sys/kernel/security/ima/ascii_runtime_measurements \ | grep /mnt/memfs/prog Without the patch, there are no new measurements, despite the FUSE driver having served different executables. With the patch, I can see additional measurements for prog1 and prog2 with the hashes reversed when the FUSE driver served the alternative content. [1] https://www.spinics.net/lists/linux-integrity/msg00948.html [2] https://github.com/bbengfort/memfs [3] https://github.com/kinvolk/memfs/commits/alban/switch-files [4] https://github.com/kinvolk/linux/commits/alban/fuse-flag-ima-nocache-v2 [5] https://github.com/kinvolk/fuse-userns-patches/commit/cf1f5750cab0 Cc: linux-kernel@vger.kernel.org Cc: linux-integr...@vger.kernel.org Cc: linux-security-mod...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Miklos Szeredi Cc: Alexander Viro Cc: Mimi Zohar Cc: Dmitry Kasatkin Cc: James Morris Cc: "Serge E. Hallyn" Cc: Seth Forshee Cc: Christoph Hellwig Tested-by: Dongsu Park Signed-off-by: Alban Crequy --- There was a previous attempt (unmerged) with a IMA option named "force" and using that option for FUSE filesystems. This patch uses a different approach so that the IMA subsystem does not need to know about FUSE. - https://www.spinics.net/lists/linux-integrity/msg00948.html - https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1584131.html Changes since v1: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1587390.html - include linux-fsdevel mailing list in cc - mark patch as RFC - based on next-integrity, without other unmerged FUSE / IMA patches --- fs/fuse/inode.c | 2 +- include/linux/fs.h| 1 + security/integrity/ima/ima_main.c | 24 ++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 624f18bbfd2b..5bf14289afba 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1205,7 +1205,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE, + .fs_flags = FS_HAS_SUBTYPE | FS_NO_IMA_CACHE, .mount = fuse_mount, .kill_sb= fuse_kill_sb_anon, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 511fbaabf624..2bd7e73ebc2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2075,6 +2075,7 @@ struct file_system_type { #define FS_BINARY_MOUNTDATA2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT8 /* Can be mounted by userns root */ +#define FS_N
Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino
;__x64_sys_nanosleep "cgid = %llx", $cgid' > PID TID COMMFUNC - > 40674067a.out __x64_sys_nanosleep cgid = 106b2 > 40674067a.out __x64_sys_nanosleep cgid = 106b2 > 40674067a.out __x64_sys_nanosleep cgid = 106b2 > ^C[yhs@localhost tools]$ > The kernel and user space cgid matches. Will provide a > formal patch later. > On Mon, May 21, 2018 at 5:24 PM, Y Song wrote: > > On Mon, May 21, 2018 at 9:26 AM, Alexei Starovoitov > > wrote: > >> On Sun, May 13, 2018 at 07:33:18PM +0200, Alban Crequy wrote: > >>> > >>> +BPF_CALL_2(bpf_get_current_cgroup_ino, u32, hierarchy, u64, flags) > >>> +{ > >>> + // TODO: pick the correct hierarchy instead of the mem controller > >>> + struct cgroup *cgrp = task_cgroup(current, memory_cgrp_id); > >>> + > >>> + if (unlikely(!cgrp)) > >>> + return -EINVAL; > >>> + if (unlikely(hierarchy)) > >>> + return -EINVAL; > >>> + if (unlikely(flags)) > >>> + return -EINVAL; > >>> + > >>> + return cgrp->kn->id.ino; > >> > >> ino only is not enough to identify cgroup. It needs generation number too. > >> I don't quite see how hierarchy and flags can be used in the future. > >> Also why limit it to memcg? > >> > >> How about something like this instead: > >> > >> BPF_CALL_2(bpf_get_current_cgroup_id) > >> { > >> struct cgroup *cgrp = task_dfl_cgroup(current); > >> > >> return cgrp->kn->id.id; > >> } > >> The user space can use fhandle api to get the same 64-bit id. > > > > I think this should work. This will also be useful to bcc as user > > space can encode desired id > > in the bpf program and compared that id to the current cgroup id, so we can have > > cgroup level tracing (esp. stat collection) support. To cope with > > cgroup hierarchy, user can use > > cgroup-array based approach or explicitly compare against multiple cgroup id's.
[PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino
From: Alban Crequy bpf_get_current_cgroup_ino() allows BPF trace programs to get the inode of the cgroup where the current process resides. My use case is to get statistics about syscalls done by a specific Kubernetes container. I have a tracepoint on raw_syscalls/sys_enter and a BPF map containing the cgroup inode that I want to trace. I use bpf_get_current_cgroup_ino() and I quickly return from the tracepoint if the inode is not in the BPF hash map. Without this BPF helper, I would need to keep track of all pids in the container. The Netlink proc connector can be used to follow process creation and destruction but it is racy. This patch only looks at the memory cgroup, which was enough for me since each Kubernetes container is placed in a different mem cgroup. For a generic implementation, I'm not sure how to proceed: it seems I would need to use 'for_each_root(root)' (see example in proc_cgroup_show() from kernel/cgroup/cgroup.c) but I don't know if taking the cgroup mutex is possible in the BPF helper function. It might be ok in the tracepoint raw_syscalls/sys_enter but could the mutex already be taken in some other tracepoints? Signed-off-by: Alban Crequy --- include/uapi/linux/bpf.h | 11 ++- kernel/trace/bpf_trace.c | 25 + 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c5ec89732a8d..38ac3959cdf3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -755,6 +755,14 @@ union bpf_attr { * @addr: pointer to struct sockaddr to bind socket to * @addr_len: length of sockaddr structure * Return: 0 on success or negative error code + * + * u64 bpf_get_current_cgroup_ino(hierarchy, flags) + * Get the cgroup{1,2} inode of current task under the specified hierarchy. + * @hierarchy: cgroup hierarchy + * @flags: reserved for future use + * Return: + * == 0 error + *> 0 inode of the cgroup */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -821,7 +829,8 @@ union bpf_attr { FN(msg_apply_bytes),\ FN(msg_cork_bytes), \ FN(msg_pull_data), \ - FN(bind), + FN(bind), \ + FN(get_current_cgroup_ino), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 56ba0f2a01db..9bf92a786639 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -524,6 +524,29 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_get_current_cgroup_ino, u32, hierarchy, u64, flags) +{ + // TODO: pick the correct hierarchy instead of the mem controller + struct cgroup *cgrp = task_cgroup(current, memory_cgrp_id); + + if (unlikely(!cgrp)) + return -EINVAL; + if (unlikely(hierarchy)) + return -EINVAL; + if (unlikely(flags)) + return -EINVAL; + + return cgrp->kn->id.ino; +} + +static const struct bpf_func_proto bpf_get_current_cgroup_ino_proto = { + .func = bpf_get_current_cgroup_ino, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_DONTCARE, + .arg2_type = ARG_DONTCARE, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -564,6 +587,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_str_proto; + case BPF_FUNC_get_current_cgroup_ino: + return &bpf_get_current_cgroup_ino_proto; default: return NULL; } -- 2.14.3
Re: [PATCH v3 4/4] seccomp: add support for passing fds via USER_NOTIF
On Thu, 31 May 2018 at 16:52, Tycho Andersen wrote: > > The idea here is that the userspace handler should be able to pass an fd > back to the trapped task, for example so it can be returned from socket(). > > I've proposed one API here, but I'm open to other options. In particular, > this only lets you return an fd from a syscall, which may not be enough in > all cases. For example, if an fd is written to an output parameter instead > of returned, the current API can't handle this. Another case is that > netlink takes as input fds sometimes (IFLA_NET_NS_FD, e.g.). If netlink > ever decides to install an fd and output it, we wouldn't be able to handle > this either. > > Still, the vast majority of interesting cases are covered by this API, so > perhaps it is Enough. > > I've left it as a separate commit for two reasons: > * It illustrates the way in which we would grow struct seccomp_notif and > struct seccomp_notif_resp without using netlink > * It shows just how little code is needed to accomplish this :) > > v2: new in v2 > v3: no changes > > Signed-off-by: Tycho Andersen > CC: Kees Cook > CC: Andy Lutomirski > CC: Oleg Nesterov > CC: Eric W. Biederman > CC: "Serge E. Hallyn" > CC: Christian Brauner > CC: Tyler Hicks > CC: Akihiro Suda > --- > include/uapi/linux/seccomp.h | 2 + > kernel/seccomp.c | 49 +++- > tools/testing/selftests/seccomp/seccomp_bpf.c | 112 ++ > 3 files changed, 161 insertions(+), 2 deletions(-) > > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h > index 8160e6cad528..3124427219cb 100644 > --- a/include/uapi/linux/seccomp.h > +++ b/include/uapi/linux/seccomp.h > @@ -71,6 +71,8 @@ struct seccomp_notif_resp { > __u64 id; > __s32 error; > __s64 val; > + __u8 return_fd; > + __u32 fd; > }; > > #endif /* _UAPI_LINUX_SECCOMP_H */ > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 6dc99c65c2f4..2ee958b3efde 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -77,6 +77,8 @@ struct seccomp_knotif { > /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ > int error; > long val; > + struct file *file; > + unsigned int flags; > > /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ > struct completion ready; > @@ -780,10 +782,32 @@ static void seccomp_do_user_notification(int > this_syscall, > goto remove_list; > } > > - ret = n.val; > - err = n.error; > + if (n.file) { > + int fd; > + > + fd = get_unused_fd_flags(n.flags); > + if (fd < 0) { > + err = fd; > + ret = -1; > + goto remove_list; > + } > + > + ret = fd; > + err = 0; > + > + fd_install(fd, n.file); > + /* Don't fput, since fd has a reference now */ > + n.file = NULL; Do we want the cgroup classid and netprio to be applied here, before the fd_install? I am looking at similar code in net/core/scm.c scm_detach_fds(): sock = sock_from_file(fp[i], &err); if (sock) { sock_update_netprioidx(&sock->sk->sk_cgrp_data); sock_update_classid(&sock->sk->sk_cgrp_data); } The listener process might live in a different cgroup with a different classid & netprio, so it might not be applied as the app might expect. Also, should we update the struct sock_cgroup_data of the socket, in order to make the BPF helper function bpf_skb_under_cgroup() work wrt the cgroup of the target process instead of the listener process? I am looking at cgroup_sk_alloc(). I don't know what's the correct behaviour we want here. > + } else { > + ret = n.val; > + err = n.error; > + } > + > > remove_list: > + if (n.file) > + fput(n.file); > + > list_del(&n.list); > out: > mutex_unlock(&match->notify_lock); > @@ -1598,6 +1622,27 @@ static ssize_t seccomp_notify_write(struct file *file, > const char __user *buf, > knotif->state = SECCOMP_NOTIFY_REPLIED; > knotif->error = resp.error; > knotif->val = resp.val; > + > + if (resp.return_fd) { > + struct fd fd; > + > + /* > +* This is a little hokey: we need a real fget() (i.e. not > +* __fget_light(), which is what fdget does), but we also need > +* the flags from strcut fd. So, we get it, put it, and get it > +* again for real. > +*/ > + fd = fdget(resp.fd); > + knotif->flags = fd.flags; > + fdput(fd); > + > + knotif->file = fget(resp.fd); > + if (!knotif->file) { > +
[PATCH] [RFC][WIP] namespace.c: Allow some unprivileged proc mounts when not fully visible
Since Linux v4.2 with commit 1b852bceb0d1 ("mnt: Refactor the logic for mounting sysfs and proc in a user namespace"), new mounts of proc or sysfs in non init userns are only allowed when there is at least one fully-visible proc or sysfs mount. This is to enforce that proc/sysfs files masked by a mount are still masked in a new mount in a unprivileged userns. The locked mount logic for bind mounts (has_locked_children()) was not enough in the case of proc/sysfs new mounts because some files in proc (/proc/kcore) exist as a singleton rather than being owned by a specific proc mount. Unfortunately, this blocks me from using userns from within a Docker container because Docker containers mask entries such as /proc/kcore. My use case is to build container images with arbitrary commands (such as using "RUN" commands in Dockerfiles) without privileges and from within a Docker container. Those arbitrary commands could be shell scripts that require /proc. The following commands show my problem: $ sudo docker run -ti --rm --cap-add=SYS_ADMIN busybox sh -c 'unshare -U -r -p -m -f mount -t proc proc /home && echo ok' mount: permission denied (are you root?) $ sudo docker run -ti --rm --cap-add=SYS_ADMIN busybox sh -c 'mkdir -p /unmasked-proc && mount -t proc proc /unmasked-proc && unshare -U -r -p -m -f mount -t proc proc /home && echo ok' ok This patch is a WIP attempt to ease new proc mounts in a user namespace even when the proc mount in the parent container has masked entries. However, to preserve the security guarantee of mount_too_revealing(), the same masked entries in the old proc mount must be masked in the new proc mount. It cannot be masked with mounts covering the entries because it's not possible to use MS_REC for new proc mount and add covering submounts at the same time. Instead, it introduces new options in proc to disable some proc entries (TBD). A proc entry will be disabled when all other proc mounts have the same entry disabled, or when all other proc mounts have the same entry masked by a submount. The granularity does not need to be per proc entry. It is simpler to define categories of entries that can be hidden. In practice, only a few entries need to support disablement and what matters is that the new proc mount is more masked than the other proc mounts. Granularity can be improved later if use cases exist. The flag IOP_USERNS_HIDEABLE is added on some proc inodes that are singletons such as /proc/kcore. This flag is used in mnt_already_visible() to signal that, as an exception to the general rule, the file can be masked by a mount without blocking the new proc mount. The hideable category is computed (WIP) and returned (WIP) in order to configure the new proc mount before attaching it to the mount tree. For my use case, I will need to support at least the following entries: $ sudo docker run -ti --rm busybox sh -c 'mount|grep /proc/' proc on /proc/asound type proc (ro,nosuid,nodev,noexec,relatime) proc on /proc/bus type proc (ro,nosuid,nodev,noexec,relatime) proc on /proc/fs type proc (ro,nosuid,nodev,noexec,relatime) proc on /proc/irq type proc (ro,nosuid,nodev,noexec,relatime) proc on /proc/sys type proc (ro,nosuid,nodev,noexec,relatime) proc on /proc/sysrq-trigger type proc (ro,nosuid,nodev,noexec,relatime) tmpfs on /proc/kcore type tmpfs (rw,context="...",nosuid,mode=755) tmpfs on /proc/latency_stats type tmpfs (rw,context="...",nosuid,mode=755) tmpfs on /proc/timer_list type tmpfs (rw,context="...",nosuid,mode=755) tmpfs on /proc/sched_debug type tmpfs (rw,context="...",nosuid,mode=755) tmpfs on /proc/scsi type tmpfs (ro,seclabel,relatime) This patch can be tested in the following way: $ sudo unshare -p -f -m sh -c "mount --bind /dev/null /proc/cmdline && unshare -U -r -p -m -f mount -t proc proc /proc && echo ok" mount: /proc: permission denied. (this patch does not support /proc/cmdline as hideable) $ sudo unshare -p -f -m sh -c "mount --bind /dev/null /proc/kcore && unshare -U -r -p -m -f mount -t proc proc /proc && echo ok" ok (this patch marks /proc/kcore as hideable: the new mounts works fine, whereas it didn't work on vanilla kernels) Signed-off-by: Alban Crequy --- fs/namespace.c | 26 +- fs/proc/generic.c | 5 + fs/proc/inode.c| 2 ++ fs/proc/internal.h | 1 + include/linux/fs.h | 1 + 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 9d1374ab6e06..0d466885c181 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2489,7 +2489,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) return err; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); +static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags
SECCOMP_IOCTL_NOTIF_ADDFD race condition
Hi, With the addfd feature (added in “seccomp: Introduce addfd ioctl to seccomp user notifier”, commit 7cf97b125455), the new file is installed in the target process during the SECCOMP_IOCTL_NOTIF_ADDFD operation and not at the end with the SECCOMP_IOCTL_NOTIF_SEND operation. This can cause race conditions when the target process is interrupted by a signal (EINTR) and restarted automatically. This is more noticeable in multithreaded processes like with Golang. In Golang 1.14: https://golang.org/doc/go1.14 > "A consequence of the implementation of preemption is that on Unix systems, > including Linux and macOS systems, programs built with Go 1.14 will receive > more signals than programs built with earlier releases. This means that > programs that use packages like syscall or golang.org/x/sys/unix will see > more slow system calls fail with EINTR errors. Those programs will have to > handle those errors in some way, most likely looping to try the system call > again." In my test, I added a seccomp policy which returns SECCOMP_RET_USER_NOTIF on execve() and I added a sleep(2) in the seccomp agent (using https://github.com/kinvolk/seccompagent/) between SECCOMP_IOCTL_NOTIF_RECV and SECCOMP_IOCTL_NOTIF_SEND to make it a bit slow to reply with SECCOMP_USER_NOTIF_FLAG_CONTINUE. I got the following strace log going on in a loop: [pid 2656199] execve("/bin/sh", ["sh", "-c", "sleep infinity"], 0xc63b00 /* 11 vars */ [pid 2656200] <... nanosleep resumed>NULL) = 0 [pid 2656200] epoll_pwait(7, [], 128, 0, NULL, 0) = 0 [pid 2656200] getpid() = 1 [pid 2656200] tgkill(1, 1, SIGURG) = 0 [pid 2656199] <... execve resumed>) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) [pid 2656200] nanosleep({tv_sec=0, tv_nsec=1000}, [pid 2656199] --- SIGURG {si_signo=SIGURG, si_code=SI_TKILL, si_pid=1, si_uid=0} --- [pid 2656199] rt_sigreturn({mask=[]}) = 59 [pid 2656199] execve("/bin/sh", ["sh", "-c", "sleep infinity"], 0xc63b00 /* 11 vars */ On the seccomp agent side, the ioctl(SECCOMP_IOCTL_NOTIF_SEND) returns ENOENT, and then it receives the same notification at the next iteration of the loop. The SIGURG signal is sent by the Golang runtime, causing the execve to be interrupted, and restarted automatically, triggering the new seccomp notification. In this example with execve, this is not a big deal because the seccomp agent doesn't add a fd. But on a open() or accept() syscall, I fear that the seccomp agent could install a file descriptor without knowing that the syscall will be interrupted soon after, but before the SECCOMP_IOCTL_NOTIF_SEND is completed. I understand the need to have two different ioctl() to add the fd and to reply to the seccomp notification because the seccomp agent needs to know the fd number being assigned before specifying the return value of the syscall with that number. What do you think is the best way to solve this problem? Here are a few ideas: - Idea 1: add a second flag for the struct seccomp_notif_resp “SECCOMP_USER_NOTIF_FLAG_RETURN_FD” to instruct seccomp to override the return value with the first fd to install. It would not help to emulate recvfrom() with SCM_RIGHTS but it will solve the problem for syscalls that return a fd because we can then implement a new ioctl (“SECCOMP_IOCTL_NOTIF_SEND_WITH_FDS”?) that does the addfd and the notification response in one step. Other ideas but they cause more problems: - Idea 2: We need some kind of transactions where the fd is sent with the first ioctl() and installed in the fd table but marked somehow to be closed automatically if the syscall is interrupted with EINTR outside of the control of the seccomp agent. The new fd in the fd table would be committed at the end if the syscall is not interrupted. But this introduces other issues: another thread could call dup() on the fd before it gets closed. Or another process sharing the fd table with CLONE_FILES could do the same. Should the not-yet-committed fds be visible in /proc//fd/? Or inherited to new processes created by fork()? - Idea 3: We could add fds in a temporary location but not in the `struct files_struct` of the target process, and only commit at SECCOMP_IOCTL_NOTIF_SEND time. In this way, threads or processes sharing the fd table with CLONE_FILES would not be impacted. However, this could open new race conditions if other threads are installing fds in the same slots in the fd table. Also, this seems quite dangerous to add this concept of "inflight" fd for seccomp because there are already inflight fds for SCM_RIGHT and a garbage collector to clean circular references (net/unix/garbage.c). If we add an inflight fd mechanism on seccomp, a malicious user could just use SECCOMP_IOCTL_NOTIF_ADDFD to send a unix socket that has the seccomp-fd inflight in SCM_RIGHT. Then, the malicious seccomp agent would close(seccompFd) and we will be in a situation where both the seccomp-fd and the unix socket are not attached to any process but the
[PATCH] [RFC] cgroup: reject cgroup names with non-printing characters
/proc//cgroup contains one cgroup path on each line. If cgroup names are allowed to contain "\n", applications cannot parse /proc//cgroup safely. I use < 0x20 as seen in vfat_bad_char; is it safe to use isprint()? Signed-off-by: Alban Crequy --- kernel/cgroup.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 70776ae..e2df5e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4378,6 +4378,20 @@ err_free_css: return err; } +/* Inspired by vfat_bad_char: do not accept non-printing characters. In + * particular, reject '\n' to prevent making /proc//cgroup unparsable. + */ +static inline int cgroup_is_used_badchars(const char *s) +{ + int i; + + for (i = 0; s[i] != '\0'; i++) + if (s[i] < 0x20) + return -EINVAL; + + return 0; +} + static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { @@ -4387,6 +4401,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct kernfs_node *kn; int ssid, ret; + /* do not create cgroups with bad names */ + ret = cgroup_is_used_badchars(name); + if (ret) + return ret; + parent = cgroup_kn_lock_live(parent_kn); if (!parent) return -ENODEV; -- 1.8.5.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] cgroup: reject cgroup names with '\n'
/proc//cgroup contains one cgroup path on each line. If cgroup names are allowed to contain "\n", applications cannot parse /proc//cgroup safely. Signed-off-by: Alban Crequy --- v2: Fixed according to comments from Tejun Heo: only reject '\n' kernel/cgroup.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788..c3d1802 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4543,6 +4543,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cftype *base_files; int ssid, ret; + /* Do not accept '\n' to prevent making /proc//cgroup unparsable. +*/ + if (strchr(name, '\n')) + return -EINVAL; + parent = cgroup_kn_lock_live(parent_kn); if (!parent) return -ENODEV; -- 1.8.5.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: CGroup Namespaces (v6)
Hi, Thanks for the patches! On 8 December 2015 at 00:06, wrote: > Hi, > > following is a revised set of the CGroup Namespace patchset which Aditya > Kali has previously sent. The code can also be found in the cgroupns.v6 > branch of > > https://git.kernel.org/cgit/linux/kernel/git/sergeh/linux-security.git/ > > To summarize the semantics: > > 1. CLONE_NEWCGROUP re-uses 0x0200, which was previously CLONE_STOPPED > > 2. unsharing a cgroup namespace makes all your current cgroups your new > cgroup root. > > 3. /proc/pid/cgroup always shows cgroup paths relative to the reader's > cgroup namespce root. A task outside of your cgroup looks like > > 8:memory:/../../.. > > 4. when a task mounts a cgroupfs, the cgroup which shows up as root depends > on the mounting task's cgroup namespace. > > 5. setns to a cgroup namespace switches your cgroup namespace but not > your cgroups. > > With this, using github.com/hallyn/lxc #2015-11-09/cgns (and > github.com/hallyn/lxcfs #2015-11-10/cgns) we can start a container in a full > proper cgroup namespace, avoiding either cgmanager or lxcfs cgroup bind > mounts. I tested cgroupns.v6 with systemd-nspawn + patches from https://github.com/systemd/systemd/pull/2112 using unshare(CLONE_NEWCGROUP) booted with systemd.unified_cgroup_hierarchy=1 in Fedora22. Tested with and without userns. It worked for me :) Do you need people to run more tests, with other scenarios? Do you have patches already for /usr/bin/unshare and /usr/bin/nsenter? > This is completely backward compatible and will be completely invisible > to any existing cgroup users (except for those running inside a cgroup > namespace and looking at /proc/pid/cgroup of tasks outside their > namespace.) > > Changes from V5: > 1. To get a root dentry for cgroup namespace mount, walk the path from the >kernfs root dentry. > > Changes from V4: > 1. Move the FS_USERNS_MOUNT flag to last patch > 2. Rebase onto cgroup/for-4.5 > 3. Don't non-init user namespaces to bind new subsystems when mounting. > 4. Address feedback from Tejun (thanks). Specificaly, not addressed: >. kernfs_obtain_root - walking dentry from kernfs root. > (I think that's the only piece) > 5. Dropped unused get_task_cgroup fn/patch. > 6. Reworked kernfs_path_from_node_locked() to try to simplify the logic. >It now finds a common ancestor, walks from the source to it, then back >up to the target. > > Changes from V3: > 1. Rebased onto latest cgroup changes. In particular switch to >css_set_lock and ns_common. > 2. Support all hierarchies. > > Changes from V2: > 1. Added documentation in Documentation/cgroups/namespace.txt > 2. Fixed a bug that caused crash > 3. Incorporated some other suggestions from last patchset: >- removed use of threadgroup_lock() while creating new cgroupns >- use task_lock() instead of rcu_read_lock() while accessing > task->nsproxy >- optimized setns() to own cgroupns >- simplified code around sane-behavior mount option parsing > 4. Restored ACKs from Serge Hallyn from v1 on few patches that have >not changed since then. > > Changes from V1: > 1. No pinning of processes within cgroupns. Tasks can be freely moved >across cgroups even outside of their cgroupns-root. Usual DAC/MAC policies >apply as before. > 2. Path in /proc//cgroup is now always shown and is relative to >cgroupns-root. So path can contain '/..' strings depending on cgroupns-root >of the reader and cgroup of . > 3. setns() does not require the process to first move under target >cgroupns-root. > > Changes form RFC (V0): > 1. setns support for cgroupns > 2. 'mount -t cgroup cgroup ' from inside a cgroupns now >mounts the cgroup hierarcy with cgroupns-root as the filesystem root. > 3. writes to cgroup files outside of cgroupns-root are not allowed > 4. visibility of /proc//cgroup is further restricted by not showing >anything if the is in a sibling cgroupns and its cgroup falls outside >your cgroupns-root. > > > ___ > Containers mailing list > contain...@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/containers -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [lxc-devel] CGroup Namespaces (v10)
Hi, On 29 January 2016 at 09:54, wrote: > Hi, > > following is a revised set of the CGroup Namespace patchset which Aditya > Kali has previously sent. The code can also be found in the cgroupns.v10 > branch of > > https://git.kernel.org/cgit/linux/kernel/git/sergeh/linux-security.git/ > > To summarize the semantics: > > 1. CLONE_NEWCGROUP re-uses 0x0200, which was previously CLONE_STOPPED What's the best way for a userspace application to test at run-time whether the kernel supports cgroup namespaces? Would you recommend to test if the file /proc/self/ns/cgroup exists? Thanks! Alban
Re: [PATCH 2/2] user_namespaces.7: Update the documention to reflect the fixes for negative groups
Hello, Thanks for updating the man page. On 12 December 2014 at 22:54, Eric W. Biederman wrote: (...) > Furthermore to preserve in some form the useful applications that have > been setting gid_map without privilege the file /proc/[pid]/setgroups > was added to allow disabling setgroups. With the setgroups system > call permanently disabled in a user namespace it again becomes safe to > allow writes to gid_map without privilege. > > Here is my meager attempt to update user_namespaces.7 to reflect these > issues. The program userns_child_exec.c in user_namespaces.7 should be updated to write in /proc/.../setgroups, near the line: /* Update the UID and GID maps in the child */ Otherwise, the example given in the manpage does not work: $ ./userns_child_exec -p -m -U -M '0 1000 1' -G '0 1000 1' bash Cheers, Alban -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] bpf: bpftool, fix documentation for attach types
From: Alban Crequy bpftool has support for attach types "stream_verdict" and "stream_parser" but the documentation was referring to them with "skb_verdict" and "skb_parse". The inconsistency comes from commit b7d3826c2ed6 ("bpf: bpftool, add support for attaching programs to maps"). This patch changes the documentation to match the implementation. Signed-off-by: Alban Crequy --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 0640e9bc0ada..dfaa019a60f0 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1198,7 +1198,7 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 }\n" - " ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n" + " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" "flow_dissector }\n" " " HELP_SPEC_OPTIONS "\n" "", -- 2.20.1
Re: [PATCH] bpf: bpftool, fix documentation for attach types
On Mon, Feb 11, 2019 at 2:26 PM Quentin Monnet wrote: > > 2019-02-11 13:54 UTC+0100 ~ Alban Crequy > > From: Alban Crequy > > > > bpftool has support for attach types "stream_verdict" and > > "stream_parser" but the documentation was referring to them with > > "skb_verdict" and "skb_parse". The inconsistency comes from commit > > b7d3826c2ed6 ("bpf: bpftool, add support for attaching programs to > > maps"). > > > > This patch changes the documentation to match the implementation. > > > > Signed-off-by: Alban Crequy > > --- > > tools/bpf/bpftool/prog.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c > > index 0640e9bc0ada..dfaa019a60f0 100644 > > --- a/tools/bpf/bpftool/prog.c > > +++ b/tools/bpf/bpftool/prog.c > > @@ -1198,7 +1198,7 @@ static int do_help(int argc, char **argv) > > " cgroup/bind4 | cgroup/bind6 | > > cgroup/post_bind4 |\n" > > " cgroup/post_bind6 | cgroup/connect4 | > > cgroup/connect6 |\n" > > " cgroup/sendmsg4 | cgroup/sendmsg6 }\n" > > - " ATTACH_TYPE := { msg_verdict | skb_verdict | > > skb_parse |\n" > > + " ATTACH_TYPE := { msg_verdict | stream_verdict | > > stream_parser |\n" > > "flow_dissector }\n" > > " " HELP_SPEC_OPTIONS "\n" > > "", > > > > Thanks Alban! > > Could you please fix the man page and the bash completion file at the > same time? Yes, I will do that soon. Sorry for the delay in replying. Thanks, Alban
[PATCH bpf-next v2] bpf: bpftool, fix documentation for attach types
From: Alban Crequy bpftool has support for attach types "stream_verdict" and "stream_parser" but the documentation was referring to them as "skb_verdict" and "skb_parse". The inconsistency comes from commit b7d3826c2ed6 ("bpf: bpftool, add support for attaching programs to maps"). This patch changes the documentation to match the implementation: - "bpftool prog help" - man pages - bash completion Signed-off-by: Alban Crequy --- Changes v1 to v2: - fix man pages & bash completion (from Quentin's review) --- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 +- tools/bpf/bpftool/bash-completion/bpftool| 4 ++-- tools/bpf/bpftool/prog.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 7e59495cb028..12bc1e2d4b46 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -42,7 +42,7 @@ PROG COMMANDS | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | } | *ATTACH_TYPE* := { -| **msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector** +| **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** | } diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 763dd12482aa..b803827d01e8 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -311,8 +311,8 @@ _bpftool() return 0 ;; 5) -COMPREPLY=( $( compgen -W 'msg_verdict skb_verdict \ -skb_parse flow_dissector' -- "$cur" ) ) +COMPREPLY=( $( compgen -W 'msg_verdict stream_verdict \ +stream_parser flow_dissector' -- "$cur" ) ) return 0 ;; 6) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 33ed0806ccc0..db978c8d76a8 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1199,7 +1199,7 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 }\n" - " ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n" + " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" "flow_dissector }\n" " " HELP_SPEC_OPTIONS "\n" "", -- 2.20.1
[PATCH bpf-next v1] bpf, lpm: fix lookup bug in map_delete_elem
From: Alban Crequy trie_delete_elem() was deleting an entry even though it was not matching if the prefixlen was correct. This patch adds a check on matchlen. Reproducer: $ sudo bpftool map create /sys/fs/bpf/mylpm type lpm_trie key 8 value 1 entries 128 name mylpm flags 1 $ sudo bpftool map update pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 aa bb cc dd value hex 01 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm key: 10 00 00 00 aa bb cc dd value: 01 Found 1 element $ sudo bpftool map delete pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 ff ff ff ff $ echo $? 0 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm Found 0 elements Signed-off-by: Alban Crequy --- kernel/bpf/lpm_trie.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index abf1002080df..93a5cbbde421 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) } if (!node || node->prefixlen != key->prefixlen || + node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) { ret = -ENOENT; goto out; -- 2.20.1
Re: [PATCH bpf-next v1] bpf, lpm: fix lookup bug in map_delete_elem
On Thu, Feb 21, 2019 at 11:24 PM Martin Lau wrote: > > On Thu, Feb 21, 2019 at 05:39:26PM +0100, Alban Crequy wrote: > > From: Alban Crequy > > > > trie_delete_elem() was deleting an entry even though it was not matching > > if the prefixlen was correct. This patch adds a check on matchlen. > > > > Reproducer: > > > > $ sudo bpftool map create /sys/fs/bpf/mylpm type lpm_trie key 8 value 1 > > entries 128 name mylpm flags 1 > > $ sudo bpftool map update pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 aa > > bb cc dd value hex 01 > > $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm > > key: 10 00 00 00 aa bb cc dd value: 01 > > Found 1 element > > $ sudo bpftool map delete pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 ff > > ff ff ff > > $ echo $? > > 0 > > $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm > > Found 0 elements > The change makes sense to me. Can you add this reproducer to > tools/testing/selftests/bpf/test_lpm_map.c? > > Bug fix should be for the "bpf" tree instead of "bpf-next" > Fixes tag is also required, like > > Fixes: e454cf595853 ("bpf: Implement map_delete_elem for > BPF_MAP_TYPE_LPM_TRIE") > Cc: Craig Gallek Thanks! I'll send a v2 shortly with the selftest and the tags, based on "bpf" tree. Cheers, Alban > > Signed-off-by: Alban Crequy > > --- > > kernel/bpf/lpm_trie.c | 1 + > > 1 file changed, 1 insertion(+) > > > > diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c > > index abf1002080df..93a5cbbde421 100644 > > --- a/kernel/bpf/lpm_trie.c > > +++ b/kernel/bpf/lpm_trie.c > > @@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void > > *_key) > > } > > > > if (!node || node->prefixlen != key->prefixlen || > > + node->prefixlen != matchlen || > > (node->flags & LPM_TREE_NODE_FLAG_IM)) { > > ret = -ENOENT; > > goto out; > > -- > > 2.20.1 > >
[PATCH bpf v2] bpf, lpm: fix lookup bug in map_delete_elem
From: Alban Crequy trie_delete_elem() was deleting an entry even though it was not matching if the prefixlen was correct. This patch adds a check on matchlen. Reproducer: $ sudo bpftool map create /sys/fs/bpf/mylpm type lpm_trie key 8 value 1 entries 128 name mylpm flags 1 $ sudo bpftool map update pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 aa bb cc dd value hex 01 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm key: 10 00 00 00 aa bb cc dd value: 01 Found 1 element $ sudo bpftool map delete pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 ff ff ff ff $ echo $? 0 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm Found 0 elements A similar reproducer is added in the selftests. Without the patch: $ sudo ./tools/testing/selftests/bpf/test_lpm_map test_lpm_map: test_lpm_map.c:485: test_lpm_delete: Assertion `bpf_map_delete_elem(map_fd, key) == -1 && errno == ENOENT' failed. Aborted With the patch: test_lpm_map runs without errors. Fixes: e454cf595853 ("bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE") Cc: Craig Gallek Signed-off-by: Alban Crequy --- Changes v1 to v2: - add selftest (review from Martin) - update commitmsg tags (review from Martin) - rebase on "bpf" tree and test again (review from Martin) --- kernel/bpf/lpm_trie.c | 1 + tools/testing/selftests/bpf/test_lpm_map.c | 10 ++ 2 files changed, 11 insertions(+) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index abf1002080df..93a5cbbde421 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) } if (!node || node->prefixlen != key->prefixlen || + node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) { ret = -ENOENT; goto out; diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index 147e34cfceb7..02d7c871862a 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -474,6 +474,16 @@ static void test_lpm_delete(void) assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 && errno == ENOENT); + key->prefixlen = 30; // unused prefix so far + inet_pton(AF_INET, "192.255.0.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == -1 && + errno == ENOENT); + + key->prefixlen = 16; // same prefix as the root node + inet_pton(AF_INET, "192.255.0.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == -1 && + errno == ENOENT); + /* assert initial lookup */ key->prefixlen = 32; inet_pton(AF_INET, "192.168.0.1", key->data); -- 2.20.1
Re: [PATCH bpf v2] bpf, lpm: fix lookup bug in map_delete_elem
adding cc: Craig Gallek I got the "git send-email" command wrong, and the cc to Craig was removed, despite being listed in the commitmsg. Sorry for my mistake. There is a copy of the patch at this URL, if needed: https://github.com/kinvolk/linux/commit/5c522b02ee447f2eb060f840ba709af6b425f932 On Fri, Feb 22, 2019 at 2:19 PM Alban Crequy wrote: > > From: Alban Crequy > > trie_delete_elem() was deleting an entry even though it was not matching > if the prefixlen was correct. This patch adds a check on matchlen. > > Reproducer: > > $ sudo bpftool map create /sys/fs/bpf/mylpm type lpm_trie key 8 value 1 > entries 128 name mylpm flags 1 > $ sudo bpftool map update pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 aa bb > cc dd value hex 01 > $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm > key: 10 00 00 00 aa bb cc dd value: 01 > Found 1 element > $ sudo bpftool map delete pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 ff ff > ff ff > $ echo $? > 0 > $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm > Found 0 elements > > A similar reproducer is added in the selftests. > > Without the patch: > > $ sudo ./tools/testing/selftests/bpf/test_lpm_map > test_lpm_map: test_lpm_map.c:485: test_lpm_delete: Assertion > `bpf_map_delete_elem(map_fd, key) == -1 && errno == ENOENT' failed. > Aborted > > With the patch: test_lpm_map runs without errors. > > Fixes: e454cf595853 ("bpf: Implement map_delete_elem for > BPF_MAP_TYPE_LPM_TRIE") > Cc: Craig Gallek > Signed-off-by: Alban Crequy > > --- > > Changes v1 to v2: > - add selftest (review from Martin) > - update commitmsg tags (review from Martin) > - rebase on "bpf" tree and test again (review from Martin) > --- > kernel/bpf/lpm_trie.c | 1 + > tools/testing/selftests/bpf/test_lpm_map.c | 10 ++ > 2 files changed, 11 insertions(+) > > diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c > index abf1002080df..93a5cbbde421 100644 > --- a/kernel/bpf/lpm_trie.c > +++ b/kernel/bpf/lpm_trie.c > @@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void > *_key) > } > > if (!node || node->prefixlen != key->prefixlen || > + node->prefixlen != matchlen || > (node->flags & LPM_TREE_NODE_FLAG_IM)) { > ret = -ENOENT; > goto out; > diff --git a/tools/testing/selftests/bpf/test_lpm_map.c > b/tools/testing/selftests/bpf/test_lpm_map.c > index 147e34cfceb7..02d7c871862a 100644 > --- a/tools/testing/selftests/bpf/test_lpm_map.c > +++ b/tools/testing/selftests/bpf/test_lpm_map.c > @@ -474,6 +474,16 @@ static void test_lpm_delete(void) > assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 && > errno == ENOENT); > > + key->prefixlen = 30; // unused prefix so far > + inet_pton(AF_INET, "192.255.0.0", key->data); > + assert(bpf_map_delete_elem(map_fd, key) == -1 && > + errno == ENOENT); > + > + key->prefixlen = 16; // same prefix as the root node > + inet_pton(AF_INET, "192.255.0.0", key->data); > + assert(bpf_map_delete_elem(map_fd, key) == -1 && > + errno == ENOENT); > + > /* assert initial lookup */ > key->prefixlen = 32; > inet_pton(AF_INET, "192.168.0.1", key->data); > -- > 2.20.1 >
[PATCH bpf-next v1 5/7] tools: bpftool: support loading map by fd from parent process
From: Alban Crequy Using a file descriptor passed by the parent process enables applications to fork a bpftool command to inspect a map they know by file descriptor even when they don't support bpffs or map ids. Documentation and bash completion updated as well. Signed-off-by: Alban Crequy --- tools/bpf/bpftool/Documentation/bpftool-map.rst | 8 tools/bpf/bpftool/bash-completion/bpftool | 10 +- tools/bpf/bpftool/map.c | 13 + 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index dfd8352fa453..658fe2fb8ecf 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -265,6 +265,14 @@ would be lost as soon as bpftool exits). anon_inode:bpf-map +**# bpftool map exec pinned /sys/fs/bpf/foo fd 99 cmd -- bpftool map show fd 99** + +:: + + 10: hash name some_map flags 0x0 + key 4B value 8B max_entries 2048 memlock 167936B + + SEE ALSO **bpf**\ (2), diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 63cd53c4d3a7..9ec1833bda1e 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -253,7 +253,7 @@ _bpftool() esac local PROG_TYPE='id pinned tag' -local MAP_TYPE='id pinned' +local MAP_TYPE='id pinned fd' case $command in show|list) [[ $prev != "$command" ]] && return 0 @@ -343,7 +343,7 @@ _bpftool() obj=${words[3]} if [[ ${words[-4]} == "map" ]]; then -COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) +COMPREPLY=( $( compgen -W "id pinned fd" -- "$cur" ) ) return 0 fi if [[ ${words[-3]} == "map" ]]; then @@ -406,7 +406,7 @@ _bpftool() esac ;; map) -local MAP_TYPE='id pinned' +local MAP_TYPE='id pinned fd' case $command in show|list|dump|peek|pop|dequeue) case $prev in @@ -462,7 +462,7 @@ _bpftool() return 0 ;; innermap) -COMPREPLY+=( $( compgen -W "id pinned" -- "$cur" ) ) +COMPREPLY+=( $( compgen -W "id pinned fd" -- "$cur" ) ) return 0 ;; id) @@ -525,7 +525,7 @@ _bpftool() # map, depending on the type of the map to update. case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) -local MAP_TYPE='id pinned' +local MAP_TYPE='id pinned fd' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ -- "$cur" ) ) return 0 diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 768497364cee..9befcabc299d 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -125,6 +125,19 @@ int map_parse_fd(int *argc, char ***argv) NEXT_ARGP(); return open_obj_pinned_any(path, BPF_OBJ_MAP); + } else if (is_prefix(**argv, "fd")) { + char *endptr; + + NEXT_ARGP(); + + fd = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as fd", **argv); + return -1; + } + NEXT_ARGP(); + + return dup(fd); } p_err("expected 'id' or 'pinned', got: '%s'?", **argv); -- 2.20.1
[PATCH bpf-next v1] tools/bpftool: create map of maps
From: Alban Crequy Before this patch, there was no way to fill attr.inner_map_fd, necessary for array_of_maps or hash_of_maps. This patch adds keyword 'innermap' to pass the innermap, either as an id or as a pinned map. Example of commands: $ sudo bpftool map create /sys/fs/bpf/innermap type hash \ key 8 value 8 entries 64 name innermap flags 1 $ sudo bpftool map create /sys/fs/bpf/outermap type hash_of_maps \ innermap pinned /sys/fs/bpf/innermap key 64 value 4 \ entries 64 name myoutermap flags 1 $ sudo bpftool map show pinned /sys/fs/bpf/outermap 47: hash_of_maps name myoutermap flags 0x1 key 64B value 4B max_entries 64 memlock 12288B Documentation and bash completion updated as well. Signed-off-by: Alban Crequy --- tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 +- tools/bpf/bpftool/bash-completion/bpftool| 13 + tools/bpf/bpftool/map.c | 5 - 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 9386bd6e0396..35902ab95cc5 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -25,7 +25,7 @@ PROG COMMANDS | **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** | **visual** | **linum**}] | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes** | **linum**}] | **bpftool** **prog pin** *PROG* *FILE* -| **bpftool** **prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] +| **bpftool** **prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*] [**innermap** MAP] | **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] | **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] | **bpftool** **prog tracelog** diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index b803827d01e8..dcdc39510dc3 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -461,6 +461,18 @@ _bpftool() _sysfs_get_netdevs return 0 ;; +innermap) +COMPREPLY+=( $( compgen -W "id pinned" -- "$cur" ) ) +return 0 +;; +id) +_bpftool_get_map_ids +return 0 +;; +pinned) +_filedir +return 0 +;; *) _bpftool_once_attr 'type' _bpftool_once_attr 'key' @@ -469,6 +481,7 @@ _bpftool() _bpftool_once_attr 'name' _bpftool_once_attr 'flags' _bpftool_once_attr 'dev' +_bpftool_once_attr 'innermap' return 0 ;; esac diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index e0c650d91784..7d8ce903a471 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1151,6 +1151,9 @@ static int do_create(int argc, char **argv) return -1; } NEXT_ARG(); + } else if (is_prefix(*argv, "innermap")) { + NEXT_ARG(); + attr.inner_map_fd = map_parse_fd(&argc, &argv); } } @@ -1231,7 +1234,7 @@ static int do_help(int argc, char **argv) "Usage: %s %s { show | list } [MAP]\n" " %s %s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" - " [dev NAME]\n" + " [dev NAME] [innermap MAP]\n" " %s %s dump MAP\n" " %s %s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" " %s %s lookup MAP [key DATA]\n" -- 2.20.1
Re: [PATCH bpf-next v1] tools/bpftool: create map of maps
Thanks for the reviews, Jakub and Quentin! I will address it and resend a new version when bpf-next opens again. I'm also preparing some other patches on "bpftool map" about pinning and passing file descriptors to help applications that don't support map pinning directly. On Tue, Mar 5, 2019 at 6:32 PM Jakub Kicinski wrote: > > On Tue, 5 Mar 2019 17:38:03 +0100, Alban Crequy wrote: > > From: Alban Crequy > > > > Before this patch, there was no way to fill attr.inner_map_fd, necessary > > for array_of_maps or hash_of_maps. > > > > This patch adds keyword 'innermap' to pass the innermap, either as an id > > or as a pinned map. > > > > Example of commands: > > > > $ sudo bpftool map create /sys/fs/bpf/innermap type hash \ > > key 8 value 8 entries 64 name innermap flags 1 > > $ sudo bpftool map create /sys/fs/bpf/outermap type hash_of_maps \ > > innermap pinned /sys/fs/bpf/innermap key 64 value 4 \ > > entries 64 name myoutermap flags 1 > > $ sudo bpftool map show pinned /sys/fs/bpf/outermap > > 47: hash_of_maps name myoutermap flags 0x1 > > key 64B value 4B max_entries 64 memlock 12288B > > > > Documentation and bash completion updated as well. > > > > Signed-off-by: Alban Crequy > > bpf-next is closed let's continue reviewing, but you'll probably have > to repost after the merge window :( > > > diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c > > index e0c650d91784..7d8ce903a471 100644 > > --- a/tools/bpf/bpftool/map.c > > +++ b/tools/bpf/bpftool/map.c > > @@ -1151,6 +1151,9 @@ static int do_create(int argc, char **argv) > > return -1; > > } > > NEXT_ARG(); > > + } else if (is_prefix(*argv, "innermap")) { > > + NEXT_ARG(); > > + attr.inner_map_fd = map_parse_fd(&argc, &argv); > > You need to check if the return value is not -1, and also close this > file descriptor (a) when done, (b) when error happens. > > > } > > } > >
Re: [PATCH 0/3] namei: implement various scoping AT_* flags
On Sat, Sep 29, 2018 at 12:35 PM Aleksa Sarai wrote: > > The need for some sort of control over VFS's path resolution (to avoid > malicious paths resulting in inadvertent breakouts) has been a very > long-standing desire of many userspace applications. This patchset is a > revival of Al Viro's old AT_NO_JUMPS[1] patchset with a few additions. > > The most obvious change is that AT_NO_JUMPS has been split as dicussed > in the original thread, along with a further split of AT_NO_PROCLINKS > which means that each individual property of AT_NO_JUMPS is now a > separate flag: > > * Path-based escapes from the starting-point using "/" or ".." are > blocked by AT_BENEATH. > * Mountpoint crossings are blocked by AT_XDEV. > * /proc/$pid/fd/$fd resolution is blocked by AT_NO_PROCLINKS (more > correctly it actually blocks any user of nd_jump_link() because it > allows out-of-VFS path resolution manipulation). > > AT_NO_JUMPS is now effectively (AT_BENEATH|AT_XDEV|AT_NO_PROCLINKS). At > Linus' suggestion in the original thread, I've also implemented > AT_NO_SYMLINKS which just denies _all_ symlink resolution (including > "proclink" resolution). It seems quite useful to me. > An additional improvement was made to AT_XDEV. The original AT_NO_JUMPS > path didn't consider "/tmp/.." as a mountpoint crossing -- this patch > blocks this as well (feel free to ask me to remove it if you feel this > is not sane). > > Currently I've only enabled these for openat(2) and the stat(2) family. > I would hope we could enable it for basically every *at(2) syscall -- > but many of them appear to not have a @flags argument and thus we'll > need to add several new syscalls to do this. I'm more than happy to send > those patches, but I'd prefer to know that this preliminary work is > acceptable before doing a bunch of copy-paste to add new sets of *at(2) > syscalls. What do you think of an equivalent feature AT_NO_SYMLINKS flag for mount()? I guess that would have made the fix for CVE-2017-1002101 in Kubernetes easier to write: https://kubernetes.io/blog/2018/04/04/fixing-subpath-volume-vulnerability/ > One additional feature I've implemented is AT_THIS_ROOT (I imagine this > is probably going to be more contentious than the refresh of > AT_NO_JUMPS, so I've included it in a separate patch). The patch itself > describes my reasoning, but the shortened version of the premise is that > continer runtimes need to have a way to resolve paths within a > potentially malicious rootfs. Container runtimes currently do this in > userspace[2] which has implicit race conditions that are not resolvable > in userspace (or use fork+exec+chroot and SCM_RIGHTS passing which is > inefficient). AT_THIS_ROOT allows for per-call chroot-like semantics for > path resolution, which would be invaluable for us -- and the > implementation is basically identical to AT_BENEATH (except that we > don't return errors when someone actually hits the root). > > I've added some selftests for this, but it's not clear to me whether > they should live here or in xfstests (as far as I can tell there are no > other VFS tests in selftests, while there are some tests that look like > generic VFS tests in xfstests). If you'd prefer them to be included in > xfstests, let me know. > > [1]: https://lore.kernel.org/patchwork/patch/784221/ > [2]: https://github.com/cyphar/filepath-securejoin > > Aleksa Sarai (3): > namei: implement O_BENEATH-style AT_* flags > namei: implement AT_THIS_ROOT chroot-like path resolution > selftests: vfs: add AT_* path resolution tests > > fs/fcntl.c| 2 +- > fs/namei.c| 158 -- > fs/open.c | 10 ++ > fs/stat.c | 15 +- > include/linux/fcntl.h | 3 +- > include/linux/namei.h | 8 + > include/uapi/asm-generic/fcntl.h | 20 +++ > include/uapi/linux/fcntl.h| 10 ++ > tools/testing/selftests/Makefile | 1 + > tools/testing/selftests/vfs/.gitignore| 1 + > tools/testing/selftests/vfs/Makefile | 13 ++ > tools/testing/selftests/vfs/at_flags.h| 40 + > tools/testing/selftests/vfs/common.sh | 37 > .../selftests/vfs/tests/0001_at_beneath.sh| 72 > .../selftests/vfs/tests/0002_at_xdev.sh | 54 ++ > .../vfs/tests/0003_at_no_proclinks.sh | 50 ++ > .../vfs/tests/0004_at_no_symlinks.sh | 49 ++ > .../selftests/vfs/tests/0005_at_this_root.sh | 66 > tools/testing/selftests/vfs/vfs_helper.c | 154 + > 19 files changed, 707 insertions(+), 56 deletions(-) > create mode 100644 tools/testing/selftests/vfs/.gitignore > create mode 100644 tools/testing/selftests/vfs/Makefile > create mode 100644 tools/testing/selftests/vfs/at_flags.h > create mode
Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino
On Mon, May 14, 2018 at 9:38 PM, Y Song wrote: > > On Sun, May 13, 2018 at 10:33 AM, Alban Crequy wrote: > > From: Alban Crequy > > > > bpf_get_current_cgroup_ino() allows BPF trace programs to get the inode > > of the cgroup where the current process resides. > > > > My use case is to get statistics about syscalls done by a specific > > Kubernetes container. I have a tracepoint on raw_syscalls/sys_enter and > > a BPF map containing the cgroup inode that I want to trace. I use > > bpf_get_current_cgroup_ino() and I quickly return from the tracepoint if > > the inode is not in the BPF hash map. > > Alternatively, the kernel already has bpf_current_task_under_cgroup helper > which uses a cgroup array to store cgroup fd's. If the current task is > in the hierarchy of a particular cgroup, the helper will return true. > > One difference between your helper and bpf_current_task_under_cgroup() is > that your helper tests against a particular cgroup, not including its > children, but > bpf_current_task_under_cgroup() will return true even the task is in a > nested cgroup. > > Maybe this will work for you? I like the behaviour that it checks for children cgroups. But with the cgroup array, I can test only one cgroup at a time. I would like to be able to enable my tracer for a few Kubernetes containers or all by adding the inodes of a few cgroups in a hash map. So I could keep separate stats for each. With bpf_current_task_under_cgroup(), I would need to iterate over the list of cgroups, which is difficult with BPF. Also, Kubernetes is cgroup-v1 only and bpf_current_task_under_cgroup() is cgroup-v2 only. In Kubernetes, the processes remain in the root of the v2 hierarchy. I'd like to be able to select the cgroup hierarchy in my helper so it'd work for both v1 and v2. > > Without this BPF helper, I would need to keep track of all pids in the > > container. The Netlink proc connector can be used to follow process > > creation and destruction but it is racy. > > > > This patch only looks at the memory cgroup, which was enough for me > > since each Kubernetes container is placed in a different mem cgroup. > > For a generic implementation, I'm not sure how to proceed: it seems I > > would need to use 'for_each_root(root)' (see example in > > proc_cgroup_show() from kernel/cgroup/cgroup.c) but I don't know if > > taking the cgroup mutex is possible in the BPF helper function. It might > > be ok in the tracepoint raw_syscalls/sys_enter but could the mutex > > already be taken in some other tracepoints? > > mutex is not allowed in a helper since it can block. Ok. I don't know how to implement my helper properly then. Maybe I could just use the 1st cgroup-v1 hierarchy (the name=systemd one) so I don't have to iterate over the hierarchies. But would that be acceptable? Cheers, Alban > > Signed-off-by: Alban Crequy > > --- > > include/uapi/linux/bpf.h | 11 ++- > > kernel/trace/bpf_trace.c | 25 + > > 2 files changed, 35 insertions(+), 1 deletion(-) > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index c5ec89732a8d..38ac3959cdf3 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -755,6 +755,14 @@ union bpf_attr { > > * @addr: pointer to struct sockaddr to bind socket to > > * @addr_len: length of sockaddr structure > > * Return: 0 on success or negative error code > > + * > > + * u64 bpf_get_current_cgroup_ino(hierarchy, flags) > > + * Get the cgroup{1,2} inode of current task under the specified > > hierarchy. > > + * @hierarchy: cgroup hierarchy > > Not sure what is the value to specify hierarchy here. > A cgroup directory fd? > > > + * @flags: reserved for future use > > + * Return: > > + * == 0 error > > looks like < 0 means error. > > > + *> 0 inode of the cgroup >>= 0 means good? > > */ > > #define __BPF_FUNC_MAPPER(FN) \ > > FN(unspec), \ > > @@ -821,7 +829,8 @@ union bpf_attr { > > FN(msg_apply_bytes),\ > > FN(msg_cork_bytes), \ > > FN(msg_pull_data), \ > > - FN(bind), > > + FN(bind), \ > > + FN(get_current_cgroup_ino), > > > > /* integer value in 'imm' field of BPF_CALL instruction selects which > > helper > > * function eBPF program intends to call > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_tr
Re: [PATCH 2/2] pidns: Expose task pid_ns_for_children to userspace
On 14 January 2017 at 15:15, Kirill Tkhai wrote: > For correct checkpointing/restoring of a task from userspace > it's need to know the task's pid_ns_for_children. Currently, > there is no a sane way to do that (the only possible trick > is to force the task create a new child and to analize the > child's /proc/[pid]/ns/pid link, that is performance-stupid). > > The patch exposes pid_ns_for_children to ns directory > in standard way with the name "pid_for_children": > > ~# ls /proc/5531/ns -l | grep pid > lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836] > lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286] > > Signed-off-by: Kirill Tkhai What's happening if a process, after unsharing CLONE_NEWPID, does not fork but instead let another process open the new "pid_for_children" and then setns()+fork()? Is that other process allowed to create the "pid 1" in the new pid namespaces? Is that also allowed if the other process lives in a sibling pid namespace? If so, that would break what pid_namespaces(7) says: "the parental relationship between processes mirrors the parental relationship between PID namespaces: the parent of a process is either in the same namespace or resides in the immediate parent PID namespace."
Re: [PATCH v4 0/3] perf: add support for analyzing events for containers
Hi, > Currently, there is no trivial mechanism to analyze events based on > containers. perf -G can be used, but it will not filter events for the > containers created after perf is invoked, making it difficult to assess/ > analyze performance issues of multiple containers at once. > > This patch-set overcomes this limitation by using cgroup identifier as > container unique identifier. A new PERF_RECORD_NAMESPACES event that > records namespaces related info is introduced, from which the cgroup > namespace's device & inode numbers are used as cgroup identifier. This > is based on the assumption that each container is created with it's own > cgroup namespace allowing assessment/analysis of multiple containers > using cgroup identifier. > > The first patch introduces PERF_RECORD_NAMESPACES in kernel while the > second patch makes the corresponding changes in perf tool to read this > PERF_RECORD_NAMESPACES events. The third patch adds a cgroup identifier > column in perf report, which contains the cgroup namespace's device and > inode numbers. I have a question for the pid namespace: does the new perf event gives the pid namespace of the task, or the pid_ns_for_children from the nsproxy? From my limited understanding, v4 seems to do the former, as opposed to v3. When synthesizing events from /proc/$PID/ns/pid, it cannot take the pid_ns_for_children, so I wanted to make sure it is consistent. Cheers, Alban
Re: [lxc-devel] CGroup Namespaces (v10)
On 29 January 2016 at 09:54, wrote: > Hi, > > following is a revised set of the CGroup Namespace patchset which Aditya > Kali has previously sent. The code can also be found in the cgroupns.v10 > branch of > > https://git.kernel.org/cgit/linux/kernel/git/sergeh/linux-security.git/ > > To summarize the semantics: > > 1. CLONE_NEWCGROUP re-uses 0x0200, which was previously CLONE_STOPPED > > 2. unsharing a cgroup namespace makes all your current cgroups your new > cgroup root. > > 3. /proc/pid/cgroup always shows cgroup paths relative to the reader's > cgroup namespce root. A task outside of your cgroup looks like > > 8:memory:/../../.. > > 4. when a task mounts a cgroupfs, the cgroup which shows up as root depends > on the mounting task's cgroup namespace. > > 5. setns to a cgroup namespace switches your cgroup namespace but not > your cgroups. > > With this, using github.com/hallyn/lxc #2015-11-09/cgns (and > github.com/hallyn/lxcfs #2015-11-10/cgns) we can start a container in a full > proper cgroup namespace, avoiding either cgmanager or lxcfs cgroup bind > mounts. > > This is completely backward compatible and will be completely invisible > to any existing cgroup users (except for those running inside a cgroup > namespace and looking at /proc/pid/cgroup of tasks outside their > namespace.) Hi, I just noticed commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match") which, as far as I understand, introduces a new userland facing API containing the full cgroup path. Does it mean that the cgroupns patchset should include cgroup path translation in xt_cgroup? > Changes from V9: > 1. Update to latest Linus tree > 2. A few locking fixes > > Changes from V8: > 1. Incorporate updated documentation from tj. > 2. Put lookup_one_len() under inode lock > 3. Make cgroup_path non-namespaced, so only calls to cgroup_path_ns() are >namespaced. > 4. Make cgroup_path{,_ns} take the needed locks, since external callers cannot >do so. > 5. Fix the bisectability problem of to_cg_ns() being defined after use > > Changes from V7: > 1. Rework kernfs_path_from_node_locked to return the string length > 2. Rename and reorder args to kernfs_path_from_node > 3. cgroup.c: undo accidental conversoins to inline > 4. cgroup.h: move ns declarations to bottom. > 5. Rework the documentation to fit the style of the rest of cgroup.txt > > Changes from V6: > 1. Switch to some WARN_ONs to provide stack traces > 2. Rename kernfs_node_distance to kernfs_depth > 3. Make sure kernfs_common_ancestor() nodes are from same root > 4. Split kernfs changes for cgroup_mount into separate patch > 5. Rename kernfs_obtain_root to kernfs_node_dentry > (And more, see patch changelogs) > > Changes from V5: > 1. To get a root dentry for cgroup namespace mount, walk the path from the >kernfs root dentry. > > Changes from V4: > 1. Move the FS_USERNS_MOUNT flag to last patch > 2. Rebase onto cgroup/for-4.5 > 3. Don't non-init user namespaces to bind new subsystems when mounting. > 4. Address feedback from Tejun (thanks). Specificaly, not addressed: >. kernfs_obtain_root - walking dentry from kernfs root. > (I think that's the only piece) > 5. Dropped unused get_task_cgroup fn/patch. > 6. Reworked kernfs_path_from_node_locked() to try to simplify the logic. >It now finds a common ancestor, walks from the source to it, then back >up to the target. > > Changes from V3: > 1. Rebased onto latest cgroup changes. In particular switch to >css_set_lock and ns_common. > 2. Support all hierarchies. > > Changes from V2: > 1. Added documentation in Documentation/cgroups/namespace.txt > 2. Fixed a bug that caused crash > 3. Incorporated some other suggestions from last patchset: >- removed use of threadgroup_lock() while creating new cgroupns >- use task_lock() instead of rcu_read_lock() while accessing > task->nsproxy >- optimized setns() to own cgroupns >- simplified code around sane-behavior mount option parsing > 4. Restored ACKs from Serge Hallyn from v1 on few patches that have >not changed since then. > > Changes from V1: > 1. No pinning of processes within cgroupns. Tasks can be freely moved >across cgroups even outside of their cgroupns-root. Usual DAC/MAC policies >apply as before. > 2. Path in /proc//cgroup is now always shown and is relative to >cgroupns-root. So path can contain '/..' strings depending on cgroupns-root >of the reader and cgroup of . > 3. setns() does not require the process to first move under target >cgroupns-root. > > Changes form RFC (V0): > 1. setns support for cgroupns > 2. 'mount -t cgroup cgroup ' from inside a cgroupns now >mounts the cgroup hierarcy with cgroupns-root as the filesystem root. > 3. writes to cgroup files outside of cgroupns-root are not allowed > 4. visibility of /proc//cgroup is further restricted by not showing >anything if the is in a sibling cgroupns and its cgroup falls outside >your cgroupn
Re: [PATCH 8/8] netfilter: implement xt_cgroup cgroup2 path match
Hi, On 7 December 2015 at 23:38, Tejun Heo wrote: > This patch implements xt_cgroup path match which matches cgroup2 > membership of the associated socket. The match is recursive and > invertible. Is there any plans to implement a similar cgroup2 path match in a cgroup classifier in tc? I wonder if it will be possible to use cgroup to classify packets in tc without net_cls.class_id in cgroup2. Thanks! Alban
[PATCH] selftests/cgroupns: new test for cgroup namespaces
From: Alban Crequy This adds the selftest "cgroupns_test" in order to test the CGroup Namespace patchset. cgroupns_test creates two child processes. They perform a list of actions defined by the array cgroupns_test. This array can easily be extended to more scenarios without adding much code. They are synchronized with eventfds to ensure only one action is performed at a time. The memory is shared between the processes (CLONE_VM) so each child process can know the pid of their siblings without extra IPC. The output explains the scenario being played. Short extract: > current cgroup: /user.slice/user-0.slice/session-1.scope > child process #0: check that process #self (pid=482) has cgroup > /user.slice/user-0.slice/session-1.scope > child process #0: unshare cgroupns > child process #0: move process #self (pid=482) to cgroup cgroup-a/subcgroup-a > child process #0: join parent cgroupns The test does not change the mount namespace and does not mount any new cgroup2 filesystem. Therefore this does not test that the cgroup2 mount is correctly rooted to the cgroupns root at mount time. Signed-off-by: Alban Crequy Acked-by: Serge E. Hallyn --- Changelog: 20160131 - rebase on sergeh/cgroupns.v10 and fix conflicts 20160115 - Detect where cgroup2 is mounted, don't assume /sys/fs/cgroup (suggested by sergeh) - Check more error conditions (from krnowak's review) - Coding style (from krnowak's review) - Update error message for Linux >= 4.5 (from krnowak's review) 20160104 - Fix coding style (from sergeh's review) - Fix printf formatting (from sergeh's review) - Fix parsing of /proc/pid/cgroup (from sergeh's review) - Fix concatenation of cgroup paths 20151219 - First version This patch is available in the cgroupns.v10-tests branch of https://github.com/kinvolk/linux.git It is rebased on top of Serge Hallyn's cgroupns.v10 branch of https://git.kernel.org/cgit/linux/kernel/git/sergeh/linux-security.git/ Test results: - SUCCESS on kernel cgroupns.v10 booted with systemd.unified_cgroup_hierarchy=1 - SUCCESS on kernel cgroupns.v10 booted with systemd.unified_cgroup_hierarchy=0 --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/cgroupns/Makefile| 11 + tools/testing/selftests/cgroupns/cgroupns_test.c | 445 +++ 3 files changed, 457 insertions(+) create mode 100644 tools/testing/selftests/cgroupns/Makefile create mode 100644 tools/testing/selftests/cgroupns/cgroupns_test.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b04afc3..b373135 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,5 +1,6 @@ TARGETS = breakpoints TARGETS += capabilities +TARGETS += cgroupns TARGETS += cpu-hotplug TARGETS += efivarfs TARGETS += exec diff --git a/tools/testing/selftests/cgroupns/Makefile b/tools/testing/selftests/cgroupns/Makefile new file mode 100644 index 000..0fdbe0a --- /dev/null +++ b/tools/testing/selftests/cgroupns/Makefile @@ -0,0 +1,11 @@ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../include/uapi/ + +all: cgroupns_test + +TEST_PROGS := cgroupns_test + +include ../lib.mk + +clean: + $(RM) cgroupns_test diff --git a/tools/testing/selftests/cgroupns/cgroupns_test.c b/tools/testing/selftests/cgroupns/cgroupns_test.c new file mode 100644 index 000..71e2336 --- /dev/null +++ b/tools/testing/selftests/cgroupns/cgroupns_test.c @@ -0,0 +1,445 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../kselftest.h" + +#define STACK_SIZE 65536 + +static char cgroup_mountpoint[4096]; +static char root_cgroup[4096]; + +#define CHILDREN_COUNT 2 +typedef struct { + pid_t pid; + uint8_t *stack; + int start_semfd; + int end_semfd; +} cgroupns_child_t; +cgroupns_child_t children[CHILDREN_COUNT]; + +typedef enum { + UNSHARE_CGROUPNS, + JOIN_CGROUPNS, + CHECK_CGROUP, + CHECK_CGROUP_WITH_ROOT_PREFIX, + MOVE_CGROUP, + MOVE_CGROUP_WITH_ROOT_PREFIX, +} cgroupns_action_t; + +static const struct { + int actor_id; + cgroupns_action_t action; + int target_id; + char *path; +} cgroupns_tests[] = { + { 0, CHECK_CGROUP_WITH_ROOT_PREFIX, -1, "/"}, + { 0, CHECK_CGROUP_WITH_ROOT_PREFIX, 0, "/"}, + { 0, CHECK_CGROUP_WITH_ROOT_PREFIX, 1, "/"}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, -1, "/"}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, 0, "/"}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, 1, "/"}, + + { 0, UNSHARE_CG
[PATCH] [RFC] selftests/cgroupns: new test for cgroup namespaces
From: Alban Crequy This adds the selftest "cgroupns_test" in order to test the CGroup Namespace patchset. cgroupns_test creates two child processes. They perform a list of actions defined by the array cgroupns_test. This array can easily be extended to more scenarios without adding much code. They are synchronized with eventfds to ensure only one action is performed at a time. The memory is shared between the processes (CLONE_VM) so each child process can know the pid of their siblings without extra IPC. The output explains the scenario being played. Short extract: > current cgroup: /user.slice/user-0.slice/session-1.scope > child process #0: check that process #self (pid=482) has cgroup > /user.slice/user-0.slice/session-1.scope > child process #0: unshare cgroupns > child process #0: move process #self (pid=482) to cgroup cgroup-a/subcgroup-a > child process #0: join parent cgroupns The test does not change the mount namespace and does not mount any new cgroup2 filesystem. Therefore this does not test that the cgroup2 mount is correctly rooted to the cgroupns root at mount time. Signed-off-by: Alban Crequy --- This patch is available in the cgroupns.v7-tests branch of https://github.com/kinvolk/linux.git It is based on top of Serge Hallyn's cgroupns.v7 branch of https://git.kernel.org/cgit/linux/kernel/git/sergeh/linux-security.git/ I see Linux does not have a lot of selftests and there are more Linux container tests in Linux Test Project: https://github.com/linux-test-project/ltp/tree/master/testcases/kernel/containers Is it better to send this test here or to LTP? --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/cgroupns/Makefile| 11 + tools/testing/selftests/cgroupns/cgroupns_test.c | 378 +++ 3 files changed, 390 insertions(+) create mode 100644 tools/testing/selftests/cgroupns/Makefile create mode 100644 tools/testing/selftests/cgroupns/cgroupns_test.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index c8edff6..694325a 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,5 @@ TARGETS = breakpoints +TARGETS += cgroupns TARGETS += cpu-hotplug TARGETS += efivarfs TARGETS += exec diff --git a/tools/testing/selftests/cgroupns/Makefile b/tools/testing/selftests/cgroupns/Makefile new file mode 100644 index 000..0fdbe0a --- /dev/null +++ b/tools/testing/selftests/cgroupns/Makefile @@ -0,0 +1,11 @@ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../include/uapi/ + +all: cgroupns_test + +TEST_PROGS := cgroupns_test + +include ../lib.mk + +clean: + $(RM) cgroupns_test diff --git a/tools/testing/selftests/cgroupns/cgroupns_test.c b/tools/testing/selftests/cgroupns/cgroupns_test.c new file mode 100644 index 000..d45017c --- /dev/null +++ b/tools/testing/selftests/cgroupns/cgroupns_test.c @@ -0,0 +1,378 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../kselftest.h" + +#define STACK_SIZE 65536 + +static char root_cgroup[4096]; + +#define CHILDREN_COUNT 2 +typedef struct { + int pid; + uint8_t *stack; + int start_semfd; + int end_semfd; +} cgroupns_child_t; +cgroupns_child_t children[CHILDREN_COUNT]; + +typedef enum { + UNSHARE_CGROUPNS, + JOIN_CGROUPNS, + CHECK_CGROUP, + CHECK_CGROUP_WITH_ROOT_PREFIX, + MOVE_CGROUP, + MOVE_CGROUP_WITH_ROOT_PREFIX, +} cgroupns_action_t; + +static const struct { + int actor_pid; + cgroupns_action_t action; + int target_pid; + char *path; +} cgroupns_tests[] = { + { 0, CHECK_CGROUP_WITH_ROOT_PREFIX, -1, ""}, + { 0, CHECK_CGROUP_WITH_ROOT_PREFIX, 0, ""}, + { 0, CHECK_CGROUP_WITH_ROOT_PREFIX, 1, ""}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, -1, ""}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, 0, ""}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, 1, ""}, + + { 0, UNSHARE_CGROUPNS, -1, NULL}, + + { 0, CHECK_CGROUP, -1, "/"}, + { 0, CHECK_CGROUP, 0, "/"}, + { 0, CHECK_CGROUP, 1, "/"}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, -1, ""}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, 0, ""}, + { 1, CHECK_CGROUP_WITH_ROOT_PREFIX, 1, ""}, + + { 1, UNSHARE_CGROUPNS, -1, NULL}, + + { 0, CHECK_CGROUP, -1, "/"}, + { 0, CHECK_CGROUP, 0, "/"}, + { 0, CHECK_CGROUP, 1, "/"}, + { 1, CHECK_CGROUP, -1, "/"}, + { 1, CHECK_CGROUP, 0, "/"}, + { 1, CHECK_CGROUP, 1, "/"}, + + { 0, MOVE_CGROUP_WITH_ROOT_PREFIX, -1, "cgroup-a"}, +
Re: [PATCH v4 0/2] NFS: Fix interaction between fs_context and user namespaces
Hi, I tested the patches on top of 5.10.0-rc3+ and I could mount an NFS share with a different user namespace. fsopen() is done in the container namespaces (user, mnt and net namespaces) while fsconfig(), fsmount() and move_mount() are done on the host namespaces. The mount on the host is available in the container via mount propagation from the host mount. With this, the files on the NFS server with uid 0 are available in the container with uid 0. On the host, they are available with uid 4294967294 (make_kuid(&init_user_ns, -2)). The code to reproduce my test is available at: https://github.com/kinvolk/nfs-mount-in-userns And the results and traces are attached at the end. While the basic feature works, I have some thoughts. First, doing the fsopen() in the container namespaces implements two features in one step: 1. Selection of the userns for the id mapping translation. 2. Selection of the netns for the connection to the NFS server. I was wondering if this only considers the scenario where the user wants to make the connection to the NFS server from the network namespace of the container. I think there is another valid use case to use the userns of the container but the netns of the host or a third-party netns. We can use the correct set of setns() to do the fsopen() in the container userns but in the host netns, but then we’d be in a netns that does not belong to the current userns, so we would not have any capability over it. In my tests, that seems to work fine when the netns and the userns of the fs_context are not related. Still, I would find the API cleaner if the userns and netns were selected explicitly with something like: sfd = fsopen("nfs4", FSOPEN_CLOEXEC); usernsfd = pidfd_open(...); or usernsfd = open(“/proc/pid/ns/user”) fsconfig(sfd, FSCONFIG_SET_FD, "userns", NULL, usernsfd); netnsfd = pidfd_open(...); or netnsfd = open(“/proc/pid/ns/net”) fsconfig(sfd, FSCONFIG_SET_FD, "netns", NULL, netnsfd); This would avoid the need for fd passing after the fsopen(). This would require fsconfig() (possibly in nfs_fs_context_parse_param()) to do the capability check but making it more explicit sounds better to me. Second, the capability check in fsopen() is the following: if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) This means that we cannot just create a temporary userns with the desired id mapping, but we additionally need to enter a mntns owned by the userns. However the code in fsopen() does not seem to do anything with the mntns (The new mount will only be associated with the current mntns at move_mount() time), so we could just create a temporary userns + mntns. It seems weird to me that the capability check is done in relation to the current mntns even though the code does not do anything with it. In Kubernetes, the NFS mount is done before creating the user namespace. pkg/kubelet/kubelet.go Kubelet's syncPod() will do the following in this order: 1. Mount the volumes with CSI or other volume implementations: WaitForAttachAndMount() line 1667 2. Call the CRI's createPodSandbox via kl.containerRuntime.SyncPod() line 1678 to create the user namespace and network namespace. This means that at the time of the NFS mount, we have not yet created the user namespace or the network namespace, and even less configured it with the CNI plugin. With this API where the id mapping for the NFS mount is decided at the superblock level, we would need to refactor the Kubelet code to be able to call the CSI mount after the creation of the sandbox, and after the configuration with CNI. This will be more complicated to integrate in Kubernetes than the idmapped mounts patch set where the id mapping is set at the bind mount level (https://lists.linuxfoundation.org/pipermail/containers/2020-October/042477.html). However, it is less invasive. This approach works for NFS volumes in Kubernetes but would not work with other volumes like hostPath (bind mount from the host) where we don’t have a new superblock. Lastly, I checked the implementation of nfs_compare_super() and it seems fine. In Kubernetes, we want to be able to create several Kubernetes pods with different userns and mount the same NFS share in several pods. The kernel will have to create different NFS superblocks for that scenario and it does that correctly in nfs_compare_super() by comparing the userns and comparing the netns as well. - Running ./nfs-mount-in-userns strace: Process 4022 attached [pid 4022] fsopen("nfs4", FSOPEN_CLOEXEC) = 6 [pid 4022] +++ exited with 0 +++ --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=4022, si_uid=0, si_status=0, si_utime=0, si_stime=0} --- fsconfig(7, FSCONFIG_SET_STRING, "source", "127.0.0.1:/server", 0) = 0 fsconfig(7, FSCONFIG_SET_STRING, "vers", "4.2", 0) = 0 fsconfig(7, FSCONFIG_SET_STRING, "addr", "127.0.0.1", 0) = 0 fsconfig(7, FSCONFIG_SET_STRING, "clientaddr", "127.0.0.1", 0) = 0 fsconfig(7, FSCONFIG_CMD_CREATE, NULL, NULL, 0) = 0 fsmount(7, FSMOUNT_CLOEXEC
[PATCH v1] tracing/kprobes: expose maxactive for kretprobe in kprobe_events
When a kretprobe is installed on a kernel function, there is a maximum limit of how many calls in parallel it can catch (aka "maxactive"). A kernel module could call register_kretprobe() and initialize maxactive (see example in samples/kprobes/kretprobe_example.c). But that is not exposed to userspace and it is currently not possible to choose maxactive when writing to /sys/kernel/debug/tracing/kprobe_events The default maxactive can be as low as 1 on single-core with a non-preemptive kernel. This is too low and we need to increase it not only for recursive functions, but for functions that sleep or resched. This patch updates the format of the command that can be written to kprobe_events so that maxactive can be optionally specified. I need this for a bpf program attached to the kretprobe of inet_csk_accept, which can sleep for a long time. BugLink: https://github.com/iovisor/bcc/issues/1072 Signed-off-by: Alban Crequy --- Documentation/trace/kprobetrace.txt | 4 +++- kernel/trace/trace_kprobe.c | 34 +- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 41ef9d8..655ca7e 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -23,7 +23,7 @@ current_tracer. Instead of that, add probe points via Synopsis of kprobe_events - p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS]: Set a return probe + r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe -:[GRP/]EVENT: Clear a probe GRP : Group name. If omitted, use "kprobes" for it. @@ -32,6 +32,8 @@ Synopsis of kprobe_events MOD : Module name which has given SYM. SYM[+offs]: Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. + MAXACTIVE : Maximum number of instances of the specified function that + can be probed simultaneously, or 0 for the default.(*) FETCHARGS : Arguments. Each probe can have up to 128 args. %REG : Fetch register REG diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5f688cc..807e01c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -282,6 +282,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, void *addr, const char *symbol, unsigned long offs, +int maxactive, int nargs, bool is_return) { struct trace_kprobe *tk; @@ -309,6 +310,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, else tk->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.maxactive = maxactive; + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; @@ -598,8 +601,10 @@ static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: -* - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] -* - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] +* - Add kprobe: +* p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] +* - Add kretprobe: +* r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -619,6 +624,7 @@ static int create_trace_kprobe(int argc, char **argv) int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; + int maxactive = 0; char *arg; unsigned long offset = 0; void *addr = NULL; @@ -637,8 +643,26 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } - if (argv[0][1] == ':') { + if (is_return && isdigit(argv[0][1]) && strchr(&argv[0][1], ':')) { + event = strchr(&argv[0][1], ':') + 1; + event[-1] = '\0'; + ret = kstrtouint(&argv[0][1], 0, &maxactive); + if (ret) { + pr_info("Failed to parse maxactive.\n"); + return ret; + } + /* kretprobes instances are iterated over via a list. The +* maximum should stay reasonable. +*/ + if (maxactive > 1024) { + pr_info("Maxactive is too big.\n"); +
Re: [PATCH v1] tracing/kprobes: expose maxactive for kretprobe in kprobe_events
Thanks for the review, On Tue, Mar 28, 2017 at 5:23 PM, Masami Hiramatsu wrote: > On Tue, 28 Mar 2017 15:52:22 +0200 > Alban Crequy wrote: > >> When a kretprobe is installed on a kernel function, there is a maximum >> limit of how many calls in parallel it can catch (aka "maxactive"). A >> kernel module could call register_kretprobe() and initialize maxactive >> (see example in samples/kprobes/kretprobe_example.c). >> >> But that is not exposed to userspace and it is currently not possible to >> choose maxactive when writing to /sys/kernel/debug/tracing/kprobe_events > > I see, I found same issue last week... > >> >> The default maxactive can be as low as 1 on single-core with a >> non-preemptive kernel. This is too low and we need to increase it not >> only for recursive functions, but for functions that sleep or resched. >> >> This patch updates the format of the command that can be written to >> kprobe_events so that maxactive can be optionally specified. > > Good! this is completely same what I'm planning to add. > >> >> I need this for a bpf program attached to the kretprobe of >> inet_csk_accept, which can sleep for a long time. > > I'm also preparing another patch for using kmalloc(GFP_ATOMIC) in > kretprobe_pre_handler(), since this manual way is sometimes hard to > estimate how many instances needed. Anyway, since that may involve > unwilling memory allocation, this patch also needed. Where is that kretprobe_pre_handler()? I don't see any allocations in pre_handler_kretprobe(). >> BugLink: https://github.com/iovisor/bcc/issues/1072 > > Could you also add Lukasz to Cc list, since he had reported an issue > related this one. > > https://www.spinics.net/lists/linux-trace/msg00448.html > > Please see my comments below. > >> Signed-off-by: Alban Crequy >> --- >> Documentation/trace/kprobetrace.txt | 4 +++- >> kernel/trace/trace_kprobe.c | 34 +- >> 2 files changed, 32 insertions(+), 6 deletions(-) >> >> diff --git a/Documentation/trace/kprobetrace.txt >> b/Documentation/trace/kprobetrace.txt >> index 41ef9d8..655ca7e 100644 >> --- a/Documentation/trace/kprobetrace.txt >> +++ b/Documentation/trace/kprobetrace.txt >> @@ -23,7 +23,7 @@ current_tracer. Instead of that, add probe points via >> Synopsis of kprobe_events >> - >>p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe >> - r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe >> + r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return >> probe >>-:[GRP/]EVENT : Clear a probe >> >> GRP : Group name. If omitted, use "kprobes" for it. >> @@ -32,6 +32,8 @@ Synopsis of kprobe_events >> MOD : Module name which has given SYM. >> SYM[+offs] : Symbol+offset where the probe is inserted. >> MEMADDR : Address where the probe is inserted. >> + MAXACTIVE : Maximum number of instances of the specified function that >> + can be probed simultaneously, or 0 for the default.(*) > > Here, yoy don't need (*), since [MAXACTIVE] is not a FETCHARGS. Why not? (*) refers to "(*) only for return probe." and the maxactive only makes sense for the kretprobe. >> FETCHARGS : Arguments. Each probe can have up to 128 args. >>%REG : Fetch register REG >> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c >> index 5f688cc..807e01c 100644 >> --- a/kernel/trace/trace_kprobe.c >> +++ b/kernel/trace/trace_kprobe.c >> @@ -282,6 +282,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const >> char *group, >>void *addr, >>const char *symbol, >>unsigned long offs, >> + int maxactive, >>int nargs, bool is_return) >> { >> struct trace_kprobe *tk; >> @@ -309,6 +310,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const >> char *group, >> else >> tk->rp.kp.pre_handler = kprobe_dispatcher; >> >> + tk->rp.maxactive = maxactive; >> + >> if (!event || !is_good_name(event)) { >> ret = -EINVAL; >> goto error; >> @@ -598,8 +601,10 @@ static int create_trace_kprobe(int argc, char **argv) >> {
[PATCH v3] tracing/kprobes: expose maxactive for kretprobe in kprobe_events
From: Alban Crequy When a kretprobe is installed on a kernel function, there is a maximum limit of how many calls in parallel it can catch (aka "maxactive"). A kernel module could call register_kretprobe() and initialize maxactive (see example in samples/kprobes/kretprobe_example.c). But that is not exposed to userspace and it is currently not possible to choose maxactive when writing to /sys/kernel/debug/tracing/kprobe_events The default maxactive can be as low as 1 on single-core with a non-preemptive kernel. This is too low and we need to increase it not only for recursive functions, but for functions that sleep or resched. This patch updates the format of the command that can be written to kprobe_events so that maxactive can be optionally specified. I need this for a bpf program attached to the kretprobe of inet_csk_accept, which can sleep for a long time. This patch includes a basic selftest: > # ./ftracetest -v test.d/kprobe/ > === Ftrace unit tests === > [1] Kprobe dynamic event - adding and removing[PASS] > [2] Kprobe dynamic event - busy event check [PASS] > [3] Kprobe dynamic event with arguments [PASS] > [4] Kprobes event arguments with types[PASS] > [5] Kprobe dynamic event with function tracer [PASS] > [6] Kretprobe dynamic event with arguments[PASS] > [7] Kretprobe dynamic event with maxactive[PASS] > > # of passed: 7 > # of failed: 0 > # of unresolved: 0 > # of untested: 0 > # of unsupported: 0 > # of xfailed: 0 > # of undefined(test bug): 0 BugLink: https://github.com/iovisor/bcc/issues/1072 Signed-off-by: Alban Crequy --- Changes since v2: - Explain the default maxactive value in the documentation. (Review from Steven Rostedt) Changes since v1: - Remove "(*)" from documentation. (Review from Masami Hiramatsu) - Fix support for "r100" without the event name (Review from Masami Hiramatsu) - Get rid of magic numbers within the code. (Review from Steven Rostedt) Note that I didn't use KRETPROBE_MAXACTIVE_ALLOC since that patch is not merged. - Return -E2BIG when maxactive is too big. - Add basic selftest --- Documentation/trace/kprobetrace.txt| 5 ++- kernel/trace/trace_kprobe.c| 39 ++ .../ftrace/test.d/kprobe/kretprobe_maxactive.tc| 39 ++ 3 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 41ef9d8..25f3960 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -23,7 +23,7 @@ current_tracer. Instead of that, add probe points via Synopsis of kprobe_events - p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS]: Set a return probe + r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe -:[GRP/]EVENT: Clear a probe GRP : Group name. If omitted, use "kprobes" for it. @@ -32,6 +32,9 @@ Synopsis of kprobe_events MOD : Module name which has given SYM. SYM[+offs]: Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. + MAXACTIVE : Maximum number of instances of the specified function that + can be probed simultaneously, or 0 for the default value + as defined in Documentation/kprobes.txt section 1.3.1. FETCHARGS : Arguments. Each probe can have up to 128 args. %REG : Fetch register REG diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c5089c7..ae81f3c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,6 +25,7 @@ #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" +#define KRETPROBE_MAXACTIVE_MAX 4096 /** * Kprobe event core functions @@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, void *addr, const char *symbol, unsigned long offs, +int maxactive, int nargs, bool is_return) { struct trace_kprobe *tk; @@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, else tk->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.maxactive = maxactive; + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; @@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **ar
Re: [RFC PATCH tip/master 2/3] kprobes: Allocate kretprobe instance if its free list is empty
On Thu, Mar 30, 2017 at 8:53 AM, Ingo Molnar wrote: > > * Masami Hiramatsu wrote: > >> > So this is something I missed while the original code was merged, but the >> > concept >> > looks a bit weird: why do we do any "allocation" while a handler is >> > executing? >> > >> > That's fundamentally fragile. What's the maximum number of parallel >> > 'kretprobe_instance' required per kretprobe - one per CPU? >> >> It depends on the place where we put the probe. If the probed function will >> be >> blocked (yield to other tasks), then we need a same number of threads on >> the system which can invoke the function. So, ultimately, it is same >> as function_graph tracer, we need it for each thread. > > So then put it into task_struct (assuming there's no > kretprobe-inside-kretprobe > nesting allowed). There's just no way in hell we should be calling any complex > kernel function from kernel probes! Some kprobes are called from an interruption context. We have a kprobe on tcp_set_state() and this is sometimes called when the network card receives a packet. > I mean, think about it, a kretprobe can be installed in a lot of places, and > now > we want to call get_free_pages() from it?? This would add a massive amount of > fragility. > > Instrumentation must be _simple_, every patch that adds more complexity to the > most fundamental code path of it should raise a red flag ... > > So let's make this more robust, ok? > > Thanks, > > Ingo Thanks, Alban
Re: [RFC PATCH tip/master 1/3] trace: kprobes: Show sum of probe/retprobe nmissed count
On Wed, Mar 29, 2017 at 7:22 AM, Masami Hiramatsu wrote: > Show sum of probe and retprobe nmissed count in > kprobe_profile, since retprobe can be missed even > if the kprobe itself succeeeded. > This explains user why their return probe didn't hit > sometimes. > > Signed-off-by: Masami Hiramatsu I tested this patch with my kretprobe on "inet_csk_accept" when there are many processes waiting in the accept() syscall. I can now successfully see the nmissed counter in /sys/kernel/debug/tracing/kprobe_profile being incremented when the kretprobe is missed. Tested-by: Alban Crequy > --- > kernel/trace/trace_kprobe.c |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c > index 013f4e7..bbdc3de 100644 > --- a/kernel/trace/trace_kprobe.c > +++ b/kernel/trace/trace_kprobe.c > @@ -896,7 +896,7 @@ static int probes_profile_seq_show(struct seq_file *m, > void *v) > seq_printf(m, " %-44s %15lu %15lu\n", >trace_event_name(&tk->tp.call), >trace_kprobe_nhit(tk), > - tk->rp.kp.nmissed); > + tk->rp.kp.nmissed + tk->rp.nmissed); > > return 0; > } >
[PATCH v2] tracing/kprobes: expose maxactive for kretprobe in kprobe_events
When a kretprobe is installed on a kernel function, there is a maximum limit of how many calls in parallel it can catch (aka "maxactive"). A kernel module could call register_kretprobe() and initialize maxactive (see example in samples/kprobes/kretprobe_example.c). But that is not exposed to userspace and it is currently not possible to choose maxactive when writing to /sys/kernel/debug/tracing/kprobe_events The default maxactive can be as low as 1 on single-core with a non-preemptive kernel. This is too low and we need to increase it not only for recursive functions, but for functions that sleep or resched. This patch updates the format of the command that can be written to kprobe_events so that maxactive can be optionally specified. I need this for a bpf program attached to the kretprobe of inet_csk_accept, which can sleep for a long time. This patch includes a basic selftest: > # ./ftracetest -v test.d/kprobe/ > === Ftrace unit tests === > [1] Kprobe dynamic event - adding and removing[PASS] > [2] Kprobe dynamic event - busy event check [PASS] > [3] Kprobe dynamic event with arguments [PASS] > [4] Kprobes event arguments with types[PASS] > [5] Kprobe dynamic event with function tracer [PASS] > [6] Kretprobe dynamic event with arguments[PASS] > [7] Kretprobe dynamic event with maxactive[PASS] > > # of passed: 7 > # of failed: 0 > # of unresolved: 0 > # of untested: 0 > # of unsupported: 0 > # of xfailed: 0 > # of undefined(test bug): 0 BugLink: https://github.com/iovisor/bcc/issues/1072 Signed-off-by: Alban Crequy --- Changes since v1: - Remove "(*)" from documentation. (Review from Masami Hiramatsu) - Fix support for "r100" without the event name (Review from Masami Hiramatsu) - Get rid of magic numbers within the code. (Review from Steven Rostedt) Note that I didn't use KRETPROBE_MAXACTIVE_ALLOC since that patch is not merged. - Return -E2BIG when maxactive is too big. - Add basic selftest --- Documentation/trace/kprobetrace.txt| 4 ++- kernel/trace/trace_kprobe.c| 39 ++ .../ftrace/test.d/kprobe/kretprobe_maxactive.tc| 39 ++ 3 files changed, 75 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 41ef9d8..7051a20 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -23,7 +23,7 @@ current_tracer. Instead of that, add probe points via Synopsis of kprobe_events - p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS]: Set a return probe + r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe -:[GRP/]EVENT: Clear a probe GRP : Group name. If omitted, use "kprobes" for it. @@ -32,6 +32,8 @@ Synopsis of kprobe_events MOD : Module name which has given SYM. SYM[+offs]: Symbol+offset where the probe is inserted. MEMADDR : Address where the probe is inserted. + MAXACTIVE : Maximum number of instances of the specified function that + can be probed simultaneously, or 0 for the default. FETCHARGS : Arguments. Each probe can have up to 128 args. %REG : Fetch register REG diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c5089c7..ae81f3c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,6 +25,7 @@ #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" +#define KRETPROBE_MAXACTIVE_MAX 4096 /** * Kprobe event core functions @@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, void *addr, const char *symbol, unsigned long offs, +int maxactive, int nargs, bool is_return) { struct trace_kprobe *tk; @@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, else tk->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.maxactive = maxactive; + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; @@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: -* - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] -* - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] +* - Add
Re: [RFC PATCH v4 1/2] fuse: introduce new fs_type flag FS_IMA_NO_CACHE
On Fri, Feb 2, 2018 at 5:10 PM, Miklos Szeredi wrote: > On Fri, Feb 2, 2018 at 4:33 PM, Mimi Zohar wrote: >> On Fri, 2018-02-02 at 10:20 -0500, Mimi Zohar wrote: >>> Hi Miklos, >>> >>> On Tue, 2018-01-30 at 19:06 +0100, Dongsu Park wrote: >>> > From: Alban Crequy >>> > >>> > This new fs_type flag FS_IMA_NO_CACHE means files should be re-measured, >>> > re-appraised and re-audited each time. Cached integrity results should >>> > not be used. >>> > >>> > It is useful in FUSE because the userspace FUSE process can change the >>> > underlying files at any time without notifying the kernel. > > I don't really have an understanding what IMA is doing, I think the > same thing applies to any network filesystem (i.e. ones with > d_revalidate). > > Isn't that the case? Hi Miklos, >From my limited understanding, network filesystems might need that too, yes. I don't know if there are people interested in using both IMA and network filesystems. If so, they would have to write that patch and test it. It is not a new issue, for neither network filesystems or FUSE. But I am more interested in the FUSE use case because FUSE can be mounted by unprivileged users either today with fusermount installed with setuid, or soon with the coming patches to allow FUSE mounts in a non-init user namespace. That makes the issue more visible than for network filesystems where unprivileged users cannot mount. Cheers, Alban
Re: [PATCH net-next v7 08/10] bpf: Add a Landlock sandbox example
Hi Mickaël, On 21 August 2017 at 02:09, Mickaël Salaün wrote: > Add a basic sandbox tool to create a process isolated from some part of > the system. This sandbox create a read-only environment. It is only > allowed to write to a character device such as a TTY: ... > + /* > +* This check allows the action on the file if it is a directory or a > +* pipe. Otherwise, a message is printed to the eBPF log. > +*/ > + if (S_ISCHR(ret) || S_ISFIFO(ret)) > + return 0; The comment says "directory", but the code checks for "character device". Thanks! Alban
[RFC v2 0/2] proc connector: get namespace events
This is v2 of the patch set to add namespace events in the proc connector. The act of a process creating or joining a namespace via clone(), unshare() or setns() is a useful signal for monitoring applications. I am working on a monitoring application that keeps track of all the containers and all processes inside each container. The current way of doing it is by polling regularly in /proc for the list of processes and in /proc/*/ns/* to know which namespaces they belong to. This is inefficient on systems with a large number of containers and a large number of processes. Instead, I would inspect /proc only one time and get the updates with the proc connector. Unfortunately, the proc connector gives me the list of processes but does not notify me when a process changes namespaces. So I would still need to inspect /proc/*/ns/*. (1) Add namespace events for processes. It generates a namespace event each time a process changes namespace via clone(), unshare() or setns(). (2) Add a way for userspace to detect if proc connector is able to send namespace events. Changes since RFC-v1: https://lkml.org/lkml/2016/9/8/588 * Supports userns. * The reason field says exactly whether it is clone/setns/unshare. * Sends aggregated messages containing details of several namespaces changes. Suggested by Evgeniy Polyakov. * Add patch 2 to detect if proc connector is able to send namespace events. This patch set is available in the git repository at: https://github.com/kinvolk/linux.git alban/proc_ns_connector-v2-5 Alban Crequy (2): proc connector: add namespace events proc connector: add a "get feature" op drivers/connector/cn_proc.c | 163 --- include/linux/cn_proc.h | 25 +++ include/uapi/linux/cn_proc.h | 27 ++- kernel/fork.c| 10 +++ kernel/nsproxy.c | 6 ++ 5 files changed, 220 insertions(+), 11 deletions(-) -- 2.7.4
[RFC v2 1/2] proc connector: add namespace events
From: Alban Crequy The act of a process creating or joining a namespace via clone(), unshare() or setns() is a useful signal for monitoring applications. I am working on a monitoring application that keeps track of all the containers and all processes inside each container. The current way of doing it is by polling regularly in /proc for the list of processes and in /proc/*/ns/* to know which namespaces they belong to. This is inefficient on systems with a large number of containers and a large number of processes. Instead, I would inspect /proc only one time and get the updates with the proc connector. Unfortunately, the proc connector gives me the list of processes but does not notify me when a process changes namespaces. So I would still need to inspect /proc/*/ns/*. This patch adds namespace events for processes. It generates a namespace event each time a process changes namespace via clone(), unshare() or setns(). For example, the following command: | # unshare -n -i -f ls -l /proc/self/ns/ | total 0 | lrwxrwxrwx 1 root root 0 Sep 25 22:31 cgroup -> 'cgroup:[4026531835]' | lrwxrwxrwx 1 root root 0 Sep 25 22:31 ipc -> 'ipc:[4026532208]' | lrwxrwxrwx 1 root root 0 Sep 25 22:31 mnt -> 'mnt:[4026531840]' | lrwxrwxrwx 1 root root 0 Sep 25 22:31 net -> 'net:[4026532210]' | lrwxrwxrwx 1 root root 0 Sep 25 22:31 pid -> 'pid:[4026531836]' | lrwxrwxrwx 1 root root 0 Sep 25 22:31 user -> 'user:[4026531837]' | lrwxrwxrwx 1 root root 0 Sep 25 22:31 uts -> 'uts:[4026531838]' causes the proc connector to generate the following events: | fork: ppid=691 pid=808 | exec: pid=808 | ns: pid=808 reason=unshare count=2 | type=ipc 4026531839 -> 4026532208 | type=net 4026531957 -> 4026532210 | fork: ppid=808 pid=809 | exec: pid=809 | exit: pid=809 | exit: pid=808 Signed-off-by: Alban Crequy --- drivers/connector/cn_proc.c | 138 +++ include/linux/cn_proc.h | 25 include/uapi/linux/cn_proc.h | 23 +++- kernel/fork.c| 10 kernel/nsproxy.c | 6 ++ 5 files changed, 201 insertions(+), 1 deletion(-) diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index a782ce8..c38733d 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -30,8 +30,13 @@ #include #include #include +#include +#include +#include +#include #include +#include /* * Size of a cn_msg followed by a proc_event structure. Since the @@ -296,6 +301,139 @@ void proc_exit_connector(struct task_struct *task) send_msg(msg); } +void proc_ns_connector_prepare(struct ns_event_prepare *prepare, u16 reason) +{ + struct nsproxy *ns = current->nsproxy; + struct ns_common *mntns; + + prepare->num_listeners = atomic_read(&proc_event_num_listeners); + + if (prepare->num_listeners < 1) + return; + + prepare->reason = reason; + + prepare->user_inum = current->cred->user_ns->ns.inum; + prepare->uts_inum = ns->uts_ns->ns.inum; + prepare->ipc_inum = ns->ipc_ns->ns.inum; + + mntns = mntns_operations.get(current); + if (mntns) { + prepare->mnt_inum = mntns->inum; + mntns_operations.put(mntns); + } else + prepare->mnt_inum = 0; + + prepare->pid_inum = ns->pid_ns_for_children->ns.inum; + prepare->net_inum = ns->net_ns->ns.inum; + prepare->cgroup_inum = ns->cgroup_ns->ns.inum; +} + +void proc_ns_connector_send(struct ns_event_prepare *prepare, struct task_struct *task) +{ + struct nsproxy *ns = task->nsproxy; + struct ns_common *mntns; + struct cn_msg *msg; + struct proc_event *ev; + __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); + int count; + + if (prepare->num_listeners < 1) + return; + + if (atomic_read(&proc_event_num_listeners) < 1) + return; + + msg = buffer_to_cn_msg(buffer); + ev = (struct proc_event *)msg->data; + memset(&ev->event_data, 0, sizeof(ev->event_data)); + ev->timestamp_ns = ktime_get_ns(); + ev->what = PROC_EVENT_NS; + + ev->event_data.ns.process_pid = task->pid; + ev->event_data.ns.process_tgid = task->tgid; + ev->event_data.ns.reason = prepare->reason; + count = 0; + + /* user */ + if (prepare->user_inum != task->cred->user_ns->ns.inum) { + ev->event_data.ns.items[count].type = CLONE_NEWUSER; + ev->event_data.ns.items[count].flags = 0; + ev->event_data.ns.items[count].old_inum = prepare->user_inum; + ev->event_data.ns.items[count].inum = task->cred->user_ns->ns.inum; +
[RFC v2 2/2] proc connector: add a "get feature" op
From: Alban Crequy As more kinds of events are being added in the proc connector, userspace needs a way to detect whether the kernel supports those new events. When a kind of event is not supported, userspace should report an error propertly, or fallback to other methods (regular polling of procfs). The events fork, exec, uid, gid, sid, ptrace, comm, exit were added together. Then commit 2b5faa4c ("connector: Added coredumping event to the process connector") added coredump events but without a way for userspace to detect if the kernel will emit those. So I am grouping them all together in PROC_CN_FEATURE_BASIC. - PROC_CN_FEATURE_BASIC: supports fork, exec, uid, gid, sid, ptrace, comm, exit, coredump. - PROC_CN_FEATURE_NS: supports ns. Signed-off-by: Alban Crequy --- drivers/connector/cn_proc.c | 25 +++-- include/uapi/linux/cn_proc.h | 4 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index c38733d..5f9ace6 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -442,15 +442,12 @@ void proc_ns_connector_send(struct ns_event_prepare *prepare, struct task_struct * values because it's not being returned via syscall return * mechanisms. */ -static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) +static void cn_proc_ack(int err, u16 flags, int rcvd_seq, int rcvd_ack) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8); - if (atomic_read(&proc_event_num_listeners) < 1) - return; - msg = buffer_to_cn_msg(buffer); ev = (struct proc_event *)msg->data; memset(&ev->event_data, 0, sizeof(ev->event_data)); @@ -462,7 +459,7 @@ static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack) memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = rcvd_ack + 1; msg->len = sizeof(*ev); - msg->flags = 0; /* not used */ + msg->flags = flags; send_msg(msg); } @@ -475,9 +472,12 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, { enum proc_cn_mcast_op *mc_op = NULL; int err = 0; + u16 flags = 0; - if (msg->len != sizeof(*mc_op)) - return; + if (msg->len != sizeof(*mc_op)) { + err = EINVAL; + goto out; + } /* * Events are reported with respect to the initial pid @@ -485,8 +485,10 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, * other namespaces. */ if ((current_user_ns() != &init_user_ns) || - (task_active_pid_ns(current) != &init_pid_ns)) - return; + (task_active_pid_ns(current) != &init_pid_ns)) { + err = EPERM; + goto out; + } /* Can only change if privileged. */ if (!__netlink_ns_capable(nsp, &init_user_ns, CAP_NET_ADMIN)) { @@ -496,6 +498,9 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, mc_op = (enum proc_cn_mcast_op *)msg->data; switch (*mc_op) { + case PROC_CN_GET_FEATURES: + flags = PROC_CN_FEATURE_BASIC | PROC_CN_FEATURE_NS; + break; case PROC_CN_MCAST_LISTEN: atomic_inc(&proc_event_num_listeners); break; @@ -508,7 +513,7 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, } out: - cn_proc_ack(err, msg->seq, msg->ack); + cn_proc_ack(err, flags, msg->seq, msg->ack); } /* diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h index 3270e8c..2ea0e5d 100644 --- a/include/uapi/linux/cn_proc.h +++ b/include/uapi/linux/cn_proc.h @@ -25,10 +25,14 @@ * for events on the connector. */ enum proc_cn_mcast_op { + PROC_CN_GET_FEATURES = 0, PROC_CN_MCAST_LISTEN = 1, PROC_CN_MCAST_IGNORE = 2 }; +#define PROC_CN_FEATURE_BASIC 0x0001 +#define PROC_CN_FEATURE_NS0x0002 + /* * From the user's point of view, the process * ID is the thread group ID and thread ID is the internal -- 2.7.4