Re: [PATCH 22/28] mm: add support for non block device backed swap files
Starting review in the middle, because this is the part I'm most familiar with. New addres_space_operations methods are added: int swapfile(struct address_space *, int); Separate -swapon() and -swapoff() methods would be so much cleaner IMO. Also is there a reason why 'struct file *' cannot be supplied to these functions? [snip] +int swap_set_page_dirty(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis-flags SWP_FILE) { + const struct address_space_operations *a_ops = + sis-swap_file-f_mapping-a_ops; + int (*spd)(struct page *) = a_ops-set_page_dirty; +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif This ifdef is not really needed. Just require -set_page_dirty() be filled in by filesystems which want swapfiles (and others too, in the longer term, the fallback is just historical crud). Here's an incremental patch addressing these issues and beautifying the new code. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Index: linux/mm/page_io.c === --- linux.orig/mm/page_io.c 2008-02-26 11:15:58.0 +0100 +++ linux/mm/page_io.c 2008-02-26 13:40:55.0 +0100 @@ -106,8 +106,10 @@ int swap_writepage(struct page *page, st } if (sis-flags SWP_FILE) { - ret = sis-swap_file-f_mapping- - a_ops-swap_out(sis-swap_file, page, wbc); + struct file *swap_file = sis-swap_file; + struct address_space *mapping = swap_file-f_mapping; + + ret = mapping-a_ops-swap_out(swap_file, page, wbc); if (!ret) count_vm_event(PSWPOUT); return ret; @@ -136,12 +138,13 @@ void swap_sync_page(struct page *page) struct swap_info_struct *sis = page_swap_info(page); if (sis-flags SWP_FILE) { - const struct address_space_operations *a_ops = - sis-swap_file-f_mapping-a_ops; - if (a_ops-sync_page) - a_ops-sync_page(page); - } else + struct address_space *mapping = sis-swap_file-f_mapping; + + if (mapping-a_ops-sync_page) + mapping-a_ops-sync_page(page); + } else { block_sync_page(page); + } } int swap_set_page_dirty(struct page *page) @@ -149,17 +152,12 @@ int swap_set_page_dirty(struct page *pag struct swap_info_struct *sis = page_swap_info(page); if (sis-flags SWP_FILE) { - const struct address_space_operations *a_ops = - sis-swap_file-f_mapping-a_ops; - int (*spd)(struct page *) = a_ops-set_page_dirty; -#ifdef CONFIG_BLOCK - if (!spd) - spd = __set_page_dirty_buffers; -#endif - return (*spd)(page); - } + struct address_space *mapping = sis-swap_file-f_mapping; - return __set_page_dirty_nobuffers(page); + return mapping-a_ops-set_page_dirty(page); + } else { + return __set_page_dirty_nobuffers(page); + } } int swap_readpage(struct file *file, struct page *page) @@ -172,8 +170,10 @@ int swap_readpage(struct file *file, str BUG_ON(PageUptodate(page)); if (sis-flags SWP_FILE) { - ret = sis-swap_file-f_mapping- - a_ops-swap_in(sis-swap_file, page); + struct file *swap_file = sis-swap_file; + struct address_space *mapping = swap_file-f_mapping; + + ret = mapping-a_ops-swap_in(swap_file, page); if (!ret) count_vm_event(PSWPIN); return ret; Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-02-26 11:15:58.0 +0100 +++ linux/include/linux/fs.h2008-02-26 13:29:40.0 +0100 @@ -485,7 +485,8 @@ struct address_space_operations { /* * swapfile support */ - int (*swapfile)(struct address_space *, int); + int (*swapon)(struct file *file); + int (*swapoff)(struct file *file); int (*swap_out)(struct file *file, struct page *page, struct writeback_control *wbc); int (*swap_in)(struct file *file, struct page *page); Index: linux/mm/swapfile.c === --- linux.orig/mm/swapfile.c2008-02-26 12:43:57.0 +0100 +++ linux/mm/swapfile.c 2008-02-26 13:34:57.0 +0100 @@ -1014,9 +1014,11 @@ static void destroy_swap_extents(struct } if (sis-flags SWP_FILE) { + struct file *swap_file = sis-swap_file; + struct address_space *mapping = swap_file
Re: [patch 00/10] mount ownership and unprivileged mount syscall (v8)
On Sat, Feb 23, 2008 at 06:33:13PM +0100, Miklos Szeredi wrote: c) just what is limited by that sysctl? AFAICS, rbind is allowed if mountpoint is on user vfsmount and it seems to create vfsmounts without eating into that limit just fine... What's the point of limiting the amount of vfsmounts marked user when you do not limit the number of vfsmount one can allocate? The limit is there, so that unprivileged users cannot create insane number of mounts. It's just a safety thing, analogous to /proc/sys/fs/file-max. Can't they? Looks like one can create any number of vfsmounts without getting more than one marked MNT_USER... permit_mount() will set MS_SETUSER in flags, and do_loopback() will set CL_SETUSER based on that flag. If you are trying to limit the number of superblocks (i.e. active instances of filesystems), then I'd say that vfsmounts make piss-poor proxies for those and it would be better to count the objects you really want to count... I think I really want to limit vfsmounts. But not because these take so much memory or anything, just to be safe against a stupid users playing rbind and propagation, and things like that. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[rfc patch] how to show propagation state for mounts
If you get down to it, the thing is about delegating control over part of namespace to somebody, without letting them control, see, etc. the rest of it. So I'd rather be very conservative about extra information we allow to piggyback on that. I don't know... perhaps with stable peer group IDs it would be OK to show peer group ID by (our) vfsmount + peer group ID of master + peer group ID of nearest dominating group that has intersection with our namespace. Then we don't leak information (AFAICS), get full propagation information between our vfsmounts and cooperating tasks in different namespaces can figure the things out as much as possible without leaking 3rd-party information to either. Here's a patch against current -mm implementing this (with some cleanups thrown in). Done some testing on it as well, it wasn't entirey trivial to figure out a setup, where propagation goes out of the namespace first, then comes back in: mount --bind /mnt1 /mnt1 mount --make-shared /mnt1 mount --bind /mnt2 /mnt2 mount --make-shared /mnt2 newns mount --make-slave /mnt1 old ns: mount --make-slave /mnt2 mount --bind /mnt1/tmp /mnt1/tmp new ns: mount --make-shared /mnt1/tmp mount --bind /mnt1/tmp /mnt2/tmp Voila. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/pnode.c === --- linux.orig/fs/pnode.c 2008-02-22 15:27:23.0 +0100 +++ linux/fs/pnode.c2008-02-22 15:27:26.0 +0100 @@ -9,8 +9,12 @@ #include linux/mnt_namespace.h #include linux/mount.h #include linux/fs.h +#include linux/idr.h #include pnode.h +static DEFINE_SPINLOCK(mnt_pgid_lock); +static DEFINE_IDA(mnt_pgid_ida); + /* return the next shared peer mount of @p */ static inline struct vfsmount *next_peer(struct vfsmount *p) { @@ -27,36 +31,90 @@ static inline struct vfsmount *next_slav return list_entry(p-mnt_slave.next, struct vfsmount, mnt_slave); } -static int __peer_group_id(struct vfsmount *mnt) +static void __set_mnt_shared(struct vfsmount *mnt) { - struct vfsmount *m; - int id = mnt-mnt_id; + mnt-mnt_flags = ~MNT_PNODE_MASK; + mnt-mnt_flags |= MNT_SHARED; +} + +void set_mnt_shared(struct vfsmount *mnt) +{ + int res; - for (m = next_peer(mnt); m != mnt; m = next_peer(m)) - id = min(id, m-mnt_id); + retry: + spin_lock(mnt_pgid_lock); + if (IS_MNT_SHARED(mnt)) { + spin_unlock(mnt_pgid_lock); + return; + } - return id; + res = ida_get_new(mnt_pgid_ida, mnt-mnt_pgid); + spin_unlock(mnt_pgid_lock); + if (res == -EAGAIN) { + if (ida_pre_get(mnt_pgid_ida, GFP_KERNEL)) + goto retry; + } + __set_mnt_shared(mnt); +} + +void clear_mnt_shared(struct vfsmount *mnt) +{ + if (IS_MNT_SHARED(mnt)) { + mnt-mnt_flags = ~MNT_SHARED; + mnt-mnt_pgid = -1; + } +} + +void make_mnt_peer(struct vfsmount *old, struct vfsmount *mnt) +{ + mnt-mnt_pgid = old-mnt_pgid; + list_add(mnt-mnt_share, old-mnt_share); + __set_mnt_shared(mnt); } -/* return the smallest ID within the peer group */ int get_peer_group_id(struct vfsmount *mnt) { + return mnt-mnt_pgid; +} + +int get_master_id(struct vfsmount *mnt) +{ int id; spin_lock(vfsmount_lock); - id = __peer_group_id(mnt); + id = get_peer_group_id(mnt-mnt_master); spin_unlock(vfsmount_lock); return id; } -/* return the smallest ID within the master's peer group */ -int get_master_id(struct vfsmount *mnt) +static struct vfsmount *get_peer_in_ns(struct vfsmount *mnt, + struct mnt_namespace *ns) { - int id; + struct vfsmount *m = mnt; + + do { + if (m-mnt_ns == ns) + return m; + m = next_peer(m); + } while (m != mnt); + + return NULL; +} + +int get_dominator_id_same_ns(struct vfsmount *mnt) +{ + int id = -1; + struct vfsmount *m; spin_lock(vfsmount_lock); - id = __peer_group_id(mnt-mnt_master); + for (m = mnt-mnt_master; m != NULL; m = m-mnt_master) { + struct vfsmount *d = get_peer_in_ns(m, mnt-mnt_ns); + if (d) { + id = d-mnt_pgid; + break; + } + } spin_unlock(vfsmount_lock); return id; @@ -80,7 +138,13 @@ static int do_make_slave(struct vfsmount if (peer_mnt == mnt) peer_mnt = NULL; } - list_del_init(mnt-mnt_share); + if (!list_empty(mnt-mnt_share)) + list_del_init(mnt-mnt_share); + else if (IS_MNT_SHARED(mnt)) { + spin_lock(mnt_pgid_lock); + ida_remove(mnt_pgid_ida, mnt-mnt_pgid); + spin_unlock
Re: NFS/LSM: allow NFS to control all of its own mount options
Please don't introduce a special case for just nfs. All filesystems should control their mount options, so please provide some library helpers for context= handling and move it into all filesystems that can support selinux. Hmm, looks like selinux is not showing it's mount options in /proc/mounts. Well, actually there's no infrastructure for it either. Here's a template patch (completely untested). Selinux guys, please fill in the details and submit, thanks. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-02-20 10:51:11.0 +0100 +++ linux/fs/namespace.c2008-02-20 10:51:25.0 +0100 @@ -385,6 +385,7 @@ static int show_vfsmnt(struct seq_file * if (mnt-mnt_flags fs_infop-flag) seq_puts(m, fs_infop-str); } + security_sb_show_options(m, mnt-mnt_sb); if (mnt-mnt_sb-s_op-show_options) err = mnt-mnt_sb-s_op-show_options(m, mnt); seq_puts(m, 0 0\n); Index: linux/include/linux/security.h === --- linux.orig/include/linux/security.h 2008-02-18 21:20:03.0 +0100 +++ linux/include/linux/security.h 2008-02-20 11:02:04.0 +0100 @@ -80,6 +80,7 @@ struct xfrm_selector; struct xfrm_policy; struct xfrm_state; struct xfrm_user_sec_ctx; +struct seq_file; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); @@ -1226,6 +1227,7 @@ struct security_operations { int (*sb_copy_data)(struct file_system_type *type, void *orig, void *copy); int (*sb_kern_mount) (struct super_block *sb, void *data); + int (*sb_show_options) (struct seq_file *, struct super_block *sb); int (*sb_statfs) (struct dentry *dentry); int (*sb_mount) (char *dev_name, struct nameidata * nd, char *type, unsigned long flags, void *data); @@ -1487,6 +1489,7 @@ int security_sb_alloc(struct super_block void security_sb_free(struct super_block *sb); int security_sb_copy_data(struct file_system_type *type, void *orig, void *copy); int security_sb_kern_mount(struct super_block *sb, void *data); +int security_sb_show_options(struct seq_file *, struct super_block *sb); int security_sb_statfs(struct dentry *dentry); int security_sb_mount(char *dev_name, struct nameidata *nd, char *type, unsigned long flags, void *data); @@ -1744,6 +1747,12 @@ static inline int security_sb_kern_mount return 0; } +static inline int security_sb_show_options (struct seq_file *m, + struct super_block *sb) +{ + return 0; +} + static inline int security_sb_statfs (struct dentry *dentry) { return 0; Index: linux/security/security.c === --- linux.orig/security/security.c 2008-02-18 21:20:06.0 +0100 +++ linux/security/security.c 2008-02-20 10:56:16.0 +0100 @@ -252,6 +252,14 @@ int security_sb_kern_mount(struct super_ return security_ops-sb_kern_mount(sb, data); } +int security_sb_show_options (struct seq_file *m, struct super_block *sb) +{ + if (security_ops-sb_show_options) + return security_ops-sb_show_options(m, sb); + else + return 0; +} + int security_sb_statfs(struct dentry *dentry) { return security_ops-sb_statfs(dentry); Index: linux/security/selinux/hooks.c === --- linux.orig/security/selinux/hooks.c 2008-02-18 21:20:06.0 +0100 +++ linux/security/selinux/hooks.c 2008-02-20 10:58:57.0 +0100 @@ -590,6 +590,12 @@ out: return rc; } +static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) +{ + /* ... */ + return 0; +} + static int superblock_doinit(struct super_block *sb, void *data) { struct superblock_security_struct *sbsec = sb-s_security; @@ -4797,6 +4803,7 @@ static struct security_operations selinu .sb_free_security = selinux_sb_free_security, .sb_copy_data = selinux_sb_copy_data, .sb_kern_mount =selinux_sb_kern_mount, + .sb_show_options = selinux_sb_show_options, .sb_statfs =selinux_sb_statfs, .sb_mount = selinux_mount, .sb_umount =selinux_umount, - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
how to show propagation state for mounts
mountinfo - IMO needs a sane discussion of what and how should be shown wrt propagation state Here's my take on the matter. The propagation tree can be either be represented 1) from root to leaf listing members of peer groups and their slaves explicitly, 2) or from leaf to root by identifying each peer group and then for each mount showing the id of its own group and the id of the group's master. 2) can have two variants: 2a) id of peer group is constant in time 2b) id of peer group may change The current patch does 2b). Having a fixed id for each peer group would mean introducing a new object to anchor the peer group into, which would add complexity to the whole thing. All of these are implementable, just need to decide which one we want. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: how to show propagation state for mounts
On Wed, Feb 20, 2008 at 04:39:15PM +0100, Miklos Szeredi wrote: mountinfo - IMO needs a sane discussion of what and how should be shown wrt propagation state Here's my take on the matter. The propagation tree can be either be represented 1) from root to leaf listing members of peer groups and their slaves explicitly, 2) or from leaf to root by identifying each peer group and then for each mount showing the id of its own group and the id of the group's master. 2) can have two variants: 2a) id of peer group is constant in time 2b) id of peer group may change The current patch does 2b). Having a fixed id for each peer group would mean introducing a new object to anchor the peer group into, which would add complexity to the whole thing. All of these are implementable, just need to decide which one we want. Eh... Much more interesting question: since the propagation tree spans multiple namespaces in a lot of normal uses, how do we deal with reconstructing propagation through the parts that are not present in our namespace? Moreover, what should and what should not be kept private to namespace? Full exposure of mount trees is definitely over the top (it shows potentially sensitive information), so we probably want less than that. FWIW, my gut feeling is that for each peer group that intersects with our namespace we ought to expose in some form * all vfsmounts belonging to that intesection * the nearest dominating peer group (== master (of master ...) of) that also has a non-empty intersection with our namespace It's less about the form of representation (after all, we generate poll events when contents of that sucker changes, so one *can* get a consistent snapshot of the entire thing) and more about having it self-contained when we have namespaces in the play. IOW, the data in there should give answers to questions that make sense. Do events get propagated from this vfsmount I have to that vfsmount I have? is a meaningful one; ditto for are events here propagated to somewhere I don't see? or are events getting propagated here from somewhere I don't see?. Well, assuming you see only one namespace. When I'm experimenting with namespaces and propagations, I see both (each in a separate xterm) and I do want to know how propagation between them happens. Your suggestion doesn't deal with that problem. Otherwise, yes it makes sense to have a consistent view of the tree shown for each namespace. Perhaps the solution is to restrict viewing the whole tree to privileged processes. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: how to show propagation state for mounts
I wonder, what is wrong in reporting mounts in other namespaces that either receive and send propagation to mounts in our namespace? A plenty. E.g. if foo trusts control over /var/blah to bar, it's not obvious that foo has any business knowing if bar gets it from somebody else in turn. And I'm not sure that bar has any business knowing that foo has the damn thing attached in five places instead of just one, let alone _where_ it has been attached. If you get down to it, the thing is about delegating control over part of namespace to somebody, without letting them control, see, etc. the rest of it. So I'd rather be very conservative about extra information we allow to piggyback on that. I don't know... perhaps with stable peer group IDs it would be OK to show peer group ID by (our) vfsmount + peer group ID of master + peer group ID of nearest dominating group that has intersection with our namespace. Then we don't leak information (AFAICS), get full propagation information between our vfsmounts and cooperating tasks in different namespaces can figure the things out as much as possible without leaking 3rd-party information to either. This sounds fine. I'll have a look at implementing a stable peer group ID (it doesn't need a separate object, I realized that now). Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
git tree with VFS stuff
I've created a git tree with the following mounts related stuff: - read-only bind mounts - /proc/pid/mountinfo - unprivileged mounts git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfsstuff.git master I guess, giving these a spin in linux-next wouldn't hurt? Thanks, Miklos Dave Hansen (33): reiserfs: eliminate private use of struct file in xattr hppfs pass vfsmount to dentry_open() check for null vfsmount in dentry_open() fix up new filp allocators do namei_flags calculation inside open_namei() merge open_namei() and do_filp_open() r/o bind mounts: stub functions r/o bind mounts: create helper to drop file write access r/o bind mounts: drop write during emergency remount r/o bind mounts: elevate write count for vfs_rmdir() r/o bind mounts: elevate write count for callers of vfs_mkdir() r/o bind mounts: elevate mnt_writers for unlink callers r/o bind mounts: elevate write count for xattr_permission() callers r/o bind mounts: elevate write count for ncp_ioctl() r/o bind mounts: write counts for time functions r/o bind mounts: elevate write count for do_utimes() r/o bind mounts: write count for file_update_time() r/o bind mounts: write counts for link/symlink r/o bind mounts: elevate write count for ioctls() r/o bind mounts: elevate write count for open()s r/o bind mounts: get write access for vfs_rename() callers r/o bind mounts: elevate write count for chmod/chown callers r/o bind mounts: write counts for truncate() r/o bind mounts: elevate count for xfs timestamp updates r/o bind mounts: make access() use new r/o helper r/o bind mounts: check mnt instead of superblock directly r/o bind mounts: get callers of vfs_mknod/create() r/o bind mounts: track numbers of writers to mounts r/o bind mounts: honor mount writer counts at remount r/o bind mounts: debugging for missed calls ehea-fix fixes for missed struct paths from akpm Revert ehea-fix Miklos Szeredi (10): unprivileged mounts: add user mounts to the kernel unprivileged mounts: allow unprivileged umount unprivileged mounts: propagate error values from clone_mnt unprivileged mounts: account user mounts unprivileged mounts: allow unprivileged bind mounts unprivileged mounts: allow unprivileged mounts unprivileged mounts: add sysctl tunable for safe property unprivileged mounts: make fuse safe unprivileged mounts: propagation: inherit owner from parent unprivileged mounts: add no submounts flag Ram Pai (1): vfs-create-proc-pid-mountinfo - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Add MS_BIND_FLAGS mount flag
Maybe instead of messing with masks, it's better to introduce a get_flags() or a more general mount_stat() operation, and let userspace deal with setting and clearing flags, just as we do for stat/chmod? So we'd have mount_stat(path, stat); mount_bind(from, to, flags); mount_set_flags(path, flags); mount_move(from, to); and perhaps mount_remount(path, opt_string, flags); Sounds reasonable to me. But it wouldn't directly solve the do a recursive bind mount setting the MS_READONLY flag on all children problem, so we'd need some of the earlier suggestions too. Doh, you're right. Let's try the original idea, but a bit cleaner: /* flags: */ #define MNT_CTRL_RECURSE (1 0) /* mnt_flags: */ #define MNT_NOSUID 0x01 #define MNT_NODEV 0x02 #define MNT_NOEXEC 0x04 #define MNT_NOATIME 0x08 #define MNT_NODIRATIME 0x10 #define MNT_RELATIME0x20 #define MNT_SHARED 0x1000 #define MNT_UNBINDABLE 0x2000 #define MNT_PNODE_MASK 0x3000 struct mount_param { u64 flags; /* control flags */ u64 mnt_flags; /* new mount flags */ u64 mnt_flags_mask; /* mask for new mount flags */ }; int mount_bindat(int fromfd, const char *frompath, int tofd, const char *topath, struct mount_param *param); int mount_setflagsat(int fd, const char *path, struct mount_param *param); int mount_moveat(int fromfd, const char *frompath, int tofd, const char *topath); ... I deliberately not used the MS_* flags, which is currently a messy mix of things with totally different meanings. Does this solve all the issues? Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Add MS_BIND_FLAGS mount flag
And I'm not against doing it with the at* variants, as Trond suggested. If you're going to change the syscall, then you should ensure that it solves _all_ the problems that are known at this time. Ignoring the automounter issue is just going to force us to redo the syscall in a couple of months... Sure. Although, an (almost) equivalent userspace code would be: mount_fooat(int fd, const char *path) { char tmpbuf[64]; int tmpfd = openat(fd, path); sprintf(tmpbuf, /proc/self/fd/%i, tmpfd); return mount_foo(tmpbuf, ...); } Or is there something (other than not requiring proc) that the *at variant gives? Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Add MS_BIND_FLAGS mount flag
On Thu, Feb 14, 2008 at 9:31 AM, Miklos Szeredi [EMAIL PROTECTED] wrote: I deliberately not used the MS_* flags, which is currently a messy mix of things with totally different meanings. Does this solve all the issues? We should add a size parameter either in the mount_params or as a final argument, for future extensibility. OK, let's add it to mount_params then. And we might as well include MNT_READONLY in the API on the assumption that per-mount readonly will be available soon. Right. That patch-set should already have been merged into 2.6.25... Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 01/26] mount options: add documentation
Could also please explain why you want to go via user mounts. Other OS use a daemon for that, which e.g. can maintain access controls. How do you want to manage this? The unprivileged mounts patches do contain a simple form of access control. I don't think anything more is needed, but of course, having unprivileged mounts in the kernel does not prevent the use of a more sophisticated access control daemon in userspace, if that becomes necessary. A I don't think anything more is needed lets go off all sorts of warning lights. Most things start out simple, so IMO it's very worth it to check where it might go to to know the limits beforehand. The main question here is why should a kernel based solution be preferable over a daemon based solution? A daemon based solution would work for the normal case, where we have a single mount namespace and a single /etc/mtab file, and we hope it doesn't get too much out of sync with what is actually in the kernel (on remount the mount options do get out of sync, but hey, we seem to be able to live with that). However, once you start using multiple namespaces, the daemon based solution quickly becomes unusable, because you would need a separate daemon for each namespace, and it would have to somehow keep track of mount propagations in userspace (which is basically impossible), etc, etc... Does that answer your question? Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 07/10] unprivileged mounts: add sysctl tunable for safe property
Maybe sysctls just need to check capabilities, instead of uids. I think that would make a lot of sense anyway. Would it be as simple as tagging the inodes with capability sets? One set for writing, or one each for reading and writing? Yes, or something even simpler, like mapping the owner permission bits to CAP_SYS_ADMIN. There seem to be very few different permissions under /proc/sys: --w--- -r--r--r-- -rw--- -rw-r--r-- As long as the group and other bits are always the same, and we accept that the owner bits really mean CAP_SYS_ADMIN and not something else, But I would assume some things under /proc/sys/net/ipv4 or /proc/sys/net/ath0 require CAP_NET_ADMIN rather than CAP_SYS_ADMIN? I guess so. I'm not very familiar with the different capabilities :) How about this patch then: a hybrid solution between just relying on permission bits, and specifying separate capability sets for read and write in addition to the permission bits. Untested, the 'cap' field obviously still needs to be filled in where appropriate. Miklos Index: linux/include/linux/sysctl.h === --- linux.orig/include/linux/sysctl.h 2008-02-04 12:29:01.0 +0100 +++ linux/include/linux/sysctl.h2008-02-07 15:19:06.0 +0100 @@ -1041,6 +1041,7 @@ struct ctl_table void *data; int maxlen; mode_t mode; + int cap;/* Capability needed to read/write */ struct ctl_table *child; struct ctl_table *parent; /* Automatically set */ proc_handler *proc_handler; /* Callback for text formatting */ Index: linux/kernel/sysctl.c === --- linux.orig/kernel/sysctl.c 2008-02-05 22:17:05.0 +0100 +++ linux/kernel/sysctl.c 2008-02-07 15:30:45.0 +0100 @@ -1527,14 +1527,26 @@ out: * some sysctl variables are readonly even to root. */ -static int test_perm(int mode, int op) +static int test_perm(struct ctl_table *table, int op) { - if (!current-euid) - mode = 6; - else if (in_egroup_p(0)) - mode = 3; + int cap = table-cap; + mode_t mode = table-mode; + + if (!cap) + cap = CAP_SYS_ADMIN; + + if ((op MAY_READ) !(mode S_IRUGO)) + return -EACCES; + + if ((op MAY_WRITE) !(mode S_IWUGO)) + return -EACCES; + + if (capable(cap)) + return 0; + if ((mode op 0007) == op) return 0; + return -EACCES; } @@ -1544,7 +1556,7 @@ int sysctl_perm(struct ctl_table *table, error = security_sysctl(table, op); if (error) return error; - return test_perm(table-mode, op); + return test_perm(table, op); } #ifdef CONFIG_SYSCTL_SYSCALL - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 07/10] unprivileged mounts: add sysctl tunable for safe property
+ t-table[0].mode = 0644; Yikes, this could be a problem for containers, as it's simply tied to uid 0, whereas tying it to a capability would let us solve it with capability bounds. This might mean more urgency to get user namespaces working at least with sysfs, else this is a quick way around having CAP_SYS_ADMIN taken out of a container's capability bounding set. I think I understand the problem, but not the solution. How do user namespaces going to help? Maybe sysctls just need to check capabilities, instead of uids. I think that would make a lot of sense anyway. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 02/10] unprivileged mounts: allow unprivileged umount
From: Miklos Szeredi [EMAIL PROTECTED] The owner doesn't need sysadmin capabilities to call umount(). Similar behavior as umount(8) on mounts having user=UID option in /etc/mtab. The difference is that umount also checks /etc/fstab, presumably to exclude another mount on the same mountpoint. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-02-04 23:47:50.0 +0100 +++ linux/fs/namespace.c2008-02-04 23:47:53.0 +0100 @@ -1033,6 +1033,27 @@ static int do_umount(struct vfsmount *mn return retval; } +static bool is_mount_owner(struct vfsmount *mnt, uid_t uid) +{ + return (mnt-mnt_flags MNT_USER) mnt-mnt_uid == uid; +} + +/* + * umount is permitted for + * - sysadmin + * - mount owner, if not forced umount + */ +static bool permit_umount(struct vfsmount *mnt, int flags) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + + if (flags MNT_FORCE) + return false; + + return is_mount_owner(mnt, current-fsuid); +} + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -1056,7 +1077,7 @@ asmlinkage long sys_umount(char __user * goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!permit_umount(nd.path.mnt, flags)) goto dput_and_out; retval = do_umount(nd.path.mnt, flags); -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 05/10] unprivileged mounts: allow unprivileged bind mounts
From: Miklos Szeredi [EMAIL PROTECTED] Allow bind mounts to unprivileged users if the following conditions are met: - mountpoint is not a symlink - parent mount is owned by the user - the number of user mounts is below the maximum Unprivileged mounts imply MS_SETUSER, and will also have the nosuid and nodev mount flags set. In particular, if mounting process doesn't have CAP_SETUID capability, then the nosuid flag will be added, and if it doesn't have CAP_MKNOD capability, then the nodev flag will be added. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-02-04 23:47:58.0 +0100 +++ linux/fs/namespace.c2008-02-04 23:48:00.0 +0100 @@ -545,6 +545,11 @@ static void __set_mnt_user(struct vfsmou WARN_ON(mnt-mnt_flags MNT_USER); mnt-mnt_uid = current-fsuid; mnt-mnt_flags |= MNT_USER; + + if (!capable(CAP_SETUID)) + mnt-mnt_flags |= MNT_NOSUID; + if (!capable(CAP_MKNOD)) + mnt-mnt_flags |= MNT_NODEV; } static void set_mnt_user(struct vfsmount *mnt) @@ -1160,22 +1165,26 @@ asmlinkage long sys_oldumount(char __use #endif -static int mount_is_safe(struct nameidata *nd) +/* + * Conditions for unprivileged mounts are: + * - mountpoint is not a symlink + * - mountpoint is in a mount owned by the user + */ +static bool permit_mount(struct nameidata *nd, int *flags) { + struct inode *inode = nd-path.dentry-d_inode; + if (capable(CAP_SYS_ADMIN)) - return 0; - return -EPERM; -#ifdef notyet - if (S_ISLNK(nd-path.dentry-d_inode-i_mode)) - return -EPERM; - if (nd-path.dentry-d_inode-i_mode S_ISVTX) { - if (current-uid != nd-path.dentry-d_inode-i_uid) - return -EPERM; - } - if (vfs_permission(nd, MAY_WRITE)) - return -EPERM; - return 0; -#endif + return true; + + if (S_ISLNK(inode-i_mode)) + return false; + + if (!is_mount_owner(nd-path.mnt, current-fsuid)) + return false; + + *flags |= MS_SETUSER; + return true; } static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry) @@ -1419,9 +1428,10 @@ static int do_loopback(struct nameidata int clone_fl; struct nameidata old_nd; struct vfsmount *mnt = NULL; - int err = mount_is_safe(nd); - if (err) - return err; + int err; + + if (!permit_mount(nd, flags)) + return -EPERM; if (!old_name || !*old_name) return -EINVAL; err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd); -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 08/10] unprivileged mounts: make fuse safe
From: Miklos Szeredi [EMAIL PROTECTED] Don't require the user_id= and group_id= options for unprivileged mounts, but if they are present, verify them for sanity. Disallow the allow_other option for unprivileged mounts. Document new way of enabling unprivileged mounts for fuse. Document problems with unprivileged mounts. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/fuse/inode.c === --- linux.orig/fs/fuse/inode.c 2008-02-04 23:47:46.0 +0100 +++ linux/fs/fuse/inode.c 2008-02-04 23:48:06.0 +0100 @@ -359,6 +359,19 @@ static int parse_fuse_opt(char *opt, str d-max_read = ~0; d-blksize = FUSE_DEFAULT_BLKSIZE; + /* +* For unprivileged mounts use current uid/gid. Still allow +* user_id and group_id options for compatibility, but +* only if they match these values. +*/ + if (!capable(CAP_SYS_ADMIN)) { + d-user_id = current-uid; + d-user_id_present = 1; + d-group_id = current-gid; + d-group_id_present = 1; + + } + while ((p = strsep(opt, ,)) != NULL) { int token; int value; @@ -387,6 +400,8 @@ static int parse_fuse_opt(char *opt, str case OPT_USER_ID: if (match_int(args[0], value)) return 0; + if (d-user_id_present d-user_id != value) + return 0; d-user_id = value; d-user_id_present = 1; break; @@ -394,6 +409,8 @@ static int parse_fuse_opt(char *opt, str case OPT_GROUP_ID: if (match_int(args[0], value)) return 0; + if (d-group_id_present d-group_id != value) + return 0; d-group_id = value; d-group_id_present = 1; break; @@ -603,6 +620,10 @@ static int fuse_fill_super(struct super_ if (!parse_fuse_opt((char *) data, d, is_bdev)) return -EINVAL; + /* This is a privileged option */ + if ((d.flags FUSE_ALLOW_OTHER) !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (is_bdev) { #ifdef CONFIG_BLOCK if (!sb_set_blocksize(sb, d.blksize)) Index: linux/Documentation/filesystems/fuse.txt === --- linux.orig/Documentation/filesystems/fuse.txt 2008-01-24 23:58:37.0 +0100 +++ linux/Documentation/filesystems/fuse.txt2008-02-05 19:34:24.0 +0100 @@ -215,11 +215,87 @@ the filesystem. There are several ways - Abort filesystem through the FUSE control filesystem. Most powerful method, always works. -How do non-privileged mounts work? -~~ +Unprivileged fuse mounts + -Since the mount() system call is a privileged operation, a helper -program (fusermount) is needed, which is installed setuid root. +Possible problems with unprivileged fuse mounts +--- + +FUSE was designed from the beginning to be safe for unprivileged +users. This has also been verified in practice over many years, with +some distributions enabling unprivileged FUSE mounts by default. + +However, there are cases when unprivileged mounting a fuse filesystem +may be problematic, particularly for multi-user systems with untrusted +users. So here are few words of warning: + +Due to the design of the process freezer, a hanging (due to network +problems, etc) or malicious filesystem may prevent suspending to ram +or hibernation to succeed. This is not actually unique to FUSE, as +any hanging network filesystem will have the same affect. + +It is not always possible to use kill(2) (not even with SIGKILL) to +terminate a process using a FUSE filesystem (see section Interrupting +filesystem operations above). As a special case of the above, +killing a self-deadlocked FUSE process is not possible, and even +killall5 will not terminate it. + +If the above could pose a threat to the system, it is recommended, +that unprivileged fuse mounts are not enabled. + +Ways of enabling user mounts + + +Now there are two different ways of allowing unprivileged fuse mounts: + + 1) new way: unprivileged mount syscall + + 2) old way: suid-root fusermount utility + +Unprivileged mount syscall +-- + +To enable this do + + echo 1 /proc/sys/fs/types/fuse/usermount_safe + +or add this line to /etc/sysctl.conf: + + fs.types.fuse.usermount_safe = 1 + +More information can be found in Documentation/filesystems/proc.txt +under the /proc/sys/fs/types/ heading. Also see
[patch 04/10] unprivileged mounts: account user mounts
From: Miklos Szeredi [EMAIL PROTECTED] Add sysctl variables for accounting and limiting the number of user mounts. The maximum number of user mounts is set to 1024 by default. This won't in itself enable user mounts, setting a mount to be owned by a user is first needed. [akpm] - don't use enumerated sysctls Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/Documentation/filesystems/proc.txt === --- linux.orig/Documentation/filesystems/proc.txt 2008-02-04 23:47:47.0 +0100 +++ linux/Documentation/filesystems/proc.txt2008-02-04 23:47:58.0 +0100 @@ -1052,6 +1052,15 @@ reaches aio-max-nr then io_setup will fa raising aio-max-nr does not result in the pre-allocation or re-sizing of any kernel data structures. +nr_user_mounts and max_user_mounts +-- + +These represent the number of user mounts and the maximum number of +user mounts respectively. User mounts may be created by +unprivileged users. User mounts may also be created with sysadmin +privileges on behalf of a user, in which case nr_user_mounts may +exceed max_user_mounts. + 2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-02-04 23:47:56.0 +0100 +++ linux/fs/namespace.c2008-02-04 23:47:58.0 +0100 @@ -46,6 +46,9 @@ static struct list_head *mount_hashtable static struct kmem_cache *mnt_cache __read_mostly; static struct rw_semaphore namespace_sem; +int nr_user_mounts; +int max_user_mounts = 1024; + /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); @@ -511,21 +514,70 @@ static struct vfsmount *skip_mnt_tree(st return p; } -static void set_mnt_user(struct vfsmount *mnt) +static void dec_nr_user_mounts(void) +{ + spin_lock(vfsmount_lock); + nr_user_mounts--; + spin_unlock(vfsmount_lock); +} + +static int reserve_user_mount(void) +{ + int err = 0; + + spin_lock(vfsmount_lock); + /* +* EMFILE was error returned by mount(2) in the old days, when +* the mount count was limited. Reuse this error value to +* mean, that the maximum number of user mounts has been +* exceeded. +*/ + if (nr_user_mounts = max_user_mounts !capable(CAP_SYS_ADMIN)) + err = -EMFILE; + else + nr_user_mounts++; + spin_unlock(vfsmount_lock); + return err; +} + +static void __set_mnt_user(struct vfsmount *mnt) { WARN_ON(mnt-mnt_flags MNT_USER); mnt-mnt_uid = current-fsuid; mnt-mnt_flags |= MNT_USER; } +static void set_mnt_user(struct vfsmount *mnt) +{ + __set_mnt_user(mnt); + spin_lock(vfsmount_lock); + nr_user_mounts++; + spin_unlock(vfsmount_lock); +} + +static void clear_mnt_user(struct vfsmount *mnt) +{ + if (mnt-mnt_flags MNT_USER) { + mnt-mnt_uid = 0; + mnt-mnt_flags = ~MNT_USER; + dec_nr_user_mounts(); + } +} + static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, int flag) { struct super_block *sb = old-mnt_sb; - struct vfsmount *mnt = alloc_vfsmnt(old-mnt_devname); + struct vfsmount *mnt; + if (flag CL_SETUSER) { + int err = reserve_user_mount(); + if (err) + return ERR_PTR(err); + } + mnt = alloc_vfsmnt(old-mnt_devname); if (!mnt) - return ERR_PTR(-ENOMEM); + goto alloc_failed; mnt-mnt_flags = old-mnt_flags; atomic_inc(sb-s_active); @@ -537,7 +589,7 @@ static struct vfsmount *clone_mnt(struct /* don't copy the MNT_USER flag */ mnt-mnt_flags = ~MNT_USER; if (flag CL_SETUSER) - set_mnt_user(mnt); + __set_mnt_user(mnt); if (flag CL_SLAVE) { list_add(mnt-mnt_slave, old-mnt_slave_list); @@ -562,6 +614,11 @@ static struct vfsmount *clone_mnt(struct spin_unlock(vfsmount_lock); } return mnt; + + alloc_failed: + if (flag CL_SETUSER) + dec_nr_user_mounts(); + return ERR_PTR(-ENOMEM); } static inline void __mntput(struct vfsmount *mnt) @@ -577,6 +634,7 @@ static inline void __mntput(struct vfsmo */ WARN_ON(atomic_read(mnt-__mnt_writers)); dput(mnt-mnt_root); + clear_mnt_user(mnt); free_vfsmnt(mnt); deactivate_super(sb); } @@ -1446,6 +1504,7 @@ static int do_remount(struct nameidata * else err = do_remount_sb(sb, flags, data, 0); if (!err) { + clear_mnt_user(nd-path.mnt
[patch 06/10] unprivileged mounts: allow unprivileged mounts
From: Miklos Szeredi [EMAIL PROTECTED] For safe filesystems allow unprivileged mounting and forced unmounting. A filesystem type is considered safe, if mounting it by an unprivileged user may not cause a security problem. This is somewhat subjective, so setting this property is left to userspace (implemented in the next patch). Since most filesystems haven't been designed with unprivileged mounting in mind, a thorough audit is recommended before setting this property. Make this a separate integer member in 'struct file_system_type' instead of a flag, since that is easier to handle by sysctl code. Move subtype handling from do_kern_mount() into do_new_mount(). All other callers are kernel-internal and do not need subtype support. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-02-04 23:48:00.0 +0100 +++ linux/fs/namespace.c2008-02-04 23:48:02.0 +0100 @@ -1105,14 +1105,16 @@ static bool is_mount_owner(struct vfsmou /* * umount is permitted for * - sysadmin - * - mount owner, if not forced umount + * - mount owner + *o if not forced umount, + *o if forced umount, and filesystem is safe */ static bool permit_umount(struct vfsmount *mnt, int flags) { if (capable(CAP_SYS_ADMIN)) return true; - if (flags MNT_FORCE) + if ((flags MNT_FORCE) !(mnt-mnt_sb-s_type-fs_safe)) return false; return is_mount_owner(mnt, current-fsuid); @@ -1170,13 +1172,17 @@ asmlinkage long sys_oldumount(char __use * - mountpoint is not a symlink * - mountpoint is in a mount owned by the user */ -static bool permit_mount(struct nameidata *nd, int *flags) +static bool permit_mount(struct nameidata *nd, struct file_system_type *type, +int *flags) { struct inode *inode = nd-path.dentry-d_inode; if (capable(CAP_SYS_ADMIN)) return true; + if (type !type-fs_safe) + return false; + if (S_ISLNK(inode-i_mode)) return false; @@ -1430,7 +1436,7 @@ static int do_loopback(struct nameidata struct vfsmount *mnt = NULL; int err; - if (!permit_mount(nd, flags)) + if (!permit_mount(nd, NULL, flags)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1611,30 +1617,76 @@ out: return err; } +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) +{ + int err; + const char *subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + err = -EINVAL; + if (!subtype[0]) + goto err; + } else + subtype = ; + + mnt-mnt_sb-s_subtype = kstrdup(subtype, GFP_KERNEL); + err = -ENOMEM; + if (!mnt-mnt_sb-s_subtype) + goto err; + return mnt; + + err: + mntput(mnt); + return ERR_PTR(err); +} + /* * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct nameidata *nd, char *type, int flags, +static int do_new_mount(struct nameidata *nd, char *fstype, int flags, int mnt_flags, char *name, void *data) { + int err; struct vfsmount *mnt; + struct file_system_type *type; - if (!type || !memchr(type, 0, PAGE_SIZE)) + if (!fstype || !memchr(fstype, 0, PAGE_SIZE)) return -EINVAL; - /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mnt = do_kern_mount(type, flags ~MS_SETUSER, name, data); - if (IS_ERR(mnt)) + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + err = -EPERM; + if (!permit_mount(nd, type, flags)) + goto out_put_filesystem; + + if (flags MS_SETUSER) { + err = reserve_user_mount(); + if (err) + goto out_put_filesystem; + } + + mnt = vfs_kern_mount(type, flags ~MS_SETUSER, name, data); + if (!IS_ERR(mnt) (type-fs_flags FS_HAS_SUBTYPE) + !mnt-mnt_sb-s_subtype) + mnt = fs_set_subtype(mnt, fstype); + put_filesystem(type); + if (IS_ERR(mnt)) { + if (flags MS_SETUSER) + dec_nr_user_mounts(); return PTR_ERR(mnt); + } if (flags MS_SETUSER) - set_mnt_user(mnt); + __set_mnt_user(mnt); return do_add_mount(mnt, nd, mnt_flags, NULL); + + out_put_filesystem: + put_filesystem(type); + return err; } /* @@ -1665,7 +1717,7 @@ int do_add_mount(struct vfsmount *newmnt if (S_ISLNK(newmnt-mnt_root
[patch 10/10] unprivileged mounts: add no submounts flag
From: Miklos Szeredi [EMAIL PROTECTED] Add a new mount flag nosubmnt, which denies submounts for the owner. This would be useful, if we want to support traditional /etc/fstab based user mounts. In this case mount(8) would still have to be suid-root, to check the mountpoint against the user/users flag in /etc/fstab, but /etc/mtab would no longer be mandatory for storing the actual owner of the mount. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-02-04 23:48:08.0 +0100 +++ linux/fs/namespace.c2008-02-04 23:48:10.0 +0100 @@ -783,6 +783,7 @@ static void show_mnt_opts(struct seq_fil { MNT_NOATIME, ,noatime }, { MNT_NODIRATIME, ,nodiratime }, { MNT_RELATIME, ,relatime }, + { MNT_NOSUBMNT, ,nosubmnt }, { 0, NULL } }; const struct proc_fs_info *fs_infop; @@ -1189,6 +1190,9 @@ static bool permit_mount(struct nameidat if (S_ISLNK(inode-i_mode)) return false; + if (nd-path.mnt-mnt_flags MNT_NOSUBMNT) + return false; + if (!is_mount_owner(nd-path.mnt, current-fsuid)) return false; @@ -2033,9 +2037,11 @@ long do_mount(char *dev_name, char *dir_ mnt_flags |= MNT_RELATIME; if (flags MS_RDONLY) mnt_flags |= MNT_READONLY; + if (flags MS_NOSUBMNT) + mnt_flags |= MNT_NOSUBMNT; - flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); + flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | + MS_NODIRATIME | MS_RELATIME | MS_KERNMOUNT | MS_NOSUBMNT); /* ... and get the mountpoint */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd); Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-02-04 23:48:08.0 +0100 +++ linux/include/linux/fs.h2008-02-04 23:48:10.0 +0100 @@ -129,6 +129,7 @@ extern int dir_notify_enable; #define MS_KERNMOUNT (122) /* this is a kern_mount call */ #define MS_I_VERSION (123) /* Update inode I_version field */ #define MS_SETUSER (124) /* set mnt_uid to current user */ +#define MS_NOSUBMNT(125) /* don't allow unprivileged submounts */ #define MS_ACTIVE (130) #define MS_NOUSER (131) Index: linux/include/linux/mount.h === --- linux.orig/include/linux/mount.h2008-02-04 23:47:50.0 +0100 +++ linux/include/linux/mount.h 2008-02-04 23:48:10.0 +0100 @@ -30,6 +30,7 @@ struct mnt_namespace; #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40/* does the user want this to be r/o? */ +#define MNT_NOSUBMNT 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 01/10] unprivileged mounts: add user mounts to the kernel
From: Miklos Szeredi [EMAIL PROTECTED] This patchset adds support for keeping mount ownership information in the kernel, and allow unprivileged mount(2) and umount(2) in certain cases. The mount owner has the following privileges: - unmount the owned mount - create a submount under the owned mount The sysadmin can set the owner explicitly on mount and remount. When an unprivileged user creates a mount, then the owner is automatically set to the user. The following use cases are envisioned: 1) Private namespace, with selected mounts owned by user. E.g. /home/$USER is a good candidate for allowing unpriv mounts and unmounts within. 2) Private namespace, with all mounts owned by user and having the nosuid flag. User can mount and umount anywhere within the namespace, but suid programs will not work. 3) Global namespace, with a designated directory, which is a mount owned by the user. E.g. /mnt/users/$USER is set up so that it is bind mounted onto itself, and set to be owned by $USER. The user can add/remove mounts only under this directory. The following extra security measures are taken for unprivileged mounts: - usermounts are limited by a sysctl tunable - force nosuid,nodev mount options on the created mount This series increases the size of vmlinux by about 1.5k on x86_64. For testing unprivileged mounts (and for other purposes) simple mount/umount utilities are available from: http://www.kernel.org/pub/linux/kernel/people/mszeredi/mmount/ A preliminary patch for util-linux-ng to add the same functionality to mount(8) and umount(8) is available here: http://lkml.org/lkml/2008/1/16/103 This patch: A new mount flag, MS_SETUSER is used to make a mount owned by a user. If this flag is specified, then the owner will be set to the current fsuid and the mount will be marked with the MNT_USER flag. On remount don't preserve previous owner, and treat MS_SETUSER as for a new mount. The MS_SETUSER flag is ignored on mount move. The MNT_USER flag is not copied on any kind of mount cloning: namespace creation, binding or propagation. For bind mounts the cloned mount(s) are set to MNT_USER depending on the MS_SETUSER mount flag. In all the other cases MNT_USER is always cleared. For MNT_USER mounts a user=UID option is added to /proc/PID/mounts. This is compatible with how mount ownership is stored in /etc/mtab. The rationale for using MS_SETUSER and MNT_USER, to distinguish user mounts from non-user or legacy mounts are follows: a) Mount(2) and umount(2) on legacy mounts always need CAP_SYS_ADMIN capability. As opposed to user mounts, which will only require, that the mount owner matches the current fsuid. So a process with fsuid=0 should not be able to mount/umount legacy mounts without the CAP_SYS_ADMIN capability. b) Legacy userspace programs may set fsuid to nonzero before calling mount(2). In such an unlikely case, this patchset would cause an unintended side effect of making the mount owned by the fsuid. c) For legacy mounts, no user=UID option should be shown in /proc/mounts for backwards compatibility. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-02-04 23:47:47.0 +0100 +++ linux/fs/namespace.c2008-02-04 23:47:50.0 +0100 @@ -511,6 +511,13 @@ static struct vfsmount *skip_mnt_tree(st return p; } +static void set_mnt_user(struct vfsmount *mnt) +{ + WARN_ON(mnt-mnt_flags MNT_USER); + mnt-mnt_uid = current-fsuid; + mnt-mnt_flags |= MNT_USER; +} + static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, int flag) { @@ -525,6 +532,11 @@ static struct vfsmount *clone_mnt(struct mnt-mnt_mountpoint = mnt-mnt_root; mnt-mnt_parent = mnt; + /* don't copy the MNT_USER flag */ + mnt-mnt_flags = ~MNT_USER; + if (flag CL_SETUSER) + set_mnt_user(mnt); + if (flag CL_SLAVE) { list_add(mnt-mnt_slave, old-mnt_slave_list); mnt-mnt_master = old; @@ -712,6 +724,8 @@ static void show_mnt_opts(struct seq_fil if (mnt-mnt_flags fs_infop-flag) seq_puts(m, fs_infop-str); } + if (mnt-mnt_flags MNT_USER) + seq_printf(m, ,user=%i, mnt-mnt_uid); } static void show_type(struct seq_file *m, struct super_block *sb) @@ -1320,8 +1334,9 @@ static int do_change_type(struct nameida /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, int flags) { + int clone_fl; struct nameidata old_nd
[patch 07/10] unprivileged mounts: add sysctl tunable for safe property
From: Miklos Szeredi [EMAIL PROTECTED] Add the following: /proc/sys/fs/types/${FS_TYPE}/usermount_safe Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/filesystems.c === --- linux.orig/fs/filesystems.c 2008-02-04 23:47:46.0 +0100 +++ linux/fs/filesystems.c 2008-02-04 23:48:04.0 +0100 @@ -12,6 +12,7 @@ #include linux/kmod.h #include linux/init.h #include linux/module.h +#include linux/sysctl.h #include asm/uaccess.h /* @@ -51,6 +52,57 @@ static struct file_system_type **find_fi return p; } +#define MAX_FILESYSTEM_VARS 1 + +struct filesystem_sysctl_table { + struct ctl_table_header *header; + struct ctl_table table[MAX_FILESYSTEM_VARS + 1]; +}; + +/* + * Create /sys/fs/types/${FSNAME} directory with per fs-type tunables. + */ +static int filesystem_sysctl_register(struct file_system_type *fs) +{ + struct filesystem_sysctl_table *t; + struct ctl_path path[] = { + { .procname = fs, .ctl_name = CTL_FS }, + { .procname = types, .ctl_name = CTL_UNNUMBERED }, + { .procname = fs-name, .ctl_name = CTL_UNNUMBERED }, + { } + }; + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOMEM; + + + t-table[0].ctl_name = CTL_UNNUMBERED; + t-table[0].procname = usermount_safe; + t-table[0].maxlen = sizeof(int); + t-table[0].data = fs-fs_safe; + t-table[0].mode = 0644; + t-table[0].proc_handler = proc_dointvec; + + t-header = register_sysctl_paths(path, t-table); + if (!t-header) { + kfree(t); + return -ENOMEM; + } + + fs-sysctl_table = t; + + return 0; +} + +static void filesystem_sysctl_unregister(struct file_system_type *fs) +{ + struct filesystem_sysctl_table *t = fs-sysctl_table; + + unregister_sysctl_table(t-header); + kfree(t); +} + /** * register_filesystem - register a new filesystem * @fs: the file system structure @@ -80,6 +132,13 @@ int register_filesystem(struct file_syst else *p = fs; write_unlock(file_systems_lock); + + if (res == 0) { + res = filesystem_sysctl_register(fs); + if (res != 0) + unregister_filesystem(fs); + } + return res; } @@ -108,6 +167,7 @@ int unregister_filesystem(struct file_sy *tmp = fs-next; fs-next = NULL; write_unlock(file_systems_lock); + filesystem_sysctl_unregister(fs); return 0; } tmp = (*tmp)-next; Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-02-04 23:48:02.0 +0100 +++ linux/include/linux/fs.h2008-02-04 23:48:04.0 +0100 @@ -1444,6 +1444,7 @@ struct file_system_type { struct module *owner; struct file_system_type * next; struct list_head fs_supers; + struct filesystem_sysctl_table *sysctl_table; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; Index: linux/Documentation/filesystems/proc.txt === --- linux.orig/Documentation/filesystems/proc.txt 2008-02-04 23:47:58.0 +0100 +++ linux/Documentation/filesystems/proc.txt2008-02-04 23:48:04.0 +0100 @@ -44,6 +44,7 @@ Table of Contents 2.14 /proc/pid/io - Display the IO accounting fields 2.15 /proc/pid/coredump_filter - Core dump filtering settings 2.16 /proc/pid/mountinfo - Information about mounts + 2.17 /proc/sys/fs/types - File system type specific parameters -- Preface @@ -2392,4 +2393,34 @@ For more information see: Documentation/filesystems/sharedsubtree.txt +2.17 /proc/sys/fs/types/ - File system type specific parameters + + +There's a separate directory /proc/sys/fs/types/type/ for each +filesystem type, containing the following files: + +usermount_safe +-- + +Setting this to non-zero will allow filesystems of this type to be +mounted by unprivileged users (note, that there are other +prerequisites as well). + +Fuse has been designed to be as safe as possible, and some +distributions already ship with unprivileged fuse mounts enabled by +default. There are still some situations (multi-user systems with +untrusted users in particular), where enabling this for fuse might not +be appropriate. For more details, see Documentation/filesystems/fuse.txt + +Procfs is also safe, but unprivileged mounting of it is not usually +necessary (bind mounting is equivalent). + +Most
[patch 2/3] mm: Add NR_WRITEBACK_TEMP counter
From: Miklos Szeredi [EMAIL PROTECTED] Fuse will use temporary buffers to write back dirty data from memory mappings (normal writes are done synchronously). This is needed, because there cannot be any guarantee about the time in which a write will complete. By using temporary buffers, from the MM's point if view the page is written back immediately. If the writeout was due to memory pressure, this effectively migrates data from a full zone to a less full zone. This patch adds a new counter (NR_WRITEBACK_TEMP) for the number of pages used as temporary buffers. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/proc/proc_misc.c === --- linux.orig/fs/proc/proc_misc.c 2008-02-04 12:29:00.0 +0100 +++ linux/fs/proc/proc_misc.c 2008-02-04 13:01:35.0 +0100 @@ -178,6 +178,7 @@ static int meminfo_read_proc(char *page, PageTables: %8lu kB\n NFS_Unstable: %8lu kB\n Bounce: %8lu kB\n + WritebackTmp: %8lu kB\n CommitLimit: %8lu kB\n Committed_AS: %8lu kB\n VmallocTotal: %8lu kB\n @@ -209,6 +210,7 @@ static int meminfo_read_proc(char *page, K(global_page_state(NR_PAGETABLE)), K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), + K(global_page_state(NR_WRITEBACK_TEMP)), K(allowed), K(committed), (unsigned long)VMALLOC_TOTAL 10, Index: linux/include/linux/mmzone.h === --- linux.orig/include/linux/mmzone.h 2008-02-04 12:29:01.0 +0100 +++ linux/include/linux/mmzone.h2008-02-04 13:01:35.0 +0100 @@ -95,6 +95,7 @@ enum zone_stat_item { NR_UNSTABLE_NFS,/* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, + NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ Index: linux/drivers/base/node.c === --- linux.orig/drivers/base/node.c 2008-02-04 12:28:53.0 +0100 +++ linux/drivers/base/node.c 2008-02-04 13:01:35.0 +0100 @@ -64,6 +64,7 @@ static ssize_t node_read_meminfo(struct Node %d PageTables: %8lu kB\n Node %d NFS_Unstable: %8lu kB\n Node %d Bounce: %8lu kB\n + Node %d WritebackTmp: %8lu kB\n Node %d Slab: %8lu kB\n Node %d SReclaimable: %8lu kB\n Node %d SUnreclaim: %8lu kB\n, @@ -86,6 +87,7 @@ static ssize_t node_read_meminfo(struct nid, K(node_page_state(nid, NR_PAGETABLE)), nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), nid, K(node_page_state(nid, NR_BOUNCE)), + nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), Index: linux/mm/page-writeback.c === --- linux.orig/mm/page-writeback.c 2008-02-04 13:01:23.0 +0100 +++ linux/mm/page-writeback.c 2008-02-04 13:01:35.0 +0100 @@ -211,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_ avail_dirty = dirty - (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_WRITEBACK) + -global_page_state(NR_UNSTABLE_NFS)); +global_page_state(NR_UNSTABLE_NFS) + +global_page_state(NR_WRITEBACK_TEMP)); if (avail_dirty 0) avail_dirty = 0; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 3/3] fuse: support writable mmap
From: Miklos Szeredi [EMAIL PROTECTED] Quoting Linus (3 years ago, FUSE inclusion discussions): User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal write() calls). Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. This patch depends on mm-bdi-allow-setting-a-maximum-for-the-bdi-dirty-limit-fix.patch Fuse page writeback design -- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/bdi/max_ratio'. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/fuse/dev.c === --- linux.orig/fs/fuse/dev.c2008-02-04 15:24:03.0 +0100 +++ linux/fs/fuse/dev.c 2008-02-04 15:24:47.0 +0100 @@ -47,6 +47,14 @@ struct fuse_req *fuse_request_alloc(void return req; } +struct fuse_req *fuse_request_alloc_nofs(void) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); + if (req) + fuse_request_init(req); + return req; +} + void fuse_request_free(struct fuse_req *req) { kmem_cache_free(fuse_req_cachep, req); @@ -430,6 +438,17 @@ void request_send_background(struct fuse } /* + * Called under fc-lock + * + * fc-connected must have been checked previously + */ +void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) +{ + req-isreply = 1; + request_send_nowait_locked(fc, req); +} + +/* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already * aborted bail out. Index: linux/fs/fuse/dir.c === --- linux.orig/fs/fuse/dir.c2008-02-04 15:24:03.0 +0100 +++ linux/fs/fuse/dir.c 2008-02-04 15:24:47.0 +0100 @@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr } /* + * Prevent concurrent writepages on inode + * + * This is done by adding a negative bias to the inode write counter + * and waiting for all pending writes to finish. + */ +void fuse_set_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode
[patch 0/3] fuse: writable mmap
This is short series for fuse writable mmap support. The first two patches are small additions to mm infrastructure. The third is a large patch for fuse. It also depends on the mm: bdi: export BDI attributes in sysfs series. I don't mind if this goes into 2.6.25 (guess, that depends on whether the bdi things go). Thanks, Miklos -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 0/3] add perform_write to a_ops
a_ops-perform_write() was left out from Nick Piggin's new a_ops patchset, as it was non-essential, and postponed for later inclusion. This short series reintroduces it, but only adds the fuse implementation and not simple_perform_write(), which I'm not sure would be a significant improvement. This allows larger than 4k buffered writes for fuse, which is one of the most requested features. This goes on top of the fuse: writable mmap patches. Thanks, Miklos -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 3/3] fuse: implement perform_write
From: Nick Piggin [EMAIL PROTECTED] Introduce fuse_perform_write. With fusexmp (a passthrough filesystem), large (1MB) writes into a backing tmpfs filesystem are sped up by almost 4 times (256MB/s vs 71MB/s). [EMAIL PROTECTED]: - split into smaller functions - testing Signed-off-by: Nick Piggin [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/fuse/file.c === --- linux.orig/fs/fuse/file.c 2008-02-04 17:11:18.0 +0100 +++ linux/fs/fuse/file.c2008-02-04 17:11:59.0 +0100 @@ -677,6 +677,148 @@ static int fuse_write_end(struct file *f return res; } +static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, + struct inode *inode, loff_t pos, + size_t count) +{ + size_t res; + unsigned offset; + unsigned i; + + for (i = 0; i req-num_pages; i++) + fuse_wait_on_page_writeback(inode, req-pages[i]-index); + + res = fuse_send_write(req, file, inode, pos, count, NULL); + + offset = req-page_offset; + count = res; + for (i = 0; i req-num_pages; i++) { + struct page *page = req-pages[i]; + + if (!req-out.h.error !offset count = PAGE_CACHE_SIZE) + SetPageUptodate(page); + + /* Just ignore count underflow on last page */ + count -= PAGE_CACHE_SIZE - offset; + offset = 0; + + unlock_page(page); + page_cache_release(page); + } + + return res; +} + +static ssize_t fuse_fill_write_pages(struct fuse_req *req, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(mapping-host); + unsigned offset = pos (PAGE_CACHE_SIZE - 1); + size_t count = 0; + int err; + + req-page_offset = offset; + + do { + size_t tmp; + struct page *page; + pgoff_t index = pos PAGE_CACHE_SHIFT; + size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, +iov_iter_count(ii)); + + bytes = min_t(size_t, bytes, fc-max_write - count); + + again: + err = -EFAULT; + if (iov_iter_fault_in_readable(ii, bytes)) + break; + + err = -ENOMEM; + page = __grab_cache_page(mapping, index); + if (!page) + break; + + pagefault_disable(); + tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + if (!tmp) { + unlock_page(page); + page_cache_release(page); + bytes = min(bytes, iov_iter_single_seg_count(ii)); + goto again; + } + + err = 0; + req-pages[req-num_pages] = page; + req-num_pages++; + + iov_iter_advance(ii, tmp); + count += tmp; + pos += tmp; + offset += tmp; + if (offset == PAGE_CACHE_SIZE) + offset = 0; + + } while (iov_iter_count(ii) count fc-max_write +req-num_pages FUSE_MAX_PAGES_PER_REQ offset == 0); + + return count 0 ? count : err; +} + +static ssize_t fuse_perform_write(struct file *file, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct inode *inode = mapping-host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err = 0; + ssize_t res = 0; + + if (is_bad_inode(inode)) + return -EIO; + + do { + struct fuse_req *req; + ssize_t count; + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + count = fuse_fill_write_pages(req, mapping, ii, pos); + if (count = 0) { + err = count; + } else { + size_t num_written; + + num_written = fuse_send_write_pages(req, file, inode, + pos, count); + err = req-out.h.error; + if (!err) { + res += num_written; + pos += num_written; + + /* break out of the loop on short write */ + if (num_written != count
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
In FUSE interrupts are sent to userspace, and the filesystem decides what to do with them. So it is entirely possible and valid for a filesystem to ignore an interrupt. If an operation was non-blocking (such as one returning an error), then there would in fact be no purpose in checking interrupts. Why do you think that it is valid to ignore pending signals? You seem to be asserting that it okay for processes to hang, uninterruptibly, when accessing files on fuse mounted file systems? Perhaps the right error to return when there is a signal pending is EINTR and not ESTALE or some other error? There has to be some way for the application to detect that its system call was interrupted due to a signal pending. Traditionally a lot of filesystem related system calls are not interruptible, and for good reason. For example what happens, if an app receives a signal, while the filesystem is performing a rename() request? It would be very confusing if the call returned EINTR, but the rename would successfully complete regardless. We had a related problem with the open(O_CREAT) call in fuse, which was interruptible between the creation and the actual open because of a design mistake. So it could return EINTR, after the file was created, and this broke a real world application (don't have details at hand, but could dig them out if you are interested). I don't know what NFS does, but returning EINTR without actually canceling an operation in the server is generally not a good idea. So while sending a signal might reliably work in NFS to break out of the loop, it does not necessarily work for other filesystems, and fuse may not be the only one affected. Have you noticed another one? I would be happy to chat with the developers for that file system to see if this support would negatively impact them. Oh, I have no idea. And I wouldn't want to do a full audit of all the filesystems to find out. But if you do, please go ahead. A few solutions come to mind, perhaps the best is to introduce a kernel internal errno value (ERETRYSTALE), that forces the relevant system calls to be retried. NFS could transform ESTALE errors to ERETRYSTALE and get the desired behavior, while other filesystems would not be affected. We don't need more error numbers, we've got plenty already. :-) That's a rather poor excuse against a simple solution which would spare us some backward compatibility problems. Do you have anything more specific about any real problems? I see lots of mays and coulds, but I don't see anything that I can do to make this support better. Implement the above suggestion? Or something else. Otherwise I have to NAK this patch due to the possibility of it breaking existing fuse installations. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
I don't know what NFS does, but returning EINTR without actually canceling an operation in the server is generally not a good idea. This is what NFS has been doing, for several decades, and no one has complained yet. Is it really? Man nfs says something quite different (emphasis mine): intrIf an NFS file operation has a *major timeout* and it is hard mounted, then allow signals to interupt the file operation and cause it to return EINTR to the calling program. The *default* is to *not* allow file operations to be *interrupted*. Have you noticed another one? I would be happy to chat with the developers for that file system to see if this support would negatively impact them. Oh, I have no idea. And I wouldn't want to do a full audit of all the filesystems to find out. But if you do, please go ahead. Well, you brought it up. I thought that perhaps you had something other than FUD. It's not FUD, it's being careful not to break an implementation when changing an API in a backward incompatbile way. Please describe this real and existing fuse installation so that I can better understand the situation and the real requirements here. I have already done so: Also up till now, returning ESTALE in a fuse filesystem was a perfectly valid thing to do. This patch changes the behavior of that rather drastically. There might be installed systems that rely on current behavior, and we want to avoid breaking those on a kernel upgrade. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/3] mm: bdi: export bdi_writeout_inc()
From: Miklos Szeredi [EMAIL PROTECTED] Fuse needs this for writable mmap support. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/include/linux/backing-dev.h === --- linux.orig/include/linux/backing-dev.h 2008-02-04 12:29:01.0 +0100 +++ linux/include/linux/backing-dev.h 2008-02-04 13:01:23.0 +0100 @@ -149,6 +149,8 @@ static inline unsigned long bdi_stat_err int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +extern void bdi_writeout_inc(struct backing_dev_info *bdi); + /* * Flags in backing_dev_info::capability * - The first two flags control whether dirty pages will contribute to the Index: linux/mm/page-writeback.c === --- linux.orig/mm/page-writeback.c 2008-02-04 12:29:01.0 +0100 +++ linux/mm/page-writeback.c 2008-02-04 13:01:23.0 +0100 @@ -168,6 +168,16 @@ static inline void __bdi_writeout_inc(st bdi-max_prop_frac); } +void bdi_writeout_inc(struct backing_dev_info *bdi) +{ + unsigned long flags; + + local_irq_save(flags); + __bdi_writeout_inc(bdi); + local_irq_restore(flags); +} +EXPORT_SYMBOL(bdi_writeout_inc); + static inline void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(vm_dirties, tsk-dirties); -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/3] vfs: introduce perform_write in a_ops
From: Nick Piggin [EMAIL PROTECTED] Introduce a new perform_write() address space operation. This is a single-call, bulk version of write_begin/write_end operations. It is only used in the buffered write path (write_begin must still be implemented), and not for in-kernel writes to pagecache. For some filesystems, using this can provide significant speedups. Signed-off-by: Nick Piggin [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-02-04 15:24:03.0 +0100 +++ linux/include/linux/fs.h2008-02-04 16:24:19.0 +0100 @@ -469,6 +469,9 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); + ssize_t (*perform_write)(struct file *, struct address_space *mapping, + struct iov_iter *i, loff_t pos); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); Index: linux/mm/filemap.c === --- linux.orig/mm/filemap.c 2008-02-04 15:24:03.0 +0100 +++ linux/mm/filemap.c 2008-02-04 16:22:55.0 +0100 @@ -2312,7 +2312,9 @@ generic_file_buffered_write(struct kiocb struct iov_iter i; iov_iter_init(i, iov, nr_segs, count, written); - if (a_ops-write_begin) + if (a_ops-perform_write) + status = a_ops-perform_write(file, mapping, i, pos); + else if (a_ops-write_begin) status = generic_perform_write(file, i, pos); else status = generic_perform_write_2copy(file, i, pos); Index: linux/Documentation/filesystems/vfs.txt === --- linux.orig/Documentation/filesystems/vfs.txt2008-02-04 12:28:50.0 +0100 +++ linux/Documentation/filesystems/vfs.txt 2008-02-04 16:23:44.0 +0100 @@ -533,6 +533,9 @@ struct address_space_operations { int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); + ssize_t (*perform_write)(struct file *, struct address_space *mapping, + struct iov_iter *i, loff_t pos); + sector_t (*bmap)(struct address_space *, sector_t); int (*invalidatepage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); @@ -664,6 +667,17 @@ struct address_space_operations { Returns 0 on failure, otherwise the number of bytes (= 'copied') that were able to be copied into pagecache. + perform_write: This is a single-call, bulk version of write_begin/write_end +operations. It is only used in the buffered write path (write_begin +must still be implemented), and not for in-kernel writes to pagecache. +It takes an iov_iter structure, which provides a descriptor for the +source data (and has associated iov_iter_xxx helpers to operate on +that data). There are also file, mapping, and pos arguments, which +specify the destination of the data. + +Returns 0 on failure if nothing was written out, otherwise returns +the number of bytes copied into pagecache. + bmap: called by the VFS to map a logical block offset within object to physical block number. This method is used by the FIBMAP ioctl and for working with swap-files. To be able to swap to -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH] vfs: optimization to /proc/pid/mountinfo patch
1) reports deleted inode in dentry_path() consistent with that in __d_path() 2) modified __d_path() to use prepend(), reducing the size of __d_path() 3) moved all the functionality that reports mount information in /proc under CONFIG_PROC_FS. Could not verify if the code would work with CONFIG_PROC_FS=n, since it was impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable CONFIG_PROC_FS. Signed-off-by: Ram Pai [EMAIL PROTECTED] --- fs/dcache.c | 59 +++ fs/namespace.c |2 + fs/seq_file.c|2 + include/linux/dcache.h |3 ++ include/linux/seq_file.h |3 ++ 5 files changed, 34 insertions(+), 35 deletions(-) Index: linux-2.6.23/fs/dcache.c === --- linux-2.6.23.orig/fs/dcache.c +++ linux-2.6.23/fs/dcache.c @@ -1747,6 +1747,17 @@ shouldnt_be_hashed: goto shouldnt_be_hashed; } +static int prepend(char **buffer, int *buflen, const char *str, + int namelen) +{ + *buflen -= namelen; + if (*buflen 0) + return 1; This is confusing. Should return -ENAMETOOLONG intead (see Chapter 16 in Documentation/CodingStyle). + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + /** * d_path - return the path of a dentry * @dentry: dentry to report @@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den { char * end = buffer+buflen; char * retval; - int namelen; - *--end = '\0'; - buflen--; - if (!IS_ROOT(dentry) d_unhashed(dentry)) { - buflen -= 10; - end -= 10; - if (buflen 0) + prepend(end, buflen, \0, 1); + if (!IS_ROOT(dentry) d_unhashed(dentry) + prepend(end, buflen, (deleted), 10)) And this should test for prepend() != 0 or prepend() 0 instead, otherwise it could easily be misread as if prepend() succeeded, then And similarly for all the later calls. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/3] fuse: clean up setting i_size in write
From: Miklos Szeredi [EMAIL PROTECTED] Extract common code for setting i_size in write functions into a common helper. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/fuse/file.c === --- linux.orig/fs/fuse/file.c 2008-02-04 13:01:39.0 +0100 +++ linux/fs/fuse/file.c2008-02-04 13:02:03.0 +0100 @@ -610,13 +610,24 @@ static int fuse_write_begin(struct file return 0; } +static void fuse_write_update_size(struct inode *inode, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(fc-lock); + fi-attr_version = ++fc-attr_version; + if (pos inode-i_size) + i_size_write(inode, pos); + spin_unlock(fc-lock); +} + static int fuse_buffered_write(struct file *file, struct inode *inode, loff_t pos, unsigned count, struct page *page) { int err; size_t nres; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); unsigned offset = pos (PAGE_CACHE_SIZE - 1); struct fuse_req *req; @@ -643,12 +654,7 @@ static int fuse_buffered_write(struct fi err = -EIO; if (!err) { pos += nres; - spin_lock(fc-lock); - fi-attr_version = ++fc-attr_version; - if (pos inode-i_size) - i_size_write(inode, pos); - spin_unlock(fc-lock); - + fuse_write_update_size(inode, pos); if (count == PAGE_CACHE_SIZE) SetPageUptodate(page); } @@ -766,12 +772,8 @@ static ssize_t fuse_direct_io(struct fil } fuse_put_request(fc, req); if (res 0) { - if (write) { - spin_lock(fc-lock); - if (pos inode-i_size) - i_size_write(inode, pos); - spin_unlock(fc-lock); - } + if (write) + fuse_write_update_size(inode, pos); *ppos = pos; } fuse_invalidate_attr(inode); -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 0/3] add perform_write to a_ops
a_ops-perform_write() was left out from Nick Piggin's new a_ops patchset, as it was non-essential, and postponed for later inclusion. This short series reintroduces it, but only adds the fuse implementation and not simple_perform_write(), which I'm not sure would be a significant improvement. This allows larger than 4k buffered writes for fuse, which is one of the most requested features. This goes on top of the fuse: writable mmap patches. Please don't do this, but rather implement your own .aio_write. There's very little in generic_file_aio_write that wouldn't be handle by -perform_write and we should rather factor those up or move to higher layers than adding this ill-defined abstraction. Moving up to higher layers might not be possible, due to lock/unlock of i_mutex being inside generic_file_aio_write(). But with fuse being the only user, it's not a huge issue duplicating some code. Nick, were there any other candidates, that would want to use such an interface in the future? Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
Would you describe the situation that would cause the kernel to go into an infinite loop, please? The patch basically does: do { ... error = inode-i_op-foo() ... } while (error == ESTALE); What is the guarantee, that -foo() will not always return ESTALE? You skimmed over some stuff, like the pathname lookup component contained in the first set of dots... I can't guarantee that -foo() won't always return ESTALE. That said, the loop is not unbreakable. At least for NFS, a signal to the process will interrupt the loop because the error returned will change from ESTALE to EINTR. In FUSE interrupts are sent to userspace, and the filesystem decides what to do with them. So it is entirely possible and valid for a filesystem to ignore an interrupt. If an operation was non-blocking (such as one returning an error), then there would in fact be no purpose in checking interrupts. So while sending a signal might reliably work in NFS to break out of the loop, it does not necessarily work for other filesystems, and fuse may not be the only one affected. Also up till now, returning ESTALE in a fuse filesystem was a perfectly valid thing to do. This patch changes the behavior of that rather drastically. There might be installed systems that rely on current behavior, and we want to avoid breaking those on a kernel upgrade. A few solutions come to mind, perhaps the best is to introduce a kernel internal errno value (ERETRYSTALE), that forces the relevant system calls to be retried. NFS could transform ESTALE errors to ERETRYSTALE and get the desired behavior, while other filesystems would not be affected. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/3] mm: bdi: use MAJOR:MINOR in /sys/class/bdi
From: Miklos Szeredi [EMAIL PROTECTED] Uniformly use MAJOR:MINOR in /sys/class/bdi/ for both block devices and non-block device backed filesystems: FUSE and NFS. Add symlink for block devices: /sys/block/name/bdi - /sys/class/bdi/bdi Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/block/genhd.c === --- linux.orig/block/genhd.c2008-02-02 22:41:03.0 +0100 +++ linux/block/genhd.c 2008-02-02 22:50:03.0 +0100 @@ -178,13 +178,17 @@ static int exact_lock(dev_t devt, void * */ void add_disk(struct gendisk *disk) { + struct backing_dev_info *bdi; + disk-flags |= GENHD_FL_UP; blk_register_region(MKDEV(disk-major, disk-first_minor), disk-minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); - bdi_register(disk-queue-backing_dev_info, NULL, - blk-%s, disk-disk_name); + + bdi = disk-queue-backing_dev_info; + bdi_register_dev(bdi, MKDEV(disk-major, disk-first_minor)); + sysfs_create_link(disk-dev.kobj, bdi-dev-kobj, bdi); } EXPORT_SYMBOL(add_disk); @@ -192,8 +196,9 @@ EXPORT_SYMBOL(del_gendisk); /* in partit void unlink_gendisk(struct gendisk *disk) { - blk_unregister_queue(disk); + sysfs_remove_link(disk-dev.kobj, bdi); bdi_unregister(disk-queue-backing_dev_info); + blk_unregister_queue(disk); blk_unregister_region(MKDEV(disk-major, disk-first_minor), disk-minors); } Index: linux/include/linux/backing-dev.h === --- linux.orig/include/linux/backing-dev.h 2008-02-02 22:41:03.0 +0100 +++ linux/include/linux/backing-dev.h 2008-02-02 22:50:03.0 +0100 @@ -62,6 +62,7 @@ void bdi_destroy(struct backing_dev_info int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); static inline void __add_bdi_stat(struct backing_dev_info *bdi, Index: linux/mm/backing-dev.c === --- linux.orig/mm/backing-dev.c 2008-02-02 22:43:36.0 +0100 +++ linux/mm/backing-dev.c 2008-02-02 22:50:03.0 +0100 @@ -143,6 +143,12 @@ exit: } EXPORT_SYMBOL(bdi_register); +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) +{ + return bdi_register(bdi, NULL, %u:%u, MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi-dev) { Index: linux/fs/fuse/inode.c === --- linux.orig/fs/fuse/inode.c 2008-02-02 22:41:03.0 +0100 +++ linux/fs/fuse/inode.c 2008-02-02 22:50:03.0 +0100 @@ -472,8 +472,7 @@ static struct fuse_conn *new_conn(struct err = bdi_init(fc-bdi); if (err) goto error_kfree; - err = bdi_register(fc-bdi, NULL, fuse-%u:%u, - MAJOR(fc-dev), MINOR(fc-dev)); + err = bdi_register_dev(fc-bdi, fc-dev); if (err) goto error_bdi_destroy; fc-reqctr = 0; Index: linux/fs/nfs/super.c === --- linux.orig/fs/nfs/super.c 2008-02-02 22:41:03.0 +0100 +++ linux/fs/nfs/super.c2008-02-02 22:50:03.0 +0100 @@ -1477,8 +1477,7 @@ static int nfs_compare_super(struct supe static int nfs_bdi_register(struct nfs_server *server) { - return bdi_register(server-backing_dev_info, NULL, nfs-%u:%u, - MAJOR(server-s_dev), MINOR(server-s_dev)); + return bdi_register_dev(server-backing_dev_info, server-s_dev); } static int nfs_get_sb(struct file_system_type *fs_type, Index: linux/Documentation/ABI/testing/sysfs-class-bdi === --- linux.orig/Documentation/ABI/testing/sysfs-class-bdi2008-02-02 22:41:03.0 +0100 +++ linux/Documentation/ABI/testing/sysfs-class-bdi 2008-02-02 22:50:03.0 +0100 @@ -6,17 +6,13 @@ Description: Provide a place in sysfs for the backing_dev_info object. This allows us to see and set the various BDI specific variables. -The bdi identifyer can take the following forms: +The bdi identifier can be either of the following: -blk-NAME +MAJOR:MINOR - Block devices, NAME is 'sda', 'loop0', etc... - -FSTYPE-MAJOR:MINOR - - Non-block device backed filesystems which provide their own - BDI, such as NFS and FUSE. MAJOR:MINOR is the value of st_dev - for files on this filesystem. + Device number for block devices
[patch 3/3] mm: bdi: move statistics to debugfs
From: Miklos Szeredi [EMAIL PROTECTED] Move BDI statistics to debugfs: /sys/kernel/debug/bdi/bdi/stats Use postcore_initcall() to initialize the sysfs class and debugfs, because debugfs is initialized in core_initcall(). Update descriptions in ABI documentation. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/include/linux/backing-dev.h === --- linux.orig/include/linux/backing-dev.h 2008-02-02 23:08:41.0 +0100 +++ linux/include/linux/backing-dev.h 2008-02-02 23:08:41.0 +0100 @@ -16,6 +16,7 @@ #include asm/atomic.h struct page; +struct dentry; /* * Bits in backing_dev_info.state @@ -55,6 +56,11 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct device *dev; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debug_dir; + struct dentry *debug_stats; +#endif }; int bdi_init(struct backing_dev_info *bdi); Index: linux/mm/backing-dev.c === --- linux.orig/mm/backing-dev.c 2008-02-02 23:08:41.0 +0100 +++ linux/mm/backing-dev.c 2008-02-02 23:12:47.0 +0100 @@ -10,6 +10,80 @@ static struct class *bdi_class; +#ifdef CONFIG_DEBUG_FS +#include linux/debugfs.h +#include linux/seq_file.h + +static struct dentry *bdi_debug_root; + +static void bdi_debug_init(void) +{ + bdi_debug_root = debugfs_create_dir(bdi, NULL); +} + +static int bdi_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m-private; + long background_thresh; + long dirty_thresh; + long bdi_thresh; + + get_dirty_limits(background_thresh, dirty_thresh, bdi_thresh, bdi); + +#define K(x) ((x) (PAGE_SHIFT - 10)) + seq_printf(m, + BdiWriteback: %8lu kB\n + BdiReclaimable: %8lu kB\n + BdiDirtyThresh: %8lu kB\n + DirtyThresh: %8lu kB\n + BackgroundThresh: %8lu kB\n, + (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), + (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh)); +#undef K + + return 0; +} + +static int bdi_debug_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, bdi_debug_stats_show, inode-i_private); +} + +static const struct file_operations bdi_debug_stats_fops = { + .open = bdi_debug_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release= single_release, +}; + +static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) +{ + bdi-debug_dir = debugfs_create_dir(name, bdi_debug_root); + bdi-debug_stats = debugfs_create_file(stats, 0444, bdi-debug_dir, + bdi, bdi_debug_stats_fops); +} + +static void bdi_debug_unregister(struct backing_dev_info *bdi) +{ + debugfs_remove(bdi-debug_stats); + debugfs_remove(bdi-debug_dir); +} +#else +static inline void bdi_debug_init(void) +{ +} +static inline void bdi_debug_register(struct backing_dev_info *bdi, + const char *name) +{ +} +static inline void bdi_debug_unregister(struct backing_dev_info *bdi) +{ +} +#endif + static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -40,21 +114,6 @@ static ssize_t name##_show(struct device BDI_SHOW(read_ahead_kb, K(bdi-ra_pages)) -BDI_SHOW(reclaimable_kb, K(bdi_stat(bdi, BDI_RECLAIMABLE))) -BDI_SHOW(writeback_kb, K(bdi_stat(bdi, BDI_WRITEBACK))) - -static inline unsigned long get_dirty(struct backing_dev_info *bdi, int i) -{ - unsigned long thresh[3]; - - get_dirty_limits(thresh[0], thresh[1], thresh[2], bdi); - - return thresh[i]; -} - -BDI_SHOW(dirty_kb, K(get_dirty(bdi, 1))) -BDI_SHOW(bdi_dirty_kb, K(get_dirty(bdi, 2))) - static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -95,10 +154,6 @@ BDI_SHOW(max_ratio, bdi-max_ratio) static struct device_attribute bdi_dev_attrs[] = { __ATTR_RW(read_ahead_kb), - __ATTR_RO(reclaimable_kb), - __ATTR_RO(writeback_kb), - __ATTR_RO(dirty_kb), - __ATTR_RO(bdi_dirty_kb), __ATTR_RW(min_ratio), __ATTR_RW(max_ratio), __ATTR_NULL, @@ -108,10 +163,11 @@ static __init int bdi_class_init(void) { bdi_class = class_create(THIS_MODULE, bdi); bdi_class-dev_attrs = bdi_dev_attrs; + bdi_debug_init(); return 0; } -core_initcall(bdi_class_init); +postcore_initcall(bdi_class_init); int bdi_register(struct backing_dev_info *bdi
[patch 1/3] mm: bdi: fix read_ahead_kb_store()
From: Miklos Szeredi [EMAIL PROTECTED] This managed to completely evade testing :( Fix return value to be count or -errno. Also bring the function in line with the other store functions on this object, which have more strict input checking. Also fix bdi_set_max_ratio() to actually return an error, instead of always zero. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/mm/backing-dev.c === --- linux.orig/mm/backing-dev.c 2008-02-02 23:21:50.0 +0100 +++ linux/mm/backing-dev.c 2008-02-02 23:26:01.0 +0100 @@ -16,10 +16,15 @@ static ssize_t read_ahead_kb_store(struc { struct backing_dev_info *bdi = dev_get_drvdata(dev); char *end; + unsigned long read_ahead_kb; + ssize_t ret = -EINVAL; - bdi-ra_pages = simple_strtoul(buf, end, 10) (PAGE_SHIFT - 10); - - return end - buf; + read_ahead_kb = simple_strtoul(buf, end, 10); + if (*buf (end[0] == '\0' || (end[0] == '\n' end[1] == '\0'))) { + bdi-ra_pages = read_ahead_kb (PAGE_SHIFT - 10); + ret = count; + } + return ret; } #define K(pages) ((pages) (PAGE_SHIFT - 10)) Index: linux/mm/page-writeback.c === --- linux.orig/mm/page-writeback.c 2008-02-02 20:51:26.0 +0100 +++ linux/mm/page-writeback.c 2008-02-02 23:26:15.0 +0100 @@ -288,7 +288,7 @@ int bdi_set_max_ratio(struct backing_dev } spin_unlock_irqrestore(bdi_lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(bdi_set_max_ratio); -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 0/3] mm: bdi: updates
Here are incremental patches against the export BDI attributes in sysfs patchset, addressing the issues identified at the last submission: - the read-only attributes are only for debugging - more consistent naming needed in /sys/class/bdi - documentation problems I've also done some testing, and fixed some bugs. Including patches in -mm can do wonders, even before the kernel containing them is released :) Let me know if you prefer a resubmission of the original series with these changes folded in. Thanks, Miklos -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
This doesn't apply to -mm, because the ro-mounts stuff touches a lot of the same places as this patch. You probably need to rebase this on top of those changes. This patch adds handling for the error, ESTALE, to the system calls which take pathnames as arguments. The algorithm used is to detect that an ESTALE error has occurred during an operation subsequent to the lookup process and then to unwind appropriately and then to perform the lookup process again. Eventually, either the lookup process will return an error or a valid dentry/inode combination and then operation can succeed or fail based on its own merits. If a broken NFS server or FUSE filesysem keeps returning ESTALE, this goes into an infinite loop. How are we planning to deal with that? And it has to be dealt with either in the VFS, or in the kernel parts of the relevant filesystems. We can't just say, fix the broken servers, especially not with FUSE, where the server is totally untrusted. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
This doesn't apply to -mm, because the ro-mounts stuff touches a lot of the same places as this patch. You probably need to rebase this on top of those changes. This patch adds handling for the error, ESTALE, to the system calls which take pathnames as arguments. The algorithm used is to detect that an ESTALE error has occurred during an operation subsequent to the lookup process and then to unwind appropriately and then to perform the lookup process again. Eventually, either the lookup process will return an error or a valid dentry/inode combination and then operation can succeed or fail based on its own merits. If a broken NFS server or FUSE filesysem keeps returning ESTALE, this goes into an infinite loop. How are we planning to deal with that? Would you describe the situation that would cause the kernel to go into an infinite loop, please? The patch basically does: do { ... error = inode-i_op-foo() ... } while (error == ESTALE); What is the guarantee, that -foo() will not always return ESTALE? Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 6/6] mm: bdi: allow setting a maximum for the bdi dirty limit
On Tue, 29 Jan 2008 16:49:06 +0100 Miklos Szeredi [EMAIL PROTECTED] wrote: Add max_ratio to /sys/class/bdi. This indicates the maximum percentage of the global dirty threshold allocated to this bdi. Maybe I'm having a stupid day, but I don't understand the semantics of this min and max at all. I've read the code, and I've read the comments (well, I've hunted for some) and I've read the docs. I really don't know how anyone could use this in its current state without doing a lot of code-reading and complex experimentation. All of which would be unneeded if this tunable was properly documented. So. Please provide adequate documentation for this tunable. I'd suggest that it be pitched at the level of a reasonably competent system operator. It should help them understand why the tunable exists, why they might choose to alter it, and what effects they can expect to see. Hopefully a reaonably competent kernel developer can then understand it too. OK. I think what's missing from some docs, is a high level description of the per-bdi throttling algorithm, and how it affects writeback. Because with info, I think the min and max ratios are trivially understandable: they just override the result of the algorithm, in case it would mean too high or too low threshold. Peter, could you write something about that? Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 01/26] mount options: add documentation
- loop: how is the connection between file and loop device maintained? We also discussed this with Karel, maybe it didn't make it onto lkml. The proposed solution was to store the loop flag separately in a file under /var. It could just be an empty file for each such loop device: /var/lib/mount/loops/loop0 This file is created by mount(8) if the '-oloop' option is given. And umount(8) automatically tears down the loop device if it finds this file. It seems we needn't this solution. There is loop auto-destruction patch in -mm. Kernel part: http://marc.info/?l=linux-kernelm=119361296818388w=2 mount(8) part: http://marc.info/?l=util-linux-ngm=119362955431694w=2 So, with this patch mount(8) needn't to maintain info about loops and umount(8) doesn't need to call LOOP_CLR_FD ioctl, because umount(2) is enough. Excellent! This is a very good example how moving a functionality into the kernel can greatly simplify it. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/6] mm: bdi: tweak task dirty penalty
From: Peter Zijlstra [EMAIL PROTECTED] Penalizing heavy dirtiers with 1/8-th the total dirty limit might be rather excessive on large memory machines. Use sqrt to scale it sub-linearly. Update the comment while we're there. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/mm/page-writeback.c === --- linux.orig/mm/page-writeback.c 2008-01-17 19:00:56.0 +0100 +++ linux/mm/page-writeback.c 2008-01-18 13:07:16.0 +0100 @@ -219,17 +219,21 @@ static inline void task_dirties_fraction } /* - * scale the dirty limit + * Task specific dirty limit: * - * task specific dirty limit: + * dirty -= 8 * sqrt(dirty) * p_{t} * - * dirty -= (dirty/8) * p_{t} + * Penalize tasks that dirty a lot of pages by lowering their dirty limit. This + * avoids infrequent dirtiers from getting stuck in this other guys dirty + * pages. + * + * Use a sub-linear function to scale the penalty, we only need a little room. */ static void task_dirty_limit(struct task_struct *tsk, long *pdirty) { long numerator, denominator; long dirty = *pdirty; - u64 inv = dirty 3; + u64 inv = 8*int_sqrt(dirty); task_dirties_fraction(tsk, numerator, denominator); inv *= numerator; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 5/6] mm: bdi: allow setting a minimum for the bdi dirty limit
From: Peter Zijlstra [EMAIL PROTECTED] Add min_ratio to /sys/class/bdi. This indicates the minimum percentage of the global dirty threshold allocated to this bdi. [EMAIL PROTECTED] - fix parsing in min_ratio_store() - document new sysfs attribute Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/include/linux/backing-dev.h === --- linux.orig/include/linux/backing-dev.h 2008-01-29 14:40:35.0 +0100 +++ linux/include/linux/backing-dev.h 2008-01-29 15:35:34.0 +0100 @@ -51,6 +51,8 @@ struct backing_dev_info { struct prop_local_percpu completions; int dirty_exceeded; + unsigned int min_ratio; + struct device *dev; }; @@ -136,6 +138,8 @@ static inline unsigned long bdi_stat_err #endif } +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); + /* * Flags in backing_dev_info::capability * - The first two flags control whether dirty pages will contribute to the Index: linux/mm/backing-dev.c === --- linux.orig/mm/backing-dev.c 2008-01-29 14:40:35.0 +0100 +++ linux/mm/backing-dev.c 2008-01-29 15:36:35.0 +0100 @@ -50,6 +50,24 @@ static inline unsigned long get_dirty(st BDI_SHOW(dirty_kb, K(get_dirty(bdi, 1))) BDI_SHOW(bdi_dirty_kb, K(get_dirty(bdi, 2))) +static ssize_t min_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned int ratio; + ssize_t ret = -EINVAL; + + ratio = simple_strtoul(buf, end, 10); + if (*buf (end[0] == '\0' || (end[0] == '\n' end[1] == '\0'))) { + ret = bdi_set_min_ratio(bdi, ratio); + if (!ret) + ret = count; + } + return ret; +} +BDI_SHOW(min_ratio, bdi-min_ratio) + #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) static struct device_attribute bdi_dev_attrs[] = { @@ -58,6 +76,7 @@ static struct device_attribute bdi_dev_a __ATTR_RO(writeback_kb), __ATTR_RO(dirty_kb), __ATTR_RO(bdi_dirty_kb), + __ATTR_RW(min_ratio), __ATTR_NULL, }; @@ -116,6 +135,8 @@ int bdi_init(struct backing_dev_info *bd bdi-dev = NULL; + bdi-min_ratio = 0; + for (i = 0; i NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init_irq(bdi-bdi_stat[i], 0); if (err) Index: linux/mm/page-writeback.c === --- linux.orig/mm/page-writeback.c 2008-01-29 14:40:35.0 +0100 +++ linux/mm/page-writeback.c 2008-01-29 15:35:34.0 +0100 @@ -247,6 +247,29 @@ static void task_dirty_limit(struct task } /* + * + */ +static DEFINE_SPINLOCK(bdi_lock); +static unsigned int bdi_min_ratio; + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(bdi_lock, flags); + min_ratio -= bdi-min_ratio; + if (bdi_min_ratio + min_ratio 100) { + bdi_min_ratio += min_ratio; + bdi-min_ratio += min_ratio; + } else + ret = -EINVAL; + spin_unlock_irqrestore(bdi_lock, flags); + + return ret; +} + +/* * Work out the current dirty-memory clamping and background writeout * thresholds. * @@ -334,7 +357,7 @@ get_dirty_limits(long *pbackground, long *pdirty = dirty; if (bdi) { - u64 bdi_dirty = dirty; + u64 bdi_dirty; long numerator, denominator; /* @@ -342,8 +365,10 @@ get_dirty_limits(long *pbackground, long */ bdi_writeout_fraction(bdi, numerator, denominator); + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; bdi_dirty *= numerator; do_div(bdi_dirty, denominator); + bdi_dirty += (dirty * bdi-min_ratio) / 100; *pbdi_dirty = bdi_dirty; clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); Index: linux/Documentation/ABI/testing/sysfs-class-bdi === --- linux.orig/Documentation/ABI/testing/sysfs-class-bdi2008-01-29 14:40:35.0 +0100 +++ linux/Documentation/ABI/testing/sysfs-class-bdi 2008-01-29 15:37:24.0 +0100 @@ -48,3 +48,9 @@ bdi_dirty_kb (read-only) Current threshold on this BDI for reclaimable + writeback memory +min_ratio (read-write) + + Minimal percentage of global dirty threshold allocated to this + bdi. If the value written to this file would make the the sum + of all min_ratio values exceed 100, then EINVAL
[patch 2/6] mm: bdi: export BDI attributes in sysfs
From: Peter Zijlstra [EMAIL PROTECTED] Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info object. This allows us to see and set the various BDI specific variables. In particular this properly exposes the read-ahead window for all relevant users and /sys/block/block/queue/read_ahead_kb should be deprecated. With patient help from Kay Sievers and Greg KH [EMAIL PROTECTED] - split off NFS and FUSE changes into separate patches - document new sysfs attributes under Documentation/ABI - do bdi_class_init as a core_initcall, otherwise the default BDI won't be initialized - remove bdi_init_fmt macro, it's not used very much Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] CC: Kay Sievers [EMAIL PROTECTED] CC: Greg KH [EMAIL PROTECTED] CC: Trond Myklebust [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/block/genhd.c === --- linux.orig/block/genhd.c2008-01-29 13:02:41.0 +0100 +++ linux/block/genhd.c 2008-01-29 13:02:46.0 +0100 @@ -183,6 +183,8 @@ void add_disk(struct gendisk *disk) disk-minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); + bdi_register(disk-queue-backing_dev_info, NULL, + blk-%s, disk-disk_name); } EXPORT_SYMBOL(add_disk); @@ -191,6 +193,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partit void unlink_gendisk(struct gendisk *disk) { blk_unregister_queue(disk); + bdi_unregister(disk-queue-backing_dev_info); blk_unregister_region(MKDEV(disk-major, disk-first_minor), disk-minors); } Index: linux/include/linux/backing-dev.h === --- linux.orig/include/linux/backing-dev.h 2008-01-29 13:02:41.0 +0100 +++ linux/include/linux/backing-dev.h 2008-01-29 13:02:46.0 +0100 @@ -11,6 +11,8 @@ #include linux/percpu_counter.h #include linux/log2.h #include linux/proportions.h +#include linux/kernel.h +#include linux/device.h #include asm/atomic.h struct page; @@ -48,11 +50,17 @@ struct backing_dev_info { struct prop_local_percpu completions; int dirty_exceeded; + + struct device *dev; }; int bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...); +void bdi_unregister(struct backing_dev_info *bdi); + static inline void __add_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, s64 amount) { Index: linux/include/linux/writeback.h === --- linux.orig/include/linux/writeback.h2008-01-29 13:02:41.0 +0100 +++ linux/include/linux/writeback.h 2008-01-29 13:02:46.0 +0100 @@ -113,6 +113,9 @@ struct file; int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, +struct backing_dev_info *bdi); + void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied); Index: linux/mm/backing-dev.c === --- linux.orig/mm/backing-dev.c 2008-01-29 13:02:41.0 +0100 +++ linux/mm/backing-dev.c 2008-01-29 13:03:23.0 +0100 @@ -4,12 +4,118 @@ #include linux/fs.h #include linux/sched.h #include linux/module.h +#include linux/writeback.h +#include linux/device.h + + +static struct class *bdi_class; + +static ssize_t read_ahead_kb_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + + bdi-ra_pages = simple_strtoul(buf, end, 10) (PAGE_SHIFT - 10); + + return end - buf; +} + +#define K(pages) ((pages) (PAGE_SHIFT - 10)) + +#define BDI_SHOW(name, expr) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *page) \ +{ \ + struct backing_dev_info *bdi = dev_get_drvdata(dev);\ + \ + return snprintf(page, PAGE_SIZE-1, %lld\n, (long long)expr); \ +} + +BDI_SHOW(read_ahead_kb, K(bdi-ra_pages)) + +BDI_SHOW(reclaimable_kb, K(bdi_stat(bdi, BDI_RECLAIMABLE))) +BDI_SHOW(writeback_kb, K(bdi_stat(bdi, BDI_WRITEBACK
[patch 0/6] mm: bdi: updates
This is a series from Peter Zijlstra, with various updates by me. The patchset mostly deals with exporting BDI attributes in sysfs. Should be in a mergeable state, at least into -mm. -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 4/6] mm: bdi: expose the BDI object in sysfs for FUSE
From: Miklos Szeredi [EMAIL PROTECTED] Register FUSE's backing_dev_info under sysfs with the name fuse-MAJOR:MINOR Make the fuse control filesystem use s_dev instead of a fuse specific ID. This makes it easier to match directories under /sys/fs/fuse/connections/ with directories under /sys/class/bdi, and with actual mounts. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] CC: Peter Zijlstra [EMAIL PROTECTED] --- Index: linux/fs/fuse/control.c === --- linux.orig/fs/fuse/control.c2008-01-29 10:26:47.0 +0100 +++ linux/fs/fuse/control.c 2008-01-29 12:16:06.0 +0100 @@ -117,7 +117,7 @@ int fuse_ctl_add_conn(struct fuse_conn * parent = fuse_control_sb-s_root; inc_nlink(parent-d_inode); - sprintf(name, %llu, (unsigned long long) fc-id); + sprintf(name, %u, fc-dev); parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, simple_dir_inode_operations, simple_dir_operations); Index: linux/fs/fuse/fuse_i.h === --- linux.orig/fs/fuse/fuse_i.h 2008-01-29 10:26:47.0 +0100 +++ linux/fs/fuse/fuse_i.h 2008-01-29 12:16:06.0 +0100 @@ -384,8 +384,8 @@ struct fuse_conn { /** Entry on the fuse_conn_list */ struct list_head entry; - /** Unique ID */ - u64 id; + /** Device ID from super block */ + dev_t dev; /** Dentries in the control filesystem */ struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; Index: linux/fs/fuse/inode.c === --- linux.orig/fs/fuse/inode.c 2008-01-29 10:26:47.0 +0100 +++ linux/fs/fuse/inode.c 2008-01-29 12:57:26.0 +0100 @@ -448,7 +448,7 @@ static int fuse_show_options(struct seq_ return 0; } -static struct fuse_conn *new_conn(void) +static struct fuse_conn *new_conn(struct super_block *sb) { struct fuse_conn *fc; int err; @@ -468,19 +468,27 @@ static struct fuse_conn *new_conn(void) atomic_set(fc-num_waiting, 0); fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc-bdi.unplug_io_fn = default_unplug_io_fn; + fc-dev = sb-s_dev; err = bdi_init(fc-bdi); - if (err) { - kfree(fc); - fc = NULL; - goto out; - } + if (err) + goto error_kfree; + err = bdi_register(fc-bdi, NULL, fuse-%u:%u, + MAJOR(fc-dev), MINOR(fc-dev)); + if (err) + goto error_bdi_destroy; fc-reqctr = 0; fc-blocked = 1; fc-attr_version = 1; get_random_bytes(fc-scramble_key, sizeof(fc-scramble_key)); } -out: return fc; + +error_bdi_destroy: + bdi_destroy(fc-bdi); +error_kfree: + mutex_destroy(fc-inst_mutex); + kfree(fc); + return NULL; } void fuse_conn_put(struct fuse_conn *fc) @@ -578,12 +586,6 @@ static void fuse_send_init(struct fuse_c request_send_background(fc, req); } -static u64 conn_id(void) -{ - static u64 ctr = 1; - return ctr++; -} - static int fuse_fill_super(struct super_block *sb, void *data, int silent) { struct fuse_conn *fc; @@ -621,7 +623,7 @@ static int fuse_fill_super(struct super_ if (file-f_op != fuse_dev_operations) return -EINVAL; - fc = new_conn(); + fc = new_conn(sb); if (!fc) return -ENOMEM; @@ -659,7 +661,6 @@ static int fuse_fill_super(struct super_ if (file-private_data) goto err_unlock; - fc-id = conn_id(); err = fuse_ctl_add_conn(fc); if (err) goto err_unlock; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch] vfs: create /proc/pid/mountinfo
From: Ram Pai [EMAIL PROTECTED] /proc/mounts in its current state fails to disambiguate bind mounts, especially when the bind mount is subrooted. Also it does not capture propagation state of the mounts(shared-subtree). The following patch addresses the problem. The patch adds '/proc/pid/mountinfo' which contains a superset of information in '/proc/pid/mounts'. The following fields are added: mntid -- is a unique identifier of the mount parent -- the id of the parent mount major:minor -- value of st_dev for files on that filesystem dir -- the subdir in the filesystem which forms the root of this mount propagation-type in the form of propagation_flag[:mntid][,...] note: 'shared' flag is followed by the mntid of its peer mount 'slave' flag is followed by the mntid of its master mount 'private' flag stands by itself 'unbindable' flag stands by itself Also mount options are split into two fileds, the first containing the per mount flags, the second the per super block options. Here is a sample cat /proc/mounts after execution the following commands: mount --bind /mnt /mnt mount --make-shared /mnt mount --bind /mnt/1 /var mount --make-slave /var mount --make-shared /var mount --bind /var/abc /tmp mount --make-unbindable /proc 2 2 0:1 rootfs rootfs / / rw rw private 16 2 98:0 ext2 /dev/root / / rw rw private 17 16 0:3 proc /proc / /proc rw rw unbindable 18 16 0:10 devpts devpts /dev/pts / rw rw private 19 16 98:0 ext2 /dev/root /mnt /mnt rw rw shared:19 20 16 98:0 ext2 /dev/root /mnt/1 /var rw rw shared:21,slave:19 21 16 98:0 ext2 /dev/root /mnt/1/abc /tmp rw rw shared:20,slave:19 For example, the last line indicates that : 1) The mount is a shared mount. 2) Its peer mount of mount with id 20 3) It is also a slave mount of the master-mount with the id 19 4) The filesystem on device with major/minor number 98:0 and subdirectory mnt/1/abc makes the root directory of this mount. 5) And finally the mount with id 16 is its parent. [EMAIL PROTECTED] - new file, rearrange fields - for mount ID's use IDA (from the IDR library) instead of a 32bit counter, which could overflow - print canonical ID's (smallest one within the peer group) for peers and master, this is more useful, than a random ID within the same namespace - fix a couple of small bugs - remove inlines - style fixes Signed-off-by: Ram Pai [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/dcache.c === --- linux.orig/fs/dcache.c 2008-01-28 14:54:19.0 +0100 +++ linux/fs/dcache.c 2008-01-28 14:54:50.0 +0100 @@ -1890,6 +1890,60 @@ char *dynamic_dname(struct dentry *dentr return memcpy(buffer, temp, sz); } +static int prepend(char **buffer, int *buflen, const char *str, + int namelen) +{ + *buflen -= namelen; + if (*buflen 0) + return 1; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + +/* + * Write full pathname from the root of the filesystem into the buffer. + */ +char *dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *end = buf + buflen; + char *retval; + + spin_lock(dcache_lock); + prepend(end, buflen, \0, 1); + if (!IS_ROOT(dentry) d_unhashed(dentry)) { + if (prepend(end, buflen, //deleted, 9)) + goto Elong; + } + if (buflen 1) + goto Elong; + /* Get '/' right */ + retval = end-1; + *retval = '/'; + + for (;;) { + struct dentry *parent; + if (IS_ROOT(dentry)) + break; + + parent = dentry-d_parent; + prefetch(parent); + + if (prepend(end, buflen, dentry-d_name.name, + dentry-d_name.len) || + prepend(end, buflen, /, 1)) + goto Elong; + + retval = end; + dentry = parent; + } + spin_unlock(dcache_lock); + return retval; +Elong: + spin_unlock(dcache_lock); + return ERR_PTR(-ENAMETOOLONG); +} + /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-01-28 14:54:19.0 +0100 +++ linux/fs/namespace.c2008-01-28 14:54:50.0 +0100 @@ -27,6 +27,7 @@ #include linux/mount.h #include linux/ramfs.h #include linux/log2.h +#include linux/idr.h #include asm/uaccess.h #include asm/unistd.h #include pnode.h @@ -39,6 +40,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); static int event; +static DEFINE_IDA(mnt_id_ida); static struct list_head *mount_hashtable __read_mostly; static struct kmem_cache
[patch 6/6] mm: bdi: allow setting a maximum for the bdi dirty limit
From: Peter Zijlstra [EMAIL PROTECTED] Add max_ratio to /sys/class/bdi. This indicates the maximum percentage of the global dirty threshold allocated to this bdi. [EMAIL PROTECTED] - fix parsing in max_ratio_store(). - export bdi_set_max_ratio() to modules - limit bdi_dirty with bdi-max_ratio - document new sysfs attribute Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/include/linux/backing-dev.h === --- linux.orig/include/linux/backing-dev.h 2008-01-29 16:33:14.0 +0100 +++ linux/include/linux/backing-dev.h 2008-01-29 16:33:14.0 +0100 @@ -52,6 +52,7 @@ struct backing_dev_info { int dirty_exceeded; unsigned int min_ratio; + unsigned int max_ratio, max_prop_frac; struct device *dev; }; @@ -139,6 +140,7 @@ static inline unsigned long bdi_stat_err } int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); /* * Flags in backing_dev_info::capability Index: linux/include/linux/proportions.h === --- linux.orig/include/linux/proportions.h 2008-01-29 16:25:14.0 +0100 +++ linux/include/linux/proportions.h 2008-01-29 16:33:14.0 +0100 @@ -78,6 +78,19 @@ void prop_inc_percpu(struct prop_descrip } /* + * Limit the time part in order to ensure there are some bits left for the + * cycle counter and fraction multiply. + */ +#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4) + +#define PROP_FRAC_SHIFT(BITS_PER_LONG - PROP_MAX_SHIFT - 1) +#define PROP_FRAC_BASE (1UL PROP_FRAC_SHIFT) + +void __prop_inc_percpu_max(struct prop_descriptor *pd, + struct prop_local_percpu *pl, long frac); + + +/* * - SINGLE -- */ Index: linux/lib/proportions.c === --- linux.orig/lib/proportions.c2008-01-29 16:25:14.0 +0100 +++ linux/lib/proportions.c 2008-01-29 16:33:14.0 +0100 @@ -73,12 +73,6 @@ #include linux/proportions.h #include linux/rcupdate.h -/* - * Limit the time part in order to ensure there are some bits left for the - * cycle counter. - */ -#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4) - int prop_descriptor_init(struct prop_descriptor *pd, int shift) { int err; @@ -268,6 +262,38 @@ void __prop_inc_percpu(struct prop_descr } /* + * identical to __prop_inc_percpu, except that it limits this pl's fraction to + * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded. + */ +void __prop_inc_percpu_max(struct prop_descriptor *pd, + struct prop_local_percpu *pl, long frac) +{ + struct prop_global *pg = prop_get_global(pd); + + prop_norm_percpu(pg, pl); + + if (unlikely(frac != PROP_FRAC_BASE)) { + unsigned long period_2 = 1UL (pg-shift - 1); + unsigned long counter_mask = period_2 - 1; + unsigned long global_count; + long numerator, denominator; + + numerator = percpu_counter_read_positive(pl-events); + global_count = percpu_counter_read(pg-events); + denominator = period_2 + (global_count counter_mask); + + if (numerator ((denominator * frac) PROP_FRAC_SHIFT)) + goto out_put; + } + + percpu_counter_add(pl-events, 1); + percpu_counter_add(pg-events, 1); + +out_put: + prop_put_global(pd, pg); +} + +/* * Obtain a fraction of this proportion * * p_{j} = x_{j} / (period/2 + t % period/2) Index: linux/mm/backing-dev.c === --- linux.orig/mm/backing-dev.c 2008-01-29 16:33:14.0 +0100 +++ linux/mm/backing-dev.c 2008-01-29 16:33:14.0 +0100 @@ -68,6 +68,24 @@ static ssize_t min_ratio_store(struct de } BDI_SHOW(min_ratio, bdi-min_ratio) +static ssize_t max_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + char *end; + unsigned int ratio; + ssize_t ret = -EINVAL; + + ratio = simple_strtoul(buf, end, 10); + if (*buf (end[0] == '\0' || (end[0] == '\n' end[1] == '\0'))) { + ret = bdi_set_max_ratio(bdi, ratio); + if (!ret) + ret = count; + } + return ret; +} +BDI_SHOW(max_ratio, bdi-max_ratio) + #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) static struct device_attribute bdi_dev_attrs[] = { @@ -77,6 +95,7 @@ static struct device_attribute bdi_dev_a __ATTR_RO(dirty_kb), __ATTR_RO(bdi_dirty_kb), __ATTR_RW(min_ratio
Re: [patch 21/26] mount options: partially fix nfs
All mount options should be shown, which are needed to reconstruct a previous mount. Ah, OK. I'm happy to implement logic to display the all missing options. I should have updated nfs_show_mount_options() when I wrote the NFS mount option parser. Let me know your preference. You are more familiar with NFS, so I think it would be better if you updated nfs_show_mount_options(). Could you also queue my patch (updated) or incorporate it into a combined fix? Thanks, Miklos Subject: mount options: partially fix nfs From: Miklos Szeredi [EMAIL PROTECTED] Add posix, bsize=, namelen= options to /proc/mounts for nfs filesystems. Document several other options that are still missing. Changes: - display namelen= unconditionally - addr= isn't missing after all Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] CC: Trond Myklebust [EMAIL PROTECTED] --- Index: linux/fs/nfs/super.c === --- linux.orig/fs/nfs/super.c 2008-01-25 15:44:56.0 +0100 +++ linux/fs/nfs/super.c2008-01-25 15:57:32.0 +0100 @@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc } nfs_info[] = { { NFS_MOUNT_SOFT, ,soft, ,hard }, { NFS_MOUNT_INTR, ,intr, ,nointr }, + { NFS_MOUNT_POSIX, ,posix, }, { NFS_MOUNT_NOCTO, ,nocto, }, { NFS_MOUNT_NOAC, ,noac, }, { NFS_MOUNT_NONLM, ,nolock, }, @@ -463,6 +464,9 @@ static void nfs_show_mount_options(struc seq_printf(m, ,vers=%d, clp-rpc_ops-version); seq_printf(m, ,rsize=%d, nfss-rsize); seq_printf(m, ,wsize=%d, nfss-wsize); + seq_printf(m, ,namelen=%d, nfss-namelen); + if (nfss-bsize != 0) + seq_printf(m, ,bsize=%d, nfss-bsize); if (nfss-acregmin != 3*HZ || showdefaults) seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ); if (nfss-acregmax != 60*HZ || showdefaults) @@ -482,6 +486,17 @@ static void nfs_show_mount_options(struc seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval / HZ); seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries); seq_printf(m, ,sec=%s, nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor)); + + /* +* Missing options: +* port= +* mountport= +* mountvers= +* mountproto= +* clientaddr= +* mounthost= +* mountaddr= +*/ } /* - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 00/26] mount options: fix filesystem's -show_options
On Thu, 24 Jan 2008 20:33:41 +0100 Miklos Szeredi [EMAIL PROTECTED] wrote: Andrew, Would you please consider these patches for -mm? Sure, but I'm too lazy to pick through them and work out which ones need updating, which ones got acked and which ones someone else merged, all on a very bumpy plane flight ;) Please resend when the dust has settled? Yes, I should have thought, it won't quite work in a single iteration :) I'll resend them in a moment. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 24/26] mount options: fix tmpfs
Thanks Miklos, that's a welcome enhancement, nicely done. I've only noticed one thing wrong (MPOL_PREFERRED shown as default); but thought shmem_config didn't add much value - I'd rather avoid those syntactic changes to unchanged code; and several tmpfs defaults being relative (e.g. to totalram_pages, or to mounter's fsuid), I ended up preferring to do real tests in shmem_show_options. I completely agree, this is much better than my version. Thus, for example, if memory is hotplugged in or out later, what started out as an unspecified size option will then get shown as explicit size. (I did think for a while that I wanted to show explicit size in all cases; but it looked pretty silly on udev.) I think that's the correct behaviour, that otherwise would be misleading; but I may be looking at this the wrong way round, what's your view? I agree, this is the correct way. I'll add functions for calculating the default max values, so the calculations won't accidentally become different for the initialization and the option showing. If you agree with the version below, please take it into your collection and insert your Signed-off-by. I should admit, I've not yet tested how the NUMA policies look: you'll hear from me again tomorrow morning if those turn out to wrong. OK, I'll send this to Andrew. Maybe I'll wait until tomorrow to hear if it's working on NUMA. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 10/26] mount options: fix devpts
Also add minor fix: when parsing the mode option, mask with S_IALLUGO instead of ~S_IFMT, which could leave unsed bits in the mask. umode_t is 16 bits, so it doesn't. The change is still good, of course. We still use 16 bit types? Strange ;) + if (config.mode != DEVPTS_DEFAULT_MODE) + seq_printf(seq, ,mode=%03o, config.mode); I would rather this be unconditional, than that it be conditional on something other than the user having specified it in the first place. Yeah, it's a matter of taste. I'll update the patch. Actually, a lot of filesystems share the options 'uid=X', 'gid=X', 'mode=X' (or 'umask=X'). This could be handled by the VFS, saving some code, and making things more consistent. One day maybe... Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 01/26] mount options: add documentation
Where did you check for the existence of a -show_options method for unionfs? Unionfs does implement -show_options and supports all of the mount/remount options. See: http://git.kernel.org/?p=linux/kernel/git/ezk/unionfs.git;a=blob;f=fs/unionfs/super.c;h=986c980261a5b171147d66ac05bf08423e2fd6b6;hb=HEAD#l963 The unionfs -remount code supports branch-management options which can add/del/change a branch, but we don't show those directly in -show_options; it makes more sense to show the final (and thus most current) branch configuration. Could you update your records please? Sure. Sorry about that, I did actually look at unionfs, and it was just an administration error and bad memory (in my head). BTW, I should be able to use your save_mount_options(). It is probably better not to use save_mount_options(). Especially, since unionfs implemets a remount, that changes the tree only partially AFAICS. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 25/26] mount options: fix udf
| + /* is this correct? */ | + if (sbi-s_anchor[2] != 0) | + seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]); you know, I would prefer to use form UDF_SB_ANCHOR(sb)[2] in sake of style unification but we should wait for Jan's decision (i'm not the expert in this area ;) I think UDF_SB_ANCHOR macro was removed by some patch in -mm. I'm more interested if the second element of the s_anchor array really does always have the value of the 'anchor=N' mount option. I haven't been able to verify that fully. Do you have some insight into that? Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 21/26] mount options: partially fix nfs
Miklos Szeredi wrote: From: Miklos Szeredi [EMAIL PROTECTED] Add posix, bsize=, namelen= options to /proc/mounts for nfs filesystems. Document several other options that are still missing. NFS lists only some options in /proc/mounts on purpose: only the essential options are mentioned there to keep clutter down. The three you've added here are for all intents and purposes deprecated, which is why they are not supported. NFS lists a more complete set of mount options for a mount point in /proc/self/mountstats. See nfs_show_stats(). Since your cover letter does not explain why you are changing this code, can you refer me to a description of why you are doing this? Descritption is in the 01/26 patch. More below. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/nfs/super.c === --- linux.orig/fs/nfs/super.c 2008-01-19 11:56:34.0 +0100 +++ linux/fs/nfs/super.c2008-01-21 20:41:30.0 +0100 @@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc } nfs_info[] = { { NFS_MOUNT_SOFT, ,soft, ,hard }, { NFS_MOUNT_INTR, ,intr, ,nointr }, + { NFS_MOUNT_POSIX, ,posix, }, { NFS_MOUNT_NOCTO, ,nocto, }, { NFS_MOUNT_NOAC, ,noac, }, { NFS_MOUNT_NONLM, ,nolock, }, @@ -459,10 +460,17 @@ static void nfs_show_mount_options(struc }; const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss-nfs_client; + unsigned int default_namelen = + clp-rpc_ops-version == 4 ? NFS4_MAXNAMLEN : + clp-rpc_ops-version == 3 ? NFS3_MAXNAMLEN : NFS2_MAXNAMLEN; seq_printf(m, ,vers=%d, clp-rpc_ops-version); seq_printf(m, ,rsize=%d, nfss-rsize); seq_printf(m, ,wsize=%d, nfss-wsize); + if (nfss-bsize != 0) + seq_printf(m, ,bsize=%d, nfss-bsize); + if (nfss-namelen != default_namelen) + seq_printf(m, ,namelen=%d, nfss-namelen); if (nfss-acregmin != 3*HZ || showdefaults) seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ); if (nfss-acregmax != 60*HZ || showdefaults) @@ -482,6 +490,18 @@ static void nfs_show_mount_options(struc seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval / HZ); seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries); seq_printf(m, ,sec=%s, nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor)); + + /* +* Missing options: +* port= Probably should be supported. +* addr= This one is already supported; see nfs_show_options(). Right, thanks. +* clientaddr= This one isn't, and should be... would be useful for tracking down certain NFSv4 problems. +* mounthost= +* mountaddr= + * mountport= + * mountvers= + * mountproto= And these mount* options are for the kernel's new mount protocol client. They aren't really useful for understanding steady-state NFS client behavior, they only effect mount-time behavior. All mount options should be shown, which are needed to reconstruct a previous mount. For example, if you copy options out from /proc/mount, umount the filesystem, and then create a new mount with the copied options, you should get the same mount. So not only those options are interesting which are useful for understanding steady state behavior. The only options, which should not be shown, are those which have a permanent effect at mount time, like journal creation, etc. And those which are meaningless across different mounts, like communication file descriptors. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 21/26] mount options: partially fix nfs
On Thu, 2008-01-24 at 20:34 +0100, Miklos Szeredi wrote: plain text document attachment (nfs_opts.patch) From: Miklos Szeredi [EMAIL PROTECTED] Add posix, bsize=, namelen= options to /proc/mounts for nfs filesystems. Document several other options that are still missing. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/nfs/super.c === --- linux.orig/fs/nfs/super.c 2008-01-19 11:56:34.0 +0100 +++ linux/fs/nfs/super.c2008-01-21 20:41:30.0 +0100 @@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc } nfs_info[] = { { NFS_MOUNT_SOFT, ,soft, ,hard }, { NFS_MOUNT_INTR, ,intr, ,nointr }, + { NFS_MOUNT_POSIX, ,posix, }, { NFS_MOUNT_NOCTO, ,nocto, }, { NFS_MOUNT_NOAC, ,noac, }, { NFS_MOUNT_NONLM, ,nolock, }, @@ -459,10 +460,17 @@ static void nfs_show_mount_options(struc }; const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss-nfs_client; + unsigned int default_namelen = + clp-rpc_ops-version == 4 ? NFS4_MAXNAMLEN : + clp-rpc_ops-version == 3 ? NFS3_MAXNAMLEN : NFS2_MAXNAMLEN; seq_printf(m, ,vers=%d, clp-rpc_ops-version); seq_printf(m, ,rsize=%d, nfss-rsize); seq_printf(m, ,wsize=%d, nfss-wsize); + if (nfss-bsize != 0) + seq_printf(m, ,bsize=%d, nfss-bsize); + if (nfss-namelen != default_namelen) + seq_printf(m, ,namelen=%d, nfss-namelen); You really just want to look at the value of nfss-namelen. It should always be set. OK, I usually add the condition for (value != default_value) to avoid unnecessary clutter. But sure, there's no problem with showing the option unconditionally. if (nfss-acregmin != 3*HZ || showdefaults) seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ); if (nfss-acregmax != 60*HZ || showdefaults) @@ -482,6 +490,18 @@ static void nfs_show_mount_options(struc seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval / HZ); seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries); seq_printf(m, ,sec=%s, nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor)); + + /* +* Missing options: +* port= +* mountport= +* mountvers= +* mountproto= +* addr= +* clientaddr= +* mounthost= +* mountaddr= +*/ The new text mount interface actually does allow us to store these values if we really do need to. That should be a separate patch, however. OK. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 26/26] mount options: fix usbfs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to usbfs. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Looks good to me. Do you want to take this through your tree, as it is dependant on other changes, or do you want me to take this through the USB tree? Whatever is easier for you is fine for me. Please take it, it should be independent of the other changes. Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 25/26] mount options: fix udf
| + /* is this correct? */ | + if (sbi-s_anchor[2] != 0) | + seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]); you know, I would prefer to use form UDF_SB_ANCHOR(sb)[2] in sake of style unification but we should wait for Jan's decision (i'm not the expert in this area ;) I think UDF_SB_ANCHOR macro was removed by some patch in -mm. Yes, it's going to be removed so don't use it. Actually, basing this patch on top of -mm is a good idea because there are quite some changes in Andrew's queue. I'm more interested if the second element of the s_anchor array really does always have the value of the 'anchor=N' mount option. I haven't been able to verify that fully. Do you have some insight into that? As Cyrill wrote, it could be zeroed out in case there is no anchor in the specified block. So I guess you have to store the passed value somewhere else.. But in that case, would the value of the anchor= option matter? This is actually a somewhat philosophical question about what the mount options in /proc/mounts mean: 1) Options _given_ by the user for the mount 2) Options which are _effective_ for the mount If we take interpretation 2) and there was no anchor (whatever that means), then the anchor=N option wasn't effective, and not giving it would have had the same effect. This could be confusing to the user, though... Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 25/26] mount options: fix udf
On Fri 25-01-08 16:50:15, Miklos Szeredi wrote: | + /* is this correct? */ | + if (sbi-s_anchor[2] != 0) | + seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]); you know, I would prefer to use form UDF_SB_ANCHOR(sb)[2] in sake of style unification but we should wait for Jan's decision (i'm not the expert in this area ;) I think UDF_SB_ANCHOR macro was removed by some patch in -mm. Yes, it's going to be removed so don't use it. Actually, basing this patch on top of -mm is a good idea because there are quite some changes in Andrew's queue. I'm more interested if the second element of the s_anchor array really does always have the value of the 'anchor=N' mount option. I haven't been able to verify that fully. Do you have some insight into that? As Cyrill wrote, it could be zeroed out in case there is no anchor in the specified block. So I guess you have to store the passed value somewhere else.. But in that case, would the value of the anchor= option matter? No, it would not. This is actually a somewhat philosophical question about what the mount options in /proc/mounts mean: 1) Options _given_ by the user for the mount 2) Options which are _effective_ for the mount If we take interpretation 2) and there was no anchor (whatever that means), then the anchor=N option wasn't effective, and not giving it would have had the same effect. This could be confusing to the user, though... Hmm, given that options are modified by remount for some filesystems, it's probably the best to display the effective state. So your code should display the right thing as it is. OK. Cyrill, Jan, thanks for the reviews. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 00/26] mount options: fix filesystem's -show_options
Andrew, Would you please consider these patches for -mm? They should be relatively uncontroversial and straightforward fixes. They touch a lot of filesystems though, so not sure about the logistics... For the description, see first patch's header. Thanks, Miklos -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 03/26] mount options: fix adfs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to adfs. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/adfs/super.c === --- linux.orig/fs/adfs/super.c 2008-01-24 13:48:43.0 +0100 +++ linux/fs/adfs/super.c 2008-01-24 15:55:26.0 +0100 @@ -20,6 +20,8 @@ #include linux/vfs.h #include linux/parser.h #include linux/bitops.h +#include linux/mount.h +#include linux/seq_file.h #include asm/uaccess.h #include asm/system.h @@ -30,6 +32,9 @@ #include dir_f.h #include dir_fplus.h +#define ADFS_DEFAULT_OWNER_MASK S_IRWXU +#define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO) + void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...) { char error_buf[128]; @@ -134,6 +139,22 @@ static void adfs_put_super(struct super_ sb-s_fs_info = NULL; } +static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct adfs_sb_info *asb = ADFS_SB(mnt-mnt_sb); + + if (asb-s_uid != 0) + seq_printf(seq, ,uid=%u, asb-s_uid); + if (asb-s_gid != 0) + seq_printf(seq, ,gid=%u, asb-s_gid); + if (asb-s_owner_mask != ADFS_DEFAULT_OWNER_MASK) + seq_printf(seq, ,ownmask=%o, asb-s_owner_mask); + if (asb-s_other_mask != ADFS_DEFAULT_OTHER_MASK) + seq_printf(seq, ,othmask=%o, asb-s_other_mask); + + return 0; +} + enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; static match_table_t tokens = { @@ -259,6 +280,7 @@ static const struct super_operations adf .put_super = adfs_put_super, .statfs = adfs_statfs, .remount_fs = adfs_remount, + .show_options = adfs_show_options, }; static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) @@ -344,8 +366,8 @@ static int adfs_fill_super(struct super_ /* set default options */ asb-s_uid = 0; asb-s_gid = 0; - asb-s_owner_mask = S_IRWXU; - asb-s_other_mask = S_IRWXG | S_IRWXO; + asb-s_owner_mask = ADFS_DEFAULT_OWNER_MASK; + asb-s_other_mask = ADFS_DEFAULT_OTHER_MASK; if (parse_options(sb, data)) goto error; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 04/26] mount options: fix affs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to affs. Use generic_show_options() and save the complete option string in affs_fill_super() and affs_remount(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/affs/super.c === --- linux.orig/fs/affs/super.c 2008-01-24 18:57:19.0 +0100 +++ linux/fs/affs/super.c 2008-01-24 19:01:21.0 +0100 @@ -122,6 +122,7 @@ static const struct super_operations aff .write_super= affs_write_super, .statfs = affs_statfs, .remount_fs = affs_remount, + .show_options = generic_show_options, }; enum { @@ -272,6 +273,8 @@ static int affs_fill_super(struct super_ u8 sig[4]; int ret = -EINVAL; + save_mount_options(sb, data); + pr_debug(AFFS: read_super(%s)\n,data ? (const char *)data : no options); sb-s_magic = AFFS_SUPER_MAGIC; @@ -487,14 +490,21 @@ affs_remount(struct super_block *sb, int int root_block; unsigned longmount_flags; int res = 0; + char*new_opts = kstrdup(data, GFP_KERNEL); pr_debug(AFFS: remount(flags=0x%x,opts=\%s\)\n,*flags,data); *flags |= MS_NODIRATIME; - if (!parse_options(data,uid,gid,mode,reserved,root_block, - blocksize,sbi-s_prefix,sbi-s_volume,mount_flags)) + if (!parse_options(data, uid, gid, mode, reserved, root_block, + blocksize, sbi-s_prefix, sbi-s_volume, + mount_flags)) { + kfree(new_opts); return -EINVAL; + } + kfree(sb-s_options); + sb-s_options = new_opts; + sbi-s_flags = mount_flags; sbi-s_mode = mode; sbi-s_uid = uid; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 05/26] mount options: fix afs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to afs. Use generic_show_options() and save the complete option string in afs_get_sb(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/afs/super.c === --- linux.orig/fs/afs/super.c 2008-01-24 11:42:44.0 +0100 +++ linux/fs/afs/super.c2008-01-24 12:05:50.0 +0100 @@ -52,6 +52,7 @@ static const struct super_operations afs .clear_inode= afs_clear_inode, .umount_begin = afs_umount_begin, .put_super = afs_put_super, + .show_options = generic_show_options, }; static struct kmem_cache *afs_inode_cachep; @@ -357,6 +358,7 @@ static int afs_get_sb(struct file_system struct super_block *sb; struct afs_volume *vol; struct key *key; + char *new_opts = kstrdup(options, GFP_KERNEL); int ret; _enter(,,%s,%p, dev_name, options); @@ -408,9 +410,11 @@ static int afs_get_sb(struct file_system deactivate_super(sb); goto error; } + sb-s_options = new_opts; sb-s_flags |= MS_ACTIVE; } else { _debug(reuse); + kfree(new_opts); ASSERTCMP(sb-s_flags, , MS_ACTIVE); } @@ -424,6 +428,7 @@ error: afs_put_volume(params.volume); afs_put_cell(params.cell); key_put(params.key); + kfree(new_opts); _leave( = %d, ret); return ret; } -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 06/26] mount options: fix autofs4
From: Miklos Szeredi [EMAIL PROTECTED] Add uid= and gid= options to /proc/mounts for autofs4 filesystems. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/autofs4/inode.c === --- linux.orig/fs/autofs4/inode.c 2008-01-22 15:52:42.0 +0100 +++ linux/fs/autofs4/inode.c2008-01-22 23:36:02.0 +0100 @@ -188,11 +188,16 @@ out_kill_sb: static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) { struct autofs_sb_info *sbi = autofs4_sbi(mnt-mnt_sb); + struct inode *root_inode = mnt-mnt_sb-s_root-d_inode; if (!sbi) return 0; seq_printf(m, ,fd=%d, sbi-pipefd); + if (root_inode-i_uid != 0) + seq_printf(m, ,uid=%u, root_inode-i_uid); + if (root_inode-i_gid != 0) + seq_printf(m, ,gid=%u, root_inode-i_gid); seq_printf(m, ,pgrp=%d, sbi-oz_pgrp); seq_printf(m, ,timeout=%lu, sbi-exp_timeout/HZ); seq_printf(m, ,minproto=%d, sbi-min_proto); -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 02/26] mount options: add generic_show_options()
From: Miklos Szeredi [EMAIL PROTECTED] Add a new s_options field to struct super_block. Filesystems can save mount options passed to them in mount or remount. It is automatically freed when the superblock is destroyed. A new helper function, generic_show_options() is introduced, which uses this field to display the mount options in /proc/mounts. Another helper function, save_mount_options() may be used by filesystems to save the options in the super block. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-01-24 17:07:46.0 +0100 +++ linux/fs/namespace.c2008-01-24 17:34:50.0 +0100 @@ -575,6 +575,50 @@ void mnt_unpin(struct vfsmount *mnt) EXPORT_SYMBOL(mnt_unpin); +static inline void mangle(struct seq_file *m, const char *s) +{ + seq_escape(m, s, \t\n\\); +} + +/* + * Simple .show_options callback for filesystems which don't want to + * implement more complex mount option showing. + * + * See also save_mount_options(). + */ +int generic_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + const char *options = mnt-mnt_sb-s_options; + + if (options != NULL options[0]) { + seq_putc(m, ','); + mangle(m, options); + } + + return 0; +} +EXPORT_SYMBOL(generic_show_options); + +/* + * If filesystem uses generic_show_options(), this function should be + * called from the fill_super() callback. + * + * The .remount_fs callback usually needs to be handled in a special + * way, to make sure, that previous options are not overwritten if the + * remount fails. + * + * Also note, that if the filesystem's .remount_fs function doesn't + * reset all options to their default value, but changes only newly + * given options, then the displayed options will not reflect reality + * any more. + */ +void save_mount_options(struct super_block *sb, char *options) +{ + kfree(sb-s_options); + sb-s_options = kstrdup(options, GFP_KERNEL); +} +EXPORT_SYMBOL(save_mount_options); + /* iterator */ static void *m_start(struct seq_file *m, loff_t *pos) { @@ -596,11 +640,6 @@ static void m_stop(struct seq_file *m, v up_read(namespace_sem); } -static inline void mangle(struct seq_file *m, const char *s) -{ - seq_escape(m, s, \t\n\\); -} - static int show_vfsmnt(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); Index: linux/fs/super.c === --- linux.orig/fs/super.c 2008-01-24 17:07:46.0 +0100 +++ linux/fs/super.c2008-01-24 17:12:33.0 +0100 @@ -105,6 +105,7 @@ static inline void destroy_super(struct { security_sb_free(s); kfree(s-s_subtype); + kfree(s-s_options); kfree(s); } Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-01-24 17:07:46.0 +0100 +++ linux/include/linux/fs.h2008-01-24 17:12:33.0 +0100 @@ -1042,6 +1042,12 @@ struct super_block { * in /proc/mounts will be type.subtype */ char *s_subtype; + + /* +* Saved mount options for lazy filesystems using +* generic_show_options() +*/ + char *s_options; }; extern struct timespec current_fs_time(struct super_block *sb); @@ -1992,6 +1998,9 @@ extern int __must_check inode_setattr(st extern void file_update_time(struct file *file); +extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt); +extern void save_mount_options(struct super_block *sb, char *options); + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 09/26] mount options: fix capifs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to capifs. Use generic_show_options() and save the complete option string in capifs_remount(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/drivers/isdn/capi/capifs.c === --- linux.orig/drivers/isdn/capi/capifs.c 2007-10-09 22:31:38.0 +0200 +++ linux/drivers/isdn/capi/capifs.c2008-01-24 11:37:42.0 +0100 @@ -52,6 +52,7 @@ static int capifs_remount(struct super_b gid_t gid = 0; umode_t mode = 0600; char *this_char; + char *new_opt = kstrdup(data, GFP_KERNEL); this_char = NULL; while ((this_char = strsep(data, ,)) != NULL) { @@ -72,11 +73,16 @@ static int capifs_remount(struct super_b return -EINVAL; } } + + kfree(s-s_options); + s-s_options = new_opt; + config.setuid = setuid; config.setgid = setgid; config.uid = uid; config.gid = gid; config.mode= mode; + return 0; } @@ -84,6 +90,7 @@ static struct super_operations capifs_so { .statfs = simple_statfs, .remount_fs = capifs_remount, + .show_options = generic_show_options, }; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 08/26] mount options: fix befs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to befs. Use generic_show_options() and save the complete option string in befs_fill_super(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/befs/linuxvfs.c === --- linux.orig/fs/befs/linuxvfs.c 2008-01-17 19:00:54.0 +0100 +++ linux/fs/befs/linuxvfs.c2008-01-22 21:40:05.0 +0100 @@ -57,6 +57,7 @@ static const struct super_operations bef .put_super = befs_put_super, /* uninit super */ .statfs = befs_statfs, /* statfs */ .remount_fs = befs_remount, + .show_options = generic_show_options, }; /* slab cache for befs_inode_info objects */ @@ -759,10 +760,11 @@ befs_fill_super(struct super_block *sb, befs_super_block *disk_sb; struct inode *root; long ret = -EINVAL; - const unsigned long sb_block = 0; const off_t x86_sb_off = 512; + save_mount_options(sb, data); + sb-s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); if (sb-s_fs_info == NULL) { printk(KERN_ERR -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 10/26] mount options: fix devpts
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to devpts. Also add minor fix: when parsing the mode option, mask with S_IALLUGO instead of ~S_IFMT, which could leave unsed bits in the mask. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/devpts/inode.c === --- linux.orig/fs/devpts/inode.c2008-01-22 23:43:12.0 +0100 +++ linux/fs/devpts/inode.c 2008-01-23 13:01:05.0 +0100 @@ -20,9 +20,12 @@ #include linux/devpts_fs.h #include linux/parser.h #include linux/fsnotify.h +#include linux/seq_file.h #define DEVPTS_SUPER_MAGIC 0x1cd1 +#define DEVPTS_DEFAULT_MODE 0600 + static struct vfsmount *devpts_mnt; static struct dentry *devpts_root; @@ -32,7 +35,7 @@ static struct { uid_t uid; gid_t gid; umode_t mode; -} config = {.mode = 0600}; +} config = {.mode = DEVPTS_DEFAULT_MODE}; enum { Opt_uid, Opt_gid, Opt_mode, @@ -54,7 +57,7 @@ static int devpts_remount(struct super_b config.setgid = 0; config.uid = 0; config.gid = 0; - config.mode= 0600; + config.mode= DEVPTS_DEFAULT_MODE; while ((p = strsep(data, ,)) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -81,7 +84,7 @@ static int devpts_remount(struct super_b case Opt_mode: if (match_octal(args[0], option)) return -EINVAL; - config.mode = option ~S_IFMT; + config.mode = option S_IALLUGO; break; default: printk(KERN_ERR devpts: called with bogus options\n); @@ -92,9 +95,22 @@ static int devpts_remount(struct super_b return 0; } +static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + if (config.setuid) + seq_printf(seq, ,uid=%u, config.uid); + if (config.setgid) + seq_printf(seq, ,gid=%u, config.gid); + if (config.mode != DEVPTS_DEFAULT_MODE) + seq_printf(seq, ,mode=%03o, config.mode); + + return 0; +} + static const struct super_operations devpts_sops = { .statfs = simple_statfs, .remount_fs = devpts_remount, + .show_options = devpts_show_options, }; static int -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 11/26] mount options: fix ext2
From: Miklos Szeredi [EMAIL PROTECTED] Add noreservation option to /proc/mounts for ext2 filesystems. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/ext2/super.c === --- linux.orig/fs/ext2/super.c 2008-01-17 19:00:55.0 +0100 +++ linux/fs/ext2/super.c 2008-01-23 21:38:08.0 +0100 @@ -285,6 +285,9 @@ static int ext2_show_options(struct seq_ seq_puts(seq, ,xip); #endif + if (!test_opt(sb, RESERVATION)) + seq_puts(seq, ,noreservation); + return 0; } -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 12/26] mount options: fix ext4
From: Miklos Szeredi [EMAIL PROTECTED] Add stripe= option to /proc/mounts for ext4 filesystems. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/ext4/super.c === --- linux.orig/fs/ext4/super.c 2008-01-23 12:57:07.0 +0100 +++ linux/fs/ext4/super.c 2008-01-23 21:43:51.0 +0100 @@ -742,7 +742,8 @@ static int ext4_show_options(struct seq_ seq_puts(seq, ,nomballoc); if (!test_opt(sb, DELALLOC)) seq_puts(seq, ,nodelalloc); - + if (sbi-s_stripe) + seq_printf(seq, ,stripe=%lu, sbi-s_stripe); /* * journal mode get enabled in different ways -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 01/26] mount options: add documentation
From: Miklos Szeredi [EMAIL PROTECTED] This series addresses the problem of showing mount options in /proc/mounts. Several filesystems which use mount options, have not implemented a .show_options superblock operation. Several others have implemented this callback, but have not kept it fully up to date with the parsed options. Q: Why do we need correct option showing in /proc/mounts? A: We want /proc/mounts to fully replace /etc/mtab. The reasons for this are: - unprivileged mounters won't be able to update /etc/mtab - /etc/mtab doesn't work with private mount namespaces - /etc/mtab can become out-of-sync with reality Q: Can't this be done, so that filesystems need not bother with implementing a .show_mounts callback, and keeping it up to date? A: Only in some cases. Certain filesystems allow modification of a subset of options in their remount_fs method. It is not possible to take this into account without knowing exactly how the filesystem handles options. For the simple case (no remount or remount resets all options) the patchset introduces two helpers: generic_show_options() save_mount_options() These can also be used to emulate the old /etc/mtab behavior, until proper support is added. Even if this is not 100% correct, it's still better than showing no options at all. The following patches fix up most in-tree filesystems, they have been compile tested only. I would like to ask maintainers (CC-d on respective patches) to please review, test and ACK these changes. The following filesystems still need fixing: CIFS, NFS, XFS, Unionfs, Reiser4. For CIFS, NFS and XFS I wasn't able to understand how some of the options are used. The last two are not yet in mainline, so I leave fixing those to their respective maintainers out of pure laziness. Table displaying status of all in-kernel filesystems: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - legend: none - fs has options, but doesn't define -show_options() some - fs defines -show_options(), but some only options are shown most - fs defines -show_options(), and shows most of them good - fs shows all options noopt - fs does not have options patch - a patch will be posted 9p good adfspatch affspatch afs patch autofs patch autofs4 patch befspatch bfs noopt cifssome codanoopt configfsnoopt cramfs noopt debugfs noopt devpts patch ecryptfsgood efs noopt ext2patch ext3good ext4patch fat patch freevxfsnoopt fusepatch fusectl noopt gfs2good gfs2metanoopt hfs good hfsplus good hostfs patch hpfspatch hppfs noopt hugetlbfs patch isofs patch jffs2 noopt jfs patch minix noopt msdos -fat ncpfs patch nfs patch,most nfsdnoopt ntfsgood ocfs2 good ocfs2/dlmfs noopt openpromfs noopt procnoopt qnx4noopt ramfs noopt reiserfspatch romfs noopt smbfs good sysfs noopt sysvnoopt udf patch ufs good vfat-fat xfs most mm/shmem.cpatch drivers/oprofile/oprofilefs.c noopt drivers/infiniband/hw/ipath/ipath_fs.cnoopt drivers/misc/ibmasm/ibmasmfs.cnoopt drivers/usb/core (usbfs) patch drivers/usb/gadget (gadgetfs) noopt drivers/isdn/capi/capifs.cpatch kernel/cpuset.c noopt fs/binfmt_misc.c noopt net/sunrpc/rpc_pipe.c noopt arch/powerpc/platforms/cell/spufs patch arch/s390/hypfs good ipc/mqueue.c noopt security (securityfs) noopt security/selinux/selinuxfs.c noopt kernel/cgroup.c good security/smack/smackfs.c noopt in -mm: reiser4 some unionfs none - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - This patch: Document the rules for handling mount options in the .show_options super operation. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/Documentation/filesystems/vfs.txt === --- linux.orig/Documentation/filesystems/vfs.txt2008-01-24 11:42:48.0 +0100 +++ linux/Documentation/filesystems/vfs.txt 2008-01-24 17:12:25.0 +0100 @@ -151,7 +151,7 @@ The get_sb() method has the following ar const char *dev_name: the device name we are mounting. void *data: arbitrary mount options, usually comes as an ASCII - string + string (see Mount Options section) struct vfsmount *mnt: a vfs-internal representation of a mount point @@ -182,7
[patch 14/26] mount options: fix fuse
From: Miklos Szeredi [EMAIL PROTECTED] Add blksize= option to /proc/mounts for fuseblk filesystems. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/fuse/inode.c === --- linux.orig/fs/fuse/inode.c 2008-01-19 11:56:34.0 +0100 +++ linux/fs/fuse/inode.c 2008-01-21 17:53:06.0 +0100 @@ -29,6 +29,8 @@ DEFINE_MUTEX(fuse_mutex); #define FUSE_SUPER_MAGIC 0x65735546 +#define FUSE_DEFAULT_BLKSIZE 512 + struct fuse_mount_data { int fd; unsigned rootmode; @@ -355,7 +357,7 @@ static int parse_fuse_opt(char *opt, str char *p; memset(d, 0, sizeof(struct fuse_mount_data)); d-max_read = ~0; - d-blksize = 512; + d-blksize = FUSE_DEFAULT_BLKSIZE; while ((p = strsep(opt, ,)) != NULL) { int token; @@ -440,6 +442,9 @@ static int fuse_show_options(struct seq_ seq_puts(m, ,allow_other); if (fc-max_read != ~0) seq_printf(m, ,max_read=%u, fc-max_read); + if (mnt-mnt_sb-s_bdev + mnt-mnt_sb-s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ,blksize=%lu, mnt-mnt_sb-s_blocksize); return 0; } -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 15/26] mount options: fix hostfs
From: Miklos Szeredi [EMAIL PROTECTED] Add the host path option to /proc/mounts for UML hostfs filesystems. The mount source (mnt_devname) should really be used for this, but not easy to change now in a backward compatible way. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/hostfs/hostfs_kern.c === --- linux.orig/fs/hostfs/hostfs_kern.c 2008-01-17 19:00:55.0 +0100 +++ linux/fs/hostfs/hostfs_kern.c 2008-01-21 19:19:55.0 +0100 @@ -11,6 +11,7 @@ #include linux/mm.h #include linux/pagemap.h #include linux/statfs.h +#include linux/seq_file.h #include hostfs.h #include init.h #include kern.h @@ -322,12 +323,25 @@ static void hostfs_destroy_inode(struct kfree(HOSTFS_I(inode)); } +static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct inode *root = vfs-mnt_sb-s_root-d_inode; + const char *root_path = HOSTFS_I(root)-host_filename; + size_t offset = strlen(root_ino) + 1; + + if (strlen(root_path) offset) + seq_printf(seq, ,%s, root_path + offset); + + return 0; +} + static const struct super_operations hostfs_sbops = { .alloc_inode= hostfs_alloc_inode, .drop_inode = generic_delete_inode, .delete_inode = hostfs_delete_inode, .destroy_inode = hostfs_destroy_inode, .statfs = hostfs_statfs, + .show_options = hostfs_show_options, }; int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 16/26] mount options: fix hpfs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to hpfs. Use generic_show_options() and save the complete option string in hpfs_fill_super() and hpfs_remount_fs(). Also add a small fix: hpfs_remount_fs() should return -EINVAL on error, instead of 1, which is not an error value. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/hpfs/super.c === --- linux.orig/fs/hpfs/super.c 2008-01-17 19:00:14.0 +0100 +++ linux/fs/hpfs/super.c 2008-01-23 23:36:53.0 +0100 @@ -386,6 +386,7 @@ static int hpfs_remount_fs(struct super_ int lowercase, conv, eas, chk, errs, chkdsk, timeshift; int o; struct hpfs_sb_info *sbi = hpfs_sb(s); + char *new_opts = kstrdup(data, GFP_KERNEL); *flags |= MS_NOATIME; @@ -398,15 +399,15 @@ static int hpfs_remount_fs(struct super_ if (!(o = parse_opts(data, uid, gid, umask, lowercase, conv, eas, chk, errs, chkdsk, timeshift))) { printk(HPFS: bad mount options.\n); - return 1; + goto out_err; } if (o == 2) { hpfs_help(); - return 1; + goto out_err; } if (timeshift != sbi-sb_timeshift) { printk(HPFS: timeshift can't be changed using remount.\n); - return 1; + goto out_err; } unmark_dirty(s); @@ -419,7 +420,14 @@ static int hpfs_remount_fs(struct super_ if (!(*flags MS_RDONLY)) mark_dirty(s); + kfree(s-s_options); + s-s_options = new_opts; + return 0; + +out_err: + kfree(new_opts); + return -EINVAL; } /* Super operations */ @@ -432,6 +440,7 @@ static const struct super_operations hpf .put_super = hpfs_put_super, .statfs = hpfs_statfs, .remount_fs = hpfs_remount_fs, + .show_options = generic_show_options, }; static int hpfs_fill_super(struct super_block *s, void *options, int silent) @@ -454,6 +463,8 @@ static int hpfs_fill_super(struct super_ int o; + save_mount_options(s, options); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 18/26] mount options: fix isofs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to isofs. Use generic_show_options() and save the complete option string in isofs_fill_super(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/isofs/inode.c === --- linux.orig/fs/isofs/inode.c 2008-01-17 19:00:55.0 +0100 +++ linux/fs/isofs/inode.c 2008-01-23 22:07:51.0 +0100 @@ -110,6 +110,7 @@ static const struct super_operations iso .put_super = isofs_put_super, .statfs = isofs_statfs, .remount_fs = isofs_remount, + .show_options = generic_show_options, }; @@ -554,6 +555,8 @@ static int isofs_fill_super(struct super int table, error = -EINVAL; unsigned int vol_desc_start; + save_mount_options(s, data); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 21/26] mount options: partially fix nfs
From: Miklos Szeredi [EMAIL PROTECTED] Add posix, bsize=, namelen= options to /proc/mounts for nfs filesystems. Document several other options that are still missing. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/nfs/super.c === --- linux.orig/fs/nfs/super.c 2008-01-19 11:56:34.0 +0100 +++ linux/fs/nfs/super.c2008-01-21 20:41:30.0 +0100 @@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc } nfs_info[] = { { NFS_MOUNT_SOFT, ,soft, ,hard }, { NFS_MOUNT_INTR, ,intr, ,nointr }, + { NFS_MOUNT_POSIX, ,posix, }, { NFS_MOUNT_NOCTO, ,nocto, }, { NFS_MOUNT_NOAC, ,noac, }, { NFS_MOUNT_NONLM, ,nolock, }, @@ -459,10 +460,17 @@ static void nfs_show_mount_options(struc }; const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss-nfs_client; + unsigned int default_namelen = + clp-rpc_ops-version == 4 ? NFS4_MAXNAMLEN : + clp-rpc_ops-version == 3 ? NFS3_MAXNAMLEN : NFS2_MAXNAMLEN; seq_printf(m, ,vers=%d, clp-rpc_ops-version); seq_printf(m, ,rsize=%d, nfss-rsize); seq_printf(m, ,wsize=%d, nfss-wsize); + if (nfss-bsize != 0) + seq_printf(m, ,bsize=%d, nfss-bsize); + if (nfss-namelen != default_namelen) + seq_printf(m, ,namelen=%d, nfss-namelen); if (nfss-acregmin != 3*HZ || showdefaults) seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ); if (nfss-acregmax != 60*HZ || showdefaults) @@ -482,6 +490,18 @@ static void nfs_show_mount_options(struc seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval / HZ); seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries); seq_printf(m, ,sec=%s, nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor)); + + /* +* Missing options: +* port= +* mountport= +* mountvers= +* mountproto= +* addr= +* clientaddr= +* mounthost= +* mountaddr= +*/ } /* -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 23/26] mount options: fix spufs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to spufs. Use generic_show_options() and save the complete option string in spufs_fill_super(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/arch/powerpc/platforms/cell/spufs/inode.c === --- linux.orig/arch/powerpc/platforms/cell/spufs/inode.c2008-01-17 19:00:52.0 +0100 +++ linux/arch/powerpc/platforms/cell/spufs/inode.c 2008-01-23 23:44:36.0 +0100 @@ -744,8 +744,11 @@ spufs_fill_super(struct super_block *sb, .statfs = simple_statfs, .delete_inode = spufs_delete_inode, .drop_inode = generic_delete_inode, + .show_options = generic_show_options, }; + save_mount_options(sb, data); + sb-s_maxbytes = MAX_LFS_FILESIZE; sb-s_blocksize = PAGE_CACHE_SIZE; sb-s_blocksize_bits = PAGE_CACHE_SHIFT; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 07/26] mount options: fix autofs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to autofs. Use generic_show_options() and save the complete option string in autofs_fill_super(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/autofs/inode.c === --- linux.orig/fs/autofs/inode.c2008-01-17 19:00:54.0 +0100 +++ linux/fs/autofs/inode.c 2008-01-24 11:16:30.0 +0100 @@ -54,6 +54,7 @@ out_kill_sb: static const struct super_operations autofs_sops = { .statfs = simple_statfs, + .show_options = generic_show_options, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; @@ -140,6 +141,8 @@ int autofs_fill_super(struct super_block int minproto, maxproto; pid_t pgid; + save_mount_options(s, data); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) goto fail_unlock; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 24/26] mount options: fix tmpfs
From: Miklos Szeredi [EMAIL PROTECTED] Add .show_options super operation to tmpfs. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/mm/shmem.c === --- linux.orig/mm/shmem.c 2008-01-21 21:20:04.0 +0100 +++ linux/mm/shmem.c2008-01-21 21:30:04.0 +0100 @@ -49,6 +49,7 @@ #include linux/ctype.h #include linux/migrate.h #include linux/highmem.h +#include linux/seq_file.h #include asm/uaccess.h #include asm/div64.h @@ -198,7 +199,7 @@ static DEFINE_MUTEX(shmem_swaplist_mutex static void shmem_free_blocks(struct inode *inode, long pages) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode-i_sb); - if (sbinfo-max_blocks) { + if (sbinfo-config.max_blocks) { spin_lock(sbinfo-stat_lock); sbinfo-free_blocks += pages; inode-i_blocks -= pages*BLOCKS_PER_PAGE; @@ -209,7 +210,7 @@ static void shmem_free_blocks(struct ino static int shmem_reserve_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo-max_inodes) { + if (sbinfo-config.max_inodes) { spin_lock(sbinfo-stat_lock); if (!sbinfo-free_inodes) { spin_unlock(sbinfo-stat_lock); @@ -224,7 +225,7 @@ static int shmem_reserve_inode(struct su static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo-max_inodes) { + if (sbinfo-config.max_inodes) { spin_lock(sbinfo-stat_lock); sbinfo-free_inodes++; spin_unlock(sbinfo-stat_lock); @@ -388,7 +389,7 @@ static swp_entry_t *shmem_swp_alloc(stru * page (and perhaps indirect index pages) yet to allocate: * a waste to allocate index if we cannot allocate data. */ - if (sbinfo-max_blocks) { + if (sbinfo-config.max_blocks) { spin_lock(sbinfo-stat_lock); if (sbinfo-free_blocks = 1) { spin_unlock(sbinfo-stat_lock); @@ -1338,7 +1339,7 @@ repeat: } else { shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode-i_sb); - if (sbinfo-max_blocks) { + if (sbinfo-config.max_blocks) { spin_lock(sbinfo-stat_lock); if (sbinfo-free_blocks == 0 || shmem_acct_block(info-flags)) { @@ -1519,8 +1520,9 @@ shmem_get_inode(struct super_block *sb, case S_IFREG: inode-i_op = shmem_inode_operations; inode-i_fop = shmem_file_operations; - mpol_shared_policy_init(info-policy, sbinfo-policy, - sbinfo-policy_nodes); + mpol_shared_policy_init(info-policy, + sbinfo-config.policy, + sbinfo-config.policy_nodes); break; case S_IFDIR: inc_nlink(inode); @@ -1720,12 +1722,12 @@ static int shmem_statfs(struct dentry *d buf-f_bsize = PAGE_CACHE_SIZE; buf-f_namelen = NAME_MAX; spin_lock(sbinfo-stat_lock); - if (sbinfo-max_blocks) { - buf-f_blocks = sbinfo-max_blocks; + if (sbinfo-config.max_blocks) { + buf-f_blocks = sbinfo-config.max_blocks; buf-f_bavail = buf-f_bfree = sbinfo-free_blocks; } - if (sbinfo-max_inodes) { - buf-f_files = sbinfo-max_inodes; + if (sbinfo-config.max_inodes) { + buf-f_files = sbinfo-config.max_inodes; buf-f_ffree = sbinfo-free_inodes; } /* else leave those fields 0 like simple_statfs */ @@ -2077,9 +2079,8 @@ static const struct export_operations sh .fh_to_dentry = shmem_fh_to_dentry, }; -static int shmem_parse_options(char *options, int *mode, uid_t *uid, - gid_t *gid, unsigned long *blocks, unsigned long *inodes, - int *policy, nodemask_t *policy_nodes) +static int shmem_parse_options(char *options, struct shmem_config *config, + bool remount) { char *this_char, *value, *rest; @@ -2122,35 +2123,43 @@ static int shmem_parse_options(char *opt } if (*rest) goto bad_val; - *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE); + config-max_blocks = + DIV_ROUND_UP(size, PAGE_CACHE_SIZE); + config-max_blocks_changed = 1; } else if (!strcmp(this_char,nr_blocks)) { - *blocks = memparse(value,rest
[patch 25/26] mount options: fix udf
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to udf. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/udf/super.c === --- linux.orig/fs/udf/super.c 2008-01-24 13:48:37.0 +0100 +++ linux/fs/udf/super.c2008-01-24 15:58:21.0 +0100 @@ -53,6 +53,8 @@ #include linux/vfs.h #include linux/vmalloc.h #include linux/errno.h +#include linux/mount.h +#include linux/seq_file.h #include asm/byteorder.h #include linux/udf_fs.h @@ -71,6 +73,8 @@ #define VDS_POS_TERMINATING_DESC 6 #define VDS_POS_LENGTH 7 +#define UDF_DEFAULT_BLOCKSIZE 2048 + static char error_buf[1024]; /* These are the meat - everything else is stuffing */ @@ -95,6 +99,7 @@ static void udf_open_lvid(struct super_b static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct dentry *, struct kstatfs *); +static int udf_show_options(struct seq_file *, struct vfsmount *); struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) { @@ -181,6 +186,7 @@ static const struct super_operations udf .write_super= udf_write_super, .statfs = udf_statfs, .remount_fs = udf_remount_fs, + .show_options = udf_show_options, }; struct udf_options { @@ -247,6 +253,56 @@ static int udf_sb_alloc_partition_maps(s return 0; } +static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct super_block *sb = mnt-mnt_sb; + struct udf_sb_info *sbi = UDF_SB(sb); + + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) + seq_puts(seq, ,nostrict); + if (sb-s_blocksize != UDF_DEFAULT_BLOCKSIZE) + seq_printf(seq, ,bs=%lu, sb-s_blocksize); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) + seq_puts(seq, ,unhide); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) + seq_puts(seq, ,undelete); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB)) + seq_puts(seq, ,noadinicb); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD)) + seq_puts(seq, ,shortad); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET)) + seq_puts(seq, ,uid=forget); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE)) + seq_puts(seq, ,uid=ignore); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET)) + seq_puts(seq, ,gid=forget); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE)) + seq_puts(seq, ,gid=ignore); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) + seq_printf(seq, ,uid=%u, sbi-s_uid); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) + seq_printf(seq, ,gid=%u, sbi-s_gid); + if (sbi-s_umask != 0) + seq_printf(seq, ,umask=%o, sbi-s_umask); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET)) + seq_printf(seq, ,session=%u, sbi-s_session); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET)) + seq_printf(seq, ,lastblock=%u, sbi-s_last_block); + /* is this correct? */ + if (sbi-s_anchor[2] != 0) + seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]); + /* +* volume, partition, fileset and rootdir seem to be ignored +* currently +*/ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) + seq_puts(seq, ,utf8); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) sbi-s_nls_map) + seq_printf(seq, ,iocharset=%s, sbi-s_nls_map-charset); + + return 0; +} + /* * udf_parse_options * @@ -339,13 +395,14 @@ static match_table_t tokens = { {Opt_err, NULL} }; -static int udf_parse_options(char *options, struct udf_options *uopt) +static int udf_parse_options(char *options, struct udf_options *uopt, +bool remount) { char *p; int option; uopt-novrs = 0; - uopt-blocksize = 2048; + uopt-blocksize = UDF_DEFAULT_BLOCKSIZE; uopt-partition = 0x; uopt-session = 0x; uopt-lastblock = 0; @@ -415,11 +472,15 @@ static int udf_parse_options(char *optio if (match_int(args, option)) return 0; uopt-session = option; + if (!remount) + uopt-flags |= (1 UDF_FLAG_SESSION_SET); break; case Opt_lastblock: if (match_int(args, option)) return 0; uopt-lastblock = option; + if (!remount) + uopt-flags |= (1 UDF_FLAG_LASTBLOCK_SET); break; case Opt_anchor: if (match_int(args, option)) @@ -497,7 +558,7 @@ static
[patch 26/26] mount options: fix usbfs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to usbfs. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/drivers/usb/core/inode.c === --- linux.orig/drivers/usb/core/inode.c 2008-01-24 13:48:37.0 +0100 +++ linux/drivers/usb/core/inode.c 2008-01-24 16:00:03.0 +0100 @@ -38,10 +38,15 @@ #include linux/usbdevice_fs.h #include linux/parser.h #include linux/notifier.h +#include linux/seq_file.h #include asm/byteorder.h #include usb.h #include hcd.h +#define USBFS_DEFAULT_DEVMODE (S_IWUSR | S_IRUGO) +#define USBFS_DEFAULT_BUSMODE (S_IXUGO | S_IRUGO) +#define USBFS_DEFAULT_LISTMODE S_IRUGO + static struct super_operations usbfs_ops; static const struct file_operations default_file_operations; static struct vfsmount *usbfs_mount; @@ -57,9 +62,33 @@ static uid_t listuid;/* = 0 */ static gid_t devgid; /* = 0 */ static gid_t busgid; /* = 0 */ static gid_t listgid; /* = 0 */ -static umode_t devmode = S_IWUSR | S_IRUGO; -static umode_t busmode = S_IXUGO | S_IRUGO; -static umode_t listmode = S_IRUGO; +static umode_t devmode = USBFS_DEFAULT_DEVMODE; +static umode_t busmode = USBFS_DEFAULT_BUSMODE; +static umode_t listmode = USBFS_DEFAULT_LISTMODE; + +static int usbfs_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + if (devuid != 0) + seq_printf(seq, ,devuid=%u, devuid); + if (devgid != 0) + seq_printf(seq, ,devgid=%u, devgid); + if (devmode != USBFS_DEFAULT_DEVMODE) + seq_printf(seq, ,devmode=%o, devmode); + if (busuid != 0) + seq_printf(seq, ,busuid=%u, busuid); + if (busgid != 0) + seq_printf(seq, ,busgid=%u, busgid); + if (busmode != USBFS_DEFAULT_BUSMODE) + seq_printf(seq, ,busmode=%o, busmode); + if (listuid != 0) + seq_printf(seq, ,listuid=%u, listuid); + if (listgid != 0) + seq_printf(seq, ,listgid=%u, listgid); + if (listmode != USBFS_DEFAULT_LISTMODE) + seq_printf(seq, ,listmode=%o, listmode); + + return 0; +} enum { Opt_devuid, Opt_devgid, Opt_devmode, @@ -93,9 +122,9 @@ static int parse_options(struct super_bl devgid = 0; busgid = 0; listgid = 0; - devmode = S_IWUSR | S_IRUGO; - busmode = S_IXUGO | S_IRUGO; - listmode = S_IRUGO; + devmode = USBFS_DEFAULT_DEVMODE; + busmode = USBFS_DEFAULT_BUSMODE; + listmode = USBFS_DEFAULT_LISTMODE; while ((p = strsep(data, ,)) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -418,6 +447,7 @@ static struct super_operations usbfs_ops .statfs = simple_statfs, .drop_inode = generic_delete_inode, .remount_fs = remount, + .show_options = usbfs_show_options, }; static int usbfs_fill_super(struct super_block *sb, void *data, int silent) -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 22/26] mount options: fix reiserfs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to reiserfs. Use generic_show_options() and save the complete option string in reiserfs_fill_super() and reiserfs_remount(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/reiserfs/super.c === --- linux.orig/fs/reiserfs/super.c 2008-01-17 19:00:55.0 +0100 +++ linux/fs/reiserfs/super.c 2008-01-22 21:20:33.0 +0100 @@ -617,6 +617,7 @@ static const struct super_operations rei .unlockfs = reiserfs_unlockfs, .statfs = reiserfs_statfs, .remount_fs = reiserfs_remount, + .show_options = generic_show_options, #ifdef CONFIG_QUOTA .quota_read = reiserfs_quota_read, .quota_write = reiserfs_quota_write, @@ -1138,6 +1139,7 @@ static int reiserfs_remount(struct super unsigned long safe_mask = 0; unsigned int commit_max_age = (unsigned int)-1; struct reiserfs_journal *journal = SB_JOURNAL(s); + char *new_opts = kstrdup(arg, GFP_KERNEL); int err; #ifdef CONFIG_QUOTA int i; @@ -1153,7 +1155,8 @@ static int reiserfs_remount(struct super REISERFS_SB(s)-s_qf_names[i] = NULL; } #endif - return -EINVAL; + err = -EINVAL; + goto out_err; } handle_attrs(s); @@ -1191,9 +1194,9 @@ static int reiserfs_remount(struct super } if (blocks) { - int rc = reiserfs_resize(s, blocks); - if (rc != 0) - return rc; + err = reiserfs_resize(s, blocks); + if (err != 0) + goto out_err; } if (*mount_flags MS_RDONLY) { @@ -1201,16 +1204,16 @@ static int reiserfs_remount(struct super /* remount read-only */ if (s-s_flags MS_RDONLY) /* it is read-only already */ - return 0; + goto out_ok; /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)-s_mount_state != REISERFS_VALID_FS) { - return 0; + goto out_ok; } err = journal_begin(th, s, 10); if (err) - return err; + goto out_err; /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1220,11 +1223,13 @@ static int reiserfs_remount(struct super /* remount read-write */ if (!(s-s_flags MS_RDONLY)) { reiserfs_xattr_init(s, *mount_flags); - return 0; /* We are read-write already */ + goto out_ok;/* We are read-write already */ } - if (reiserfs_is_journal_aborted(journal)) - return journal-j_errno; + if (reiserfs_is_journal_aborted(journal)) { + err = journal-j_errno; + goto out_err; + } handle_data_mode(s, mount_options); handle_barrier_mode(s, mount_options); @@ -1232,7 +1237,7 @@ static int reiserfs_remount(struct super s-s_flags = ~MS_RDONLY; /* now it is safe to call journal_begin */ err = journal_begin(th, s, 10); if (err) - return err; + goto out_err; /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); @@ -1247,7 +1252,7 @@ static int reiserfs_remount(struct super SB_JOURNAL(s)-j_must_wait = 1; err = journal_end(th, s, 10); if (err) - return err; + goto out_err; s-s_dirt = 0; if (!(*mount_flags MS_RDONLY)) { @@ -1255,7 +1260,14 @@ static int reiserfs_remount(struct super reiserfs_xattr_init(s, *mount_flags); } +out_ok: + kfree(s-s_options); + s-s_options = new_opts; return 0; + +out_err: + kfree(new_opts); + return err; } static int read_super_block(struct super_block *s, int offset) @@ -1559,6 +1571,8 @@ static int reiserfs_fill_super(struct su struct reiserfs_sb_info *sbi; int errval = -EINVAL; + save_mount_options(s, data); + sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); if (!sbi) { errval = -ENOMEM; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 20/26] mount options: fix ncpfs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to ncpfs. Small fix: add FS_BINARY_MOUNTDATA to the filesystem type flags, since it can take binary data, as well as text (similarly to NFS). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/ncpfs/inode.c === --- linux.orig/fs/ncpfs/inode.c 2008-01-24 13:48:37.0 +0100 +++ linux/fs/ncpfs/inode.c 2008-01-24 15:57:17.0 +0100 @@ -28,6 +28,8 @@ #include linux/init.h #include linux/smp_lock.h #include linux/vfs.h +#include linux/mount.h +#include linux/seq_file.h #include linux/ncp_fs.h @@ -36,9 +38,15 @@ #include ncplib_kernel.h #include getopt.h +#define NCP_DEFAULT_FILE_MODE 0600 +#define NCP_DEFAULT_DIR_MODE 0700 +#define NCP_DEFAULT_TIME_OUT 10 +#define NCP_DEFAULT_RETRY_COUNT 20 + static void ncp_delete_inode(struct inode *); static void ncp_put_super(struct super_block *); static int ncp_statfs(struct dentry *, struct kstatfs *); +static int ncp_show_options(struct seq_file *, struct vfsmount *); static struct kmem_cache * ncp_inode_cachep; @@ -96,6 +104,7 @@ static const struct super_operations ncp .put_super = ncp_put_super, .statfs = ncp_statfs, .remount_fs = ncp_remount, + .show_options = ncp_show_options, }; extern struct dentry_operations ncp_root_dentry_operations; @@ -304,6 +313,37 @@ static void ncp_stop_tasks(struct ncp_se flush_scheduled_work(); } +static int ncp_show_options(struct seq_file *seq, struct vfsmount *mnt) +{ + struct ncp_server *server = NCP_SBP(mnt-mnt_sb); + unsigned int tmp; + + if (server-m.uid != 0) + seq_printf(seq, ,uid=%u, server-m.uid); + if (server-m.gid != 0) + seq_printf(seq, ,gid=%u, server-m.gid); + if (server-m.mounted_uid != 0) + seq_printf(seq, ,owner=%u, server-m.mounted_uid); + tmp = server-m.file_mode S_IALLUGO; + if (tmp != NCP_DEFAULT_FILE_MODE) + seq_printf(seq, ,mode=0%o, tmp); + tmp = server-m.dir_mode S_IALLUGO; + if (tmp != NCP_DEFAULT_DIR_MODE) + seq_printf(seq, ,dirmode=0%o, tmp); + if (server-m.time_out != NCP_DEFAULT_TIME_OUT * HZ / 100) { + tmp = server-m.time_out * 100 / HZ; + seq_printf(seq, ,timeout=%u, tmp); + } + if (server-m.retry_count != NCP_DEFAULT_RETRY_COUNT) + seq_printf(seq, ,retry=%u, server-m.retry_count); + if (server-m.flags != 0) + seq_printf(seq, ,flags=%lu, server-m.flags); + if (server-m.wdog_pid != NULL) + seq_printf(seq, ,wdogpid=%u, pid_vnr(server-m.wdog_pid)); + + return 0; +} + static const struct ncp_option ncp_opts[] = { { uid,OPT_INT,'u' }, { gid,OPT_INT,'g' }, @@ -331,12 +371,12 @@ static int ncp_parse_options(struct ncp_ data-mounted_uid = 0; data-wdog_pid = NULL; data-ncp_fd = ~0; - data-time_out = 10; - data-retry_count = 20; + data-time_out = NCP_DEFAULT_TIME_OUT; + data-retry_count = NCP_DEFAULT_RETRY_COUNT; data-uid = 0; data-gid = 0; - data-file_mode = 0600; - data-dir_mode = 0700; + data-file_mode = NCP_DEFAULT_FILE_MODE; + data-dir_mode = NCP_DEFAULT_DIR_MODE; data-info_fd = -1; data-mounted_vol[0] = 0; @@ -982,6 +1022,7 @@ static struct file_system_type ncp_fs_ty .name = ncpfs, .get_sb = ncp_get_sb, .kill_sb= kill_anon_super, + .fs_flags = FS_BINARY_MOUNTDATA, }; static int __init init_ncp_fs(void) -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 19/26] mount options: fix jfs
From: Miklos Szeredi [EMAIL PROTECTED] Add iocharset= and errors= options to /proc/mounts for jfs filesystems. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/jfs/super.c === --- linux.orig/fs/jfs/super.c 2008-01-17 19:00:55.0 +0100 +++ linux/fs/jfs/super.c2008-01-21 19:39:30.0 +0100 @@ -602,6 +602,12 @@ static int jfs_show_options(struct seq_f seq_printf(seq, ,umask=%03o, sbi-umask); if (sbi-flag JFS_NOINTEGRITY) seq_puts(seq, ,nointegrity); + if (sbi-nls_tab) + seq_printf(seq, ,iocharset=%s, sbi-nls_tab-charset); + if (sbi-flag JFS_ERR_CONTINUE) + seq_printf(seq, ,errors=continue); + if (sbi-flag JFS_ERR_PANIC) + seq_printf(seq, ,errors=panic); #ifdef CONFIG_QUOTA if (sbi-flag JFS_USRQUOTA) -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 17/26] mount options: fix hugetlbfs
From: Miklos Szeredi [EMAIL PROTECTED] Add a .show_options super operation to hugetlbfs. Use generic_show_options() and save the complete option string in hugetlbfs_fill_super(). Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/hugetlbfs/inode.c === --- linux.orig/fs/hugetlbfs/inode.c 2008-01-22 21:31:53.0 +0100 +++ linux/fs/hugetlbfs/inode.c 2008-01-22 21:32:20.0 +0100 @@ -734,6 +734,7 @@ static const struct super_operations hug .delete_inode = hugetlbfs_delete_inode, .drop_inode = hugetlbfs_drop_inode, .put_super = hugetlbfs_put_super, + .show_options = generic_show_options, }; static int @@ -817,6 +818,8 @@ hugetlbfs_fill_super(struct super_block struct hugetlbfs_config config; struct hugetlbfs_sb_info *sbinfo; + save_mount_options(sb, data); + config.nr_blocks = -1; /* No limit on size by default */ config.nr_inodes = -1; /* No limit on number of inodes by default */ config.uid = current-fsuid; -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] VFS: create /proc/pid/mountinfo
Pavel Machek wrote: On Sun 2008-01-20 09:23:00, Miklos Szeredi wrote: Miklos Szeredi wrote: - for mount ID's use IDA (from the IDR library) instead of a 32bit counter, which could overflow IDAs tend to get reused quickly, which can cause race conditions. Any reason not to just use a 64-bit counter? They tend to become hard to parse/compare for humans after a while. And all this is basically only for humans, so race conditions don't really matter. Also a changed mount with a reused ID is easily identified by comparing the other fields. Hmm, smart humans only compare last few digits if they don't care about 100% reliability, and dumb software compares 64bits easily... Pavel Indeed. And this is most certainly NOT only for humans, and race conditions most certainly matter. Use case please? What will this info be used for, other than for feedback for humans about the state of the propagation tree? Face it, userspace is inherently racy. Inode numbers, device numbers, whatever are being reused all the time, we live with it, even if it's programs, and not just humans. But it's not even an important design decision, the ID allocation can be swapped at any time. If you insist, I'll change it to a 64bit counter, and it'll just suck a little more, but no permanent damage done ;) Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 07/10] unprivileged mounts: add sysctl tunable for safe property
What do you think about doing this only if FS_SAFE is also set, so for instance at first only FUSE would allow itself to be made user-mountable? A safe thing to do, or overly intrusive? It goes somewhat against the no policy in kernel policy ;). I think the warning in the documentation should be enough to make sysadmins think twice before doing anything foolish: +Care should be taken when enabling this, since most +filesystems haven't been designed with unprivileged mounting +in mind. + BTW, filesystems like 'proc' and 'sysfs' should also be safe, although the only use for them being marked safe is if the users are allowed to umount them from their private namespace (otherwise a 'mount --bind' has the same effect as a new mount). Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] VFS: create /proc/pid/mountinfo
On Mon, 2008-01-21 at 22:25 +0100, Miklos Szeredi wrote: You have removed the code that checked if the peer or master mount was in the same namespace before reporting their corresponding mount-ids. One downside of that approach is the user will see an mount_id in the output with no corresponding line to explain the details of the mount_id. Before the change, the peer and master ID's were basically randomly chosen from the peers, which means, it wasn't possible to always determine, that two mounts were peers, or that they were slaves to the same peer group. After the change, this is possible, since the peer ID will be the same for all mounts which are peers. This means, that even though the peer ID might be in a different namespace, it is possible to determine all peers within the same namespace by comparing their peer ID's. I agree with your reasoning on the random id; showing a single id avoids clutter. But my point is, why not show a id for the master or peer residing in the same namespace? Because this way it is possible see propagation between different namespaces as well, by looking at the mount information for processes in the different namespaces. Of course, this is only possible with sufficient privileges. Showing a id with no corresponding entry for that id, can be intriguing. Not if it's clearly documented (will add documentation for the next submission). Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][PATCH] VFS: create /proc/pid/mountinfo
Miklos Szeredi wrote: - for mount ID's use IDA (from the IDR library) instead of a 32bit counter, which could overflow IDAs tend to get reused quickly, which can cause race conditions. Any reason not to just use a 64-bit counter? They tend to become hard to parse/compare for humans after a while. And all this is basically only for humans, so race conditions don't really matter. Also a changed mount with a reused ID is easily identified by comparing the other fields. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC][PATCH] VFS: create /proc/pid/mountinfo
Seems, most people would be happier with a new file, instead of extending /proc/mounts. This patch is the first attempt at doing that, as well as fixing the issues found in the previous submission. Thanks, Miklos --- From: Ram Pai [EMAIL PROTECTED] /proc/mounts in its current state fail to disambiguate bind mounts, especially when the bind mount is subrooted. Also it does not capture propagation state of the mounts(shared-subtree). The following patch addresses the problem. The patch adds '/proc/pid/mountinfo' which contains a superset of the fields in '/proc/pid/mounts'. The following additional fields are added: mntid -- is a unique identifier of the mount parent -- the id of the parent mount major:minor -- value of st_dev for files on that filesystem dir -- the subdir in the filesystem which forms the root of this mount propagation-type in the form of propagation_flag[:mntid][,...] note: 'shared' flag is followed by the mntid of its peer mount 'slave' flag is followed by the mntid of its master mount 'private' flag stands by itself 'unbindable' flag stands by itself Also mount options are split into two fileds, the first containing the per mount flags, the second the per super block options. Here is a sample cat /proc/mounts after execution the following commands: mount --bind /mnt /mnt mount --make-shared /mnt mount --bind /mnt/1 /var mount --make-slave /var mount --make-shared /var mount --bind /var/abc /tmp mount --make-unbindable /proc 2 2 0:1 rootfs rootfs / / rw rw private 16 2 98:0 ext2 /dev/root / / rw rw private 17 16 0:3 proc /proc / /proc rw rw unbindable 18 16 0:10 devpts devpts /dev/pts / rw rw private 19 16 98:0 ext2 /dev/root /mnt /mnt rw rw shared:19 20 16 98:0 ext2 /dev/root /mnt/1 /var rw rw shared:21,slave:19 21 16 98:0 ext2 /dev/root /mnt/1/abc /tmp rw rw shared:20,slave:19 For example, the last line indicates that: 1) The mount is a shared mount. 2) Its peer mount of mount with id 20 3) It is also a slave mount of the master-mount with the id 19 4) The filesystem on device with major/minor number 98:0 and subdirectory mnt/1/abc makes the root directory of this mount. 5) And finally the mount with id 16 is its parent. [EMAIL PROTECTED]: - new file, rearrange fields - for mount ID's use IDA (from the IDR library) instead of a 32bit counter, which could overflow - print canonical ID's (smallest one within the peer group) for peers and master, this is more useful, than a random ID within the same namespace - fix a couple of small bugs - remove inlines - style fixes Signed-off-by: Ram Pai [EMAIL PROTECTED] Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/dcache.c === --- linux.orig/fs/dcache.c 2008-01-18 19:21:38.0 +0100 +++ linux/fs/dcache.c 2008-01-18 19:22:27.0 +0100 @@ -1890,6 +1890,60 @@ char *dynamic_dname(struct dentry *dentr return memcpy(buffer, temp, sz); } +static int prepend(char **buffer, int *buflen, const char *str, + int namelen) +{ + *buflen -= namelen; + if (*buflen 0) + return 1; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + +/* + * Write full pathname from the root of the filesystem into the buffer. + */ +char *dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *end = buf + buflen; + char *retval; + + spin_lock(dcache_lock); + prepend(end, buflen, \0, 1); + if (!IS_ROOT(dentry) d_unhashed(dentry)) { + if (prepend(end, buflen, //deleted, 9)) + goto Elong; + } + if (buflen 1) + goto Elong; + /* Get '/' right */ + retval = end-1; + *retval = '/'; + + for (;;) { + struct dentry *parent; + if (IS_ROOT(dentry)) + break; + + parent = dentry-d_parent; + prefetch(parent); + + if (prepend(end, buflen, dentry-d_name.name, + dentry-d_name.len) || + prepend(end, buflen, /, 1)) + goto Elong; + + retval = end; + dentry = parent; + } + spin_unlock(dcache_lock); + return retval; +Elong: + spin_unlock(dcache_lock); + return ERR_PTR(-ENAMETOOLONG); +} + /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-01-18 19:21:38.0 +0100 +++ linux/fs/namespace.c2008-01-18 23:39:35.0 +0100 @@ -27,6 +27,7 @@ #include linux/mount.h #include linux/ramfs.h #include linux/log2.h +#include linux/idr.h #include asm/uaccess.h #include asm/unistd.h #include
Re: [patch] util-linux-ng: unprivileged mounts support
This is an experimental patch for supporing unprivileged mounts and umounts. User unmount unfortunately still doesn't work if the kernel doesn't have the unprivileged mount support but as we discussed this in last July that shouldn't be needed for this case. % mount -t ntfs-3g /dev/hda10 /tmp/test % cat /proc/mounts | grep /tmp/test /dev/hda10 /tmp/test fuseblk rw,nosuid,nodev,user_id=501,group_id=501,allow_other 0 0 % mount | grep /tmp/test /dev/hda10 on /tmp/test type fuseblk (rw,nosuid,nodev,allow_other,blksize=1024,user=szaka) % umount /tmp/test umount: /dev/hda10: not mounted umount: /tmp/test: must be superuser to umount umount: /dev/hda10: not mounted umount: /tmp/test: must be superuser to umount But 'fusermount -u /tmp/test' does work, doesn't it? Yes, this should probably be fixed in umount(8), but it's an (almost) completely separate issue. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch] VFS: extend /proc/mounts
The alternative (and completely safe) solution is to add another file to proc. Me no likey. Since we need saner layout, I would strongly suggest exactly that. I don't think there's all that much wrong with the current layout, except the two dummy zeroes at the end. Or, something else needs fixing in there? major:minor -- is the major minor number of the device hosting the filesystem Bad description. Value of st_dev for files on that filesystem, please - there might be no such thing as the device hosting the filesystem _and_ the value here may bloody well be unrelated to device actually holding all data (for things like ext2meta, etc.). Right. 1) The mount is a shared mount. 2) Its peer mount of mount with id 20 3) It is also a slave mount of the master-mount with the id 19 4) The filesystem on device with major/minor number 98:0 and subdirectory mnt/1/abc makes the root directory of this mount. 5) And finally the mount with id 16 is its parent. I'd suggest doing a new file that would *not* try to imitate /etc/mtab. Another thing is, how much of propagation information do we want to be exposed and what do we intend to do with it? I think the scheme devised by Ram is basically right. It shows the relationships (slave, peer) and the ID of a master/peer mount. What I changed, is to always show a canonical peer, because I think that is more useful in establishing relationships between mounts. Is this info sensitive? I can't see why it would be. Note that entire propagation tree is out of question - it spans many namespaces and contains potentially sensitive information. So we won't see all nodes. With multiple namespaces, of course you are only allowed to see a part of the tree, but you could have xterms for all of them, and can put together the big picture from the pieces. What do we want to *do* with the information about propagation? Just feedback about the state of the thing. It's very annoying, that after setting up propagation, it's impossible to check the result. Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 9/9] unprivileged mounts: add no submounts flag
Why not nosubmnt? Why not indeed. Maybe I should try to use my brain sometime. Well it really should have 'user' or 'unpriv' in the name somewhere. 'nosubmnt' is more confusing than 'nomnt' because it no submounts really sounds like a reasonable thing in itself... I slept on it, and I still think 'nosubmnt' might be the best compromise. Obviously the superuser has privileges, that override what is normally allowed, and we don't find it strange when a read-only file is happily being written by root. It may feel wrong in the context of mounts, because we are so used to mounts being privileged-only. Objections? Once this goes in, it will stay the same forever, so now is the time to express any doubts... Thanks, Miklos - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 06/10] unprivileged mounts: allow unprivileged mounts
From: Miklos Szeredi [EMAIL PROTECTED] For safe filesystems allow unprivileged mounting and forced unmounting. A filesystem type is considered safe, if mounting it by an unprivileged user may not cause a security problem. This is somewhat subjective, so setting this property is left to userspace (implemented in the next patch). Since most filesystems haven't been designed with unprivileged mounting in mind, a thorough audit is recommended before setting this property. Make this a separate integer member in 'struct file_system_type' instead of a flag, since that is easier to handle by sysctl code. Move subtype handling from do_kern_mount() into do_new_mount(). All other callers are kernel-internal and do not need subtype support. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-01-16 13:25:08.0 +0100 +++ linux/fs/namespace.c2008-01-16 13:25:09.0 +0100 @@ -966,14 +966,16 @@ static bool is_mount_owner(struct vfsmou /* * umount is permitted for * - sysadmin - * - mount owner, if not forced umount + * - mount owner + *o if not forced umount, + *o if forced umount, and filesystem is safe */ static bool permit_umount(struct vfsmount *mnt, int flags) { if (capable(CAP_SYS_ADMIN)) return true; - if (flags MNT_FORCE) + if ((flags MNT_FORCE) !(mnt-mnt_sb-s_type-fs_safe)) return false; return is_mount_owner(mnt, current-fsuid); @@ -1031,13 +1033,17 @@ asmlinkage long sys_oldumount(char __use * - mountpoint is not a symlink * - mountpoint is in a mount owned by the user */ -static bool permit_mount(struct nameidata *nd, int *flags) +static bool permit_mount(struct nameidata *nd, struct file_system_type *type, +int *flags) { struct inode *inode = nd-path.dentry-d_inode; if (capable(CAP_SYS_ADMIN)) return true; + if (type !type-fs_safe) + return false; + if (S_ISLNK(inode-i_mode)) return false; @@ -1291,7 +1297,7 @@ static int do_loopback(struct nameidata struct vfsmount *mnt = NULL; int err; - if (!permit_mount(nd, flags)) + if (!permit_mount(nd, NULL, flags)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1472,30 +1478,76 @@ out: return err; } +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) +{ + int err; + const char *subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + err = -EINVAL; + if (!subtype[0]) + goto err; + } else + subtype = ; + + mnt-mnt_sb-s_subtype = kstrdup(subtype, GFP_KERNEL); + err = -ENOMEM; + if (!mnt-mnt_sb-s_subtype) + goto err; + return mnt; + + err: + mntput(mnt); + return ERR_PTR(err); +} + /* * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct nameidata *nd, char *type, int flags, +static int do_new_mount(struct nameidata *nd, char *fstype, int flags, int mnt_flags, char *name, void *data) { + int err; struct vfsmount *mnt; + struct file_system_type *type; - if (!type || !memchr(type, 0, PAGE_SIZE)) + if (!fstype || !memchr(fstype, 0, PAGE_SIZE)) return -EINVAL; - /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mnt = do_kern_mount(type, flags ~MS_SETUSER, name, data); - if (IS_ERR(mnt)) + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + err = -EPERM; + if (!permit_mount(nd, type, flags)) + goto out_put_filesystem; + + if (flags MS_SETUSER) { + err = reserve_user_mount(); + if (err) + goto out_put_filesystem; + } + + mnt = vfs_kern_mount(type, flags ~MS_SETUSER, name, data); + if (!IS_ERR(mnt) (type-fs_flags FS_HAS_SUBTYPE) + !mnt-mnt_sb-s_subtype) + mnt = fs_set_subtype(mnt, fstype); + put_filesystem(type); + if (IS_ERR(mnt)) { + if (flags MS_SETUSER) + dec_nr_user_mounts(); return PTR_ERR(mnt); + } if (flags MS_SETUSER) - set_mnt_user(mnt); + __set_mnt_user(mnt); return do_add_mount(mnt, nd, mnt_flags, NULL); + + out_put_filesystem: + put_filesystem(type); + return err; } /* @@ -1526,7 +1578,7 @@ int do_add_mount(struct vfsmount *newmnt if (S_ISLNK(newmnt-mnt_root
[patch 08/10] unprivileged mounts: make fuse safe
From: Miklos Szeredi [EMAIL PROTECTED] Don't require the user_id= and group_id= options for unprivileged mounts, but if they are present, verify them for sanity. Disallow the allow_other option for unprivileged mounts. FUSE was designed from the beginning to be safe for unprivileged users. This has also been verified in practice over many years, with some distributions enabling unprivileged FUSE mounts by default. However there are some properties of FUSE, that could make it unsafe for certain situations (e.g. multiuser system with untrusted users): - It is not always possible to use kill(2) (not even with SIGKILL) to terminate a process using a FUSE filesystem. However it is possible to use any of the following instead: o kill the filesystem daemon o use forced umounting o use the fusectl control filesystem - As a special case of the above, killing a self-deadlocked FUSE process is not possible, and even killall5 will not terminate it. - Due to the design of the process freezer, a hanging (due to network problems, etc) or malicious filesystem may prevent suspending to ram or hibernation to succeed. This is not actually unique to FUSE, as any hanging network filesystem will have the same affect. If the above could pose a threat to the system, it is recommended, that the '/proc/sys/fs/types/fuse/safe' sysctl tunable is not turned on, and/or '/dev/fuse' is not made world-readable and writable. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/fuse/inode.c === --- linux.orig/fs/fuse/inode.c 2008-01-16 13:24:52.0 +0100 +++ linux/fs/fuse/inode.c 2008-01-16 13:25:10.0 +0100 @@ -357,6 +357,19 @@ static int parse_fuse_opt(char *opt, str d-max_read = ~0; d-blksize = 512; + /* +* For unprivileged mounts use current uid/gid. Still allow +* user_id and group_id options for compatibility, but +* only if they match these values. +*/ + if (!capable(CAP_SYS_ADMIN)) { + d-user_id = current-uid; + d-user_id_present = 1; + d-group_id = current-gid; + d-group_id_present = 1; + + } + while ((p = strsep(opt, ,)) != NULL) { int token; int value; @@ -385,6 +398,8 @@ static int parse_fuse_opt(char *opt, str case OPT_USER_ID: if (match_int(args[0], value)) return 0; + if (d-user_id_present d-user_id != value) + return 0; d-user_id = value; d-user_id_present = 1; break; @@ -392,6 +407,8 @@ static int parse_fuse_opt(char *opt, str case OPT_GROUP_ID: if (match_int(args[0], value)) return 0; + if (d-group_id_present d-group_id != value) + return 0; d-group_id = value; d-group_id_present = 1; break; @@ -596,6 +613,10 @@ static int fuse_fill_super(struct super_ if (!parse_fuse_opt((char *) data, d, is_bdev)) return -EINVAL; + /* This is a privileged option */ + if ((d.flags FUSE_ALLOW_OTHER) !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (is_bdev) { #ifdef CONFIG_BLOCK if (!sb_set_blocksize(sb, d.blksize)) -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 07/10] unprivileged mounts: add sysctl tunable for safe property
From: Miklos Szeredi [EMAIL PROTECTED] Add the following: /proc/sys/fs/types/${FS_TYPE}/usermount_safe Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/filesystems.c === --- linux.orig/fs/filesystems.c 2008-01-16 13:24:52.0 +0100 +++ linux/fs/filesystems.c 2008-01-16 13:25:09.0 +0100 @@ -12,6 +12,7 @@ #include linux/kmod.h #include linux/init.h #include linux/module.h +#include linux/sysctl.h #include asm/uaccess.h /* @@ -51,6 +52,57 @@ static struct file_system_type **find_fi return p; } +#define MAX_FILESYSTEM_VARS 1 + +struct filesystem_sysctl_table { + struct ctl_table_header *header; + struct ctl_table table[MAX_FILESYSTEM_VARS + 1]; +}; + +/* + * Create /sys/fs/types/${FSNAME} directory with per fs-type tunables. + */ +static int filesystem_sysctl_register(struct file_system_type *fs) +{ + struct filesystem_sysctl_table *t; + struct ctl_path path[] = { + { .procname = fs, .ctl_name = CTL_FS }, + { .procname = types, .ctl_name = CTL_UNNUMBERED }, + { .procname = fs-name, .ctl_name = CTL_UNNUMBERED }, + { } + }; + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOMEM; + + + t-table[0].ctl_name = CTL_UNNUMBERED; + t-table[0].procname = usermount_safe; + t-table[0].maxlen = sizeof(int); + t-table[0].data = fs-fs_safe; + t-table[0].mode = 0644; + t-table[0].proc_handler = proc_dointvec; + + t-header = register_sysctl_paths(path, t-table); + if (!t-header) { + kfree(t); + return -ENOMEM; + } + + fs-sysctl_table = t; + + return 0; +} + +static void filesystem_sysctl_unregister(struct file_system_type *fs) +{ + struct filesystem_sysctl_table *t = fs-sysctl_table; + + unregister_sysctl_table(t-header); + kfree(t); +} + /** * register_filesystem - register a new filesystem * @fs: the file system structure @@ -80,6 +132,13 @@ int register_filesystem(struct file_syst else *p = fs; write_unlock(file_systems_lock); + + if (res == 0) { + res = filesystem_sysctl_register(fs); + if (res != 0) + unregister_filesystem(fs); + } + return res; } @@ -108,6 +167,7 @@ int unregister_filesystem(struct file_sy *tmp = fs-next; fs-next = NULL; write_unlock(file_systems_lock); + filesystem_sysctl_unregister(fs); return 0; } tmp = (*tmp)-next; Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-01-16 13:25:09.0 +0100 +++ linux/include/linux/fs.h2008-01-16 13:25:09.0 +0100 @@ -1437,6 +1437,7 @@ struct file_system_type { struct module *owner; struct file_system_type * next; struct list_head fs_supers; + struct filesystem_sysctl_table *sysctl_table; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; Index: linux/Documentation/filesystems/proc.txt === --- linux.orig/Documentation/filesystems/proc.txt 2008-01-16 13:25:07.0 +0100 +++ linux/Documentation/filesystems/proc.txt2008-01-16 13:25:09.0 +0100 @@ -43,6 +43,7 @@ Table of Contents 2.13 /proc/pid/oom_score - Display current oom-killer score 2.14 /proc/pid/io - Display the IO accounting fields 2.15 /proc/pid/coredump_filter - Core dump filtering settings + 2.16 /proc/sys/fs/types - File system type specific parameters -- Preface @@ -2283,4 +2284,21 @@ For example: $ echo 0x7 /proc/self/coredump_filter $ ./some_program +2.16 /proc/sys/fs/types/ - File system type specific parameters + + +There's a separate directory /proc/sys/fs/types/type/ for each +filesystem type, containing the following files: + +usermount_safe +-- + +Setting this to non-zero will allow filesystems of this type to be +mounted by unprivileged users (note, that there are other +prerequisites as well). + +Care should be taken when enabling this, since most +filesystems haven't been designed with unprivileged mounting +in mind. + -- -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 10/10] unprivileged mounts: add no submounts flag
From: Miklos Szeredi [EMAIL PROTECTED] Add a new mount flag nosubmnt, which denies submounts for the owner. This would be useful, if we want to support traditional /etc/fstab based user mounts. In this case mount(8) would still have to be suid-root, to check the mountpoint against the user/users flag in /etc/fstab, but /etc/mtab would no longer be mandatory for storing the actual owner of the mount. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] Acked-by: Serge Hallyn [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-01-16 13:25:11.0 +0100 +++ linux/fs/namespace.c2008-01-16 13:25:12.0 +0100 @@ -700,6 +700,7 @@ static int show_vfsmnt(struct seq_file * { MNT_NOATIME, ,noatime }, { MNT_NODIRATIME, ,nodiratime }, { MNT_RELATIME, ,relatime }, + { MNT_NOSUBMNT, ,nosubmnt }, { 0, NULL } }; struct proc_fs_info *fs_infop; @@ -1050,6 +1051,9 @@ static bool permit_mount(struct nameidat if (S_ISLNK(inode-i_mode)) return false; + if (nd-path.mnt-mnt_flags MNT_NOSUBMNT) + return false; + if (!is_mount_owner(nd-path.mnt, current-fsuid)) return false; @@ -1894,9 +1898,11 @@ long do_mount(char *dev_name, char *dir_ mnt_flags |= MNT_RELATIME; if (flags MS_RDONLY) mnt_flags |= MNT_READONLY; + if (flags MS_NOSUBMNT) + mnt_flags |= MNT_NOSUBMNT; - flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); + flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | + MS_NODIRATIME | MS_RELATIME | MS_KERNMOUNT | MS_NOSUBMNT); /* ... and get the mountpoint */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd); Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-01-16 13:25:11.0 +0100 +++ linux/include/linux/fs.h2008-01-16 13:25:12.0 +0100 @@ -129,6 +129,7 @@ extern int dir_notify_enable; #define MS_KERNMOUNT (122) /* this is a kern_mount call */ #define MS_I_VERSION (123) /* Update inode I_version field */ #define MS_SETUSER (124) /* set mnt_uid to current user */ +#define MS_NOSUBMNT(125) /* don't allow unprivileged submounts */ #define MS_ACTIVE (130) #define MS_NOUSER (131) Index: linux/include/linux/mount.h === --- linux.orig/include/linux/mount.h2008-01-16 13:25:05.0 +0100 +++ linux/include/linux/mount.h 2008-01-16 13:25:12.0 +0100 @@ -30,6 +30,7 @@ struct mnt_namespace; #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40/* does the user want this to be r/o? */ +#define MNT_NOSUBMNT 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ -- - To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 09/10] unprivileged mounts: propagation: inherit owner from parent
From: Miklos Szeredi [EMAIL PROTECTED] On mount propagation, let the owner of the clone be inherited from the parent into which it has been propagated. If the parent has the nosuid flag, set this flag for the child as well. This is needed for the suid-less namespace (use case #2 in the first patch header), where all mounts are owned by the user and have the nosuid flag set. In this case the propagated mount needs to have nosuid, otherwise a suid executable may be misused by the user. Similar treatment is not needed for nodev, because devices can't be abused this way: the user is not able to gain privileges to devices by rearranging the mount namespace. Signed-off-by: Miklos Szeredi [EMAIL PROTECTED] --- Index: linux/fs/namespace.c === --- linux.orig/fs/namespace.c 2008-01-16 13:25:09.0 +0100 +++ linux/fs/namespace.c2008-01-16 13:25:11.0 +0100 @@ -506,10 +506,10 @@ static int reserve_user_mount(void) return err; } -static void __set_mnt_user(struct vfsmount *mnt) +static void __set_mnt_user(struct vfsmount *mnt, uid_t owner) { WARN_ON(mnt-mnt_flags MNT_USER); - mnt-mnt_uid = current-fsuid; + mnt-mnt_uid = owner; mnt-mnt_flags |= MNT_USER; if (!capable(CAP_SETUID)) @@ -520,7 +520,7 @@ static void __set_mnt_user(struct vfsmou static void set_mnt_user(struct vfsmount *mnt) { - __set_mnt_user(mnt); + __set_mnt_user(mnt, current-fsuid); spin_lock(vfsmount_lock); nr_user_mounts++; spin_unlock(vfsmount_lock); @@ -536,7 +536,7 @@ static void clear_mnt_user(struct vfsmou } static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, - int flag) + int flag, uid_t owner) { struct super_block *sb = old-mnt_sb; struct vfsmount *mnt; @@ -560,7 +560,10 @@ static struct vfsmount *clone_mnt(struct /* don't copy the MNT_USER flag */ mnt-mnt_flags = ~MNT_USER; if (flag CL_SETUSER) - __set_mnt_user(mnt); + __set_mnt_user(mnt, owner); + + if (flag CL_NOSUID) + mnt-mnt_flags |= MNT_NOSUID; if (flag CL_SLAVE) { list_add(mnt-mnt_slave, old-mnt_slave_list); @@ -1066,7 +1069,7 @@ static int lives_below_in_same_fs(struct } struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, - int flag) + int flag, uid_t owner) { struct vfsmount *res, *p, *q, *r, *s; struct nameidata nd; @@ -1074,7 +1077,7 @@ struct vfsmount *copy_tree(struct vfsmou if (!(flag CL_COPY_ALL) IS_MNT_UNBINDABLE(mnt)) return ERR_PTR(-EPERM); - res = q = clone_mnt(mnt, dentry, flag); + res = q = clone_mnt(mnt, dentry, flag, owner); if (IS_ERR(q)) goto error; q-mnt_mountpoint = mnt-mnt_mountpoint; @@ -1096,7 +1099,7 @@ struct vfsmount *copy_tree(struct vfsmou p = s; nd.path.mnt = q; nd.path.dentry = p-mnt_mountpoint; - q = clone_mnt(p, p-mnt_root, flag); + q = clone_mnt(p, p-mnt_root, flag, owner); if (IS_ERR(q)) goto error; spin_lock(vfsmount_lock); @@ -1121,7 +1124,7 @@ struct vfsmount *collect_mounts(struct v { struct vfsmount *tree; down_read(namespace_sem); - tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); + tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE, 0); up_read(namespace_sem); return tree; } @@ -1292,7 +1295,8 @@ static int do_change_type(struct nameida */ static int do_loopback(struct nameidata *nd, char *old_name, int flags) { - int clone_fl; + int clone_fl = 0; + uid_t owner = 0; struct nameidata old_nd; struct vfsmount *mnt = NULL; int err; @@ -1313,11 +1317,17 @@ static int do_loopback(struct nameidata if (!check_mnt(nd-path.mnt) || !check_mnt(old_nd.path.mnt)) goto out; - clone_fl = (flags MS_SETUSER) ? CL_SETUSER : 0; + if (flags MS_SETUSER) { + clone_fl |= CL_SETUSER; + owner = current-fsuid; + } + if (flags MS_REC) - mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, clone_fl); + mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, clone_fl, + owner); else - mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, clone_fl); + mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, clone_fl, + owner); err = PTR_ERR(mnt); if (IS_ERR(mnt)) @@ -1541,7 +1551,7