Re: [PATCH 22/28] mm: add support for non block device backed swap files

2008-02-26 Thread Miklos Szeredi
Starting review in the middle, because this is the part I'm most
familiar with.

 New addres_space_operations methods are added:
   int swapfile(struct address_space *, int);

Separate -swapon() and -swapoff() methods would be so much cleaner IMO.

Also is there a reason why 'struct file *' cannot be supplied to these
functions?

[snip]

 +int swap_set_page_dirty(struct page *page)
 +{
 + struct swap_info_struct *sis = page_swap_info(page);
 +
 + if (sis-flags  SWP_FILE) {
 + const struct address_space_operations *a_ops =
 + sis-swap_file-f_mapping-a_ops;
 + int (*spd)(struct page *) = a_ops-set_page_dirty;
 +#ifdef CONFIG_BLOCK
 + if (!spd)
 + spd = __set_page_dirty_buffers;
 +#endif

This ifdef is not really needed.  Just require -set_page_dirty() be
filled in by filesystems which want swapfiles (and others too, in the
longer term, the fallback is just historical crud).

Here's an incremental patch addressing these issues and beautifying
the new code.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]

Index: linux/mm/page_io.c
===
--- linux.orig/mm/page_io.c 2008-02-26 11:15:58.0 +0100
+++ linux/mm/page_io.c  2008-02-26 13:40:55.0 +0100
@@ -106,8 +106,10 @@ int swap_writepage(struct page *page, st
}
 
if (sis-flags  SWP_FILE) {
-   ret = sis-swap_file-f_mapping-
-   a_ops-swap_out(sis-swap_file, page, wbc);
+   struct file *swap_file = sis-swap_file;
+   struct address_space *mapping = swap_file-f_mapping;
+
+   ret = mapping-a_ops-swap_out(swap_file, page, wbc);
if (!ret)
count_vm_event(PSWPOUT);
return ret;
@@ -136,12 +138,13 @@ void swap_sync_page(struct page *page)
struct swap_info_struct *sis = page_swap_info(page);
 
if (sis-flags  SWP_FILE) {
-   const struct address_space_operations *a_ops =
-   sis-swap_file-f_mapping-a_ops;
-   if (a_ops-sync_page)
-   a_ops-sync_page(page);
-   } else
+   struct address_space *mapping = sis-swap_file-f_mapping;
+
+   if (mapping-a_ops-sync_page)
+   mapping-a_ops-sync_page(page);
+   } else {
block_sync_page(page);
+   }
 }
 
 int swap_set_page_dirty(struct page *page)
@@ -149,17 +152,12 @@ int swap_set_page_dirty(struct page *pag
struct swap_info_struct *sis = page_swap_info(page);
 
if (sis-flags  SWP_FILE) {
-   const struct address_space_operations *a_ops =
-   sis-swap_file-f_mapping-a_ops;
-   int (*spd)(struct page *) = a_ops-set_page_dirty;
-#ifdef CONFIG_BLOCK
-   if (!spd)
-   spd = __set_page_dirty_buffers;
-#endif
-   return (*spd)(page);
-   }
+   struct address_space *mapping = sis-swap_file-f_mapping;
 
-   return __set_page_dirty_nobuffers(page);
+   return mapping-a_ops-set_page_dirty(page);
+   } else {
+   return __set_page_dirty_nobuffers(page);
+   }
 }
 
 int swap_readpage(struct file *file, struct page *page)
@@ -172,8 +170,10 @@ int swap_readpage(struct file *file, str
BUG_ON(PageUptodate(page));
 
if (sis-flags  SWP_FILE) {
-   ret = sis-swap_file-f_mapping-
-   a_ops-swap_in(sis-swap_file, page);
+   struct file *swap_file = sis-swap_file;
+   struct address_space *mapping = swap_file-f_mapping;
+
+   ret = mapping-a_ops-swap_in(swap_file, page);
if (!ret)
count_vm_event(PSWPIN);
return ret;
Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-02-26 11:15:58.0 +0100
+++ linux/include/linux/fs.h2008-02-26 13:29:40.0 +0100
@@ -485,7 +485,8 @@ struct address_space_operations {
/*
 * swapfile support
 */
-   int (*swapfile)(struct address_space *, int);
+   int (*swapon)(struct file *file);
+   int (*swapoff)(struct file *file);
int (*swap_out)(struct file *file, struct page *page,
struct writeback_control *wbc);
int (*swap_in)(struct file *file, struct page *page);
Index: linux/mm/swapfile.c
===
--- linux.orig/mm/swapfile.c2008-02-26 12:43:57.0 +0100
+++ linux/mm/swapfile.c 2008-02-26 13:34:57.0 +0100
@@ -1014,9 +1014,11 @@ static void destroy_swap_extents(struct 
}
 
if (sis-flags  SWP_FILE) {
+   struct file *swap_file = sis-swap_file;
+   struct address_space *mapping = swap_file

Re: [patch 00/10] mount ownership and unprivileged mount syscall (v8)

2008-02-23 Thread Miklos Szeredi
 On Sat, Feb 23, 2008 at 06:33:13PM +0100, Miklos Szeredi wrote:
 c) just what is limited by that sysctl?  AFAICS, rbind is allowed
   if mountpoint is on user vfsmount and it seems to create vfsmounts without
   eating into that limit just fine...  What's the point of limiting the
   amount of vfsmounts marked user when you do not limit the number of 
   vfsmount
   one can allocate?
  
  The limit is there, so that unprivileged users cannot create insane
  number of mounts.  It's just a safety thing, analogous to
  /proc/sys/fs/file-max.
 
 Can't they?  Looks like one can create any number of vfsmounts without
 getting more than one marked MNT_USER...

permit_mount() will set MS_SETUSER in flags, and do_loopback() will
set CL_SETUSER based on that flag.

 If you are trying to limit the number of superblocks (i.e. active instances
 of filesystems), then I'd say that vfsmounts make piss-poor proxies for
 those and it would be better to count the objects you really want to count...

I think I really want to limit vfsmounts.  But not because these take
so much memory or anything, just to be safe against a stupid users
playing rbind and propagation, and things like that.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[rfc patch] how to show propagation state for mounts

2008-02-22 Thread Miklos Szeredi
  If you get down to it, the thing is about delegating control over part
  of namespace to somebody, without letting them control, see, etc. the
  rest of it.  So I'd rather be very conservative about extra information
  we allow to piggyback on that.  I don't know... perhaps with stable peer
  group IDs it would be OK to show peer group ID by (our) vfsmount + peer
  group ID of master + peer group ID of nearest dominating group that has
  intersection with our namespace.  Then we don't leak information (AFAICS),
  get full propagation information between our vfsmounts and cooperating
  tasks in different namespaces can figure the things out as much as possible
  without leaking 3rd-party information to either.
 

Here's a patch against current -mm implementing this (with some
cleanups thrown in).  Done some testing on it as well, it wasn't
entirey trivial to figure out a setup, where propagation goes out of
the namespace first, then comes back in:

  mount --bind /mnt1 /mnt1
  mount --make-shared /mnt1
  mount --bind /mnt2 /mnt2
  mount --make-shared /mnt2
  newns
  mount --make-slave /mnt1

old ns:
  mount --make-slave /mnt2
  mount --bind /mnt1/tmp /mnt1/tmp

new ns:
  mount --make-shared /mnt1/tmp
  mount --bind /mnt1/tmp /mnt2/tmp

Voila.


Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/pnode.c
===
--- linux.orig/fs/pnode.c   2008-02-22 15:27:23.0 +0100
+++ linux/fs/pnode.c2008-02-22 15:27:26.0 +0100
@@ -9,8 +9,12 @@
 #include linux/mnt_namespace.h
 #include linux/mount.h
 #include linux/fs.h
+#include linux/idr.h
 #include pnode.h
 
+static DEFINE_SPINLOCK(mnt_pgid_lock);
+static DEFINE_IDA(mnt_pgid_ida);
+
 /* return the next shared peer mount of @p */
 static inline struct vfsmount *next_peer(struct vfsmount *p)
 {
@@ -27,36 +31,90 @@ static inline struct vfsmount *next_slav
return list_entry(p-mnt_slave.next, struct vfsmount, mnt_slave);
 }
 
-static int __peer_group_id(struct vfsmount *mnt)
+static void __set_mnt_shared(struct vfsmount *mnt)
 {
-   struct vfsmount *m;
-   int id = mnt-mnt_id;
+   mnt-mnt_flags = ~MNT_PNODE_MASK;
+   mnt-mnt_flags |= MNT_SHARED;
+}
+
+void set_mnt_shared(struct vfsmount *mnt)
+{
+   int res;
 
-   for (m = next_peer(mnt); m != mnt; m = next_peer(m))
-   id = min(id, m-mnt_id);
+ retry:
+   spin_lock(mnt_pgid_lock);
+   if (IS_MNT_SHARED(mnt)) {
+   spin_unlock(mnt_pgid_lock);
+   return;
+   }
 
-   return id;
+   res = ida_get_new(mnt_pgid_ida, mnt-mnt_pgid);
+   spin_unlock(mnt_pgid_lock);
+   if (res == -EAGAIN) {
+   if (ida_pre_get(mnt_pgid_ida, GFP_KERNEL))
+   goto retry;
+   }
+   __set_mnt_shared(mnt);
+}
+
+void clear_mnt_shared(struct vfsmount *mnt)
+{
+   if (IS_MNT_SHARED(mnt)) {
+   mnt-mnt_flags = ~MNT_SHARED;
+   mnt-mnt_pgid = -1;
+   }
+}
+
+void make_mnt_peer(struct vfsmount *old, struct vfsmount *mnt)
+{
+   mnt-mnt_pgid = old-mnt_pgid;
+   list_add(mnt-mnt_share, old-mnt_share);
+   __set_mnt_shared(mnt);
 }
 
-/* return the smallest ID within the peer group */
 int get_peer_group_id(struct vfsmount *mnt)
 {
+   return mnt-mnt_pgid;
+}
+
+int get_master_id(struct vfsmount *mnt)
+{
int id;
 
spin_lock(vfsmount_lock);
-   id = __peer_group_id(mnt);
+   id = get_peer_group_id(mnt-mnt_master);
spin_unlock(vfsmount_lock);
 
return id;
 }
 
-/* return the smallest ID within the master's peer group */
-int get_master_id(struct vfsmount *mnt)
+static struct vfsmount *get_peer_in_ns(struct vfsmount *mnt,
+  struct mnt_namespace *ns)
 {
-   int id;
+   struct vfsmount *m = mnt;
+
+   do {
+   if (m-mnt_ns == ns)
+   return m;
+   m = next_peer(m);
+   } while (m != mnt);
+
+   return NULL;
+}
+
+int get_dominator_id_same_ns(struct vfsmount *mnt)
+{
+   int id = -1;
+   struct vfsmount *m;
 
spin_lock(vfsmount_lock);
-   id = __peer_group_id(mnt-mnt_master);
+   for (m = mnt-mnt_master; m != NULL; m = m-mnt_master) {
+   struct vfsmount *d = get_peer_in_ns(m, mnt-mnt_ns);
+   if (d) {
+   id = d-mnt_pgid;
+   break;
+   }
+   }
spin_unlock(vfsmount_lock);
 
return id;
@@ -80,7 +138,13 @@ static int do_make_slave(struct vfsmount
if (peer_mnt == mnt)
peer_mnt = NULL;
}
-   list_del_init(mnt-mnt_share);
+   if (!list_empty(mnt-mnt_share))
+   list_del_init(mnt-mnt_share);
+   else if (IS_MNT_SHARED(mnt)) {
+   spin_lock(mnt_pgid_lock);
+   ida_remove(mnt_pgid_ida, mnt-mnt_pgid);
+   spin_unlock

Re: NFS/LSM: allow NFS to control all of its own mount options

2008-02-20 Thread Miklos Szeredi
 Please don't introduce a special case for just nfs.  All filesystems
 should control their mount options, so please provide some library
 helpers for context= handling and move it into all filesystems that
 can support selinux.

Hmm, looks like selinux is not showing it's mount options in
/proc/mounts.  Well, actually there's no infrastructure for it either.
Here's a template patch (completely untested).

Selinux guys, please fill in the details and submit, thanks.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-02-20 10:51:11.0 +0100
+++ linux/fs/namespace.c2008-02-20 10:51:25.0 +0100
@@ -385,6 +385,7 @@ static int show_vfsmnt(struct seq_file *
if (mnt-mnt_flags  fs_infop-flag)
seq_puts(m, fs_infop-str);
}
+   security_sb_show_options(m, mnt-mnt_sb);
if (mnt-mnt_sb-s_op-show_options)
err = mnt-mnt_sb-s_op-show_options(m, mnt);
seq_puts(m,  0 0\n);
Index: linux/include/linux/security.h
===
--- linux.orig/include/linux/security.h 2008-02-18 21:20:03.0 +0100
+++ linux/include/linux/security.h  2008-02-20 11:02:04.0 +0100
@@ -80,6 +80,7 @@ struct xfrm_selector;
 struct xfrm_policy;
 struct xfrm_state;
 struct xfrm_user_sec_ctx;
+struct seq_file;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
 extern int cap_netlink_recv(struct sk_buff *skb, int cap);
@@ -1226,6 +1227,7 @@ struct security_operations {
int (*sb_copy_data)(struct file_system_type *type,
void *orig, void *copy);
int (*sb_kern_mount) (struct super_block *sb, void *data);
+   int (*sb_show_options) (struct seq_file *, struct super_block *sb);
int (*sb_statfs) (struct dentry *dentry);
int (*sb_mount) (char *dev_name, struct nameidata * nd,
 char *type, unsigned long flags, void *data);
@@ -1487,6 +1489,7 @@ int security_sb_alloc(struct super_block
 void security_sb_free(struct super_block *sb);
 int security_sb_copy_data(struct file_system_type *type, void *orig, void 
*copy);
 int security_sb_kern_mount(struct super_block *sb, void *data);
+int security_sb_show_options(struct seq_file *, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(char *dev_name, struct nameidata *nd,
char *type, unsigned long flags, void *data);
@@ -1744,6 +1747,12 @@ static inline int security_sb_kern_mount
return 0;
 }
 
+static inline int security_sb_show_options (struct seq_file *m,
+   struct super_block *sb)
+{
+   return 0;
+}
+
 static inline int security_sb_statfs (struct dentry *dentry)
 {
return 0;
Index: linux/security/security.c
===
--- linux.orig/security/security.c  2008-02-18 21:20:06.0 +0100
+++ linux/security/security.c   2008-02-20 10:56:16.0 +0100
@@ -252,6 +252,14 @@ int security_sb_kern_mount(struct super_
return security_ops-sb_kern_mount(sb, data);
 }
 
+int security_sb_show_options (struct seq_file *m, struct super_block *sb)
+{
+   if (security_ops-sb_show_options)
+   return security_ops-sb_show_options(m, sb);
+   else
+   return 0;
+}
+
 int security_sb_statfs(struct dentry *dentry)
 {
return security_ops-sb_statfs(dentry);
Index: linux/security/selinux/hooks.c
===
--- linux.orig/security/selinux/hooks.c 2008-02-18 21:20:06.0 +0100
+++ linux/security/selinux/hooks.c  2008-02-20 10:58:57.0 +0100
@@ -590,6 +590,12 @@ out:
return rc;
 }
 
+static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
+{
+   /* ... */
+   return 0;
+}
+
 static int superblock_doinit(struct super_block *sb, void *data)
 {
struct superblock_security_struct *sbsec = sb-s_security;
@@ -4797,6 +4803,7 @@ static struct security_operations selinu
.sb_free_security = selinux_sb_free_security,
.sb_copy_data = selinux_sb_copy_data,
.sb_kern_mount =selinux_sb_kern_mount,
+   .sb_show_options =  selinux_sb_show_options,
.sb_statfs =selinux_sb_statfs,
.sb_mount = selinux_mount,
.sb_umount =selinux_umount,

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


how to show propagation state for mounts

2008-02-20 Thread Miklos Szeredi
 mountinfo - IMO needs a sane discussion of what and how should be shown
 wrt propagation state

Here's my take on the matter.

The propagation tree can be either be represented

 1) from root to leaf listing members of peer groups and their
 slaves explicitly,

 2) or from leaf to root by identifying each peer group and then for
 each mount showing the id of its own group and the id of the group's
 master.

2) can have two variants:

 2a) id of peer group is constant in time

 2b) id of peer group may change

The current patch does 2b).  Having a fixed id for each peer group
would mean introducing a new object to anchor the peer group into,
which would add complexity to the whole thing.

All of these are implementable, just need to decide which one we want.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: how to show propagation state for mounts

2008-02-20 Thread Miklos Szeredi
 On Wed, Feb 20, 2008 at 04:39:15PM +0100, Miklos Szeredi wrote:
   mountinfo - IMO needs a sane discussion of what and how should be shown
   wrt propagation state
  
  Here's my take on the matter.
  
  The propagation tree can be either be represented
  
   1) from root to leaf listing members of peer groups and their
   slaves explicitly,
  
   2) or from leaf to root by identifying each peer group and then for
   each mount showing the id of its own group and the id of the group's
   master.
  
  2) can have two variants:
  
   2a) id of peer group is constant in time
  
   2b) id of peer group may change
  
  The current patch does 2b).  Having a fixed id for each peer group
  would mean introducing a new object to anchor the peer group into,
  which would add complexity to the whole thing.
  
  All of these are implementable, just need to decide which one we want.
 
 Eh...  Much more interesting question: since the propagation tree spans
 multiple namespaces in a lot of normal uses, how do we deal with
 reconstructing propagation through the parts that are not present in
 our namespace?  Moreover, what should and what should not be kept private
 to namespace?  Full exposure of mount trees is definitely over the top
 (it shows potentially sensitive information), so we probably want less
 than that.
 
 FWIW, my gut feeling is that for each peer group that intersects with our
 namespace we ought to expose in some form
   * all vfsmounts belonging to that intesection
   * the nearest dominating peer group (== master (of master ...) of)
 that also has a non-empty intersection with our namespace
 
 It's less about the form of representation (after all, we generate poll
 events when contents of that sucker changes, so one *can* get a consistent
 snapshot of the entire thing) and more about having it self-contained
 when we have namespaces in the play.
 
 IOW, the data in there should give answers to questions that make sense.
 Do events get propagated from this vfsmount I have to that vfsmount I have?
 is a meaningful one; ditto for are events here propagated to somewhere I
 don't see? or are events getting propagated here from somewhere I don't
 see?.

Well, assuming you see only one namespace.  When I'm experimenting
with namespaces and propagations, I see both (each in a separate
xterm) and I do want to know how propagation between them happens.

Your suggestion doesn't deal with that problem.

Otherwise, yes it makes sense to have a consistent view of the tree
shown for each namespace.  Perhaps the solution is to restrict viewing
the whole tree to privileged processes.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: how to show propagation state for mounts

2008-02-20 Thread Miklos Szeredi
  I wonder, what is wrong in reporting mounts in other namespaces that
  either receive and send propagation to mounts in our namespace?
 
 A plenty.  E.g. if foo trusts control over /var/blah to bar, it's not
 obvious that foo has any business knowing if bar gets it from somebody
 else in turn.  And I'm not sure that bar has any business knowing that
 foo has the damn thing attached in five places instead of just one,
 let alone _where_ it has been attached.
 
 If you get down to it, the thing is about delegating control over part
 of namespace to somebody, without letting them control, see, etc. the
 rest of it.  So I'd rather be very conservative about extra information
 we allow to piggyback on that.  I don't know... perhaps with stable peer
 group IDs it would be OK to show peer group ID by (our) vfsmount + peer
 group ID of master + peer group ID of nearest dominating group that has
 intersection with our namespace.  Then we don't leak information (AFAICS),
 get full propagation information between our vfsmounts and cooperating
 tasks in different namespaces can figure the things out as much as possible
 without leaking 3rd-party information to either.

This sounds fine.

I'll have a look at implementing a stable peer group ID (it doesn't
need a separate object, I realized that now).

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


git tree with VFS stuff

2008-02-19 Thread Miklos Szeredi
I've created a git tree with the following mounts related stuff:

  - read-only bind mounts
  - /proc/pid/mountinfo
  - unprivileged mounts

git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfsstuff.git master

I guess, giving these a spin in linux-next wouldn't hurt?

Thanks,
Miklos


Dave Hansen (33):
  reiserfs: eliminate private use of struct file in xattr
  hppfs pass vfsmount to dentry_open()
  check for null vfsmount in dentry_open()
  fix up new filp allocators
  do namei_flags calculation inside open_namei()
  merge open_namei() and do_filp_open()
  r/o bind mounts: stub functions
  r/o bind mounts: create helper to drop file write access
  r/o bind mounts: drop write during emergency remount
  r/o bind mounts: elevate write count for vfs_rmdir()
  r/o bind mounts: elevate write count for callers of vfs_mkdir()
  r/o bind mounts: elevate mnt_writers for unlink callers
  r/o bind mounts: elevate write count for xattr_permission() callers
  r/o bind mounts: elevate write count for ncp_ioctl()
  r/o bind mounts: write counts for time functions
  r/o bind mounts: elevate write count for do_utimes()
  r/o bind mounts: write count for file_update_time()
  r/o bind mounts: write counts for link/symlink
  r/o bind mounts: elevate write count for ioctls()
  r/o bind mounts: elevate write count for open()s
  r/o bind mounts: get write access for vfs_rename() callers
  r/o bind mounts: elevate write count for chmod/chown callers
  r/o bind mounts: write counts for truncate()
  r/o bind mounts: elevate count for xfs timestamp updates
  r/o bind mounts: make access() use new r/o helper
  r/o bind mounts: check mnt instead of superblock directly
  r/o bind mounts: get callers of vfs_mknod/create()
  r/o bind mounts: track numbers of writers to mounts
  r/o bind mounts: honor mount writer counts at remount
  r/o bind mounts: debugging for missed calls
  ehea-fix
  fixes for missed struct paths from akpm
  Revert ehea-fix

Miklos Szeredi (10):
  unprivileged mounts: add user mounts to the kernel
  unprivileged mounts: allow unprivileged umount
  unprivileged mounts: propagate error values from clone_mnt
  unprivileged mounts: account user mounts
  unprivileged mounts: allow unprivileged bind mounts
  unprivileged mounts: allow unprivileged mounts
  unprivileged mounts: add sysctl tunable for safe property
  unprivileged mounts: make fuse safe
  unprivileged mounts: propagation: inherit owner from parent
  unprivileged mounts: add no submounts flag

Ram Pai (1):
  vfs-create-proc-pid-mountinfo

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Miklos Szeredi
   Maybe instead of messing with masks, it's better to introduce a
   get_flags() or a more general mount_stat() operation, and let
   userspace deal with setting and clearing flags, just as we do for
   stat/chmod?
 
   So we'd have
 
mount_stat(path, stat);
mount_bind(from, to, flags);
mount_set_flags(path, flags);
mount_move(from, to);
 
   and perhaps
 
mount_remount(path, opt_string, flags);
 
 Sounds reasonable to me. But it wouldn't directly solve the do a
 recursive bind mount setting the MS_READONLY flag on all children
 problem, so we'd need some of the earlier suggestions too.

Doh, you're right.

Let's try the original idea, but a bit cleaner:

/* flags: */
#define MNT_CTRL_RECURSE (1  0)

/* mnt_flags: */
#define MNT_NOSUID  0x01
#define MNT_NODEV   0x02
#define MNT_NOEXEC  0x04
#define MNT_NOATIME 0x08
#define MNT_NODIRATIME  0x10
#define MNT_RELATIME0x20

#define MNT_SHARED  0x1000
#define MNT_UNBINDABLE  0x2000
#define MNT_PNODE_MASK  0x3000

struct mount_param {
u64 flags;  /* control flags */
u64 mnt_flags;  /* new mount flags */
u64 mnt_flags_mask; /* mask for new mount flags */
};

int mount_bindat(int fromfd, const char *frompath,
int tofd, const char *topath,
struct mount_param *param);

int mount_setflagsat(int fd, const char *path,
struct mount_param *param);

int mount_moveat(int fromfd, const char *frompath,
 int tofd, const char *topath);

...

I deliberately not used the MS_* flags, which is currently a messy mix
of things with totally different meanings.

Does this solve all the issues?

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Miklos Szeredi
  And I'm not against doing it with the at* variants, as Trond
  suggested.
 
 If you're going to change the syscall, then you should ensure that it
 solves _all_ the problems that are known at this time. Ignoring the
 automounter issue is just going to force us to redo the syscall in a
 couple of months...

Sure.

Although, an (almost) equivalent userspace code would be:

mount_fooat(int fd, const char *path)
{
char tmpbuf[64];
int tmpfd = openat(fd, path);

sprintf(tmpbuf, /proc/self/fd/%i, tmpfd);
return mount_foo(tmpbuf, ...);
}

Or is there something (other than not requiring proc) that the *at
variant gives?

Miklos

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Miklos Szeredi
 On Thu, Feb 14, 2008 at 9:31 AM, Miklos Szeredi [EMAIL PROTECTED] wrote:
 
   I deliberately not used the MS_* flags, which is currently a messy mix
   of things with totally different meanings.
 
   Does this solve all the issues?
 
 We should add a size parameter either in the mount_params or as a
 final argument, for future extensibility.

OK, let's add it to mount_params then.

 And we might as well include MNT_READONLY in the API on the assumption
 that per-mount readonly will be available soon.

Right.  That patch-set should already have been merged into 2.6.25...

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 01/26] mount options: add documentation

2008-02-08 Thread Miklos Szeredi
   Could also please explain why you want to go via user
   mounts. Other OS use a daemon for that, which e.g. can maintain
   access controls. How do you want to manage this?
  
  The unprivileged mounts patches do contain a simple form of access
  control.  I don't think anything more is needed, but of course, having
  unprivileged mounts in the kernel does not prevent the use of a more
  sophisticated access control daemon in userspace, if that becomes
  necessary.
 
 A I don't think anything more is needed lets go off all sorts of warning 
 lights. Most things start out simple, so IMO it's very worth it to check 
 where it might go to to know the limits beforehand. The main question here 
 is why should a kernel based solution be preferable over a daemon based 
 solution?

A daemon based solution would work for the normal case, where we
have a single mount namespace and a single /etc/mtab file, and we hope
it doesn't get too much out of sync with what is actually in the
kernel (on remount the mount options do get out of sync, but hey, we
seem to be able to live with that).

However, once you start using multiple namespaces, the daemon based
solution quickly becomes unusable, because you would need a separate
daemon for each namespace, and it would have to somehow keep track of
mount propagations in userspace (which is basically impossible), etc,
etc...

Does that answer your question?

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 07/10] unprivileged mounts: add sysctl tunable for safe property

2008-02-07 Thread Miklos Szeredi
Maybe sysctls just need to check capabilities, instead of uids.  I
think that would make a lot of sense anyway.
   
   Would it be as simple as tagging the inodes with capability sets?  One
   set for writing, or one each for reading and writing?
  
  Yes, or something even simpler, like mapping the owner permission bits
  to CAP_SYS_ADMIN.  There seem to be very few different permissions
  under /proc/sys:
  
  --w---
  -r--r--r--
  -rw---
  -rw-r--r--
  
  As long as the group and other bits are always the same, and we accept
  that the owner bits really mean CAP_SYS_ADMIN and not something else,
 
 But I would assume some things under /proc/sys/net/ipv4 or
 /proc/sys/net/ath0 require CAP_NET_ADMIN rather than CAP_SYS_ADMIN?

I guess so.  I'm not very familiar with the different capabilities :)

How about this patch then: a hybrid solution between just relying on
permission bits, and specifying separate capability sets for read and
write in addition to the permission bits.

Untested, the 'cap' field obviously still needs to be filled in where
appropriate.

Miklos


Index: linux/include/linux/sysctl.h
===
--- linux.orig/include/linux/sysctl.h   2008-02-04 12:29:01.0 +0100
+++ linux/include/linux/sysctl.h2008-02-07 15:19:06.0 +0100
@@ -1041,6 +1041,7 @@ struct ctl_table 
void *data;
int maxlen;
mode_t mode;
+   int cap;/* Capability needed to read/write */
struct ctl_table *child;
struct ctl_table *parent;   /* Automatically set */
proc_handler *proc_handler; /* Callback for text formatting */
Index: linux/kernel/sysctl.c
===
--- linux.orig/kernel/sysctl.c  2008-02-05 22:17:05.0 +0100
+++ linux/kernel/sysctl.c   2008-02-07 15:30:45.0 +0100
@@ -1527,14 +1527,26 @@ out:
  * some sysctl variables are readonly even to root.
  */
 
-static int test_perm(int mode, int op)
+static int test_perm(struct ctl_table *table, int op)
 {
-   if (!current-euid)
-   mode = 6;
-   else if (in_egroup_p(0))
-   mode = 3;
+   int cap = table-cap;
+   mode_t mode = table-mode;
+
+   if (!cap)
+   cap = CAP_SYS_ADMIN;
+
+   if ((op  MAY_READ)  !(mode  S_IRUGO))
+   return -EACCES;
+
+   if ((op  MAY_WRITE)  !(mode  S_IWUGO))
+   return -EACCES;
+
+   if (capable(cap))
+   return 0;
+
if ((mode  op  0007) == op)
return 0;
+
return -EACCES;
 }
 
@@ -1544,7 +1556,7 @@ int sysctl_perm(struct ctl_table *table,
error = security_sysctl(table, op);
if (error)
return error;
-   return test_perm(table-mode, op);
+   return test_perm(table, op);
 }
 
 #ifdef CONFIG_SYSCTL_SYSCALL
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 07/10] unprivileged mounts: add sysctl tunable for safe property

2008-02-06 Thread Miklos Szeredi
  +   t-table[0].mode = 0644;
 
 Yikes, this could be a problem for containers, as it's simply tied to
 uid 0, whereas tying it to a capability would let us solve it with
 capability bounds.
 
 This might mean more urgency to get user namespaces working at least
 with sysfs, else this is a quick way around having CAP_SYS_ADMIN taken
 out of a container's capability bounding set.

I think I understand the problem, but not the solution.  How do user
namespaces going to help?

Maybe sysctls just need to check capabilities, instead of uids.  I
think that would make a lot of sense anyway.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 02/10] unprivileged mounts: allow unprivileged umount

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

The owner doesn't need sysadmin capabilities to call umount().

Similar behavior as umount(8) on mounts having user=UID option in /etc/mtab.
The difference is that umount also checks /etc/fstab, presumably to exclude
another mount on the same mountpoint.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-02-04 23:47:50.0 +0100
+++ linux/fs/namespace.c2008-02-04 23:47:53.0 +0100
@@ -1033,6 +1033,27 @@ static int do_umount(struct vfsmount *mn
return retval;
 }
 
+static bool is_mount_owner(struct vfsmount *mnt, uid_t uid)
+{
+   return (mnt-mnt_flags  MNT_USER)  mnt-mnt_uid == uid;
+}
+
+/*
+ * umount is permitted for
+ *  - sysadmin
+ *  - mount owner, if not forced umount
+ */
+static bool permit_umount(struct vfsmount *mnt, int flags)
+{
+   if (capable(CAP_SYS_ADMIN))
+   return true;
+
+   if (flags  MNT_FORCE)
+   return false;
+
+   return is_mount_owner(mnt, current-fsuid);
+}
+
 /*
  * Now umount can handle mount points as well as block devices.
  * This is important for filesystems which use unnamed block devices.
@@ -1056,7 +1077,7 @@ asmlinkage long sys_umount(char __user *
goto dput_and_out;
 
retval = -EPERM;
-   if (!capable(CAP_SYS_ADMIN))
+   if (!permit_umount(nd.path.mnt, flags))
goto dput_and_out;
 
retval = do_umount(nd.path.mnt, flags);

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 05/10] unprivileged mounts: allow unprivileged bind mounts

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Allow bind mounts to unprivileged users if the following conditions are met:

  - mountpoint is not a symlink
  - parent mount is owned by the user
  - the number of user mounts is below the maximum

Unprivileged mounts imply MS_SETUSER, and will also have the nosuid and
nodev mount flags set.

In particular, if mounting process doesn't have CAP_SETUID capability,
then the nosuid flag will be added, and if it doesn't have CAP_MKNOD
capability, then the nodev flag will be added.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-02-04 23:47:58.0 +0100
+++ linux/fs/namespace.c2008-02-04 23:48:00.0 +0100
@@ -545,6 +545,11 @@ static void __set_mnt_user(struct vfsmou
WARN_ON(mnt-mnt_flags  MNT_USER);
mnt-mnt_uid = current-fsuid;
mnt-mnt_flags |= MNT_USER;
+
+   if (!capable(CAP_SETUID))
+   mnt-mnt_flags |= MNT_NOSUID;
+   if (!capable(CAP_MKNOD))
+   mnt-mnt_flags |= MNT_NODEV;
 }
 
 static void set_mnt_user(struct vfsmount *mnt)
@@ -1160,22 +1165,26 @@ asmlinkage long sys_oldumount(char __use
 
 #endif
 
-static int mount_is_safe(struct nameidata *nd)
+/*
+ * Conditions for unprivileged mounts are:
+ * - mountpoint is not a symlink
+ * - mountpoint is in a mount owned by the user
+ */
+static bool permit_mount(struct nameidata *nd, int *flags)
 {
+   struct inode *inode = nd-path.dentry-d_inode;
+
if (capable(CAP_SYS_ADMIN))
-   return 0;
-   return -EPERM;
-#ifdef notyet
-   if (S_ISLNK(nd-path.dentry-d_inode-i_mode))
-   return -EPERM;
-   if (nd-path.dentry-d_inode-i_mode  S_ISVTX) {
-   if (current-uid != nd-path.dentry-d_inode-i_uid)
-   return -EPERM;
-   }
-   if (vfs_permission(nd, MAY_WRITE))
-   return -EPERM;
-   return 0;
-#endif
+   return true;
+
+   if (S_ISLNK(inode-i_mode))
+   return false;
+
+   if (!is_mount_owner(nd-path.mnt, current-fsuid))
+   return false;
+
+   *flags |= MS_SETUSER;
+   return true;
 }
 
 static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
@@ -1419,9 +1428,10 @@ static int do_loopback(struct nameidata 
int clone_fl;
struct nameidata old_nd;
struct vfsmount *mnt = NULL;
-   int err = mount_is_safe(nd);
-   if (err)
-   return err;
+   int err;
+
+   if (!permit_mount(nd, flags))
+   return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd);

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 08/10] unprivileged mounts: make fuse safe

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Don't require the user_id= and group_id= options for unprivileged mounts,
but if they are present, verify them for sanity.

Disallow the allow_other option for unprivileged mounts.

Document new way of enabling unprivileged mounts for fuse.

Document problems with unprivileged mounts.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/fuse/inode.c
===
--- linux.orig/fs/fuse/inode.c  2008-02-04 23:47:46.0 +0100
+++ linux/fs/fuse/inode.c   2008-02-04 23:48:06.0 +0100
@@ -359,6 +359,19 @@ static int parse_fuse_opt(char *opt, str
d-max_read = ~0;
d-blksize = FUSE_DEFAULT_BLKSIZE;
 
+   /*
+* For unprivileged mounts use current uid/gid.  Still allow
+* user_id and group_id options for compatibility, but
+* only if they match these values.
+*/
+   if (!capable(CAP_SYS_ADMIN)) {
+   d-user_id = current-uid;
+   d-user_id_present = 1;
+   d-group_id = current-gid;
+   d-group_id_present = 1;
+
+   }
+
while ((p = strsep(opt, ,)) != NULL) {
int token;
int value;
@@ -387,6 +400,8 @@ static int parse_fuse_opt(char *opt, str
case OPT_USER_ID:
if (match_int(args[0], value))
return 0;
+   if (d-user_id_present  d-user_id != value)
+   return 0;
d-user_id = value;
d-user_id_present = 1;
break;
@@ -394,6 +409,8 @@ static int parse_fuse_opt(char *opt, str
case OPT_GROUP_ID:
if (match_int(args[0], value))
return 0;
+   if (d-group_id_present  d-group_id != value)
+   return 0;
d-group_id = value;
d-group_id_present = 1;
break;
@@ -603,6 +620,10 @@ static int fuse_fill_super(struct super_
if (!parse_fuse_opt((char *) data, d, is_bdev))
return -EINVAL;
 
+   /* This is a privileged option */
+   if ((d.flags  FUSE_ALLOW_OTHER)  !capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
if (is_bdev) {
 #ifdef CONFIG_BLOCK
if (!sb_set_blocksize(sb, d.blksize))
Index: linux/Documentation/filesystems/fuse.txt
===
--- linux.orig/Documentation/filesystems/fuse.txt   2008-01-24 
23:58:37.0 +0100
+++ linux/Documentation/filesystems/fuse.txt2008-02-05 19:34:24.0 
+0100
@@ -215,11 +215,87 @@ the filesystem.  There are several ways 
   - Abort filesystem through the FUSE control filesystem.  Most
 powerful method, always works.
 
-How do non-privileged mounts work?
-~~
+Unprivileged fuse mounts
+
 
-Since the mount() system call is a privileged operation, a helper
-program (fusermount) is needed, which is installed setuid root.
+Possible problems with unprivileged fuse mounts
+---
+
+FUSE was designed from the beginning to be safe for unprivileged
+users.  This has also been verified in practice over many years, with
+some distributions enabling unprivileged FUSE mounts by default.
+
+However, there are cases when unprivileged mounting a fuse filesystem
+may be problematic, particularly for multi-user systems with untrusted
+users.  So here are few words of warning:
+
+Due to the design of the process freezer, a hanging (due to network
+problems, etc) or malicious filesystem may prevent suspending to ram
+or hibernation to succeed.  This is not actually unique to FUSE, as
+any hanging network filesystem will have the same affect.
+
+It is not always possible to use kill(2) (not even with SIGKILL) to
+terminate a process using a FUSE filesystem (see section Interrupting
+filesystem operations above).  As a special case of the above,
+killing a self-deadlocked FUSE process is not possible, and even
+killall5 will not terminate it.
+
+If the above could pose a threat to the system, it is recommended,
+that unprivileged fuse mounts are not enabled.
+
+Ways of enabling user mounts
+
+
+Now there are two different ways of allowing unprivileged fuse mounts:
+
+ 1) new way: unprivileged mount syscall
+
+ 2) old way: suid-root fusermount utility
+
+Unprivileged mount syscall
+--
+
+To enable this do
+
+  echo 1  /proc/sys/fs/types/fuse/usermount_safe
+
+or add this line to /etc/sysctl.conf:
+
+  fs.types.fuse.usermount_safe = 1
+
+More information can be found in Documentation/filesystems/proc.txt
+under the /proc/sys/fs/types/ heading.  Also see

[patch 04/10] unprivileged mounts: account user mounts

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add sysctl variables for accounting and limiting the number of user
mounts.

The maximum number of user mounts is set to 1024 by default.  This
won't in itself enable user mounts, setting a mount to be owned by a
user is first needed.

[akpm]
 - don't use enumerated sysctls

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/Documentation/filesystems/proc.txt
===
--- linux.orig/Documentation/filesystems/proc.txt   2008-02-04 
23:47:47.0 +0100
+++ linux/Documentation/filesystems/proc.txt2008-02-04 23:47:58.0 
+0100
@@ -1052,6 +1052,15 @@ reaches aio-max-nr then io_setup will fa
 raising aio-max-nr does not result in the pre-allocation or re-sizing
 of any kernel data structures.
 
+nr_user_mounts and max_user_mounts
+--
+
+These represent the number of user mounts and the maximum number of
+user mounts respectively.  User mounts may be created by
+unprivileged users.  User mounts may also be created with sysadmin
+privileges on behalf of a user, in which case nr_user_mounts may
+exceed max_user_mounts.
+
 2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats
 ---
 
Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-02-04 23:47:56.0 +0100
+++ linux/fs/namespace.c2008-02-04 23:47:58.0 +0100
@@ -46,6 +46,9 @@ static struct list_head *mount_hashtable
 static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 
+int nr_user_mounts;
+int max_user_mounts = 1024;
+
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
@@ -511,21 +514,70 @@ static struct vfsmount *skip_mnt_tree(st
return p;
 }
 
-static void set_mnt_user(struct vfsmount *mnt)
+static void dec_nr_user_mounts(void)
+{
+   spin_lock(vfsmount_lock);
+   nr_user_mounts--;
+   spin_unlock(vfsmount_lock);
+}
+
+static int reserve_user_mount(void)
+{
+   int err = 0;
+
+   spin_lock(vfsmount_lock);
+   /*
+* EMFILE was error returned by mount(2) in the old days, when
+* the mount count was limited.  Reuse this error value to
+* mean, that the maximum number of user mounts has been
+* exceeded.
+*/
+   if (nr_user_mounts = max_user_mounts  !capable(CAP_SYS_ADMIN))
+   err = -EMFILE;
+   else
+   nr_user_mounts++;
+   spin_unlock(vfsmount_lock);
+   return err;
+}
+
+static void __set_mnt_user(struct vfsmount *mnt)
 {
WARN_ON(mnt-mnt_flags  MNT_USER);
mnt-mnt_uid = current-fsuid;
mnt-mnt_flags |= MNT_USER;
 }
 
+static void set_mnt_user(struct vfsmount *mnt)
+{
+   __set_mnt_user(mnt);
+   spin_lock(vfsmount_lock);
+   nr_user_mounts++;
+   spin_unlock(vfsmount_lock);
+}
+
+static void clear_mnt_user(struct vfsmount *mnt)
+{
+   if (mnt-mnt_flags  MNT_USER) {
+   mnt-mnt_uid = 0;
+   mnt-mnt_flags = ~MNT_USER;
+   dec_nr_user_mounts();
+   }
+}
+
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
int flag)
 {
struct super_block *sb = old-mnt_sb;
-   struct vfsmount *mnt = alloc_vfsmnt(old-mnt_devname);
+   struct vfsmount *mnt;
 
+   if (flag  CL_SETUSER) {
+   int err = reserve_user_mount();
+   if (err)
+   return ERR_PTR(err);
+   }
+   mnt = alloc_vfsmnt(old-mnt_devname);
if (!mnt)
-   return ERR_PTR(-ENOMEM);
+   goto alloc_failed;
 
mnt-mnt_flags = old-mnt_flags;
atomic_inc(sb-s_active);
@@ -537,7 +589,7 @@ static struct vfsmount *clone_mnt(struct
/* don't copy the MNT_USER flag */
mnt-mnt_flags = ~MNT_USER;
if (flag  CL_SETUSER)
-   set_mnt_user(mnt);
+   __set_mnt_user(mnt);
 
if (flag  CL_SLAVE) {
list_add(mnt-mnt_slave, old-mnt_slave_list);
@@ -562,6 +614,11 @@ static struct vfsmount *clone_mnt(struct
spin_unlock(vfsmount_lock);
}
return mnt;
+
+ alloc_failed:
+   if (flag  CL_SETUSER)
+   dec_nr_user_mounts();
+   return ERR_PTR(-ENOMEM);
 }
 
 static inline void __mntput(struct vfsmount *mnt)
@@ -577,6 +634,7 @@ static inline void __mntput(struct vfsmo
 */
WARN_ON(atomic_read(mnt-__mnt_writers));
dput(mnt-mnt_root);
+   clear_mnt_user(mnt);
free_vfsmnt(mnt);
deactivate_super(sb);
 }
@@ -1446,6 +1504,7 @@ static int do_remount(struct nameidata *
else
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
+   clear_mnt_user(nd-path.mnt

[patch 06/10] unprivileged mounts: allow unprivileged mounts

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

For safe filesystems allow unprivileged mounting and forced
unmounting.

A filesystem type is considered safe, if mounting it by an
unprivileged user may not cause a security problem.  This is somewhat
subjective, so setting this property is left to userspace (implemented
in the next patch).

Since most filesystems haven't been designed with unprivileged
mounting in mind, a thorough audit is recommended before setting this
property.

Make this a separate integer member in 'struct file_system_type'
instead of a flag, since that is easier to handle by sysctl code.

Move subtype handling from do_kern_mount() into do_new_mount().  All
other callers are kernel-internal and do not need subtype support.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-02-04 23:48:00.0 +0100
+++ linux/fs/namespace.c2008-02-04 23:48:02.0 +0100
@@ -1105,14 +1105,16 @@ static bool is_mount_owner(struct vfsmou
 /*
  * umount is permitted for
  *  - sysadmin
- *  - mount owner, if not forced umount
+ *  - mount owner
+ *o if not forced umount,
+ *o if forced umount, and filesystem is safe
  */
 static bool permit_umount(struct vfsmount *mnt, int flags)
 {
if (capable(CAP_SYS_ADMIN))
return true;
 
-   if (flags  MNT_FORCE)
+   if ((flags  MNT_FORCE)  !(mnt-mnt_sb-s_type-fs_safe))
return false;
 
return is_mount_owner(mnt, current-fsuid);
@@ -1170,13 +1172,17 @@ asmlinkage long sys_oldumount(char __use
  * - mountpoint is not a symlink
  * - mountpoint is in a mount owned by the user
  */
-static bool permit_mount(struct nameidata *nd, int *flags)
+static bool permit_mount(struct nameidata *nd, struct file_system_type *type,
+int *flags)
 {
struct inode *inode = nd-path.dentry-d_inode;
 
if (capable(CAP_SYS_ADMIN))
return true;
 
+   if (type  !type-fs_safe)
+   return false;
+
if (S_ISLNK(inode-i_mode))
return false;
 
@@ -1430,7 +1436,7 @@ static int do_loopback(struct nameidata 
struct vfsmount *mnt = NULL;
int err;
 
-   if (!permit_mount(nd, flags))
+   if (!permit_mount(nd, NULL, flags))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
@@ -1611,30 +1617,76 @@ out:
return err;
 }
 
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char 
*fstype)
+{
+   int err;
+   const char *subtype = strchr(fstype, '.');
+   if (subtype) {
+   subtype++;
+   err = -EINVAL;
+   if (!subtype[0])
+   goto err;
+   } else
+   subtype = ;
+
+   mnt-mnt_sb-s_subtype = kstrdup(subtype, GFP_KERNEL);
+   err = -ENOMEM;
+   if (!mnt-mnt_sb-s_subtype)
+   goto err;
+   return mnt;
+
+ err:
+   mntput(mnt);
+   return ERR_PTR(err);
+}
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
  */
-static int do_new_mount(struct nameidata *nd, char *type, int flags,
+static int do_new_mount(struct nameidata *nd, char *fstype, int flags,
int mnt_flags, char *name, void *data)
 {
+   int err;
struct vfsmount *mnt;
+   struct file_system_type *type;
 
-   if (!type || !memchr(type, 0, PAGE_SIZE))
+   if (!fstype || !memchr(fstype, 0, PAGE_SIZE))
return -EINVAL;
 
-   /* we need capabilities... */
-   if (!capable(CAP_SYS_ADMIN))
-   return -EPERM;
-
-   mnt = do_kern_mount(type, flags  ~MS_SETUSER, name, data);
-   if (IS_ERR(mnt))
+   type = get_fs_type(fstype);
+   if (!type)
+   return -ENODEV;
+
+   err = -EPERM;
+   if (!permit_mount(nd, type, flags))
+   goto out_put_filesystem;
+
+   if (flags  MS_SETUSER) {
+   err = reserve_user_mount();
+   if (err)
+   goto out_put_filesystem;
+   }
+
+   mnt = vfs_kern_mount(type, flags  ~MS_SETUSER, name, data);
+   if (!IS_ERR(mnt)  (type-fs_flags  FS_HAS_SUBTYPE) 
+   !mnt-mnt_sb-s_subtype)
+   mnt = fs_set_subtype(mnt, fstype);
+   put_filesystem(type);
+   if (IS_ERR(mnt)) {
+   if (flags  MS_SETUSER)
+   dec_nr_user_mounts();
return PTR_ERR(mnt);
+   }
 
if (flags  MS_SETUSER)
-   set_mnt_user(mnt);
+   __set_mnt_user(mnt);
 
return do_add_mount(mnt, nd, mnt_flags, NULL);
+
+ out_put_filesystem:
+   put_filesystem(type);
+   return err;
 }
 
 /*
@@ -1665,7 +1717,7 @@ int do_add_mount(struct vfsmount *newmnt
if (S_ISLNK(newmnt-mnt_root

[patch 10/10] unprivileged mounts: add no submounts flag

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a new mount flag nosubmnt, which denies submounts for the owner.
This would be useful, if we want to support traditional /etc/fstab
based user mounts.

In this case mount(8) would still have to be suid-root, to check the
mountpoint against the user/users flag in /etc/fstab, but /etc/mtab
would no longer be mandatory for storing the actual owner of the
mount.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-02-04 23:48:08.0 +0100
+++ linux/fs/namespace.c2008-02-04 23:48:10.0 +0100
@@ -783,6 +783,7 @@ static void show_mnt_opts(struct seq_fil
{ MNT_NOATIME, ,noatime },
{ MNT_NODIRATIME, ,nodiratime },
{ MNT_RELATIME, ,relatime },
+   { MNT_NOSUBMNT, ,nosubmnt },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
@@ -1189,6 +1190,9 @@ static bool permit_mount(struct nameidat
if (S_ISLNK(inode-i_mode))
return false;
 
+   if (nd-path.mnt-mnt_flags  MNT_NOSUBMNT)
+   return false;
+
if (!is_mount_owner(nd-path.mnt, current-fsuid))
return false;
 
@@ -2033,9 +2037,11 @@ long do_mount(char *dev_name, char *dir_
mnt_flags |= MNT_RELATIME;
if (flags  MS_RDONLY)
mnt_flags |= MNT_READONLY;
+   if (flags  MS_NOSUBMNT)
+   mnt_flags |= MNT_NOSUBMNT;
 
-   flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
-  MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
+   flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME |
+  MS_NODIRATIME | MS_RELATIME | MS_KERNMOUNT | MS_NOSUBMNT);
 
/* ... and get the mountpoint */
retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-02-04 23:48:08.0 +0100
+++ linux/include/linux/fs.h2008-02-04 23:48:10.0 +0100
@@ -129,6 +129,7 @@ extern int dir_notify_enable;
 #define MS_KERNMOUNT   (122) /* this is a kern_mount call */
 #define MS_I_VERSION   (123) /* Update inode I_version field */
 #define MS_SETUSER (124) /* set mnt_uid to current user */
+#define MS_NOSUBMNT(125) /* don't allow unprivileged submounts */
 #define MS_ACTIVE  (130)
 #define MS_NOUSER  (131)
 
Index: linux/include/linux/mount.h
===
--- linux.orig/include/linux/mount.h2008-02-04 23:47:50.0 +0100
+++ linux/include/linux/mount.h 2008-02-04 23:48:10.0 +0100
@@ -30,6 +30,7 @@ struct mnt_namespace;
 #define MNT_NODIRATIME 0x10
 #define MNT_RELATIME   0x20
 #define MNT_READONLY   0x40/* does the user want this to be r/o? */
+#define MNT_NOSUBMNT   0x80
 
 #define MNT_SHRINKABLE 0x100
 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 01/10] unprivileged mounts: add user mounts to the kernel

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

This patchset adds support for keeping mount ownership information in the
kernel, and allow unprivileged mount(2) and umount(2) in certain cases.

The mount owner has the following privileges:

  - unmount the owned mount
  - create a submount under the owned mount

The sysadmin can set the owner explicitly on mount and remount.  When an
unprivileged user creates a mount, then the owner is automatically set to the
user.

The following use cases are envisioned:

1) Private namespace, with selected mounts owned by user.  E.g.
   /home/$USER is a good candidate for allowing unpriv mounts and unmounts
   within.

2) Private namespace, with all mounts owned by user and having the nosuid
   flag.  User can mount and umount anywhere within the namespace, but suid
   programs will not work.

3) Global namespace, with a designated directory, which is a mount owned by
   the user.  E.g.  /mnt/users/$USER is set up so that it is bind mounted onto
   itself, and set to be owned by $USER.  The user can add/remove mounts only
   under this directory.

The following extra security measures are taken for unprivileged mounts:

 - usermounts are limited by a sysctl tunable
 - force nosuid,nodev mount options on the created mount

This series increases the size of vmlinux by about 1.5k on x86_64.

For testing unprivileged mounts (and for other purposes) simple
mount/umount utilities are available from:

  http://www.kernel.org/pub/linux/kernel/people/mszeredi/mmount/

A preliminary patch for util-linux-ng to add the same functionality to
mount(8) and umount(8) is available here:

  http://lkml.org/lkml/2008/1/16/103


This patch:

A new mount flag, MS_SETUSER is used to make a mount owned by a user.  If this
flag is specified, then the owner will be set to the current fsuid and the
mount will be marked with the MNT_USER flag.  On remount don't preserve
previous owner, and treat MS_SETUSER as for a new mount.  The MS_SETUSER flag
is ignored on mount move.

The MNT_USER flag is not copied on any kind of mount cloning: namespace
creation, binding or propagation.  For bind mounts the cloned mount(s) are set
to MNT_USER depending on the MS_SETUSER mount flag.  In all the other cases
MNT_USER is always cleared.

For MNT_USER mounts a user=UID option is added to /proc/PID/mounts.  This is
compatible with how mount ownership is stored in /etc/mtab.

The rationale for using MS_SETUSER and MNT_USER, to distinguish user
mounts from non-user or legacy mounts are follows:

  a) Mount(2) and umount(2) on legacy mounts always need CAP_SYS_ADMIN
 capability.  As opposed to user mounts, which will only require,
 that the mount owner matches the current fsuid.  So a process
 with fsuid=0 should not be able to mount/umount legacy mounts
 without the CAP_SYS_ADMIN capability.

  b) Legacy userspace programs may set fsuid to nonzero before calling
 mount(2).  In such an unlikely case, this patchset would cause
 an unintended side effect of making the mount owned by the fsuid.

  c) For legacy mounts, no user=UID option should be shown in
 /proc/mounts for backwards compatibility.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-02-04 23:47:47.0 +0100
+++ linux/fs/namespace.c2008-02-04 23:47:50.0 +0100
@@ -511,6 +511,13 @@ static struct vfsmount *skip_mnt_tree(st
return p;
 }
 
+static void set_mnt_user(struct vfsmount *mnt)
+{
+   WARN_ON(mnt-mnt_flags  MNT_USER);
+   mnt-mnt_uid = current-fsuid;
+   mnt-mnt_flags |= MNT_USER;
+}
+
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
int flag)
 {
@@ -525,6 +532,11 @@ static struct vfsmount *clone_mnt(struct
mnt-mnt_mountpoint = mnt-mnt_root;
mnt-mnt_parent = mnt;
 
+   /* don't copy the MNT_USER flag */
+   mnt-mnt_flags = ~MNT_USER;
+   if (flag  CL_SETUSER)
+   set_mnt_user(mnt);
+
if (flag  CL_SLAVE) {
list_add(mnt-mnt_slave, old-mnt_slave_list);
mnt-mnt_master = old;
@@ -712,6 +724,8 @@ static void show_mnt_opts(struct seq_fil
if (mnt-mnt_flags  fs_infop-flag)
seq_puts(m, fs_infop-str);
}
+   if (mnt-mnt_flags  MNT_USER)
+   seq_printf(m, ,user=%i, mnt-mnt_uid);
 }
 
 static void show_type(struct seq_file *m, struct super_block *sb)
@@ -1320,8 +1334,9 @@ static int do_change_type(struct nameida
 /*
  * do loopback mount.
  */
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, int flags)
 {
+   int clone_fl;
struct nameidata old_nd

[patch 07/10] unprivileged mounts: add sysctl tunable for safe property

2008-02-05 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add the following:

  /proc/sys/fs/types/${FS_TYPE}/usermount_safe

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/filesystems.c
===
--- linux.orig/fs/filesystems.c 2008-02-04 23:47:46.0 +0100
+++ linux/fs/filesystems.c  2008-02-04 23:48:04.0 +0100
@@ -12,6 +12,7 @@
 #include linux/kmod.h
 #include linux/init.h
 #include linux/module.h
+#include linux/sysctl.h
 #include asm/uaccess.h
 
 /*
@@ -51,6 +52,57 @@ static struct file_system_type **find_fi
return p;
 }
 
+#define MAX_FILESYSTEM_VARS 1
+
+struct filesystem_sysctl_table {
+   struct ctl_table_header *header;
+   struct ctl_table table[MAX_FILESYSTEM_VARS + 1];
+};
+
+/*
+ * Create /sys/fs/types/${FSNAME} directory with per fs-type tunables.
+ */
+static int filesystem_sysctl_register(struct file_system_type *fs)
+{
+   struct filesystem_sysctl_table *t;
+   struct ctl_path path[] = {
+   { .procname = fs, .ctl_name = CTL_FS },
+   { .procname = types, .ctl_name = CTL_UNNUMBERED },
+   { .procname = fs-name, .ctl_name = CTL_UNNUMBERED },
+   { }
+   };
+
+   t = kzalloc(sizeof(*t), GFP_KERNEL);
+   if (!t)
+   return -ENOMEM;
+
+
+   t-table[0].ctl_name = CTL_UNNUMBERED;
+   t-table[0].procname = usermount_safe;
+   t-table[0].maxlen = sizeof(int);
+   t-table[0].data = fs-fs_safe;
+   t-table[0].mode = 0644;
+   t-table[0].proc_handler = proc_dointvec;
+
+   t-header = register_sysctl_paths(path, t-table);
+   if (!t-header) {
+   kfree(t);
+   return -ENOMEM;
+   }
+
+   fs-sysctl_table = t;
+
+   return 0;
+}
+
+static void filesystem_sysctl_unregister(struct file_system_type *fs)
+{
+   struct filesystem_sysctl_table *t = fs-sysctl_table;
+
+   unregister_sysctl_table(t-header);
+   kfree(t);
+}
+
 /**
  * register_filesystem - register a new filesystem
  * @fs: the file system structure
@@ -80,6 +132,13 @@ int register_filesystem(struct file_syst
else
*p = fs;
write_unlock(file_systems_lock);
+
+   if (res == 0) {
+   res = filesystem_sysctl_register(fs);
+   if (res != 0)
+   unregister_filesystem(fs);
+   }
+
return res;
 }
 
@@ -108,6 +167,7 @@ int unregister_filesystem(struct file_sy
*tmp = fs-next;
fs-next = NULL;
write_unlock(file_systems_lock);
+   filesystem_sysctl_unregister(fs);
return 0;
}
tmp = (*tmp)-next;
Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-02-04 23:48:02.0 +0100
+++ linux/include/linux/fs.h2008-02-04 23:48:04.0 +0100
@@ -1444,6 +1444,7 @@ struct file_system_type {
struct module *owner;
struct file_system_type * next;
struct list_head fs_supers;
+   struct filesystem_sysctl_table *sysctl_table;
 
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
Index: linux/Documentation/filesystems/proc.txt
===
--- linux.orig/Documentation/filesystems/proc.txt   2008-02-04 
23:47:58.0 +0100
+++ linux/Documentation/filesystems/proc.txt2008-02-04 23:48:04.0 
+0100
@@ -44,6 +44,7 @@ Table of Contents
   2.14 /proc/pid/io - Display the IO accounting fields
   2.15 /proc/pid/coredump_filter - Core dump filtering settings
   2.16 /proc/pid/mountinfo - Information about mounts
+  2.17 /proc/sys/fs/types - File system type specific parameters
 
 --
 Preface
@@ -2392,4 +2393,34 @@ For more information see:
   Documentation/filesystems/sharedsubtree.txt
 
 
+2.17 /proc/sys/fs/types/ - File system type specific parameters
+
+
+There's a separate directory /proc/sys/fs/types/type/ for each
+filesystem type, containing the following files:
+
+usermount_safe
+--
+
+Setting this to non-zero will allow filesystems of this type to be
+mounted by unprivileged users (note, that there are other
+prerequisites as well).
+
+Fuse has been designed to be as safe as possible, and some
+distributions already ship with unprivileged fuse mounts enabled by
+default.  There are still some situations (multi-user systems with
+untrusted users in particular), where enabling this for fuse might not
+be appropriate.  For more details, see Documentation/filesystems/fuse.txt
+
+Procfs is also safe, but unprivileged mounting of it is not usually
+necessary (bind mounting is equivalent).
+
+Most

[patch 2/3] mm: Add NR_WRITEBACK_TEMP counter

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Fuse will use temporary buffers to write back dirty data from memory
mappings (normal writes are done synchronously).  This is needed,
because there cannot be any guarantee about the time in which a write
will complete.

By using temporary buffers, from the MM's point if view the page is
written back immediately.  If the writeout was due to memory pressure,
this effectively migrates data from a full zone to a less full zone.

This patch adds a new counter (NR_WRITEBACK_TEMP) for the number of
pages used as temporary buffers.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/proc/proc_misc.c
===
--- linux.orig/fs/proc/proc_misc.c  2008-02-04 12:29:00.0 +0100
+++ linux/fs/proc/proc_misc.c   2008-02-04 13:01:35.0 +0100
@@ -178,6 +178,7 @@ static int meminfo_read_proc(char *page,
PageTables:   %8lu kB\n
NFS_Unstable: %8lu kB\n
Bounce:   %8lu kB\n
+   WritebackTmp: %8lu kB\n
CommitLimit:  %8lu kB\n
Committed_AS: %8lu kB\n
VmallocTotal: %8lu kB\n
@@ -209,6 +210,7 @@ static int meminfo_read_proc(char *page,
K(global_page_state(NR_PAGETABLE)),
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
+   K(global_page_state(NR_WRITEBACK_TEMP)),
K(allowed),
K(committed),
(unsigned long)VMALLOC_TOTAL  10,
Index: linux/include/linux/mmzone.h
===
--- linux.orig/include/linux/mmzone.h   2008-02-04 12:29:01.0 +0100
+++ linux/include/linux/mmzone.h2008-02-04 13:01:35.0 +0100
@@ -95,6 +95,7 @@ enum zone_stat_item {
NR_UNSTABLE_NFS,/* NFS unstable pages */
NR_BOUNCE,
NR_VMSCAN_WRITE,
+   NR_WRITEBACK_TEMP,  /* Writeback using temporary buffers */
 #ifdef CONFIG_NUMA
NUMA_HIT,   /* allocated in intended node */
NUMA_MISS,  /* allocated in non intended node */
Index: linux/drivers/base/node.c
===
--- linux.orig/drivers/base/node.c  2008-02-04 12:28:53.0 +0100
+++ linux/drivers/base/node.c   2008-02-04 13:01:35.0 +0100
@@ -64,6 +64,7 @@ static ssize_t node_read_meminfo(struct 
   Node %d PageTables:   %8lu kB\n
   Node %d NFS_Unstable: %8lu kB\n
   Node %d Bounce:   %8lu kB\n
+  Node %d WritebackTmp: %8lu kB\n
   Node %d Slab: %8lu kB\n
   Node %d SReclaimable: %8lu kB\n
   Node %d SUnreclaim:   %8lu kB\n,
@@ -86,6 +87,7 @@ static ssize_t node_read_meminfo(struct 
   nid, K(node_page_state(nid, NR_PAGETABLE)),
   nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
   nid, K(node_page_state(nid, NR_BOUNCE)),
+  nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)),
   nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
   nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-02-04 13:01:23.0 +0100
+++ linux/mm/page-writeback.c   2008-02-04 13:01:35.0 +0100
@@ -211,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_
avail_dirty = dirty -
(global_page_state(NR_FILE_DIRTY) +
 global_page_state(NR_WRITEBACK) +
-global_page_state(NR_UNSTABLE_NFS));
+global_page_state(NR_UNSTABLE_NFS) +
+global_page_state(NR_WRITEBACK_TEMP));
 
if (avail_dirty  0)
avail_dirty = 0;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 3/3] fuse: support writable mmap

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Quoting Linus (3 years ago, FUSE inclusion discussions):

  User-space filesystems are hard to get right. I'd claim that they
   are almost impossible, unless you limit them somehow (shared
   writable mappings are the nastiest part - if you don't have those,
   you can reasonably limit your problems by limiting the number of
   dirty pages you accept through normal write() calls).

Instead of attempting the impossible, I've just waited for the dirty
page accounting infrastructure to materialize (thanks to Peter
Zijlstra and others).  This nicely solved the biggest problem:
limiting the number of pages used for write caching.

Some small details remained, however, which this largish patch
attempts to address.  It provides a page writeback implementation for
fuse, which is completely safe against VM related deadlocks.
Performance may not be very good for certain usage patterns, but
generally it should be acceptable.

It has been tested extensively with fsx-linux and bash-shared-mapping.

This patch depends on
mm-bdi-allow-setting-a-maximum-for-the-bdi-dirty-limit-fix.patch


Fuse page writeback design
--

fuse_writepage() allocates a new temporary page with
GFP_NOFS|__GFP_HIGHMEM.  It copies the contents of the original page,
and queues a WRITE request to the userspace filesystem using this temp
page.

The writeback is finished instantly from the MM's point of view: the
page is removed from the radix trees, and the PageDirty and
PageWriteback flags are cleared.

For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented.  The per-bdi writeback count is not decremented until the
actual write completes.

On dirtying the page, fuse waits for a previous write to finish before
proceeding.  This makes sure, there can only be one temporary page used
at a time for one cached page.

This approach is wasteful in both memory and CPU bandwidth, so why is
this complication needed?

The basic problem is that there can be no guarantee about the time in
which the userspace filesystem will complete a write.  It may be buggy
or even malicious, and fail to complete WRITE requests.  We don't want
unrelated parts of the system to grind to a halt in such cases.

Also a filesystem may need additional resources (particularly memory)
to complete a WRITE request.  There's a great danger of a deadlock if
that allocation may wait for the writepage to finish.

Currently there are several cases where the kernel can block on page
writeback:

  - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
  - page migration
  - throttle_vm_writeout (through NR_WRITEBACK)
  - sync(2)

Of course in some cases (fsync, msync) we explicitly want to allow
blocking.  So for these cases new code has to be added to fuse, since
the VM is not tracking writeback pages for us any more.

As an extra safetly measure, the maximum dirty ratio allocated to a
single fuse filesystem is set to 1% by default.  This way one (or
several) buggy or malicious fuse filesystems cannot slow down the rest
of the system by hogging dirty memory.

With appropriate privileges, this limit can be raised through
'/sys/class/bdi/bdi/max_ratio'.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/dev.c
===
--- linux.orig/fs/fuse/dev.c2008-02-04 15:24:03.0 +0100
+++ linux/fs/fuse/dev.c 2008-02-04 15:24:47.0 +0100
@@ -47,6 +47,14 @@ struct fuse_req *fuse_request_alloc(void
return req;
 }
 
+struct fuse_req *fuse_request_alloc_nofs(void)
+{
+   struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
+   if (req)
+   fuse_request_init(req);
+   return req;
+}
+
 void fuse_request_free(struct fuse_req *req)
 {
kmem_cache_free(fuse_req_cachep, req);
@@ -430,6 +438,17 @@ void request_send_background(struct fuse
 }
 
 /*
+ * Called under fc-lock
+ *
+ * fc-connected must have been checked previously
+ */
+void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+{
+   req-isreply = 1;
+   request_send_nowait_locked(fc, req);
+}
+
+/*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
  * aborted bail out.
Index: linux/fs/fuse/dir.c
===
--- linux.orig/fs/fuse/dir.c2008-02-04 15:24:03.0 +0100
+++ linux/fs/fuse/dir.c 2008-02-04 15:24:47.0 +0100
@@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr 
 }
 
 /*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   struct fuse_inode *fi = get_fuse_inode(inode

[patch 0/3] fuse: writable mmap

2008-02-04 Thread Miklos Szeredi
This is short series for fuse writable mmap support.

The first two patches are small additions to mm infrastructure.  The
third is a large patch for fuse.  It also depends on the mm: bdi:
export BDI attributes in sysfs series.

I don't mind if this goes into 2.6.25 (guess, that depends on whether
the bdi things go).

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 0/3] add perform_write to a_ops

2008-02-04 Thread Miklos Szeredi
a_ops-perform_write() was left out from Nick Piggin's new a_ops
patchset, as it was non-essential, and postponed for later inclusion.

This short series reintroduces it, but only adds the fuse
implementation and not simple_perform_write(), which I'm not sure
would be a significant improvement.

This allows larger than 4k buffered writes for fuse, which is one of
the most requested features.

This goes on top of the fuse: writable mmap patches.

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 3/3] fuse: implement perform_write

2008-02-04 Thread Miklos Szeredi
From: Nick Piggin [EMAIL PROTECTED]

Introduce fuse_perform_write. With fusexmp (a passthrough filesystem), large
(1MB) writes into a backing tmpfs filesystem are sped up by almost 4 times
(256MB/s vs 71MB/s).

[EMAIL PROTECTED]:

 - split into smaller functions
 - testing

Signed-off-by: Nick Piggin [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/file.c
===
--- linux.orig/fs/fuse/file.c   2008-02-04 17:11:18.0 +0100
+++ linux/fs/fuse/file.c2008-02-04 17:11:59.0 +0100
@@ -677,6 +677,148 @@ static int fuse_write_end(struct file *f
return res;
 }
 
+static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+   struct inode *inode, loff_t pos,
+   size_t count)
+{
+   size_t res;
+   unsigned offset;
+   unsigned i;
+
+   for (i = 0; i  req-num_pages; i++)
+   fuse_wait_on_page_writeback(inode, req-pages[i]-index);
+
+   res = fuse_send_write(req, file, inode, pos, count, NULL);
+
+   offset = req-page_offset;
+   count = res;
+   for (i = 0; i  req-num_pages; i++) {
+   struct page *page = req-pages[i];
+
+   if (!req-out.h.error  !offset  count = PAGE_CACHE_SIZE)
+   SetPageUptodate(page);
+
+   /* Just ignore count underflow on last page */
+   count -= PAGE_CACHE_SIZE - offset;
+   offset = 0;
+
+   unlock_page(page);
+   page_cache_release(page);
+   }
+
+   return res;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+  struct address_space *mapping,
+  struct iov_iter *ii, loff_t pos)
+{
+   struct fuse_conn *fc = get_fuse_conn(mapping-host);
+   unsigned offset = pos  (PAGE_CACHE_SIZE - 1);
+   size_t count = 0;
+   int err;
+
+   req-page_offset = offset;
+
+   do {
+   size_t tmp;
+   struct page *page;
+   pgoff_t index = pos  PAGE_CACHE_SHIFT;
+   size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+iov_iter_count(ii));
+
+   bytes = min_t(size_t, bytes, fc-max_write - count);
+
+ again:
+   err = -EFAULT;
+   if (iov_iter_fault_in_readable(ii, bytes))
+   break;
+
+   err = -ENOMEM;
+   page = __grab_cache_page(mapping, index);
+   if (!page)
+   break;
+
+   pagefault_disable();
+   tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+   pagefault_enable();
+   flush_dcache_page(page);
+
+   if (!tmp) {
+   unlock_page(page);
+   page_cache_release(page);
+   bytes = min(bytes, iov_iter_single_seg_count(ii));
+   goto again;
+   }
+
+   err = 0;
+   req-pages[req-num_pages] = page;
+   req-num_pages++;
+
+   iov_iter_advance(ii, tmp);
+   count += tmp;
+   pos += tmp;
+   offset += tmp;
+   if (offset == PAGE_CACHE_SIZE)
+   offset = 0;
+
+   } while (iov_iter_count(ii)  count  fc-max_write 
+req-num_pages  FUSE_MAX_PAGES_PER_REQ  offset == 0);
+
+   return count  0 ? count : err;
+}
+
+static ssize_t fuse_perform_write(struct file *file,
+ struct address_space *mapping,
+ struct iov_iter *ii, loff_t pos)
+{
+   struct inode *inode = mapping-host;
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   int err = 0;
+   ssize_t res = 0;
+
+   if (is_bad_inode(inode))
+   return -EIO;
+
+   do {
+   struct fuse_req *req;
+   ssize_t count;
+
+   req = fuse_get_req(fc);
+   if (IS_ERR(req)) {
+   err = PTR_ERR(req);
+   break;
+   }
+
+   count = fuse_fill_write_pages(req, mapping, ii, pos);
+   if (count = 0) {
+   err = count;
+   } else {
+   size_t num_written;
+
+   num_written = fuse_send_write_pages(req, file, inode,
+   pos, count);
+   err = req-out.h.error;
+   if (!err) {
+   res += num_written;
+   pos += num_written;
+
+   /* break out of the loop on short write */
+   if (num_written != count

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Miklos Szeredi
  In FUSE interrupts are sent to userspace, and the filesystem decides
  what to do with them.  So it is entirely possible and valid for a
  filesystem to ignore an interrupt.  If an operation was non-blocking
  (such as one returning an error), then there would in fact be no
  purpose in checking interrupts.
 

 
 Why do you think that it is valid to ignore pending signals?
 You seem to be asserting that it okay for processes to hang,
 uninterruptibly, when accessing files on fuse mounted file
 systems?
 
 Perhaps the right error to return when there is a signal
 pending is EINTR and not ESTALE or some other error?  There
 has to be some way for the application to detect that its
 system call was interrupted due to a signal pending.

Traditionally a lot of filesystem related system calls are not
interruptible, and for good reason.  For example what happens, if an
app receives a signal, while the filesystem is performing a rename()
request?  It would be very confusing if the call returned EINTR, but
the rename would successfully complete regardless.

We had a related problem with the open(O_CREAT) call in fuse, which
was interruptible between the creation and the actual open because of
a design mistake.  So it could return EINTR, after the file was
created, and this broke a real world application (don't have details
at hand, but could dig them out if you are interested).

I don't know what NFS does, but returning EINTR without actually
canceling an operation in the server is generally not a good idea.

  So while sending a signal might reliably work in NFS to break out of
  the loop, it does not necessarily work for other filesystems, and fuse
  may not be the only one affected.
 

 
 Have you noticed another one?  I would be happy to chat with the
 developers for that file system to see if this support would
 negatively impact them.

Oh, I have no idea.  And I wouldn't want to do a full audit of all the
filesystems to find out.  But if you do, please go ahead.

  A few solutions come to mind, perhaps the best is to introduce a
  kernel internal errno value (ERETRYSTALE), that forces the relevant
  system calls to be retried.
 
  NFS could transform ESTALE errors to ERETRYSTALE and get the desired
  behavior, while other filesystems would not be affected.
 
 We don't need more error numbers, we've got plenty already.  :-)

That's a rather poor excuse against a simple solution which would
spare us some backward compatibility problems.

 Do you have anything more specific about any real problems?
 I see lots of mays and coulds, but I don't see anything
 that I can do to make this support better.

Implement the above suggestion?  Or something else.

Otherwise I have to NAK this patch due to the possibility of it
breaking existing fuse installations.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Miklos Szeredi
  I don't know what NFS does, but returning EINTR without actually
  canceling an operation in the server is generally not a good idea.
 

 
 This is what NFS has been doing, for several decades, and no one
 has complained yet.

Is it really?  Man nfs says something quite different (emphasis mine):

   intrIf an NFS file operation has a *major timeout* and  it  is
   hard  mounted,  then  allow signals to interupt the file
   operation and cause it to return EINTR  to  the  calling
   program.  The *default* is to *not* allow file operations to
   be *interrupted*.

  Have you noticed another one?  I would be happy to chat with the
  developers for that file system to see if this support would
  negatively impact them.
  
 
  Oh, I have no idea.  And I wouldn't want to do a full audit of all the
  filesystems to find out.  But if you do, please go ahead.
 

 
 Well, you brought it up.  I thought that perhaps you had something
 other than FUD.

It's not FUD, it's being careful not to break an implementation when
changing an API in a backward incompatbile way.

 Please describe this real and existing fuse installation so that I can
 better understand the situation and the real requirements here.

I have already done so:

  Also up till now, returning ESTALE in a fuse filesystem was a
   perfectly valid thing to do.  This patch changes the behavior of
   that rather drastically.  There might be installed systems that
   rely on current behavior, and we want to avoid breaking those on a
   kernel upgrade.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/3] mm: bdi: export bdi_writeout_inc()

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Fuse needs this for writable mmap support.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-02-04 12:29:01.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-02-04 13:01:23.0 +0100
@@ -149,6 +149,8 @@ static inline unsigned long bdi_stat_err
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
+extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+
 /*
  * Flags in backing_dev_info::capability
  * - The first two flags control whether dirty pages will contribute to the
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-02-04 12:29:01.0 +0100
+++ linux/mm/page-writeback.c   2008-02-04 13:01:23.0 +0100
@@ -168,6 +168,16 @@ static inline void __bdi_writeout_inc(st
  bdi-max_prop_frac);
 }
 
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+   unsigned long flags;
+
+   local_irq_save(flags);
+   __bdi_writeout_inc(bdi);
+   local_irq_restore(flags);
+}
+EXPORT_SYMBOL(bdi_writeout_inc);
+
 static inline void task_dirty_inc(struct task_struct *tsk)
 {
prop_inc_single(vm_dirties, tsk-dirties);

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/3] vfs: introduce perform_write in a_ops

2008-02-04 Thread Miklos Szeredi
From: Nick Piggin [EMAIL PROTECTED]

Introduce a new perform_write() address space operation.

This is a single-call, bulk version of write_begin/write_end
operations.  It is only used in the buffered write path (write_begin
must still be implemented), and not for in-kernel writes to pagecache.

For some filesystems, using this can provide significant speedups.

Signed-off-by: Nick Piggin [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-02-04 15:24:03.0 +0100
+++ linux/include/linux/fs.h2008-02-04 16:24:19.0 +0100
@@ -469,6 +469,9 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
 
+   ssize_t (*perform_write)(struct file *, struct address_space *mapping,
+   struct iov_iter *i, loff_t pos);
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
Index: linux/mm/filemap.c
===
--- linux.orig/mm/filemap.c 2008-02-04 15:24:03.0 +0100
+++ linux/mm/filemap.c  2008-02-04 16:22:55.0 +0100
@@ -2312,7 +2312,9 @@ generic_file_buffered_write(struct kiocb
struct iov_iter i;
 
iov_iter_init(i, iov, nr_segs, count, written);
-   if (a_ops-write_begin)
+   if (a_ops-perform_write)
+   status = a_ops-perform_write(file, mapping, i, pos);
+   else if (a_ops-write_begin)
status = generic_perform_write(file, i, pos);
else
status = generic_perform_write_2copy(file, i, pos);
Index: linux/Documentation/filesystems/vfs.txt
===
--- linux.orig/Documentation/filesystems/vfs.txt2008-02-04 
12:28:50.0 +0100
+++ linux/Documentation/filesystems/vfs.txt 2008-02-04 16:23:44.0 
+0100
@@ -533,6 +533,9 @@ struct address_space_operations {
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
+   ssize_t (*perform_write)(struct file *, struct address_space *mapping,
+   struct iov_iter *i, loff_t pos);
+
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
@@ -664,6 +667,17 @@ struct address_space_operations {
 Returns  0 on failure, otherwise the number of bytes (= 'copied')
 that were able to be copied into pagecache.
 
+  perform_write: This is a single-call, bulk version of write_begin/write_end
+operations. It is only used in the buffered write path (write_begin
+must still be implemented), and not for in-kernel writes to pagecache.
+It takes an iov_iter structure, which provides a descriptor for the
+source data (and has associated iov_iter_xxx helpers to operate on
+that data). There are also file, mapping, and pos arguments, which
+specify the destination of the data.
+
+Returns  0 on failure if nothing was written out, otherwise returns
+the number of bytes copied into pagecache.
+
   bmap: called by the VFS to map a logical block offset within object to
physical block number. This method is used by the FIBMAP
ioctl and for working with swap-files.  To be able to swap to

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] vfs: optimization to /proc/pid/mountinfo patch

2008-02-04 Thread Miklos Szeredi
 1) reports deleted inode in dentry_path() consistent with that in __d_path()
 2) modified __d_path() to use prepend(), reducing the size of __d_path()
 3) moved all the functionality that reports mount information in /proc under
   CONFIG_PROC_FS.
 
 Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
 impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
 CONFIG_PROC_FS.
   
 
 
 Signed-off-by: Ram Pai [EMAIL PROTECTED]
 ---
  fs/dcache.c  |   59 
 +++
  fs/namespace.c   |2 +
  fs/seq_file.c|2 +
  include/linux/dcache.h   |3 ++
  include/linux/seq_file.h |3 ++
  5 files changed, 34 insertions(+), 35 deletions(-)
 
 Index: linux-2.6.23/fs/dcache.c
 ===
 --- linux-2.6.23.orig/fs/dcache.c
 +++ linux-2.6.23/fs/dcache.c
 @@ -1747,6 +1747,17 @@ shouldnt_be_hashed:
   goto shouldnt_be_hashed;
  }
  
 +static int prepend(char **buffer, int *buflen, const char *str,
 +   int namelen)
 +{
 + *buflen -= namelen;
 + if (*buflen  0)
 + return 1;

This is confusing.  Should return -ENAMETOOLONG intead (see Chapter 16
in Documentation/CodingStyle).

 + *buffer -= namelen;
 + memcpy(*buffer, str, namelen);
 + return 0;
 +}
 +
  /**
   * d_path - return the path of a dentry
   * @dentry: dentry to report
 @@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den
  {
   char * end = buffer+buflen;
   char * retval;
 - int namelen;
  
 - *--end = '\0';
 - buflen--;
 - if (!IS_ROOT(dentry)  d_unhashed(dentry)) {
 - buflen -= 10;
 - end -= 10;
 - if (buflen  0)
 + prepend(end, buflen, \0, 1);
 + if (!IS_ROOT(dentry)  d_unhashed(dentry) 
 + prepend(end, buflen,  (deleted), 10))

And this should test for prepend() != 0 or prepend()  0 instead,
otherwise it could easily be misread as if prepend() succeeded,
then

And similarly for all the later calls.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/3] fuse: clean up setting i_size in write

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Extract common code for setting i_size in write functions into a
common helper.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/file.c
===
--- linux.orig/fs/fuse/file.c   2008-02-04 13:01:39.0 +0100
+++ linux/fs/fuse/file.c2008-02-04 13:02:03.0 +0100
@@ -610,13 +610,24 @@ static int fuse_write_begin(struct file 
return 0;
 }
 
+static void fuse_write_update_size(struct inode *inode, loff_t pos)
+{
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   struct fuse_inode *fi = get_fuse_inode(inode);
+
+   spin_lock(fc-lock);
+   fi-attr_version = ++fc-attr_version;
+   if (pos  inode-i_size)
+   i_size_write(inode, pos);
+   spin_unlock(fc-lock);
+}
+
 static int fuse_buffered_write(struct file *file, struct inode *inode,
   loff_t pos, unsigned count, struct page *page)
 {
int err;
size_t nres;
struct fuse_conn *fc = get_fuse_conn(inode);
-   struct fuse_inode *fi = get_fuse_inode(inode);
unsigned offset = pos  (PAGE_CACHE_SIZE - 1);
struct fuse_req *req;
 
@@ -643,12 +654,7 @@ static int fuse_buffered_write(struct fi
err = -EIO;
if (!err) {
pos += nres;
-   spin_lock(fc-lock);
-   fi-attr_version = ++fc-attr_version;
-   if (pos  inode-i_size)
-   i_size_write(inode, pos);
-   spin_unlock(fc-lock);
-
+   fuse_write_update_size(inode, pos);
if (count == PAGE_CACHE_SIZE)
SetPageUptodate(page);
}
@@ -766,12 +772,8 @@ static ssize_t fuse_direct_io(struct fil
}
fuse_put_request(fc, req);
if (res  0) {
-   if (write) {
-   spin_lock(fc-lock);
-   if (pos  inode-i_size)
-   i_size_write(inode, pos);
-   spin_unlock(fc-lock);
-   }
+   if (write)
+   fuse_write_update_size(inode, pos);
*ppos = pos;
}
fuse_invalidate_attr(inode);

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 0/3] add perform_write to a_ops

2008-02-04 Thread Miklos Szeredi
  a_ops-perform_write() was left out from Nick Piggin's new a_ops
  patchset, as it was non-essential, and postponed for later inclusion.
  
  This short series reintroduces it, but only adds the fuse
  implementation and not simple_perform_write(), which I'm not sure
  would be a significant improvement.
  
  This allows larger than 4k buffered writes for fuse, which is one of
  the most requested features.
  
  This goes on top of the fuse: writable mmap patches.
 
 Please don't do this, but rather implement your own .aio_write.  There's
 very little in generic_file_aio_write that wouldn't be handle by
 -perform_write and we should rather factor those up or move to higher
 layers than adding this ill-defined abstraction.
 

Moving up to higher layers might not be possible, due to lock/unlock
of i_mutex being inside generic_file_aio_write().

But with fuse being the only user, it's not a huge issue duplicating
some code.

Nick, were there any other candidates, that would want to use such an
interface in the future?

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-02 Thread Miklos Szeredi

  Would you describe the situation that would cause the kernel to
  go into an infinite loop, please?
  
 
  The patch basically does:
 
  do {
  ...
  error = inode-i_op-foo()
  ...
  } while (error == ESTALE);
 
  What is the guarantee, that -foo() will not always return ESTALE?
 
 You skimmed over some stuff, like the pathname lookup component
 contained in the first set of dots...
 
 I can't guarantee that -foo() won't always return ESTALE.
 
 That said, the loop is not unbreakable.  At least for NFS, a signal
 to the process will interrupt the loop because the error returned
 will change from ESTALE to EINTR.

In FUSE interrupts are sent to userspace, and the filesystem decides
what to do with them.  So it is entirely possible and valid for a
filesystem to ignore an interrupt.  If an operation was non-blocking
(such as one returning an error), then there would in fact be no
purpose in checking interrupts.

So while sending a signal might reliably work in NFS to break out of
the loop, it does not necessarily work for other filesystems, and fuse
may not be the only one affected.

Also up till now, returning ESTALE in a fuse filesystem was a
perfectly valid thing to do.  This patch changes the behavior of that
rather drastically.  There might be installed systems that rely on
current behavior, and we want to avoid breaking those on a kernel
upgrade.

A few solutions come to mind, perhaps the best is to introduce a
kernel internal errno value (ERETRYSTALE), that forces the relevant
system calls to be retried.

NFS could transform ESTALE errors to ERETRYSTALE and get the desired
behavior, while other filesystems would not be affected.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/3] mm: bdi: use MAJOR:MINOR in /sys/class/bdi

2008-02-02 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Uniformly use MAJOR:MINOR in /sys/class/bdi/ for both block devices
and non-block device backed filesystems: FUSE and NFS.

Add symlink for block devices:

/sys/block/name/bdi - /sys/class/bdi/bdi

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/block/genhd.c
===
--- linux.orig/block/genhd.c2008-02-02 22:41:03.0 +0100
+++ linux/block/genhd.c 2008-02-02 22:50:03.0 +0100
@@ -178,13 +178,17 @@ static int exact_lock(dev_t devt, void *
  */
 void add_disk(struct gendisk *disk)
 {
+   struct backing_dev_info *bdi;
+
disk-flags |= GENHD_FL_UP;
blk_register_region(MKDEV(disk-major, disk-first_minor),
disk-minors, NULL, exact_match, exact_lock, disk);
register_disk(disk);
blk_register_queue(disk);
-   bdi_register(disk-queue-backing_dev_info, NULL,
-   blk-%s, disk-disk_name);
+
+   bdi = disk-queue-backing_dev_info;
+   bdi_register_dev(bdi, MKDEV(disk-major, disk-first_minor));
+   sysfs_create_link(disk-dev.kobj, bdi-dev-kobj, bdi);
 }
 
 EXPORT_SYMBOL(add_disk);
@@ -192,8 +196,9 @@ EXPORT_SYMBOL(del_gendisk); /* in partit
 
 void unlink_gendisk(struct gendisk *disk)
 {
-   blk_unregister_queue(disk);
+   sysfs_remove_link(disk-dev.kobj, bdi);
bdi_unregister(disk-queue-backing_dev_info);
+   blk_unregister_queue(disk);
blk_unregister_region(MKDEV(disk-major, disk-first_minor),
  disk-minors);
 }
Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-02-02 22:41:03.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-02-02 22:50:03.0 +0100
@@ -62,6 +62,7 @@ void bdi_destroy(struct backing_dev_info
 
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
Index: linux/mm/backing-dev.c
===
--- linux.orig/mm/backing-dev.c 2008-02-02 22:43:36.0 +0100
+++ linux/mm/backing-dev.c  2008-02-02 22:50:03.0 +0100
@@ -143,6 +143,12 @@ exit:
 }
 EXPORT_SYMBOL(bdi_register);
 
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+{
+   return bdi_register(bdi, NULL, %u:%u, MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
 void bdi_unregister(struct backing_dev_info *bdi)
 {
if (bdi-dev) {
Index: linux/fs/fuse/inode.c
===
--- linux.orig/fs/fuse/inode.c  2008-02-02 22:41:03.0 +0100
+++ linux/fs/fuse/inode.c   2008-02-02 22:50:03.0 +0100
@@ -472,8 +472,7 @@ static struct fuse_conn *new_conn(struct
err = bdi_init(fc-bdi);
if (err)
goto error_kfree;
-   err = bdi_register(fc-bdi, NULL, fuse-%u:%u,
-  MAJOR(fc-dev), MINOR(fc-dev));
+   err = bdi_register_dev(fc-bdi, fc-dev);
if (err)
goto error_bdi_destroy;
fc-reqctr = 0;
Index: linux/fs/nfs/super.c
===
--- linux.orig/fs/nfs/super.c   2008-02-02 22:41:03.0 +0100
+++ linux/fs/nfs/super.c2008-02-02 22:50:03.0 +0100
@@ -1477,8 +1477,7 @@ static int nfs_compare_super(struct supe
 
 static int nfs_bdi_register(struct nfs_server *server)
 {
-   return bdi_register(server-backing_dev_info, NULL, nfs-%u:%u,
-   MAJOR(server-s_dev), MINOR(server-s_dev));
+   return bdi_register_dev(server-backing_dev_info, server-s_dev);
 }
 
 static int nfs_get_sb(struct file_system_type *fs_type,
Index: linux/Documentation/ABI/testing/sysfs-class-bdi
===
--- linux.orig/Documentation/ABI/testing/sysfs-class-bdi2008-02-02 
22:41:03.0 +0100
+++ linux/Documentation/ABI/testing/sysfs-class-bdi 2008-02-02 
22:50:03.0 +0100
@@ -6,17 +6,13 @@ Description:
 Provide a place in sysfs for the backing_dev_info object.
 This allows us to see and set the various BDI specific variables.
 
-The bdi identifyer can take the following forms:
+The bdi identifier can be either of the following:
 
-blk-NAME
+MAJOR:MINOR
 
-   Block devices, NAME is 'sda', 'loop0', etc...
-
-FSTYPE-MAJOR:MINOR
-
-   Non-block device backed filesystems which provide their own
-   BDI, such as NFS and FUSE.  MAJOR:MINOR is the value of st_dev
-   for files on this filesystem.
+   Device number for block devices

[patch 3/3] mm: bdi: move statistics to debugfs

2008-02-02 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Move BDI statistics to debugfs:

   /sys/kernel/debug/bdi/bdi/stats

Use postcore_initcall() to initialize the sysfs class and debugfs,
because debugfs is initialized in core_initcall().

Update descriptions in ABI documentation.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-02-02 23:08:41.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-02-02 23:08:41.0 +0100
@@ -16,6 +16,7 @@
 #include asm/atomic.h
 
 struct page;
+struct dentry;
 
 /*
  * Bits in backing_dev_info.state
@@ -55,6 +56,11 @@ struct backing_dev_info {
unsigned int max_ratio, max_prop_frac;
 
struct device *dev;
+
+#ifdef CONFIG_DEBUG_FS
+   struct dentry *debug_dir;
+   struct dentry *debug_stats;
+#endif
 };
 
 int bdi_init(struct backing_dev_info *bdi);
Index: linux/mm/backing-dev.c
===
--- linux.orig/mm/backing-dev.c 2008-02-02 23:08:41.0 +0100
+++ linux/mm/backing-dev.c  2008-02-02 23:12:47.0 +0100
@@ -10,6 +10,80 @@
 
 static struct class *bdi_class;
 
+#ifdef CONFIG_DEBUG_FS
+#include linux/debugfs.h
+#include linux/seq_file.h
+
+static struct dentry *bdi_debug_root;
+
+static void bdi_debug_init(void)
+{
+   bdi_debug_root = debugfs_create_dir(bdi, NULL);
+}
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+   struct backing_dev_info *bdi = m-private;
+   long background_thresh;
+   long dirty_thresh;
+   long bdi_thresh;
+
+   get_dirty_limits(background_thresh, dirty_thresh, bdi_thresh, bdi);
+
+#define K(x) ((x)  (PAGE_SHIFT - 10))
+   seq_printf(m,
+  BdiWriteback: %8lu kB\n
+  BdiReclaimable:   %8lu kB\n
+  BdiDirtyThresh:   %8lu kB\n
+  DirtyThresh:  %8lu kB\n
+  BackgroundThresh: %8lu kB\n,
+  (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
+  (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+  K(bdi_thresh),
+  K(dirty_thresh),
+  K(background_thresh));
+#undef K
+
+   return 0;
+}
+
+static int bdi_debug_stats_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, bdi_debug_stats_show, inode-i_private);
+}
+
+static const struct file_operations bdi_debug_stats_fops = {
+   .open   = bdi_debug_stats_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+};
+
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+{
+   bdi-debug_dir = debugfs_create_dir(name, bdi_debug_root);
+   bdi-debug_stats = debugfs_create_file(stats, 0444, bdi-debug_dir,
+  bdi, bdi_debug_stats_fops);
+}
+
+static void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+   debugfs_remove(bdi-debug_stats);
+   debugfs_remove(bdi-debug_dir);
+}
+#else
+static inline void bdi_debug_init(void)
+{
+}
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
+ const char *name)
+{
+}
+static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+}
+#endif
+
 static ssize_t read_ahead_kb_store(struct device *dev,
  struct device_attribute *attr,
  const char *buf, size_t count)
@@ -40,21 +114,6 @@ static ssize_t name##_show(struct device
 
 BDI_SHOW(read_ahead_kb, K(bdi-ra_pages))
 
-BDI_SHOW(reclaimable_kb, K(bdi_stat(bdi, BDI_RECLAIMABLE)))
-BDI_SHOW(writeback_kb, K(bdi_stat(bdi, BDI_WRITEBACK)))
-
-static inline unsigned long get_dirty(struct backing_dev_info *bdi, int i)
-{
-   unsigned long thresh[3];
-
-   get_dirty_limits(thresh[0], thresh[1], thresh[2], bdi);
-
-   return thresh[i];
-}
-
-BDI_SHOW(dirty_kb, K(get_dirty(bdi, 1)))
-BDI_SHOW(bdi_dirty_kb, K(get_dirty(bdi, 2)))
-
 static ssize_t min_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
 {
@@ -95,10 +154,6 @@ BDI_SHOW(max_ratio, bdi-max_ratio)
 
 static struct device_attribute bdi_dev_attrs[] = {
__ATTR_RW(read_ahead_kb),
-   __ATTR_RO(reclaimable_kb),
-   __ATTR_RO(writeback_kb),
-   __ATTR_RO(dirty_kb),
-   __ATTR_RO(bdi_dirty_kb),
__ATTR_RW(min_ratio),
__ATTR_RW(max_ratio),
__ATTR_NULL,
@@ -108,10 +163,11 @@ static __init int bdi_class_init(void)
 {
bdi_class = class_create(THIS_MODULE, bdi);
bdi_class-dev_attrs = bdi_dev_attrs;
+   bdi_debug_init();
return 0;
 }
 
-core_initcall(bdi_class_init);
+postcore_initcall(bdi_class_init);
 
 int bdi_register(struct backing_dev_info *bdi

[patch 1/3] mm: bdi: fix read_ahead_kb_store()

2008-02-02 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

This managed to completely evade testing :(

Fix return value to be count or -errno.  Also bring the function in
line with the other store functions on this object, which have more
strict input checking.

Also fix bdi_set_max_ratio() to actually return an error, instead of
always zero.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/mm/backing-dev.c
===
--- linux.orig/mm/backing-dev.c 2008-02-02 23:21:50.0 +0100
+++ linux/mm/backing-dev.c  2008-02-02 23:26:01.0 +0100
@@ -16,10 +16,15 @@ static ssize_t read_ahead_kb_store(struc
 {
struct backing_dev_info *bdi = dev_get_drvdata(dev);
char *end;
+   unsigned long read_ahead_kb;
+   ssize_t ret = -EINVAL;
 
-   bdi-ra_pages = simple_strtoul(buf, end, 10)  (PAGE_SHIFT - 10);
-
-   return end - buf;
+   read_ahead_kb = simple_strtoul(buf, end, 10);
+   if (*buf  (end[0] == '\0' || (end[0] == '\n'  end[1] == '\0'))) {
+   bdi-ra_pages = read_ahead_kb  (PAGE_SHIFT - 10);
+   ret = count;
+   }
+   return ret;
 }
 
 #define K(pages) ((pages)  (PAGE_SHIFT - 10))
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-02-02 20:51:26.0 +0100
+++ linux/mm/page-writeback.c   2008-02-02 23:26:15.0 +0100
@@ -288,7 +288,7 @@ int bdi_set_max_ratio(struct backing_dev
}
spin_unlock_irqrestore(bdi_lock, flags);
 
-   return 0;
+   return ret;
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 0/3] mm: bdi: updates

2008-02-02 Thread Miklos Szeredi
Here are incremental patches against the export BDI attributes in
sysfs patchset, addressing the issues identified at the last
submission:

  - the read-only attributes are only for debugging
  - more consistent naming needed in /sys/class/bdi
  - documentation problems

I've also done some testing, and fixed some bugs.  Including patches
in -mm can do wonders, even before the kernel containing them is
released :)

Let me know if you prefer a resubmission of the original series with
these changes folded in.

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-01 Thread Miklos Szeredi
This doesn't apply to -mm, because the ro-mounts stuff touches a lot
of the same places as this patch.  You probably need to rebase this on
top of those changes.

 This patch adds handling for the error, ESTALE, to the system
 calls which take pathnames as arguments.  The algorithm used
 is to detect that an ESTALE error has occurred during an
 operation subsequent to the lookup process and then to unwind
 appropriately and then to perform the lookup process again.
 Eventually, either the lookup process will return an error
 or a valid dentry/inode combination and then operation can
 succeed or fail based on its own merits.

If a broken NFS server or FUSE filesysem keeps returning ESTALE, this
goes into an infinite loop.  How are we planning to deal with that?

And it has to be dealt with either in the VFS, or in the kernel parts
of the relevant filesystems.  We can't just say, fix the broken
servers, especially not with FUSE, where the server is totally
untrusted.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-01 Thread Miklos Szeredi
  This doesn't apply to -mm, because the ro-mounts stuff touches a lot
  of the same places as this patch.  You probably need to rebase this on
  top of those changes.
 

  This patch adds handling for the error, ESTALE, to the system
  calls which take pathnames as arguments.  The algorithm used
  is to detect that an ESTALE error has occurred during an
  operation subsequent to the lookup process and then to unwind
  appropriately and then to perform the lookup process again.
  Eventually, either the lookup process will return an error
  or a valid dentry/inode combination and then operation can
  succeed or fail based on its own merits.
  
 
  If a broken NFS server or FUSE filesysem keeps returning ESTALE, this
  goes into an infinite loop.  How are we planning to deal with that?
 

 
 Would you describe the situation that would cause the kernel to
 go into an infinite loop, please?

The patch basically does:

do {
...
error = inode-i_op-foo()
...
} while (error == ESTALE);

What is the guarantee, that -foo() will not always return ESTALE?

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 6/6] mm: bdi: allow setting a maximum for the bdi dirty limit

2008-01-31 Thread Miklos Szeredi
 On Tue, 29 Jan 2008 16:49:06 +0100
 Miklos Szeredi [EMAIL PROTECTED] wrote:
 
  Add max_ratio to /sys/class/bdi.  This indicates the maximum
  percentage of the global dirty threshold allocated to this bdi.
 
 Maybe I'm having a stupid day, but I don't understand the semantics of this
 min and max at all.  I've read the code, and I've read the comments (well,
 I've hunted for some) and I've read the docs.
 
 I really don't know how anyone could use this in its current state without
 doing a lot of code-reading and complex experimentation.  All of which
 would be unneeded if this tunable was properly documented.
 
 So.  Please provide adequate documentation for this tunable.  I'd suggest
 that it be pitched at the level of a reasonably competent system operator. 
 It should help them understand why the tunable exists, why they might
 choose to alter it, and what effects they can expect to see.  Hopefully a
 reaonably competent kernel developer can then understand it too.

OK.  I think what's missing from some docs, is a high level
description of the per-bdi throttling algorithm, and how it affects
writeback.  Because with info, I think the min and max ratios are
trivially understandable: they just override the result of the
algorithm, in case it would mean too high or too low threshold.

Peter, could you write something about that?

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 01/26] mount options: add documentation

2008-01-31 Thread Miklos Szeredi
   - loop: how is the connection between file and loop device maintained?
  
  We also discussed this with Karel, maybe it didn't make it onto lkml.
  
  The proposed solution was to store the loop flag separately in a
  file under /var.  It could just be an empty file for each such loop
  device:
  
/var/lib/mount/loops/loop0
  
  This file is created by mount(8) if the '-oloop' option is given.  And
  umount(8) automatically tears down the loop device if it finds this
  file.
 
  It seems we needn't this solution. There is loop auto-destruction
  patch in -mm.
 
  Kernel part:
 http://marc.info/?l=linux-kernelm=119361296818388w=2
 
  mount(8) part:
 http://marc.info/?l=util-linux-ngm=119362955431694w=2
 
  So, with this patch mount(8) needn't to maintain info about loops and
  umount(8) doesn't need to call LOOP_CLR_FD ioctl, because umount(2)
  is enough.

Excellent!  This is a very good example how moving a functionality
into the kernel can greatly simplify it.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/6] mm: bdi: tweak task dirty penalty

2008-01-29 Thread Miklos Szeredi
From: Peter Zijlstra [EMAIL PROTECTED]

Penalizing heavy dirtiers with 1/8-th the total dirty limit might be rather
excessive on large memory machines. Use sqrt to scale it sub-linearly.

Update the comment while we're there.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-01-17 19:00:56.0 +0100
+++ linux/mm/page-writeback.c   2008-01-18 13:07:16.0 +0100
@@ -219,17 +219,21 @@ static inline void task_dirties_fraction
 }
 
 /*
- * scale the dirty limit
+ * Task specific dirty limit:
  *
- * task specific dirty limit:
+ *   dirty -= 8 * sqrt(dirty) * p_{t}
  *
- *   dirty -= (dirty/8) * p_{t}
+ * Penalize tasks that dirty a lot of pages by lowering their dirty limit. This
+ * avoids infrequent dirtiers from getting stuck in this other guys dirty
+ * pages.
+ *
+ * Use a sub-linear function to scale the penalty, we only need a little room.
  */
 static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
 {
long numerator, denominator;
long dirty = *pdirty;
-   u64 inv = dirty  3;
+   u64 inv = 8*int_sqrt(dirty);
 
task_dirties_fraction(tsk, numerator, denominator);
inv *= numerator;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 5/6] mm: bdi: allow setting a minimum for the bdi dirty limit

2008-01-29 Thread Miklos Szeredi
From: Peter Zijlstra [EMAIL PROTECTED]

Add min_ratio to /sys/class/bdi.  This indicates the minimum
percentage of the global dirty threshold allocated to this bdi.

[EMAIL PROTECTED]

 - fix parsing in min_ratio_store()
 - document new sysfs attribute

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-01-29 14:40:35.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-01-29 15:35:34.0 +0100
@@ -51,6 +51,8 @@ struct backing_dev_info {
struct prop_local_percpu completions;
int dirty_exceeded;
 
+   unsigned int min_ratio;
+
struct device *dev;
 };
 
@@ -136,6 +138,8 @@ static inline unsigned long bdi_stat_err
 #endif
 }
 
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
+
 /*
  * Flags in backing_dev_info::capability
  * - The first two flags control whether dirty pages will contribute to the
Index: linux/mm/backing-dev.c
===
--- linux.orig/mm/backing-dev.c 2008-01-29 14:40:35.0 +0100
+++ linux/mm/backing-dev.c  2008-01-29 15:36:35.0 +0100
@@ -50,6 +50,24 @@ static inline unsigned long get_dirty(st
 BDI_SHOW(dirty_kb, K(get_dirty(bdi, 1)))
 BDI_SHOW(bdi_dirty_kb, K(get_dirty(bdi, 2)))
 
+static ssize_t min_ratio_store(struct device *dev,
+   struct device_attribute *attr, const char *buf, size_t count)
+{
+   struct backing_dev_info *bdi = dev_get_drvdata(dev);
+   char *end;
+   unsigned int ratio;
+   ssize_t ret = -EINVAL;
+
+   ratio = simple_strtoul(buf, end, 10);
+   if (*buf  (end[0] == '\0' || (end[0] == '\n'  end[1] == '\0'))) {
+   ret = bdi_set_min_ratio(bdi, ratio);
+   if (!ret)
+   ret = count;
+   }
+   return ret;
+}
+BDI_SHOW(min_ratio, bdi-min_ratio)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
@@ -58,6 +76,7 @@ static struct device_attribute bdi_dev_a
__ATTR_RO(writeback_kb),
__ATTR_RO(dirty_kb),
__ATTR_RO(bdi_dirty_kb),
+   __ATTR_RW(min_ratio),
__ATTR_NULL,
 };
 
@@ -116,6 +135,8 @@ int bdi_init(struct backing_dev_info *bd
 
bdi-dev = NULL;
 
+   bdi-min_ratio = 0;
+
for (i = 0; i  NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init_irq(bdi-bdi_stat[i], 0);
if (err)
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-01-29 14:40:35.0 +0100
+++ linux/mm/page-writeback.c   2008-01-29 15:35:34.0 +0100
@@ -247,6 +247,29 @@ static void task_dirty_limit(struct task
 }
 
 /*
+ *
+ */
+static DEFINE_SPINLOCK(bdi_lock);
+static unsigned int bdi_min_ratio;
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+   int ret = 0;
+   unsigned long flags;
+
+   spin_lock_irqsave(bdi_lock, flags);
+   min_ratio -= bdi-min_ratio;
+   if (bdi_min_ratio + min_ratio  100) {
+   bdi_min_ratio += min_ratio;
+   bdi-min_ratio += min_ratio;
+   } else
+   ret = -EINVAL;
+   spin_unlock_irqrestore(bdi_lock, flags);
+
+   return ret;
+}
+
+/*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
  *
@@ -334,7 +357,7 @@ get_dirty_limits(long *pbackground, long
*pdirty = dirty;
 
if (bdi) {
-   u64 bdi_dirty = dirty;
+   u64 bdi_dirty;
long numerator, denominator;
 
/*
@@ -342,8 +365,10 @@ get_dirty_limits(long *pbackground, long
 */
bdi_writeout_fraction(bdi, numerator, denominator);
 
+   bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
bdi_dirty *= numerator;
do_div(bdi_dirty, denominator);
+   bdi_dirty += (dirty * bdi-min_ratio) / 100;
 
*pbdi_dirty = bdi_dirty;
clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
Index: linux/Documentation/ABI/testing/sysfs-class-bdi
===
--- linux.orig/Documentation/ABI/testing/sysfs-class-bdi2008-01-29 
14:40:35.0 +0100
+++ linux/Documentation/ABI/testing/sysfs-class-bdi 2008-01-29 
15:37:24.0 +0100
@@ -48,3 +48,9 @@ bdi_dirty_kb (read-only)
Current threshold on this BDI for reclaimable + writeback
memory
 
+min_ratio (read-write)
+
+   Minimal percentage of global dirty threshold allocated to this
+   bdi.  If the value written to this file would make the the sum
+   of all min_ratio values exceed 100, then EINVAL

[patch 2/6] mm: bdi: export BDI attributes in sysfs

2008-01-29 Thread Miklos Szeredi
From: Peter Zijlstra [EMAIL PROTECTED]

Provide a place in sysfs (/sys/class/bdi) for the backing_dev_info
object.  This allows us to see and set the various BDI specific
variables.

In particular this properly exposes the read-ahead window for all
relevant users and /sys/block/block/queue/read_ahead_kb should be
deprecated.

With patient help from Kay Sievers and Greg KH

[EMAIL PROTECTED]

 - split off NFS and FUSE changes into separate patches
 - document new sysfs attributes under Documentation/ABI
 - do bdi_class_init as a core_initcall, otherwise the default BDI
   won't be initialized
 - remove bdi_init_fmt macro, it's not used very much

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
CC: Kay Sievers [EMAIL PROTECTED]
CC: Greg KH [EMAIL PROTECTED]
CC: Trond Myklebust [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/block/genhd.c
===
--- linux.orig/block/genhd.c2008-01-29 13:02:41.0 +0100
+++ linux/block/genhd.c 2008-01-29 13:02:46.0 +0100
@@ -183,6 +183,8 @@ void add_disk(struct gendisk *disk)
disk-minors, NULL, exact_match, exact_lock, disk);
register_disk(disk);
blk_register_queue(disk);
+   bdi_register(disk-queue-backing_dev_info, NULL,
+   blk-%s, disk-disk_name);
 }
 
 EXPORT_SYMBOL(add_disk);
@@ -191,6 +193,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partit
 void unlink_gendisk(struct gendisk *disk)
 {
blk_unregister_queue(disk);
+   bdi_unregister(disk-queue-backing_dev_info);
blk_unregister_region(MKDEV(disk-major, disk-first_minor),
  disk-minors);
 }
Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-01-29 13:02:41.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-01-29 13:02:46.0 +0100
@@ -11,6 +11,8 @@
 #include linux/percpu_counter.h
 #include linux/log2.h
 #include linux/proportions.h
+#include linux/kernel.h
+#include linux/device.h
 #include asm/atomic.h
 
 struct page;
@@ -48,11 +50,17 @@ struct backing_dev_info {
 
struct prop_local_percpu completions;
int dirty_exceeded;
+
+   struct device *dev;
 };
 
 int bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+   const char *fmt, ...);
+void bdi_unregister(struct backing_dev_info *bdi);
+
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
enum bdi_stat_item item, s64 amount)
 {
Index: linux/include/linux/writeback.h
===
--- linux.orig/include/linux/writeback.h2008-01-29 13:02:41.0 
+0100
+++ linux/include/linux/writeback.h 2008-01-29 13:02:46.0 +0100
@@ -113,6 +113,9 @@ struct file;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
  void __user *, size_t *, loff_t *);
 
+void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+struct backing_dev_info *bdi);
+
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied);
Index: linux/mm/backing-dev.c
===
--- linux.orig/mm/backing-dev.c 2008-01-29 13:02:41.0 +0100
+++ linux/mm/backing-dev.c  2008-01-29 13:03:23.0 +0100
@@ -4,12 +4,118 @@
 #include linux/fs.h
 #include linux/sched.h
 #include linux/module.h
+#include linux/writeback.h
+#include linux/device.h
+
+
+static struct class *bdi_class;
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+   struct backing_dev_info *bdi = dev_get_drvdata(dev);
+   char *end;
+
+   bdi-ra_pages = simple_strtoul(buf, end, 10)  (PAGE_SHIFT - 10);
+
+   return end - buf;
+}
+
+#define K(pages) ((pages)  (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr)   \
+static ssize_t name##_show(struct device *dev, \
+  struct device_attribute *attr, char *page)   \
+{  \
+   struct backing_dev_info *bdi = dev_get_drvdata(dev);\
+   \
+   return snprintf(page, PAGE_SIZE-1, %lld\n, (long long)expr);  \
+}
+
+BDI_SHOW(read_ahead_kb, K(bdi-ra_pages))
+
+BDI_SHOW(reclaimable_kb, K(bdi_stat(bdi, BDI_RECLAIMABLE)))
+BDI_SHOW(writeback_kb, K(bdi_stat(bdi, BDI_WRITEBACK

[patch 0/6] mm: bdi: updates

2008-01-29 Thread Miklos Szeredi
This is a series from Peter Zijlstra, with various updates by me.  The
patchset mostly deals with exporting BDI attributes in sysfs.

Should be in a mergeable state, at least into -mm.

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 4/6] mm: bdi: expose the BDI object in sysfs for FUSE

2008-01-29 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Register FUSE's backing_dev_info under sysfs with the name
fuse-MAJOR:MINOR

Make the fuse control filesystem use s_dev instead of a fuse specific
ID.  This makes it easier to match directories under
/sys/fs/fuse/connections/ with directories under /sys/class/bdi, and
with actual mounts.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
CC: Peter Zijlstra [EMAIL PROTECTED]
---

Index: linux/fs/fuse/control.c
===
--- linux.orig/fs/fuse/control.c2008-01-29 10:26:47.0 +0100
+++ linux/fs/fuse/control.c 2008-01-29 12:16:06.0 +0100
@@ -117,7 +117,7 @@ int fuse_ctl_add_conn(struct fuse_conn *
 
parent = fuse_control_sb-s_root;
inc_nlink(parent-d_inode);
-   sprintf(name, %llu, (unsigned long long) fc-id);
+   sprintf(name, %u, fc-dev);
parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
 simple_dir_inode_operations,
 simple_dir_operations);
Index: linux/fs/fuse/fuse_i.h
===
--- linux.orig/fs/fuse/fuse_i.h 2008-01-29 10:26:47.0 +0100
+++ linux/fs/fuse/fuse_i.h  2008-01-29 12:16:06.0 +0100
@@ -384,8 +384,8 @@ struct fuse_conn {
/** Entry on the fuse_conn_list */
struct list_head entry;
 
-   /** Unique ID */
-   u64 id;
+   /** Device ID from super block */
+   dev_t dev;
 
/** Dentries in the control filesystem */
struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
Index: linux/fs/fuse/inode.c
===
--- linux.orig/fs/fuse/inode.c  2008-01-29 10:26:47.0 +0100
+++ linux/fs/fuse/inode.c   2008-01-29 12:57:26.0 +0100
@@ -448,7 +448,7 @@ static int fuse_show_options(struct seq_
return 0;
 }
 
-static struct fuse_conn *new_conn(void)
+static struct fuse_conn *new_conn(struct super_block *sb)
 {
struct fuse_conn *fc;
int err;
@@ -468,19 +468,27 @@ static struct fuse_conn *new_conn(void)
atomic_set(fc-num_waiting, 0);
fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc-bdi.unplug_io_fn = default_unplug_io_fn;
+   fc-dev = sb-s_dev;
err = bdi_init(fc-bdi);
-   if (err) {
-   kfree(fc);
-   fc = NULL;
-   goto out;
-   }
+   if (err)
+   goto error_kfree;
+   err = bdi_register(fc-bdi, NULL, fuse-%u:%u,
+  MAJOR(fc-dev), MINOR(fc-dev));
+   if (err)
+   goto error_bdi_destroy;
fc-reqctr = 0;
fc-blocked = 1;
fc-attr_version = 1;
get_random_bytes(fc-scramble_key, sizeof(fc-scramble_key));
}
-out:
return fc;
+
+error_bdi_destroy:
+   bdi_destroy(fc-bdi);
+error_kfree:
+   mutex_destroy(fc-inst_mutex);
+   kfree(fc);
+   return NULL;
 }
 
 void fuse_conn_put(struct fuse_conn *fc)
@@ -578,12 +586,6 @@ static void fuse_send_init(struct fuse_c
request_send_background(fc, req);
 }
 
-static u64 conn_id(void)
-{
-   static u64 ctr = 1;
-   return ctr++;
-}
-
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
struct fuse_conn *fc;
@@ -621,7 +623,7 @@ static int fuse_fill_super(struct super_
if (file-f_op != fuse_dev_operations)
return -EINVAL;
 
-   fc = new_conn();
+   fc = new_conn(sb);
if (!fc)
return -ENOMEM;
 
@@ -659,7 +661,6 @@ static int fuse_fill_super(struct super_
if (file-private_data)
goto err_unlock;
 
-   fc-id = conn_id();
err = fuse_ctl_add_conn(fc);
if (err)
goto err_unlock;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch] vfs: create /proc/pid/mountinfo

2008-01-29 Thread Miklos Szeredi
From: Ram Pai [EMAIL PROTECTED]

/proc/mounts in its current state fails to disambiguate bind mounts,
especially when the bind mount is subrooted.  Also it does not capture
propagation state of the mounts(shared-subtree).  The following patch
addresses the problem.

The patch adds '/proc/pid/mountinfo' which contains a superset of
information in '/proc/pid/mounts'. The following fields are added:

mntid -- is a unique identifier of the mount
parent -- the id of the parent mount
major:minor -- value of st_dev for files on that filesystem
dir -- the subdir in the filesystem which forms the root of this mount
propagation-type in the form of propagation_flag[:mntid][,...]
note: 'shared' flag is followed by the mntid of its peer mount
  'slave' flag is followed by the mntid of its master mount
  'private' flag stands by itself
  'unbindable' flag stands by itself

Also mount options are split into two fileds, the first containing the
per mount flags, the second the per super block options.

Here is a sample cat /proc/mounts after execution the following commands:

mount --bind /mnt /mnt
mount --make-shared /mnt
mount --bind /mnt/1 /var
mount --make-slave /var
mount --make-shared /var
mount --bind /var/abc /tmp
mount --make-unbindable /proc

2 2 0:1 rootfs rootfs / / rw rw private
16 2 98:0 ext2 /dev/root / / rw rw private
17 16 0:3 proc /proc / /proc rw rw unbindable
18 16 0:10 devpts devpts /dev/pts / rw rw private
19 16 98:0 ext2 /dev/root /mnt /mnt rw rw shared:19
20 16 98:0 ext2 /dev/root /mnt/1 /var rw rw shared:21,slave:19
21 16 98:0 ext2 /dev/root /mnt/1/abc /tmp rw rw shared:20,slave:19

For example, the last line indicates that :

1) The mount is a shared mount.
2) Its peer mount of mount with id 20
3) It is also a slave mount of the master-mount with the id  19
4) The filesystem on device with major/minor number 98:0 and subdirectory
mnt/1/abc makes the root directory of this mount.
5) And finally the mount with id 16 is its parent.


[EMAIL PROTECTED]

- new file, rearrange fields
- for mount ID's use IDA (from the IDR library) instead of a 32bit
  counter, which could overflow
- print canonical ID's (smallest one within the peer group) for peers
  and master, this is more useful, than a random ID within the same namespace
- fix a couple of small bugs
- remove inlines
- style fixes

Signed-off-by: Ram Pai [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/dcache.c
===
--- linux.orig/fs/dcache.c  2008-01-28 14:54:19.0 +0100
+++ linux/fs/dcache.c   2008-01-28 14:54:50.0 +0100
@@ -1890,6 +1890,60 @@ char *dynamic_dname(struct dentry *dentr
return memcpy(buffer, temp, sz);
 }
 
+static int prepend(char **buffer, int *buflen, const char *str,
+ int namelen)
+{
+   *buflen -= namelen;
+   if (*buflen  0)
+   return 1;
+   *buffer -= namelen;
+   memcpy(*buffer, str, namelen);
+   return 0;
+}
+
+/*
+ * Write full pathname from the root of the filesystem into the buffer.
+ */
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+   char *end = buf + buflen;
+   char *retval;
+
+   spin_lock(dcache_lock);
+   prepend(end, buflen, \0, 1);
+   if (!IS_ROOT(dentry)  d_unhashed(dentry)) {
+   if (prepend(end, buflen, //deleted, 9))
+   goto Elong;
+   }
+   if (buflen  1)
+   goto Elong;
+   /* Get '/' right */
+   retval = end-1;
+   *retval = '/';
+
+   for (;;) {
+   struct dentry *parent;
+   if (IS_ROOT(dentry))
+   break;
+
+   parent = dentry-d_parent;
+   prefetch(parent);
+
+   if (prepend(end, buflen, dentry-d_name.name,
+   dentry-d_name.len) ||
+   prepend(end, buflen, /, 1))
+   goto Elong;
+
+   retval = end;
+   dentry = parent;
+   }
+   spin_unlock(dcache_lock);
+   return retval;
+Elong:
+   spin_unlock(dcache_lock);
+   return ERR_PTR(-ENAMETOOLONG);
+}
+
 /*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-01-28 14:54:19.0 +0100
+++ linux/fs/namespace.c2008-01-28 14:54:50.0 +0100
@@ -27,6 +27,7 @@
 #include linux/mount.h
 #include linux/ramfs.h
 #include linux/log2.h
+#include linux/idr.h
 #include asm/uaccess.h
 #include asm/unistd.h
 #include pnode.h
@@ -39,6 +40,7 @@
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static int event;
+static DEFINE_IDA(mnt_id_ida);
 
 static struct list_head *mount_hashtable __read_mostly;
 static struct kmem_cache

[patch 6/6] mm: bdi: allow setting a maximum for the bdi dirty limit

2008-01-29 Thread Miklos Szeredi
From: Peter Zijlstra [EMAIL PROTECTED]

Add max_ratio to /sys/class/bdi.  This indicates the maximum
percentage of the global dirty threshold allocated to this bdi.

[EMAIL PROTECTED]

 - fix parsing in max_ratio_store().
 - export bdi_set_max_ratio() to modules
 - limit bdi_dirty with bdi-max_ratio
 - document new sysfs attribute

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-01-29 16:33:14.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-01-29 16:33:14.0 +0100
@@ -52,6 +52,7 @@ struct backing_dev_info {
int dirty_exceeded;
 
unsigned int min_ratio;
+   unsigned int max_ratio, max_prop_frac;
 
struct device *dev;
 };
@@ -139,6 +140,7 @@ static inline unsigned long bdi_stat_err
 }
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
 /*
  * Flags in backing_dev_info::capability
Index: linux/include/linux/proportions.h
===
--- linux.orig/include/linux/proportions.h  2008-01-29 16:25:14.0 
+0100
+++ linux/include/linux/proportions.h   2008-01-29 16:33:14.0 +0100
@@ -78,6 +78,19 @@ void prop_inc_percpu(struct prop_descrip
 }
 
 /*
+ * Limit the time part in order to ensure there are some bits left for the
+ * cycle counter and fraction multiply.
+ */
+#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
+
+#define PROP_FRAC_SHIFT(BITS_PER_LONG - PROP_MAX_SHIFT - 1)
+#define PROP_FRAC_BASE (1UL  PROP_FRAC_SHIFT)
+
+void __prop_inc_percpu_max(struct prop_descriptor *pd,
+  struct prop_local_percpu *pl, long frac);
+
+
+/*
  * - SINGLE --
  */
 
Index: linux/lib/proportions.c
===
--- linux.orig/lib/proportions.c2008-01-29 16:25:14.0 +0100
+++ linux/lib/proportions.c 2008-01-29 16:33:14.0 +0100
@@ -73,12 +73,6 @@
 #include linux/proportions.h
 #include linux/rcupdate.h
 
-/*
- * Limit the time part in order to ensure there are some bits left for the
- * cycle counter.
- */
-#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
-
 int prop_descriptor_init(struct prop_descriptor *pd, int shift)
 {
int err;
@@ -268,6 +262,38 @@ void __prop_inc_percpu(struct prop_descr
 }
 
 /*
+ * identical to __prop_inc_percpu, except that it limits this pl's fraction to
+ * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded.
+ */
+void __prop_inc_percpu_max(struct prop_descriptor *pd,
+  struct prop_local_percpu *pl, long frac)
+{
+   struct prop_global *pg = prop_get_global(pd);
+
+   prop_norm_percpu(pg, pl);
+
+   if (unlikely(frac != PROP_FRAC_BASE)) {
+   unsigned long period_2 = 1UL  (pg-shift - 1);
+   unsigned long counter_mask = period_2 - 1;
+   unsigned long global_count;
+   long numerator, denominator;
+
+   numerator = percpu_counter_read_positive(pl-events);
+   global_count = percpu_counter_read(pg-events);
+   denominator = period_2 + (global_count  counter_mask);
+
+   if (numerator  ((denominator * frac)  PROP_FRAC_SHIFT))
+   goto out_put;
+   }
+
+   percpu_counter_add(pl-events, 1);
+   percpu_counter_add(pg-events, 1);
+
+out_put:
+   prop_put_global(pd, pg);
+}
+
+/*
  * Obtain a fraction of this proportion
  *
  *   p_{j} = x_{j} / (period/2 + t % period/2)
Index: linux/mm/backing-dev.c
===
--- linux.orig/mm/backing-dev.c 2008-01-29 16:33:14.0 +0100
+++ linux/mm/backing-dev.c  2008-01-29 16:33:14.0 +0100
@@ -68,6 +68,24 @@ static ssize_t min_ratio_store(struct de
 }
 BDI_SHOW(min_ratio, bdi-min_ratio)
 
+static ssize_t max_ratio_store(struct device *dev,
+   struct device_attribute *attr, const char *buf, size_t count)
+{
+   struct backing_dev_info *bdi = dev_get_drvdata(dev);
+   char *end;
+   unsigned int ratio;
+   ssize_t ret = -EINVAL;
+
+   ratio = simple_strtoul(buf, end, 10);
+   if (*buf  (end[0] == '\0' || (end[0] == '\n'  end[1] == '\0'))) {
+   ret = bdi_set_max_ratio(bdi, ratio);
+   if (!ret)
+   ret = count;
+   }
+   return ret;
+}
+BDI_SHOW(max_ratio, bdi-max_ratio)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
@@ -77,6 +95,7 @@ static struct device_attribute bdi_dev_a
__ATTR_RO(dirty_kb),
__ATTR_RO(bdi_dirty_kb),
__ATTR_RW(min_ratio

Re: [patch 21/26] mount options: partially fix nfs

2008-01-28 Thread Miklos Szeredi
  All mount options should be shown, which are needed to reconstruct a
  previous mount.
 
 Ah, OK.
 
 I'm happy to implement logic to display the all missing options.  I  
 should have updated nfs_show_mount_options() when I wrote the NFS  
 mount option parser.
 
 Let me know your preference.

You are more familiar with NFS, so I think it would be better if you
updated nfs_show_mount_options().

Could you also queue my patch (updated) or incorporate it into a
combined fix?

Thanks,
Miklos


Subject: mount options: partially fix nfs

From: Miklos Szeredi [EMAIL PROTECTED]

Add posix, bsize=, namelen= options to /proc/mounts for nfs
filesystems.

Document several other options that are still missing.

Changes:

 - display namelen= unconditionally
 - addr= isn't missing after all

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
CC: Trond Myklebust [EMAIL PROTECTED]
---

Index: linux/fs/nfs/super.c
===
--- linux.orig/fs/nfs/super.c   2008-01-25 15:44:56.0 +0100
+++ linux/fs/nfs/super.c2008-01-25 15:57:32.0 +0100
@@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc
} nfs_info[] = {
{ NFS_MOUNT_SOFT, ,soft, ,hard },
{ NFS_MOUNT_INTR, ,intr, ,nointr },
+   { NFS_MOUNT_POSIX, ,posix,  },
{ NFS_MOUNT_NOCTO, ,nocto,  },
{ NFS_MOUNT_NOAC, ,noac,  },
{ NFS_MOUNT_NONLM, ,nolock,  },
@@ -463,6 +464,9 @@ static void nfs_show_mount_options(struc
seq_printf(m, ,vers=%d, clp-rpc_ops-version);
seq_printf(m, ,rsize=%d, nfss-rsize);
seq_printf(m, ,wsize=%d, nfss-wsize);
+   seq_printf(m, ,namelen=%d, nfss-namelen);
+   if (nfss-bsize != 0)
+   seq_printf(m, ,bsize=%d, nfss-bsize);
if (nfss-acregmin != 3*HZ || showdefaults)
seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ);
if (nfss-acregmax != 60*HZ || showdefaults)
@@ -482,6 +486,17 @@ static void nfs_show_mount_options(struc
seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval 
/ HZ);
seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries);
seq_printf(m, ,sec=%s, 
nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor));
+
+   /*
+* Missing options:
+* port=
+* mountport=
+* mountvers=
+* mountproto=
+* clientaddr=
+* mounthost=
+* mountaddr=
+*/
 }
 
 /*
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 00/26] mount options: fix filesystem's -show_options

2008-01-28 Thread Miklos Szeredi
  On Thu, 24 Jan 2008 20:33:41 +0100 Miklos Szeredi [EMAIL PROTECTED] wrote:
  Andrew,
  
  Would you please consider these patches for -mm?
 
 Sure, but I'm too lazy to pick through them and work out which ones need
 updating, which ones got acked and which ones someone else merged, all on a
 very bumpy plane flight ;)
 
 Please resend when the dust has settled?

Yes, I should have thought, it won't quite work in a single iteration :)

I'll resend them in a moment.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 24/26] mount options: fix tmpfs

2008-01-28 Thread Miklos Szeredi
 
 Thanks Miklos, that's a welcome enhancement, nicely done.  I've only
 noticed one thing wrong (MPOL_PREFERRED shown as default); but thought
 shmem_config didn't add much value - I'd rather avoid those syntactic
 changes to unchanged code; and several tmpfs defaults being relative
 (e.g. to totalram_pages, or to mounter's fsuid), I ended up preferring
 to do real tests in shmem_show_options.

I completely agree, this is much better than my version.

 Thus, for example, if memory is hotplugged in or out later, what started
 out as an unspecified size option will then get shown as explicit size.
 (I did think for a while that I wanted to show explicit size in all
 cases; but it looked pretty silly on udev.)  I think that's the correct
 behaviour, that otherwise would be misleading; but I may be looking at
 this the wrong way round, what's your view?

I agree, this is the correct way.

I'll add functions for calculating the default max values, so the
calculations won't accidentally become different for the
initialization and the option showing.

 If you agree with the version below, please take it into your collection
 and insert your Signed-off-by.  I should admit, I've not yet tested how
 the NUMA policies look: you'll hear from me again tomorrow morning if
 those turn out to wrong.

OK, I'll send this to Andrew.  Maybe I'll wait until tomorrow to hear
if it's working on NUMA.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 10/26] mount options: fix devpts

2008-01-25 Thread Miklos Szeredi
  Also add minor fix: when parsing the mode option, mask with
  S_IALLUGO instead of ~S_IFMT, which could leave unsed bits in the
  mask.
 
 umode_t is 16 bits, so it doesn't.  The change is still good, of course.

We still use 16 bit types?  Strange ;)

 
  +   if (config.mode != DEVPTS_DEFAULT_MODE)
  +   seq_printf(seq, ,mode=%03o, config.mode);
 
 I would rather this be unconditional, than that it be conditional on 
 something other than the user having specified it in the first place.

Yeah, it's a matter of taste.  I'll update the patch.

Actually, a lot of filesystems share the options 'uid=X', 'gid=X',
'mode=X' (or 'umask=X').  This could be handled by the VFS, saving
some code, and making things more consistent.  One day maybe...

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 01/26] mount options: add documentation

2008-01-25 Thread Miklos Szeredi
 Where did you check for the existence of a -show_options method for
 unionfs?  Unionfs does implement -show_options and supports all of the
 mount/remount options.  See:
 
 http://git.kernel.org/?p=linux/kernel/git/ezk/unionfs.git;a=blob;f=fs/unionfs/super.c;h=986c980261a5b171147d66ac05bf08423e2fd6b6;hb=HEAD#l963
 
 The unionfs -remount code supports branch-management options which can
 add/del/change a branch, but we don't show those directly in -show_options;
 it makes more sense to show the final (and thus most current) branch
 configuration.
 
 Could you update your records please?

Sure.  Sorry about that, I did actually look at unionfs, and it was
just an administration error and bad memory (in my head).

 BTW, I should be able to use your save_mount_options().

It is probably better not to use save_mount_options().  Especially,
since unionfs implemets a remount, that changes the tree only
partially AFAICS.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 25/26] mount options: fix udf

2008-01-25 Thread Miklos Szeredi
 | +   /* is this correct? */
 | +   if (sbi-s_anchor[2] != 0)
 | +   seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]);
 
 you know, I would prefer to use form UDF_SB_ANCHOR(sb)[2]
 in sake of style unification but we should wait for Jan's
 decision (i'm not the expert in this area ;)

I think UDF_SB_ANCHOR macro was removed by some patch in -mm.

I'm more interested if the second element of the s_anchor array really
does always have the value of the 'anchor=N' mount option.  I haven't
been able to verify that fully.  Do you have some insight into that?

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 21/26] mount options: partially fix nfs

2008-01-25 Thread Miklos Szeredi
 Miklos Szeredi wrote:
  From: Miklos Szeredi [EMAIL PROTECTED]
  
  Add posix, bsize=, namelen= options to /proc/mounts for nfs
  filesystems.
  
  Document several other options that are still missing.
 
 NFS lists only some options in /proc/mounts on purpose: only the 
 essential options are mentioned there to keep clutter down.  The three 
 you've added here are for all intents and purposes deprecated, which is 
 why they are not supported.
 
 NFS lists a more complete set of mount options for a mount point in 
 /proc/self/mountstats.  See nfs_show_stats().
 
 Since your cover letter does not explain why you are changing this code, 
 can you refer me to a description of why you are doing this?

Descritption is in the 01/26 patch.

 More below.
 
  Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
  ---
  
  Index: linux/fs/nfs/super.c
  ===
  --- linux.orig/fs/nfs/super.c   2008-01-19 11:56:34.0 +0100
  +++ linux/fs/nfs/super.c2008-01-21 20:41:30.0 +0100
  @@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc
  } nfs_info[] = {
  { NFS_MOUNT_SOFT, ,soft, ,hard },
  { NFS_MOUNT_INTR, ,intr, ,nointr },
  +   { NFS_MOUNT_POSIX, ,posix,  },
  { NFS_MOUNT_NOCTO, ,nocto,  },
  { NFS_MOUNT_NOAC, ,noac,  },
  { NFS_MOUNT_NONLM, ,nolock,  },
  @@ -459,10 +460,17 @@ static void nfs_show_mount_options(struc
  };
  const struct proc_nfs_info *nfs_infop;
  struct nfs_client *clp = nfss-nfs_client;
  +   unsigned int default_namelen =
  +   clp-rpc_ops-version == 4 ? NFS4_MAXNAMLEN :
  +   clp-rpc_ops-version == 3 ? NFS3_MAXNAMLEN : NFS2_MAXNAMLEN;
   
  seq_printf(m, ,vers=%d, clp-rpc_ops-version);
  seq_printf(m, ,rsize=%d, nfss-rsize);
  seq_printf(m, ,wsize=%d, nfss-wsize);
  +   if (nfss-bsize != 0)
  +   seq_printf(m, ,bsize=%d, nfss-bsize);
  +   if (nfss-namelen != default_namelen)
  +   seq_printf(m, ,namelen=%d, nfss-namelen);
  if (nfss-acregmin != 3*HZ || showdefaults)
  seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ);
  if (nfss-acregmax != 60*HZ || showdefaults)
  @@ -482,6 +490,18 @@ static void nfs_show_mount_options(struc
  seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval 
  / HZ);
  seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries);
  seq_printf(m, ,sec=%s, 
  nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor));
  +
  +   /*
  +* Missing options:
  +* port=
 
 Probably should be supported.
 
  +* addr=
 
 This one is already supported; see nfs_show_options().

Right, thanks.

 
  +* clientaddr=
 
 This one isn't, and should be... would be useful for tracking down 
 certain NFSv4 problems.
 
  +* mounthost=
  +* mountaddr=
   +   * mountport=
   +   * mountvers=
   +   * mountproto=
 
 And these mount* options are for the kernel's new mount protocol client. 
   They aren't really useful for understanding steady-state NFS client 
 behavior, they only effect mount-time behavior.

All mount options should be shown, which are needed to reconstruct a
previous mount.

For example, if you copy options out from /proc/mount, umount the
filesystem, and then create a new mount with the copied options, you
should get the same mount.

So not only those options are interesting which are useful for
understanding steady state behavior.

The only options, which should not be shown, are those which have a
permanent effect at mount time, like journal creation, etc.  And those
which are meaningless across different mounts, like communication file
descriptors.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 21/26] mount options: partially fix nfs

2008-01-25 Thread Miklos Szeredi
 On Thu, 2008-01-24 at 20:34 +0100, Miklos Szeredi wrote:
  plain text document attachment (nfs_opts.patch)
  From: Miklos Szeredi [EMAIL PROTECTED]
  
  Add posix, bsize=, namelen= options to /proc/mounts for nfs
  filesystems.
  
  Document several other options that are still missing.
  
  Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
  ---
  
  Index: linux/fs/nfs/super.c
  ===
  --- linux.orig/fs/nfs/super.c   2008-01-19 11:56:34.0 +0100
  +++ linux/fs/nfs/super.c2008-01-21 20:41:30.0 +0100
  @@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc
  } nfs_info[] = {
  { NFS_MOUNT_SOFT, ,soft, ,hard },
  { NFS_MOUNT_INTR, ,intr, ,nointr },
  +   { NFS_MOUNT_POSIX, ,posix,  },
  { NFS_MOUNT_NOCTO, ,nocto,  },
  { NFS_MOUNT_NOAC, ,noac,  },
  { NFS_MOUNT_NONLM, ,nolock,  },
  @@ -459,10 +460,17 @@ static void nfs_show_mount_options(struc
  };
  const struct proc_nfs_info *nfs_infop;
  struct nfs_client *clp = nfss-nfs_client;
  +   unsigned int default_namelen =
  +   clp-rpc_ops-version == 4 ? NFS4_MAXNAMLEN :
  +   clp-rpc_ops-version == 3 ? NFS3_MAXNAMLEN : NFS2_MAXNAMLEN;
  seq_printf(m, ,vers=%d, clp-rpc_ops-version);
  seq_printf(m, ,rsize=%d, nfss-rsize);
  seq_printf(m, ,wsize=%d, nfss-wsize);
  +   if (nfss-bsize != 0)
  +   seq_printf(m, ,bsize=%d, nfss-bsize);
  +   if (nfss-namelen != default_namelen)
  +   seq_printf(m, ,namelen=%d, nfss-namelen);
 
 You really just want to look at the value of nfss-namelen. It should
 always be set.

OK, I usually add the condition for (value != default_value) to avoid
unnecessary clutter.  But sure, there's no problem with showing the
option unconditionally.

 
  if (nfss-acregmin != 3*HZ || showdefaults)
  seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ);
  if (nfss-acregmax != 60*HZ || showdefaults)
  @@ -482,6 +490,18 @@ static void nfs_show_mount_options(struc
  seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval 
  / HZ);
  seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries);
  seq_printf(m, ,sec=%s, 
  nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor));
  +
  +   /*
  +* Missing options:
  +* port=
  +* mountport=
  +* mountvers=
  +* mountproto=
  +* addr=
  +* clientaddr=
  +* mounthost=
  +* mountaddr=
  +*/
 
 The new text mount interface actually does allow us to store these
 values if we really do need to. That should be a separate patch,
 however.

OK.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 26/26] mount options: fix usbfs

2008-01-25 Thread Miklos Szeredi
  From: Miklos Szeredi [EMAIL PROTECTED]
  
  Add a .show_options super operation to usbfs.
  
  Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
 
 Looks good to me.  Do you want to take this through your tree, as it is
 dependant on other changes, or do you want me to take this through the
 USB tree?  Whatever is easier for you is fine for me.

Please take it, it should be independent of the other changes.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 25/26] mount options: fix udf

2008-01-25 Thread Miklos Szeredi
   | +   /* is this correct? */
   | +   if (sbi-s_anchor[2] != 0)
   | +   seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]);
   
   you know, I would prefer to use form UDF_SB_ANCHOR(sb)[2]
   in sake of style unification but we should wait for Jan's
   decision (i'm not the expert in this area ;)
  
  I think UDF_SB_ANCHOR macro was removed by some patch in -mm.
   Yes, it's going to be removed so don't use it. Actually, basing this
 patch on top of -mm is a good idea because there are quite some changes
 in Andrew's queue.
 
  I'm more interested if the second element of the s_anchor array really
  does always have the value of the 'anchor=N' mount option.  I haven't
  been able to verify that fully.  Do you have some insight into that?
   As Cyrill wrote, it could be zeroed out in case there is no anchor in
 the specified block. So I guess you have to store the passed value
 somewhere else..

But in that case, would the value of the anchor= option matter?

This is actually a somewhat philosophical question about what the
mount options in /proc/mounts mean:

 1) Options _given_ by the user for the mount
 2) Options which are _effective_ for the mount

If we take interpretation 2) and there was no anchor (whatever that
means), then the anchor=N option wasn't effective, and not giving it
would have had the same effect.

This could be confusing to the user, though...

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 25/26] mount options: fix udf

2008-01-25 Thread Miklos Szeredi
 On Fri 25-01-08 16:50:15, Miklos Szeredi wrote:
 | +   /* is this correct? */
 | +   if (sbi-s_anchor[2] != 0)
 | +   seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]);
 
 you know, I would prefer to use form UDF_SB_ANCHOR(sb)[2]
 in sake of style unification but we should wait for Jan's
 decision (i'm not the expert in this area ;)

I think UDF_SB_ANCHOR macro was removed by some patch in -mm.
 Yes, it's going to be removed so don't use it. Actually, basing this
   patch on top of -mm is a good idea because there are quite some changes
   in Andrew's queue.
   
I'm more interested if the second element of the s_anchor array really
does always have the value of the 'anchor=N' mount option.  I haven't
been able to verify that fully.  Do you have some insight into that?
 As Cyrill wrote, it could be zeroed out in case there is no anchor in
   the specified block. So I guess you have to store the passed value
   somewhere else..
  
  But in that case, would the value of the anchor= option matter?
   No, it would not.
 
  This is actually a somewhat philosophical question about what the
  mount options in /proc/mounts mean:
  
   1) Options _given_ by the user for the mount
   2) Options which are _effective_ for the mount
  
  If we take interpretation 2) and there was no anchor (whatever that
  means), then the anchor=N option wasn't effective, and not giving it
  would have had the same effect.
  
  This could be confusing to the user, though...
   Hmm, given that options are modified by remount for some filesystems,
 it's probably the best to display the effective state. So your code should
 display the right thing as it is.

OK.  Cyrill, Jan, thanks for the reviews.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 00/26] mount options: fix filesystem's -show_options

2008-01-24 Thread Miklos Szeredi
Andrew,

Would you please consider these patches for -mm?  They should be
relatively uncontroversial and straightforward fixes.

They touch a lot of filesystems though, so not sure about the
logistics...

For the description, see first patch's header.

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 03/26] mount options: fix adfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to adfs.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/adfs/super.c
===
--- linux.orig/fs/adfs/super.c  2008-01-24 13:48:43.0 +0100
+++ linux/fs/adfs/super.c   2008-01-24 15:55:26.0 +0100
@@ -20,6 +20,8 @@
 #include linux/vfs.h
 #include linux/parser.h
 #include linux/bitops.h
+#include linux/mount.h
+#include linux/seq_file.h
 
 #include asm/uaccess.h
 #include asm/system.h
@@ -30,6 +32,9 @@
 #include dir_f.h
 #include dir_fplus.h
 
+#define ADFS_DEFAULT_OWNER_MASK S_IRWXU
+#define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO)
+
 void __adfs_error(struct super_block *sb, const char *function, const char 
*fmt, ...)
 {
char error_buf[128];
@@ -134,6 +139,22 @@ static void adfs_put_super(struct super_
sb-s_fs_info = NULL;
 }
 
+static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+   struct adfs_sb_info *asb = ADFS_SB(mnt-mnt_sb);
+
+   if (asb-s_uid != 0)
+   seq_printf(seq, ,uid=%u, asb-s_uid);
+   if (asb-s_gid != 0)
+   seq_printf(seq, ,gid=%u, asb-s_gid);
+   if (asb-s_owner_mask != ADFS_DEFAULT_OWNER_MASK)
+   seq_printf(seq, ,ownmask=%o, asb-s_owner_mask);
+   if (asb-s_other_mask != ADFS_DEFAULT_OTHER_MASK)
+   seq_printf(seq, ,othmask=%o, asb-s_other_mask);
+
+   return 0;
+}
+
 enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
 
 static match_table_t tokens = {
@@ -259,6 +280,7 @@ static const struct super_operations adf
.put_super  = adfs_put_super,
.statfs = adfs_statfs,
.remount_fs = adfs_remount,
+   .show_options   = adfs_show_options,
 };
 
 static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct 
adfs_discrecord *dr)
@@ -344,8 +366,8 @@ static int adfs_fill_super(struct super_
/* set default options */
asb-s_uid = 0;
asb-s_gid = 0;
-   asb-s_owner_mask = S_IRWXU;
-   asb-s_other_mask = S_IRWXG | S_IRWXO;
+   asb-s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
+   asb-s_other_mask = ADFS_DEFAULT_OTHER_MASK;
 
if (parse_options(sb, data))
goto error;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 04/26] mount options: fix affs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to affs.

Use generic_show_options() and save the complete option string in
affs_fill_super() and affs_remount().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/affs/super.c
===
--- linux.orig/fs/affs/super.c  2008-01-24 18:57:19.0 +0100
+++ linux/fs/affs/super.c   2008-01-24 19:01:21.0 +0100
@@ -122,6 +122,7 @@ static const struct super_operations aff
.write_super= affs_write_super,
.statfs = affs_statfs,
.remount_fs = affs_remount,
+   .show_options   = generic_show_options,
 };
 
 enum {
@@ -272,6 +273,8 @@ static int affs_fill_super(struct super_
u8   sig[4];
int  ret = -EINVAL;
 
+   save_mount_options(sb, data);
+
pr_debug(AFFS: read_super(%s)\n,data ? (const char *)data : no 
options);
 
sb-s_magic = AFFS_SUPER_MAGIC;
@@ -487,14 +490,21 @@ affs_remount(struct super_block *sb, int
int  root_block;
unsigned longmount_flags;
int  res = 0;
+   char*new_opts = kstrdup(data, GFP_KERNEL);
 
pr_debug(AFFS: remount(flags=0x%x,opts=\%s\)\n,*flags,data);
 
*flags |= MS_NODIRATIME;
 
-   if (!parse_options(data,uid,gid,mode,reserved,root_block,
-   blocksize,sbi-s_prefix,sbi-s_volume,mount_flags))
+   if (!parse_options(data, uid, gid, mode, reserved, root_block,
+  blocksize, sbi-s_prefix, sbi-s_volume,
+  mount_flags)) {
+   kfree(new_opts);
return -EINVAL;
+   }
+   kfree(sb-s_options);
+   sb-s_options = new_opts;
+
sbi-s_flags = mount_flags;
sbi-s_mode  = mode;
sbi-s_uid   = uid;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 05/26] mount options: fix afs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to afs.

Use generic_show_options() and save the complete option string in
afs_get_sb().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/afs/super.c
===
--- linux.orig/fs/afs/super.c   2008-01-24 11:42:44.0 +0100
+++ linux/fs/afs/super.c2008-01-24 12:05:50.0 +0100
@@ -52,6 +52,7 @@ static const struct super_operations afs
.clear_inode= afs_clear_inode,
.umount_begin   = afs_umount_begin,
.put_super  = afs_put_super,
+   .show_options   = generic_show_options,
 };
 
 static struct kmem_cache *afs_inode_cachep;
@@ -357,6 +358,7 @@ static int afs_get_sb(struct file_system
struct super_block *sb;
struct afs_volume *vol;
struct key *key;
+   char *new_opts = kstrdup(options, GFP_KERNEL);
int ret;
 
_enter(,,%s,%p, dev_name, options);
@@ -408,9 +410,11 @@ static int afs_get_sb(struct file_system
deactivate_super(sb);
goto error;
}
+   sb-s_options = new_opts;
sb-s_flags |= MS_ACTIVE;
} else {
_debug(reuse);
+   kfree(new_opts);
ASSERTCMP(sb-s_flags, , MS_ACTIVE);
}
 
@@ -424,6 +428,7 @@ error:
afs_put_volume(params.volume);
afs_put_cell(params.cell);
key_put(params.key);
+   kfree(new_opts);
_leave( = %d, ret);
return ret;
 }

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 06/26] mount options: fix autofs4

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add uid= and gid= options to /proc/mounts for autofs4 filesystems.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/autofs4/inode.c
===
--- linux.orig/fs/autofs4/inode.c   2008-01-22 15:52:42.0 +0100
+++ linux/fs/autofs4/inode.c2008-01-22 23:36:02.0 +0100
@@ -188,11 +188,16 @@ out_kill_sb:
 static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
struct autofs_sb_info *sbi = autofs4_sbi(mnt-mnt_sb);
+   struct inode *root_inode = mnt-mnt_sb-s_root-d_inode;
 
if (!sbi)
return 0;
 
seq_printf(m, ,fd=%d, sbi-pipefd);
+   if (root_inode-i_uid != 0)
+   seq_printf(m, ,uid=%u, root_inode-i_uid);
+   if (root_inode-i_gid != 0)
+   seq_printf(m, ,gid=%u, root_inode-i_gid);
seq_printf(m, ,pgrp=%d, sbi-oz_pgrp);
seq_printf(m, ,timeout=%lu, sbi-exp_timeout/HZ);
seq_printf(m, ,minproto=%d, sbi-min_proto);

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 02/26] mount options: add generic_show_options()

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a new s_options field to struct super_block.  Filesystems can save
mount options passed to them in mount or remount.  It is automatically
freed when the superblock is destroyed.

A new helper function, generic_show_options() is introduced, which uses
this field to display the mount options in /proc/mounts.

Another helper function, save_mount_options() may be used by
filesystems to save the options in the super block.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-01-24 17:07:46.0 +0100
+++ linux/fs/namespace.c2008-01-24 17:34:50.0 +0100
@@ -575,6 +575,50 @@ void mnt_unpin(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(mnt_unpin);
 
+static inline void mangle(struct seq_file *m, const char *s)
+{
+   seq_escape(m, s,  \t\n\\);
+}
+
+/*
+ * Simple .show_options callback for filesystems which don't want to
+ * implement more complex mount option showing.
+ *
+ * See also save_mount_options().
+ */
+int generic_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+   const char *options = mnt-mnt_sb-s_options;
+
+   if (options != NULL  options[0]) {
+   seq_putc(m, ',');
+   mangle(m, options);
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL(generic_show_options);
+
+/*
+ * If filesystem uses generic_show_options(), this function should be
+ * called from the fill_super() callback.
+ *
+ * The .remount_fs callback usually needs to be handled in a special
+ * way, to make sure, that previous options are not overwritten if the
+ * remount fails.
+ *
+ * Also note, that if the filesystem's .remount_fs function doesn't
+ * reset all options to their default value, but changes only newly
+ * given options, then the displayed options will not reflect reality
+ * any more.
+ */
+void save_mount_options(struct super_block *sb, char *options)
+{
+   kfree(sb-s_options);
+   sb-s_options = kstrdup(options, GFP_KERNEL);
+}
+EXPORT_SYMBOL(save_mount_options);
+
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
@@ -596,11 +640,6 @@ static void m_stop(struct seq_file *m, v
up_read(namespace_sem);
 }
 
-static inline void mangle(struct seq_file *m, const char *s)
-{
-   seq_escape(m, s,  \t\n\\);
-}
-
 static int show_vfsmnt(struct seq_file *m, void *v)
 {
struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
Index: linux/fs/super.c
===
--- linux.orig/fs/super.c   2008-01-24 17:07:46.0 +0100
+++ linux/fs/super.c2008-01-24 17:12:33.0 +0100
@@ -105,6 +105,7 @@ static inline void destroy_super(struct 
 {
security_sb_free(s);
kfree(s-s_subtype);
+   kfree(s-s_options);
kfree(s);
 }
 
Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-01-24 17:07:46.0 +0100
+++ linux/include/linux/fs.h2008-01-24 17:12:33.0 +0100
@@ -1042,6 +1042,12 @@ struct super_block {
 * in /proc/mounts will be type.subtype
 */
char *s_subtype;
+
+   /*
+* Saved mount options for lazy filesystems using
+* generic_show_options()
+*/
+   char *s_options;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
@@ -1992,6 +1998,9 @@ extern int __must_check inode_setattr(st
 
 extern void file_update_time(struct file *file);
 
+extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
+extern void save_mount_options(struct super_block *sb, char *options);
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
ino_t res;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 09/26] mount options: fix capifs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to capifs.

Use generic_show_options() and save the complete option string in
capifs_remount().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/drivers/isdn/capi/capifs.c
===
--- linux.orig/drivers/isdn/capi/capifs.c   2007-10-09 22:31:38.0 
+0200
+++ linux/drivers/isdn/capi/capifs.c2008-01-24 11:37:42.0 +0100
@@ -52,6 +52,7 @@ static int capifs_remount(struct super_b
gid_t gid = 0;
umode_t mode = 0600;
char *this_char;
+   char *new_opt = kstrdup(data, GFP_KERNEL);
 
this_char = NULL;
while ((this_char = strsep(data, ,)) != NULL) {
@@ -72,11 +73,16 @@ static int capifs_remount(struct super_b
return -EINVAL;
}
}
+
+   kfree(s-s_options);
+   s-s_options = new_opt;
+
config.setuid  = setuid;
config.setgid  = setgid;
config.uid = uid;
config.gid = gid;
config.mode= mode;
+
return 0;
 }
 
@@ -84,6 +90,7 @@ static struct super_operations capifs_so
 {
.statfs = simple_statfs,
.remount_fs = capifs_remount,
+   .show_options   = generic_show_options,
 };
 
 

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 08/26] mount options: fix befs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to befs.

Use generic_show_options() and save the complete option string in
befs_fill_super().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/befs/linuxvfs.c
===
--- linux.orig/fs/befs/linuxvfs.c   2008-01-17 19:00:54.0 +0100
+++ linux/fs/befs/linuxvfs.c2008-01-22 21:40:05.0 +0100
@@ -57,6 +57,7 @@ static const struct super_operations bef
.put_super  = befs_put_super,   /* uninit super */
.statfs = befs_statfs,  /* statfs */
.remount_fs = befs_remount,
+   .show_options   = generic_show_options,
 };
 
 /* slab cache for befs_inode_info objects */
@@ -759,10 +760,11 @@ befs_fill_super(struct super_block *sb, 
befs_super_block *disk_sb;
struct inode *root;
long ret = -EINVAL;
-
const unsigned long sb_block = 0;
const off_t x86_sb_off = 512;
 
+   save_mount_options(sb, data);
+
sb-s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL);
if (sb-s_fs_info == NULL) {
printk(KERN_ERR

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 10/26] mount options: fix devpts

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to devpts.

Also add minor fix: when parsing the mode option, mask with
S_IALLUGO instead of ~S_IFMT, which could leave unsed bits in the
mask.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/devpts/inode.c
===
--- linux.orig/fs/devpts/inode.c2008-01-22 23:43:12.0 +0100
+++ linux/fs/devpts/inode.c 2008-01-23 13:01:05.0 +0100
@@ -20,9 +20,12 @@
 #include linux/devpts_fs.h
 #include linux/parser.h
 #include linux/fsnotify.h
+#include linux/seq_file.h
 
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 
+#define DEVPTS_DEFAULT_MODE 0600
+
 static struct vfsmount *devpts_mnt;
 static struct dentry *devpts_root;
 
@@ -32,7 +35,7 @@ static struct {
uid_t   uid;
gid_t   gid;
umode_t mode;
-} config = {.mode = 0600};
+} config = {.mode = DEVPTS_DEFAULT_MODE};
 
 enum {
Opt_uid, Opt_gid, Opt_mode,
@@ -54,7 +57,7 @@ static int devpts_remount(struct super_b
config.setgid  = 0;
config.uid = 0;
config.gid = 0;
-   config.mode= 0600;
+   config.mode= DEVPTS_DEFAULT_MODE;
 
while ((p = strsep(data, ,)) != NULL) {
substring_t args[MAX_OPT_ARGS];
@@ -81,7 +84,7 @@ static int devpts_remount(struct super_b
case Opt_mode:
if (match_octal(args[0], option))
return -EINVAL;
-   config.mode = option  ~S_IFMT;
+   config.mode = option  S_IALLUGO;
break;
default:
printk(KERN_ERR devpts: called with bogus options\n);
@@ -92,9 +95,22 @@ static int devpts_remount(struct super_b
return 0;
 }
 
+static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+   if (config.setuid)
+   seq_printf(seq, ,uid=%u, config.uid);
+   if (config.setgid)
+   seq_printf(seq, ,gid=%u, config.gid);
+   if (config.mode != DEVPTS_DEFAULT_MODE)
+   seq_printf(seq, ,mode=%03o, config.mode);
+
+   return 0;
+}
+
 static const struct super_operations devpts_sops = {
.statfs = simple_statfs,
.remount_fs = devpts_remount,
+   .show_options   = devpts_show_options,
 };
 
 static int

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 11/26] mount options: fix ext2

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add noreservation option to /proc/mounts for ext2 filesystems.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/ext2/super.c
===
--- linux.orig/fs/ext2/super.c  2008-01-17 19:00:55.0 +0100
+++ linux/fs/ext2/super.c   2008-01-23 21:38:08.0 +0100
@@ -285,6 +285,9 @@ static int ext2_show_options(struct seq_
seq_puts(seq, ,xip);
 #endif
 
+   if (!test_opt(sb, RESERVATION))
+   seq_puts(seq, ,noreservation);
+
return 0;
 }
 

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 12/26] mount options: fix ext4

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add stripe= option to /proc/mounts for ext4 filesystems.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/ext4/super.c
===
--- linux.orig/fs/ext4/super.c  2008-01-23 12:57:07.0 +0100
+++ linux/fs/ext4/super.c   2008-01-23 21:43:51.0 +0100
@@ -742,7 +742,8 @@ static int ext4_show_options(struct seq_
seq_puts(seq, ,nomballoc);
if (!test_opt(sb, DELALLOC))
seq_puts(seq, ,nodelalloc);
-
+   if (sbi-s_stripe)
+   seq_printf(seq, ,stripe=%lu, sbi-s_stripe);
 
/*
 * journal mode get enabled in different ways

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 01/26] mount options: add documentation

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

This series addresses the problem of showing mount options in
/proc/mounts.

Several filesystems which use mount options, have not implemented a
.show_options superblock operation.  Several others have implemented
this callback, but have not kept it fully up to date with the parsed
options.

Q: Why do we need correct option showing in /proc/mounts?
A: We want /proc/mounts to fully replace /etc/mtab.  The reasons for
   this are:
- unprivileged mounters won't be able to update /etc/mtab
- /etc/mtab doesn't work with private mount namespaces
- /etc/mtab can become out-of-sync with reality

Q: Can't this be done, so that filesystems need not bother with
   implementing a .show_mounts callback, and keeping it up to date?
A: Only in some cases.  Certain filesystems allow modification of a
   subset of options in their remount_fs method.  It is not possible
   to take this into account without knowing exactly how the
   filesystem handles options.

For the simple case (no remount or remount resets all options) the
patchset introduces two helpers:

  generic_show_options()
  save_mount_options()

These can also be used to emulate the old /etc/mtab behavior, until
proper support is added.  Even if this is not 100% correct, it's still
better than showing no options at all.

The following patches fix up most in-tree filesystems, they have been
compile tested only.  I would like to ask maintainers (CC-d on
respective patches) to please review, test and ACK these changes.

The following filesystems still need fixing: CIFS, NFS, XFS, Unionfs,
Reiser4.  For CIFS, NFS and XFS I wasn't able to understand how some
of the options are used.  The last two are not yet in mainline, so I
leave fixing those to their respective maintainers out of pure
laziness.

Table displaying status of all in-kernel filesystems:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
legend:

  none - fs has options, but doesn't define -show_options()
  some - fs defines -show_options(), but some only options are shown
  most - fs defines -show_options(), and shows most of them
  good - fs shows all options
  noopt - fs does not have options
  patch - a patch will be posted

9p  good
adfspatch
affspatch
afs patch
autofs  patch
autofs4 patch
befspatch
bfs noopt
cifssome
codanoopt
configfsnoopt
cramfs  noopt
debugfs noopt
devpts  patch
ecryptfsgood
efs noopt
ext2patch
ext3good
ext4patch
fat patch
freevxfsnoopt
fusepatch
fusectl noopt
gfs2good
gfs2metanoopt
hfs good
hfsplus good
hostfs  patch
hpfspatch
hppfs   noopt
hugetlbfs   patch
isofs   patch
jffs2   noopt
jfs patch
minix   noopt
msdos   -fat
ncpfs   patch
nfs patch,most
nfsdnoopt
ntfsgood
ocfs2   good
ocfs2/dlmfs noopt
openpromfs  noopt
procnoopt
qnx4noopt
ramfs   noopt
reiserfspatch
romfs   noopt
smbfs   good
sysfs   noopt
sysvnoopt
udf patch
ufs good
vfat-fat
xfs most

mm/shmem.cpatch
drivers/oprofile/oprofilefs.c noopt
drivers/infiniband/hw/ipath/ipath_fs.cnoopt
drivers/misc/ibmasm/ibmasmfs.cnoopt
drivers/usb/core (usbfs)  patch
drivers/usb/gadget (gadgetfs) noopt
drivers/isdn/capi/capifs.cpatch
kernel/cpuset.c   noopt
fs/binfmt_misc.c  noopt
net/sunrpc/rpc_pipe.c noopt
arch/powerpc/platforms/cell/spufs patch
arch/s390/hypfs   good
ipc/mqueue.c  noopt
security (securityfs) noopt
security/selinux/selinuxfs.c  noopt
kernel/cgroup.c   good
security/smack/smackfs.c  noopt

in -mm:

reiser4 some
unionfs none
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

This patch:

Document the rules for handling mount options in the .show_options
super operation.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/Documentation/filesystems/vfs.txt
===
--- linux.orig/Documentation/filesystems/vfs.txt2008-01-24 
11:42:48.0 +0100
+++ linux/Documentation/filesystems/vfs.txt 2008-01-24 17:12:25.0 
+0100
@@ -151,7 +151,7 @@ The get_sb() method has the following ar
   const char *dev_name: the device name we are mounting.
 
   void *data: arbitrary mount options, usually comes as an ASCII
-   string
+   string (see Mount Options section)
 
   struct vfsmount *mnt: a vfs-internal representation of a mount point
 
@@ -182,7

[patch 14/26] mount options: fix fuse

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add blksize= option to /proc/mounts for fuseblk filesystems.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/inode.c
===
--- linux.orig/fs/fuse/inode.c  2008-01-19 11:56:34.0 +0100
+++ linux/fs/fuse/inode.c   2008-01-21 17:53:06.0 +0100
@@ -29,6 +29,8 @@ DEFINE_MUTEX(fuse_mutex);
 
 #define FUSE_SUPER_MAGIC 0x65735546
 
+#define FUSE_DEFAULT_BLKSIZE 512
+
 struct fuse_mount_data {
int fd;
unsigned rootmode;
@@ -355,7 +357,7 @@ static int parse_fuse_opt(char *opt, str
char *p;
memset(d, 0, sizeof(struct fuse_mount_data));
d-max_read = ~0;
-   d-blksize = 512;
+   d-blksize = FUSE_DEFAULT_BLKSIZE;
 
while ((p = strsep(opt, ,)) != NULL) {
int token;
@@ -440,6 +442,9 @@ static int fuse_show_options(struct seq_
seq_puts(m, ,allow_other);
if (fc-max_read != ~0)
seq_printf(m, ,max_read=%u, fc-max_read);
+   if (mnt-mnt_sb-s_bdev 
+   mnt-mnt_sb-s_blocksize != FUSE_DEFAULT_BLKSIZE)
+   seq_printf(m, ,blksize=%lu, mnt-mnt_sb-s_blocksize);
return 0;
 }
 

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 15/26] mount options: fix hostfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add the host path option to /proc/mounts for UML hostfs filesystems.

The mount source (mnt_devname) should really be used for this, but not
easy to change now in a backward compatible way.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/hostfs/hostfs_kern.c
===
--- linux.orig/fs/hostfs/hostfs_kern.c  2008-01-17 19:00:55.0 +0100
+++ linux/fs/hostfs/hostfs_kern.c   2008-01-21 19:19:55.0 +0100
@@ -11,6 +11,7 @@
 #include linux/mm.h
 #include linux/pagemap.h
 #include linux/statfs.h
+#include linux/seq_file.h
 #include hostfs.h
 #include init.h
 #include kern.h
@@ -322,12 +323,25 @@ static void hostfs_destroy_inode(struct 
kfree(HOSTFS_I(inode));
 }
 
+static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+   struct inode *root = vfs-mnt_sb-s_root-d_inode;
+   const char *root_path = HOSTFS_I(root)-host_filename;
+   size_t offset = strlen(root_ino) + 1;
+
+   if (strlen(root_path)  offset)
+   seq_printf(seq, ,%s, root_path + offset);
+
+   return 0;
+}
+
 static const struct super_operations hostfs_sbops = {
.alloc_inode= hostfs_alloc_inode,
.drop_inode = generic_delete_inode,
.delete_inode   = hostfs_delete_inode,
.destroy_inode  = hostfs_destroy_inode,
.statfs = hostfs_statfs,
+   .show_options   = hostfs_show_options,
 };
 
 int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 16/26] mount options: fix hpfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to hpfs.

Use generic_show_options() and save the complete option string in
hpfs_fill_super() and hpfs_remount_fs().

Also add a small fix: hpfs_remount_fs() should return -EINVAL on
error, instead of 1, which is not an error value.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/hpfs/super.c
===
--- linux.orig/fs/hpfs/super.c  2008-01-17 19:00:14.0 +0100
+++ linux/fs/hpfs/super.c   2008-01-23 23:36:53.0 +0100
@@ -386,6 +386,7 @@ static int hpfs_remount_fs(struct super_
int lowercase, conv, eas, chk, errs, chkdsk, timeshift;
int o;
struct hpfs_sb_info *sbi = hpfs_sb(s);
+   char *new_opts = kstrdup(data, GFP_KERNEL);

*flags |= MS_NOATIME;

@@ -398,15 +399,15 @@ static int hpfs_remount_fs(struct super_
if (!(o = parse_opts(data, uid, gid, umask, lowercase, conv,
eas, chk, errs, chkdsk, timeshift))) {
printk(HPFS: bad mount options.\n);
-   return 1;
+   goto out_err;
}
if (o == 2) {
hpfs_help();
-   return 1;
+   goto out_err;
}
if (timeshift != sbi-sb_timeshift) {
printk(HPFS: timeshift can't be changed using remount.\n);
-   return 1;
+   goto out_err;
}
 
unmark_dirty(s);
@@ -419,7 +420,14 @@ static int hpfs_remount_fs(struct super_
 
if (!(*flags  MS_RDONLY)) mark_dirty(s);
 
+   kfree(s-s_options);
+   s-s_options = new_opts;
+
return 0;
+
+out_err:
+   kfree(new_opts);
+   return -EINVAL;
 }
 
 /* Super operations */
@@ -432,6 +440,7 @@ static const struct super_operations hpf
.put_super  = hpfs_put_super,
.statfs = hpfs_statfs,
.remount_fs = hpfs_remount_fs,
+   .show_options   = generic_show_options,
 };
 
 static int hpfs_fill_super(struct super_block *s, void *options, int silent)
@@ -454,6 +463,8 @@ static int hpfs_fill_super(struct super_
 
int o;
 
+   save_mount_options(s, options);
+
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 18/26] mount options: fix isofs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to isofs.

Use generic_show_options() and save the complete option string in
isofs_fill_super().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/isofs/inode.c
===
--- linux.orig/fs/isofs/inode.c 2008-01-17 19:00:55.0 +0100
+++ linux/fs/isofs/inode.c  2008-01-23 22:07:51.0 +0100
@@ -110,6 +110,7 @@ static const struct super_operations iso
.put_super  = isofs_put_super,
.statfs = isofs_statfs,
.remount_fs = isofs_remount,
+   .show_options   = generic_show_options,
 };
 
 
@@ -554,6 +555,8 @@ static int isofs_fill_super(struct super
int table, error = -EINVAL;
unsigned int vol_desc_start;
 
+   save_mount_options(s, data);
+
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 21/26] mount options: partially fix nfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add posix, bsize=, namelen= options to /proc/mounts for nfs
filesystems.

Document several other options that are still missing.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/nfs/super.c
===
--- linux.orig/fs/nfs/super.c   2008-01-19 11:56:34.0 +0100
+++ linux/fs/nfs/super.c2008-01-21 20:41:30.0 +0100
@@ -449,6 +449,7 @@ static void nfs_show_mount_options(struc
} nfs_info[] = {
{ NFS_MOUNT_SOFT, ,soft, ,hard },
{ NFS_MOUNT_INTR, ,intr, ,nointr },
+   { NFS_MOUNT_POSIX, ,posix,  },
{ NFS_MOUNT_NOCTO, ,nocto,  },
{ NFS_MOUNT_NOAC, ,noac,  },
{ NFS_MOUNT_NONLM, ,nolock,  },
@@ -459,10 +460,17 @@ static void nfs_show_mount_options(struc
};
const struct proc_nfs_info *nfs_infop;
struct nfs_client *clp = nfss-nfs_client;
+   unsigned int default_namelen =
+   clp-rpc_ops-version == 4 ? NFS4_MAXNAMLEN :
+   clp-rpc_ops-version == 3 ? NFS3_MAXNAMLEN : NFS2_MAXNAMLEN;
 
seq_printf(m, ,vers=%d, clp-rpc_ops-version);
seq_printf(m, ,rsize=%d, nfss-rsize);
seq_printf(m, ,wsize=%d, nfss-wsize);
+   if (nfss-bsize != 0)
+   seq_printf(m, ,bsize=%d, nfss-bsize);
+   if (nfss-namelen != default_namelen)
+   seq_printf(m, ,namelen=%d, nfss-namelen);
if (nfss-acregmin != 3*HZ || showdefaults)
seq_printf(m, ,acregmin=%d, nfss-acregmin/HZ);
if (nfss-acregmax != 60*HZ || showdefaults)
@@ -482,6 +490,18 @@ static void nfs_show_mount_options(struc
seq_printf(m, ,timeo=%lu, 10U * nfss-client-cl_timeout-to_initval 
/ HZ);
seq_printf(m, ,retrans=%u, nfss-client-cl_timeout-to_retries);
seq_printf(m, ,sec=%s, 
nfs_pseudoflavour_to_name(nfss-client-cl_auth-au_flavor));
+
+   /*
+* Missing options:
+* port=
+* mountport=
+* mountvers=
+* mountproto=
+* addr=
+* clientaddr=
+* mounthost=
+* mountaddr=
+*/
 }
 
 /*

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 23/26] mount options: fix spufs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to spufs.

Use generic_show_options() and save the complete option string in
spufs_fill_super().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/arch/powerpc/platforms/cell/spufs/inode.c
===
--- linux.orig/arch/powerpc/platforms/cell/spufs/inode.c2008-01-17 
19:00:52.0 +0100
+++ linux/arch/powerpc/platforms/cell/spufs/inode.c 2008-01-23 
23:44:36.0 +0100
@@ -744,8 +744,11 @@ spufs_fill_super(struct super_block *sb,
.statfs = simple_statfs,
.delete_inode = spufs_delete_inode,
.drop_inode = generic_delete_inode,
+   .show_options = generic_show_options,
};
 
+   save_mount_options(sb, data);
+
sb-s_maxbytes = MAX_LFS_FILESIZE;
sb-s_blocksize = PAGE_CACHE_SIZE;
sb-s_blocksize_bits = PAGE_CACHE_SHIFT;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 07/26] mount options: fix autofs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to autofs.

Use generic_show_options() and save the complete option string in
autofs_fill_super().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/autofs/inode.c
===
--- linux.orig/fs/autofs/inode.c2008-01-17 19:00:54.0 +0100
+++ linux/fs/autofs/inode.c 2008-01-24 11:16:30.0 +0100
@@ -54,6 +54,7 @@ out_kill_sb:
 
 static const struct super_operations autofs_sops = {
.statfs = simple_statfs,
+   .show_options   = generic_show_options,
 };
 
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
@@ -140,6 +141,8 @@ int autofs_fill_super(struct super_block
int minproto, maxproto;
pid_t pgid;
 
+   save_mount_options(s, data);
+
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
goto fail_unlock;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 24/26] mount options: fix tmpfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add .show_options super operation to tmpfs.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/mm/shmem.c
===
--- linux.orig/mm/shmem.c   2008-01-21 21:20:04.0 +0100
+++ linux/mm/shmem.c2008-01-21 21:30:04.0 +0100
@@ -49,6 +49,7 @@
 #include linux/ctype.h
 #include linux/migrate.h
 #include linux/highmem.h
+#include linux/seq_file.h
 
 #include asm/uaccess.h
 #include asm/div64.h
@@ -198,7 +199,7 @@ static DEFINE_MUTEX(shmem_swaplist_mutex
 static void shmem_free_blocks(struct inode *inode, long pages)
 {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode-i_sb);
-   if (sbinfo-max_blocks) {
+   if (sbinfo-config.max_blocks) {
spin_lock(sbinfo-stat_lock);
sbinfo-free_blocks += pages;
inode-i_blocks -= pages*BLOCKS_PER_PAGE;
@@ -209,7 +210,7 @@ static void shmem_free_blocks(struct ino
 static int shmem_reserve_inode(struct super_block *sb)
 {
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
-   if (sbinfo-max_inodes) {
+   if (sbinfo-config.max_inodes) {
spin_lock(sbinfo-stat_lock);
if (!sbinfo-free_inodes) {
spin_unlock(sbinfo-stat_lock);
@@ -224,7 +225,7 @@ static int shmem_reserve_inode(struct su
 static void shmem_free_inode(struct super_block *sb)
 {
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
-   if (sbinfo-max_inodes) {
+   if (sbinfo-config.max_inodes) {
spin_lock(sbinfo-stat_lock);
sbinfo-free_inodes++;
spin_unlock(sbinfo-stat_lock);
@@ -388,7 +389,7 @@ static swp_entry_t *shmem_swp_alloc(stru
 * page (and perhaps indirect index pages) yet to allocate:
 * a waste to allocate index if we cannot allocate data.
 */
-   if (sbinfo-max_blocks) {
+   if (sbinfo-config.max_blocks) {
spin_lock(sbinfo-stat_lock);
if (sbinfo-free_blocks = 1) {
spin_unlock(sbinfo-stat_lock);
@@ -1338,7 +1339,7 @@ repeat:
} else {
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode-i_sb);
-   if (sbinfo-max_blocks) {
+   if (sbinfo-config.max_blocks) {
spin_lock(sbinfo-stat_lock);
if (sbinfo-free_blocks == 0 ||
shmem_acct_block(info-flags)) {
@@ -1519,8 +1520,9 @@ shmem_get_inode(struct super_block *sb, 
case S_IFREG:
inode-i_op = shmem_inode_operations;
inode-i_fop = shmem_file_operations;
-   mpol_shared_policy_init(info-policy, sbinfo-policy,
-   sbinfo-policy_nodes);
+   mpol_shared_policy_init(info-policy,
+   sbinfo-config.policy,
+   sbinfo-config.policy_nodes);
break;
case S_IFDIR:
inc_nlink(inode);
@@ -1720,12 +1722,12 @@ static int shmem_statfs(struct dentry *d
buf-f_bsize = PAGE_CACHE_SIZE;
buf-f_namelen = NAME_MAX;
spin_lock(sbinfo-stat_lock);
-   if (sbinfo-max_blocks) {
-   buf-f_blocks = sbinfo-max_blocks;
+   if (sbinfo-config.max_blocks) {
+   buf-f_blocks = sbinfo-config.max_blocks;
buf-f_bavail = buf-f_bfree = sbinfo-free_blocks;
}
-   if (sbinfo-max_inodes) {
-   buf-f_files = sbinfo-max_inodes;
+   if (sbinfo-config.max_inodes) {
+   buf-f_files = sbinfo-config.max_inodes;
buf-f_ffree = sbinfo-free_inodes;
}
/* else leave those fields 0 like simple_statfs */
@@ -2077,9 +2079,8 @@ static const struct export_operations sh
.fh_to_dentry   = shmem_fh_to_dentry,
 };
 
-static int shmem_parse_options(char *options, int *mode, uid_t *uid,
-   gid_t *gid, unsigned long *blocks, unsigned long *inodes,
-   int *policy, nodemask_t *policy_nodes)
+static int shmem_parse_options(char *options, struct shmem_config *config,
+  bool remount)
 {
char *this_char, *value, *rest;
 
@@ -2122,35 +2123,43 @@ static int shmem_parse_options(char *opt
}
if (*rest)
goto bad_val;
-   *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
+   config-max_blocks =
+   DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
+   config-max_blocks_changed = 1;
} else if (!strcmp(this_char,nr_blocks)) {
-   *blocks = memparse(value,rest

[patch 25/26] mount options: fix udf

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to udf.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/udf/super.c
===
--- linux.orig/fs/udf/super.c   2008-01-24 13:48:37.0 +0100
+++ linux/fs/udf/super.c2008-01-24 15:58:21.0 +0100
@@ -53,6 +53,8 @@
 #include linux/vfs.h
 #include linux/vmalloc.h
 #include linux/errno.h
+#include linux/mount.h
+#include linux/seq_file.h
 #include asm/byteorder.h
 
 #include linux/udf_fs.h
@@ -71,6 +73,8 @@
 #define VDS_POS_TERMINATING_DESC   6
 #define VDS_POS_LENGTH 7
 
+#define UDF_DEFAULT_BLOCKSIZE 2048
+
 static char error_buf[1024];
 
 /* These are the meat - everything else is stuffing */
@@ -95,6 +99,7 @@ static void udf_open_lvid(struct super_b
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct dentry *, struct kstatfs *);
+static int udf_show_options(struct seq_file *, struct vfsmount *);
 
 struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
 {
@@ -181,6 +186,7 @@ static const struct super_operations udf
.write_super= udf_write_super,
.statfs = udf_statfs,
.remount_fs = udf_remount_fs,
+   .show_options   = udf_show_options,
 };
 
 struct udf_options {
@@ -247,6 +253,56 @@ static int udf_sb_alloc_partition_maps(s
return 0;
 }
 
+static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+   struct super_block *sb = mnt-mnt_sb;
+   struct udf_sb_info *sbi = UDF_SB(sb);
+
+   if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
+   seq_puts(seq, ,nostrict);
+   if (sb-s_blocksize != UDF_DEFAULT_BLOCKSIZE)
+   seq_printf(seq, ,bs=%lu, sb-s_blocksize);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
+   seq_puts(seq, ,unhide);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
+   seq_puts(seq, ,undelete);
+   if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB))
+   seq_puts(seq, ,noadinicb);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD))
+   seq_puts(seq, ,shortad);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET))
+   seq_puts(seq, ,uid=forget);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE))
+   seq_puts(seq, ,uid=ignore);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET))
+   seq_puts(seq, ,gid=forget);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE))
+   seq_puts(seq, ,gid=ignore);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
+   seq_printf(seq, ,uid=%u, sbi-s_uid);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
+   seq_printf(seq, ,gid=%u, sbi-s_gid);
+   if (sbi-s_umask != 0)
+   seq_printf(seq, ,umask=%o, sbi-s_umask);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
+   seq_printf(seq, ,session=%u, sbi-s_session);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
+   seq_printf(seq, ,lastblock=%u, sbi-s_last_block);
+   /* is this correct? */
+   if (sbi-s_anchor[2] != 0)
+   seq_printf(seq, ,anchor=%u, sbi-s_anchor[2]);
+   /*
+* volume, partition, fileset and rootdir seem to be ignored
+* currently
+*/
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8))
+   seq_puts(seq, ,utf8);
+   if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)  sbi-s_nls_map)
+   seq_printf(seq, ,iocharset=%s, sbi-s_nls_map-charset);
+
+   return 0;
+}
+
 /*
  * udf_parse_options
  *
@@ -339,13 +395,14 @@ static match_table_t tokens = {
{Opt_err,   NULL}
 };
 
-static int udf_parse_options(char *options, struct udf_options *uopt)
+static int udf_parse_options(char *options, struct udf_options *uopt,
+bool remount)
 {
char *p;
int option;
 
uopt-novrs = 0;
-   uopt-blocksize = 2048;
+   uopt-blocksize = UDF_DEFAULT_BLOCKSIZE;
uopt-partition = 0x;
uopt-session = 0x;
uopt-lastblock = 0;
@@ -415,11 +472,15 @@ static int udf_parse_options(char *optio
if (match_int(args, option))
return 0;
uopt-session = option;
+   if (!remount)
+   uopt-flags |= (1  UDF_FLAG_SESSION_SET);
break;
case Opt_lastblock:
if (match_int(args, option))
return 0;
uopt-lastblock = option;
+   if (!remount)
+   uopt-flags |= (1  UDF_FLAG_LASTBLOCK_SET);
break;
case Opt_anchor:
if (match_int(args, option))
@@ -497,7 +558,7 @@ static

[patch 26/26] mount options: fix usbfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to usbfs.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/drivers/usb/core/inode.c
===
--- linux.orig/drivers/usb/core/inode.c 2008-01-24 13:48:37.0 +0100
+++ linux/drivers/usb/core/inode.c  2008-01-24 16:00:03.0 +0100
@@ -38,10 +38,15 @@
 #include linux/usbdevice_fs.h
 #include linux/parser.h
 #include linux/notifier.h
+#include linux/seq_file.h
 #include asm/byteorder.h
 #include usb.h
 #include hcd.h
 
+#define USBFS_DEFAULT_DEVMODE (S_IWUSR | S_IRUGO)
+#define USBFS_DEFAULT_BUSMODE (S_IXUGO | S_IRUGO)
+#define USBFS_DEFAULT_LISTMODE S_IRUGO
+
 static struct super_operations usbfs_ops;
 static const struct file_operations default_file_operations;
 static struct vfsmount *usbfs_mount;
@@ -57,9 +62,33 @@ static uid_t listuid;/* = 0 */
 static gid_t devgid;   /* = 0 */
 static gid_t busgid;   /* = 0 */
 static gid_t listgid;  /* = 0 */
-static umode_t devmode = S_IWUSR | S_IRUGO;
-static umode_t busmode = S_IXUGO | S_IRUGO;
-static umode_t listmode = S_IRUGO;
+static umode_t devmode = USBFS_DEFAULT_DEVMODE;
+static umode_t busmode = USBFS_DEFAULT_BUSMODE;
+static umode_t listmode = USBFS_DEFAULT_LISTMODE;
+
+static int usbfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+   if (devuid != 0)
+   seq_printf(seq, ,devuid=%u, devuid);
+   if (devgid != 0)
+   seq_printf(seq, ,devgid=%u, devgid);
+   if (devmode != USBFS_DEFAULT_DEVMODE)
+   seq_printf(seq, ,devmode=%o, devmode);
+   if (busuid != 0)
+   seq_printf(seq, ,busuid=%u, busuid);
+   if (busgid != 0)
+   seq_printf(seq, ,busgid=%u, busgid);
+   if (busmode != USBFS_DEFAULT_BUSMODE)
+   seq_printf(seq, ,busmode=%o, busmode);
+   if (listuid != 0)
+   seq_printf(seq, ,listuid=%u, listuid);
+   if (listgid != 0)
+   seq_printf(seq, ,listgid=%u, listgid);
+   if (listmode != USBFS_DEFAULT_LISTMODE)
+   seq_printf(seq, ,listmode=%o, listmode);
+
+   return 0;
+}
 
 enum {
Opt_devuid, Opt_devgid, Opt_devmode,
@@ -93,9 +122,9 @@ static int parse_options(struct super_bl
devgid = 0;
busgid = 0;
listgid = 0;
-   devmode = S_IWUSR | S_IRUGO;
-   busmode = S_IXUGO | S_IRUGO;
-   listmode = S_IRUGO;
+   devmode = USBFS_DEFAULT_DEVMODE;
+   busmode = USBFS_DEFAULT_BUSMODE;
+   listmode = USBFS_DEFAULT_LISTMODE;
 
while ((p = strsep(data, ,)) != NULL) {
substring_t args[MAX_OPT_ARGS];
@@ -418,6 +447,7 @@ static struct super_operations usbfs_ops
.statfs =   simple_statfs,
.drop_inode =   generic_delete_inode,
.remount_fs =   remount,
+   .show_options = usbfs_show_options,
 };
 
 static int usbfs_fill_super(struct super_block *sb, void *data, int silent)

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 22/26] mount options: fix reiserfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to reiserfs.

Use generic_show_options() and save the complete option string in
reiserfs_fill_super() and reiserfs_remount().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/reiserfs/super.c
===
--- linux.orig/fs/reiserfs/super.c  2008-01-17 19:00:55.0 +0100
+++ linux/fs/reiserfs/super.c   2008-01-22 21:20:33.0 +0100
@@ -617,6 +617,7 @@ static const struct super_operations rei
.unlockfs = reiserfs_unlockfs,
.statfs = reiserfs_statfs,
.remount_fs = reiserfs_remount,
+   .show_options = generic_show_options,
 #ifdef CONFIG_QUOTA
.quota_read = reiserfs_quota_read,
.quota_write = reiserfs_quota_write,
@@ -1138,6 +1139,7 @@ static int reiserfs_remount(struct super
unsigned long safe_mask = 0;
unsigned int commit_max_age = (unsigned int)-1;
struct reiserfs_journal *journal = SB_JOURNAL(s);
+   char *new_opts = kstrdup(arg, GFP_KERNEL);
int err;
 #ifdef CONFIG_QUOTA
int i;
@@ -1153,7 +1155,8 @@ static int reiserfs_remount(struct super
REISERFS_SB(s)-s_qf_names[i] = NULL;
}
 #endif
-   return -EINVAL;
+   err = -EINVAL;
+   goto out_err;
}
 
handle_attrs(s);
@@ -1191,9 +1194,9 @@ static int reiserfs_remount(struct super
}
 
if (blocks) {
-   int rc = reiserfs_resize(s, blocks);
-   if (rc != 0)
-   return rc;
+   err = reiserfs_resize(s, blocks);
+   if (err != 0)
+   goto out_err;
}
 
if (*mount_flags  MS_RDONLY) {
@@ -1201,16 +1204,16 @@ static int reiserfs_remount(struct super
/* remount read-only */
if (s-s_flags  MS_RDONLY)
/* it is read-only already */
-   return 0;
+   goto out_ok;
/* try to remount file system with read-only permissions */
if (sb_umount_state(rs) == REISERFS_VALID_FS
|| REISERFS_SB(s)-s_mount_state != REISERFS_VALID_FS) {
-   return 0;
+   goto out_ok;
}
 
err = journal_begin(th, s, 10);
if (err)
-   return err;
+   goto out_err;
 
/* Mounting a rw partition read-only. */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1220,11 +1223,13 @@ static int reiserfs_remount(struct super
/* remount read-write */
if (!(s-s_flags  MS_RDONLY)) {
reiserfs_xattr_init(s, *mount_flags);
-   return 0;   /* We are read-write already */
+   goto out_ok;/* We are read-write already */
}
 
-   if (reiserfs_is_journal_aborted(journal))
-   return journal-j_errno;
+   if (reiserfs_is_journal_aborted(journal)) {
+   err = journal-j_errno;
+   goto out_err;
+   }
 
handle_data_mode(s, mount_options);
handle_barrier_mode(s, mount_options);
@@ -1232,7 +1237,7 @@ static int reiserfs_remount(struct super
s-s_flags = ~MS_RDONLY;   /* now it is safe to call 
journal_begin */
err = journal_begin(th, s, 10);
if (err)
-   return err;
+   goto out_err;
 
/* Mount a partition which is read-only, read-write */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1247,7 +1252,7 @@ static int reiserfs_remount(struct super
SB_JOURNAL(s)-j_must_wait = 1;
err = journal_end(th, s, 10);
if (err)
-   return err;
+   goto out_err;
s-s_dirt = 0;
 
if (!(*mount_flags  MS_RDONLY)) {
@@ -1255,7 +1260,14 @@ static int reiserfs_remount(struct super
reiserfs_xattr_init(s, *mount_flags);
}
 
+out_ok:
+   kfree(s-s_options);
+   s-s_options = new_opts;
return 0;
+
+out_err:
+   kfree(new_opts);
+   return err;
 }
 
 static int read_super_block(struct super_block *s, int offset)
@@ -1559,6 +1571,8 @@ static int reiserfs_fill_super(struct su
struct reiserfs_sb_info *sbi;
int errval = -EINVAL;
 
+   save_mount_options(s, data);
+
sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
if (!sbi) {
errval = -ENOMEM;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 20/26] mount options: fix ncpfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to ncpfs.

Small fix: add FS_BINARY_MOUNTDATA to the filesystem type flags, since
it can take binary data, as well as text (similarly to NFS).

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/ncpfs/inode.c
===
--- linux.orig/fs/ncpfs/inode.c 2008-01-24 13:48:37.0 +0100
+++ linux/fs/ncpfs/inode.c  2008-01-24 15:57:17.0 +0100
@@ -28,6 +28,8 @@
 #include linux/init.h
 #include linux/smp_lock.h
 #include linux/vfs.h
+#include linux/mount.h
+#include linux/seq_file.h
 
 #include linux/ncp_fs.h
 
@@ -36,9 +38,15 @@
 #include ncplib_kernel.h
 #include getopt.h
 
+#define NCP_DEFAULT_FILE_MODE 0600
+#define NCP_DEFAULT_DIR_MODE 0700
+#define NCP_DEFAULT_TIME_OUT 10
+#define NCP_DEFAULT_RETRY_COUNT 20
+
 static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
 static int  ncp_statfs(struct dentry *, struct kstatfs *);
+static int  ncp_show_options(struct seq_file *, struct vfsmount *);
 
 static struct kmem_cache * ncp_inode_cachep;
 
@@ -96,6 +104,7 @@ static const struct super_operations ncp
.put_super  = ncp_put_super,
.statfs = ncp_statfs,
.remount_fs = ncp_remount,
+   .show_options   = ncp_show_options,
 };
 
 extern struct dentry_operations ncp_root_dentry_operations;
@@ -304,6 +313,37 @@ static void ncp_stop_tasks(struct ncp_se
flush_scheduled_work();
 }
 
+static int  ncp_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+   struct ncp_server *server = NCP_SBP(mnt-mnt_sb);
+   unsigned int tmp;
+
+   if (server-m.uid != 0)
+   seq_printf(seq, ,uid=%u, server-m.uid);
+   if (server-m.gid != 0)
+   seq_printf(seq, ,gid=%u, server-m.gid);
+   if (server-m.mounted_uid != 0)
+   seq_printf(seq, ,owner=%u, server-m.mounted_uid);
+   tmp = server-m.file_mode  S_IALLUGO;
+   if (tmp != NCP_DEFAULT_FILE_MODE)
+   seq_printf(seq, ,mode=0%o, tmp);
+   tmp = server-m.dir_mode  S_IALLUGO;
+   if (tmp != NCP_DEFAULT_DIR_MODE)
+   seq_printf(seq, ,dirmode=0%o, tmp);
+   if (server-m.time_out != NCP_DEFAULT_TIME_OUT * HZ / 100) {
+   tmp = server-m.time_out * 100 / HZ;
+   seq_printf(seq, ,timeout=%u, tmp);
+   }
+   if (server-m.retry_count != NCP_DEFAULT_RETRY_COUNT)
+   seq_printf(seq, ,retry=%u, server-m.retry_count);
+   if (server-m.flags != 0)
+   seq_printf(seq, ,flags=%lu, server-m.flags);
+   if (server-m.wdog_pid != NULL)
+   seq_printf(seq, ,wdogpid=%u, pid_vnr(server-m.wdog_pid));
+
+   return 0;
+}
+
 static const struct ncp_option ncp_opts[] = {
{ uid,OPT_INT,'u' },
{ gid,OPT_INT,'g' },
@@ -331,12 +371,12 @@ static int ncp_parse_options(struct ncp_
data-mounted_uid = 0;
data-wdog_pid = NULL;
data-ncp_fd = ~0;
-   data-time_out = 10;
-   data-retry_count = 20;
+   data-time_out = NCP_DEFAULT_TIME_OUT;
+   data-retry_count = NCP_DEFAULT_RETRY_COUNT;
data-uid = 0;
data-gid = 0;
-   data-file_mode = 0600;
-   data-dir_mode = 0700;
+   data-file_mode = NCP_DEFAULT_FILE_MODE;
+   data-dir_mode = NCP_DEFAULT_DIR_MODE;
data-info_fd = -1;
data-mounted_vol[0] = 0;

@@ -982,6 +1022,7 @@ static struct file_system_type ncp_fs_ty
.name   = ncpfs,
.get_sb = ncp_get_sb,
.kill_sb= kill_anon_super,
+   .fs_flags   = FS_BINARY_MOUNTDATA,
 };
 
 static int __init init_ncp_fs(void)

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 19/26] mount options: fix jfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add iocharset= and errors= options to /proc/mounts for jfs
filesystems.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/jfs/super.c
===
--- linux.orig/fs/jfs/super.c   2008-01-17 19:00:55.0 +0100
+++ linux/fs/jfs/super.c2008-01-21 19:39:30.0 +0100
@@ -602,6 +602,12 @@ static int jfs_show_options(struct seq_f
seq_printf(seq, ,umask=%03o, sbi-umask);
if (sbi-flag  JFS_NOINTEGRITY)
seq_puts(seq, ,nointegrity);
+   if (sbi-nls_tab)
+   seq_printf(seq, ,iocharset=%s, sbi-nls_tab-charset);
+   if (sbi-flag  JFS_ERR_CONTINUE)
+   seq_printf(seq, ,errors=continue);
+   if (sbi-flag  JFS_ERR_PANIC)
+   seq_printf(seq, ,errors=panic);
 
 #ifdef CONFIG_QUOTA
if (sbi-flag  JFS_USRQUOTA)

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 17/26] mount options: fix hugetlbfs

2008-01-24 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a .show_options super operation to hugetlbfs.

Use generic_show_options() and save the complete option string in
hugetlbfs_fill_super().

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/hugetlbfs/inode.c
===
--- linux.orig/fs/hugetlbfs/inode.c 2008-01-22 21:31:53.0 +0100
+++ linux/fs/hugetlbfs/inode.c  2008-01-22 21:32:20.0 +0100
@@ -734,6 +734,7 @@ static const struct super_operations hug
.delete_inode   = hugetlbfs_delete_inode,
.drop_inode = hugetlbfs_drop_inode,
.put_super  = hugetlbfs_put_super,
+   .show_options   = generic_show_options,
 };
 
 static int
@@ -817,6 +818,8 @@ hugetlbfs_fill_super(struct super_block 
struct hugetlbfs_config config;
struct hugetlbfs_sb_info *sbinfo;
 
+   save_mount_options(sb, data);
+
config.nr_blocks = -1; /* No limit on size by default */
config.nr_inodes = -1; /* No limit on number of inodes by default */
config.uid = current-fsuid;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] VFS: create /proc/pid/mountinfo

2008-01-23 Thread Miklos Szeredi
 Pavel Machek wrote:
  On Sun 2008-01-20 09:23:00, Miklos Szeredi wrote:
  Miklos Szeredi wrote:
  - for mount ID's use IDA (from the IDR library) instead of a 32bit
counter, which could overflow
  IDAs tend to get reused quickly, which can cause race conditions.  Any 
  reason not to just use a 64-bit counter?
  They tend to become hard to parse/compare for humans after a while.
  And all this is basically only for humans, so race conditions don't
  really matter.  Also a changed mount with a reused ID is easily
  identified by comparing the other fields.
  
  Hmm, smart humans only compare last few digits if they don't care
  about 100% reliability, and dumb software compares 64bits easily...
  Pavel
 
 Indeed.
 
 And this is most certainly NOT only for humans, and race conditions most 
 certainly matter.

Use case please?  What will this info be used for, other than for
feedback for humans about the state of the propagation tree?

Face it, userspace is inherently racy.  Inode numbers, device numbers,
whatever are being reused all the time, we live with it, even if it's
programs, and not just humans.

But it's not even an important design decision, the ID allocation can
be swapped at any time.  If you insist, I'll change it to a 64bit
counter, and it'll just suck a little more, but no permanent damage
done ;)

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 07/10] unprivileged mounts: add sysctl tunable for safe property

2008-01-21 Thread Miklos Szeredi
 What do you think about doing this only if FS_SAFE is also set,
 so for instance at first only FUSE would allow itself to be
 made user-mountable?
 
 A safe thing to do, or overly intrusive?

It goes somewhat against the no policy in kernel policy ;).  I think
the warning in the documentation should be enough to make sysadmins
think twice before doing anything foolish:

 +Care should be taken when enabling this, since most
 +filesystems haven't been designed with unprivileged mounting
 +in mind.
 +

BTW, filesystems like 'proc' and 'sysfs' should also be safe, although
the only use for them being marked safe is if the users are allowed to
umount them from their private namespace (otherwise a 'mount --bind'
has the same effect as a new mount).

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] VFS: create /proc/pid/mountinfo

2008-01-21 Thread Miklos Szeredi
 On Mon, 2008-01-21 at 22:25 +0100, Miklos Szeredi wrote:
 You have removed the code that checked if the peer or
 master mount was in the same namespace before reporting their
 corresponding mount-ids. One downside of that approach is the
 user will see an mount_id in the output with no corresponding
 line to explain the details of the mount_id.  
  
  Before the change, the peer and master ID's were basically randomly
  chosen from the peers, which means, it wasn't possible to always
  determine, that two mounts were peers, or that they were slaves to the
  same peer group.
  
  After the change, this is possible, since the peer ID will be the same
  for all mounts which are peers.  This means, that even though the peer
  ID might be in a different namespace, it is possible to determine all
  peers within the same namespace by comparing their peer ID's.
 
 
  I agree with your reasoning on the random id; showing a single
  id avoids clutter. But my point is, why not show a
  id for the master or peer residing in the same namespace?

Because this way it is possible see propagation between different
namespaces as well, by looking at the mount information for processes
in the different namespaces.  Of course, this is only possible with
sufficient privileges.

  Showing a id with no corresponding entry for that id, can be
  intriguing.

Not if it's clearly documented (will add documentation for the next
submission).

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] VFS: create /proc/pid/mountinfo

2008-01-20 Thread Miklos Szeredi
 Miklos Szeredi wrote:
  - for mount ID's use IDA (from the IDR library) instead of a 32bit
counter, which could overflow
 
 IDAs tend to get reused quickly, which can cause race conditions.  Any 
 reason not to just use a 64-bit counter?

They tend to become hard to parse/compare for humans after a while.
And all this is basically only for humans, so race conditions don't
really matter.  Also a changed mount with a reused ID is easily
identified by comparing the other fields.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH] VFS: create /proc/pid/mountinfo

2008-01-19 Thread Miklos Szeredi
Seems, most people would be happier with a new file, instead of
extending /proc/mounts.

This patch is the first attempt at doing that, as well as fixing the
issues found in the previous submission.

Thanks,
Miklos

---
From: Ram Pai [EMAIL PROTECTED]

/proc/mounts in its current state fail to disambiguate bind mounts, especially
when the bind mount is subrooted. Also it does not capture propagation state of
the mounts(shared-subtree). The following patch addresses the problem.

The patch adds '/proc/pid/mountinfo' which contains a superset of
the fields in '/proc/pid/mounts'. The following additional fields
are added:

mntid -- is a unique identifier of the mount
parent -- the id of the parent mount
major:minor -- value of st_dev for files on that filesystem
dir -- the subdir in the filesystem which forms the root of this mount
propagation-type in the form of propagation_flag[:mntid][,...]
note: 'shared' flag is followed by the mntid of its peer mount
  'slave' flag is followed by the mntid of its master mount
  'private' flag stands by itself
  'unbindable' flag stands by itself

Also mount options are split into two fileds, the first containing the
per mount flags, the second the per super block options.

Here is a sample cat /proc/mounts after execution the following commands:

mount --bind /mnt /mnt
mount --make-shared /mnt
mount --bind /mnt/1 /var
mount --make-slave /var
mount --make-shared /var
mount --bind /var/abc /tmp
mount --make-unbindable /proc

2 2 0:1 rootfs rootfs / / rw rw private
16 2 98:0 ext2 /dev/root / / rw rw private
17 16 0:3 proc /proc / /proc rw rw unbindable
18 16 0:10 devpts devpts /dev/pts / rw rw private
19 16 98:0 ext2 /dev/root /mnt /mnt rw rw shared:19
20 16 98:0 ext2 /dev/root /mnt/1 /var rw rw shared:21,slave:19
21 16 98:0 ext2 /dev/root /mnt/1/abc /tmp rw rw shared:20,slave:19

For example, the last line indicates that:

1) The mount is a shared mount.
2) Its peer mount of mount with id 20
3) It is also a slave mount of the master-mount with the id  19
4) The filesystem on device with major/minor number 98:0 and subdirectory
mnt/1/abc makes the root directory of this mount.
5) And finally the mount with id 16 is its parent.


[EMAIL PROTECTED]:

- new file, rearrange fields
- for mount ID's use IDA (from the IDR library) instead of a 32bit
  counter, which could overflow
- print canonical ID's (smallest one within the peer group) for peers
  and master, this is more useful, than a random ID within the same namespace
- fix a couple of small bugs
- remove inlines
- style fixes

Signed-off-by: Ram Pai [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/dcache.c
===
--- linux.orig/fs/dcache.c  2008-01-18 19:21:38.0 +0100
+++ linux/fs/dcache.c   2008-01-18 19:22:27.0 +0100
@@ -1890,6 +1890,60 @@ char *dynamic_dname(struct dentry *dentr
return memcpy(buffer, temp, sz);
 }
 
+static int prepend(char **buffer, int *buflen, const char *str,
+ int namelen)
+{
+   *buflen -= namelen;
+   if (*buflen  0)
+   return 1;
+   *buffer -= namelen;
+   memcpy(*buffer, str, namelen);
+   return 0;
+}
+
+/*
+ * Write full pathname from the root of the filesystem into the buffer.
+ */
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+   char *end = buf + buflen;
+   char *retval;
+
+   spin_lock(dcache_lock);
+   prepend(end, buflen, \0, 1);
+   if (!IS_ROOT(dentry)  d_unhashed(dentry)) {
+   if (prepend(end, buflen, //deleted, 9))
+   goto Elong;
+   }
+   if (buflen  1)
+   goto Elong;
+   /* Get '/' right */
+   retval = end-1;
+   *retval = '/';
+
+   for (;;) {
+   struct dentry *parent;
+   if (IS_ROOT(dentry))
+   break;
+
+   parent = dentry-d_parent;
+   prefetch(parent);
+
+   if (prepend(end, buflen, dentry-d_name.name,
+   dentry-d_name.len) ||
+   prepend(end, buflen, /, 1))
+   goto Elong;
+
+   retval = end;
+   dentry = parent;
+   }
+   spin_unlock(dcache_lock);
+   return retval;
+Elong:
+   spin_unlock(dcache_lock);
+   return ERR_PTR(-ENAMETOOLONG);
+}
+
 /*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-01-18 19:21:38.0 +0100
+++ linux/fs/namespace.c2008-01-18 23:39:35.0 +0100
@@ -27,6 +27,7 @@
 #include linux/mount.h
 #include linux/ramfs.h
 #include linux/log2.h
+#include linux/idr.h
 #include asm/uaccess.h
 #include asm/unistd.h
 #include

Re: [patch] util-linux-ng: unprivileged mounts support

2008-01-19 Thread Miklos Szeredi
  This is an experimental patch for supporing unprivileged mounts and
  umounts.  
 
 User unmount unfortunately still doesn't work if the kernel doesn't have 
 the unprivileged mount support but as we discussed this in last July that 
 shouldn't be needed for this case.
 
   % mount -t ntfs-3g /dev/hda10 /tmp/test
   % cat /proc/mounts | grep /tmp/test 
 
   /dev/hda10 /tmp/test fuseblk 
 rw,nosuid,nodev,user_id=501,group_id=501,allow_other 0 0
   % mount | grep /tmp/test
   /dev/hda10 on /tmp/test type fuseblk 
 (rw,nosuid,nodev,allow_other,blksize=1024,user=szaka)
   % umount /tmp/test
   umount: /dev/hda10: not mounted
   umount: /tmp/test: must be superuser to umount
   umount: /dev/hda10: not mounted
   umount: /tmp/test: must be superuser to umount

But 'fusermount -u /tmp/test' does work, doesn't it?

Yes, this should probably be fixed in umount(8), but it's an (almost)
completely separate issue.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch] VFS: extend /proc/mounts

2008-01-17 Thread Miklos Szeredi
  The alternative (and completely safe) solution is to add another file
  to proc.  Me no likey.
 
 Since we need saner layout, I would strongly suggest exactly that.

I don't think there's all that much wrong with the current layout,
except the two dummy zeroes at the end.  Or, something else needs
fixing in there?

  major:minor -- is the major minor number of the device hosting the 
  filesystem
 
 Bad description.  Value of st_dev for files on that filesystem, please -
 there might be no such thing as the device hosting the filesystem _and_
 the value here may bloody well be unrelated to device actually holding
 all data (for things like ext2meta, etc.).

Right.

  1) The mount is a shared mount.
  2) Its peer mount of mount with id 20
  3) It is also a slave mount of the master-mount with the id  19
  4) The filesystem on device with major/minor number 98:0 and subdirectory
  mnt/1/abc makes the root directory of this mount.
  5) And finally the mount with id 16 is its parent.
 
 I'd suggest doing a new file that would *not* try to imitate /etc/mtab.
 Another thing is, how much of propagation information do we want to
 be exposed and what do we intend to do with it?

I think the scheme devised by Ram is basically right.  It shows the
relationships (slave, peer) and the ID of a master/peer mount.

What I changed, is to always show a canonical peer, because I think
that is more useful in establishing relationships between mounts.  Is
this info sensitive?  I can't see why it would be.

  Note that entire
 propagation tree is out of question - it spans many namespaces and
 contains potentially sensitive information.  So we won't see all nodes.

With multiple namespaces, of course you are only allowed to see a part
of the tree, but you could have xterms for all of them, and can put
together the big picture from the pieces.

 What do we want to *do* with the information about propagation?

Just feedback about the state of the thing.  It's very annoying, that
after setting up propagation, it's impossible to check the result.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 9/9] unprivileged mounts: add no submounts flag

2008-01-16 Thread Miklos Szeredi
   Why not nosubmnt?
  
  Why not indeed.  Maybe I should try to use my brain sometime.
 
 Well it really should have 'user' or 'unpriv' in the name
 somewhere.  'nosubmnt' is more confusing than 'nomnt' because
 it no submounts really sounds like a reasonable thing in
 itself...

I slept on it, and I still think 'nosubmnt' might be the best
compromise.  Obviously the superuser has privileges, that override
what is normally allowed, and we don't find it strange when a
read-only file is happily being written by root.

It may feel wrong in the context of mounts, because we are so used to
mounts being privileged-only.

Objections?  Once this goes in, it will stay the same forever, so now
is the time to express any doubts...

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 06/10] unprivileged mounts: allow unprivileged mounts

2008-01-16 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

For safe filesystems allow unprivileged mounting and forced
unmounting.

A filesystem type is considered safe, if mounting it by an
unprivileged user may not cause a security problem.  This is somewhat
subjective, so setting this property is left to userspace (implemented
in the next patch).

Since most filesystems haven't been designed with unprivileged
mounting in mind, a thorough audit is recommended before setting this
property.

Make this a separate integer member in 'struct file_system_type'
instead of a flag, since that is easier to handle by sysctl code.

Move subtype handling from do_kern_mount() into do_new_mount().  All
other callers are kernel-internal and do not need subtype support.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-01-16 13:25:08.0 +0100
+++ linux/fs/namespace.c2008-01-16 13:25:09.0 +0100
@@ -966,14 +966,16 @@ static bool is_mount_owner(struct vfsmou
 /*
  * umount is permitted for
  *  - sysadmin
- *  - mount owner, if not forced umount
+ *  - mount owner
+ *o if not forced umount,
+ *o if forced umount, and filesystem is safe
  */
 static bool permit_umount(struct vfsmount *mnt, int flags)
 {
if (capable(CAP_SYS_ADMIN))
return true;
 
-   if (flags  MNT_FORCE)
+   if ((flags  MNT_FORCE)  !(mnt-mnt_sb-s_type-fs_safe))
return false;
 
return is_mount_owner(mnt, current-fsuid);
@@ -1031,13 +1033,17 @@ asmlinkage long sys_oldumount(char __use
  * - mountpoint is not a symlink
  * - mountpoint is in a mount owned by the user
  */
-static bool permit_mount(struct nameidata *nd, int *flags)
+static bool permit_mount(struct nameidata *nd, struct file_system_type *type,
+int *flags)
 {
struct inode *inode = nd-path.dentry-d_inode;
 
if (capable(CAP_SYS_ADMIN))
return true;
 
+   if (type  !type-fs_safe)
+   return false;
+
if (S_ISLNK(inode-i_mode))
return false;
 
@@ -1291,7 +1297,7 @@ static int do_loopback(struct nameidata 
struct vfsmount *mnt = NULL;
int err;
 
-   if (!permit_mount(nd, flags))
+   if (!permit_mount(nd, NULL, flags))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
@@ -1472,30 +1478,76 @@ out:
return err;
 }
 
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char 
*fstype)
+{
+   int err;
+   const char *subtype = strchr(fstype, '.');
+   if (subtype) {
+   subtype++;
+   err = -EINVAL;
+   if (!subtype[0])
+   goto err;
+   } else
+   subtype = ;
+
+   mnt-mnt_sb-s_subtype = kstrdup(subtype, GFP_KERNEL);
+   err = -ENOMEM;
+   if (!mnt-mnt_sb-s_subtype)
+   goto err;
+   return mnt;
+
+ err:
+   mntput(mnt);
+   return ERR_PTR(err);
+}
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
  */
-static int do_new_mount(struct nameidata *nd, char *type, int flags,
+static int do_new_mount(struct nameidata *nd, char *fstype, int flags,
int mnt_flags, char *name, void *data)
 {
+   int err;
struct vfsmount *mnt;
+   struct file_system_type *type;
 
-   if (!type || !memchr(type, 0, PAGE_SIZE))
+   if (!fstype || !memchr(fstype, 0, PAGE_SIZE))
return -EINVAL;
 
-   /* we need capabilities... */
-   if (!capable(CAP_SYS_ADMIN))
-   return -EPERM;
-
-   mnt = do_kern_mount(type, flags  ~MS_SETUSER, name, data);
-   if (IS_ERR(mnt))
+   type = get_fs_type(fstype);
+   if (!type)
+   return -ENODEV;
+
+   err = -EPERM;
+   if (!permit_mount(nd, type, flags))
+   goto out_put_filesystem;
+
+   if (flags  MS_SETUSER) {
+   err = reserve_user_mount();
+   if (err)
+   goto out_put_filesystem;
+   }
+
+   mnt = vfs_kern_mount(type, flags  ~MS_SETUSER, name, data);
+   if (!IS_ERR(mnt)  (type-fs_flags  FS_HAS_SUBTYPE) 
+   !mnt-mnt_sb-s_subtype)
+   mnt = fs_set_subtype(mnt, fstype);
+   put_filesystem(type);
+   if (IS_ERR(mnt)) {
+   if (flags  MS_SETUSER)
+   dec_nr_user_mounts();
return PTR_ERR(mnt);
+   }
 
if (flags  MS_SETUSER)
-   set_mnt_user(mnt);
+   __set_mnt_user(mnt);
 
return do_add_mount(mnt, nd, mnt_flags, NULL);
+
+ out_put_filesystem:
+   put_filesystem(type);
+   return err;
 }
 
 /*
@@ -1526,7 +1578,7 @@ int do_add_mount(struct vfsmount *newmnt
if (S_ISLNK(newmnt-mnt_root

[patch 08/10] unprivileged mounts: make fuse safe

2008-01-16 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Don't require the user_id= and group_id= options for unprivileged mounts,
but if they are present, verify them for sanity.

Disallow the allow_other option for unprivileged mounts.

FUSE was designed from the beginning to be safe for unprivileged
users.  This has also been verified in practice over many years, with
some distributions enabling unprivileged FUSE mounts by default.

However there are some properties of FUSE, that could make it unsafe
for certain situations (e.g. multiuser system with untrusted users):

 - It is not always possible to use kill(2) (not even with SIGKILL) to
   terminate a process using a FUSE filesystem.  However it is
   possible to use any of the following instead:
 o kill the filesystem daemon
 o use forced umounting
 o use the fusectl control filesystem

 - As a special case of the above, killing a self-deadlocked FUSE
   process is not possible, and even killall5 will not terminate it.

 - Due to the design of the process freezer, a hanging (due to network
   problems, etc) or malicious filesystem may prevent suspending to
   ram or hibernation to succeed.  This is not actually unique to
   FUSE, as any hanging network filesystem will have the same affect.

If the above could pose a threat to the system, it is recommended,
that the '/proc/sys/fs/types/fuse/safe' sysctl tunable is not turned
on, and/or '/dev/fuse' is not made world-readable and writable.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/inode.c
===
--- linux.orig/fs/fuse/inode.c  2008-01-16 13:24:52.0 +0100
+++ linux/fs/fuse/inode.c   2008-01-16 13:25:10.0 +0100
@@ -357,6 +357,19 @@ static int parse_fuse_opt(char *opt, str
d-max_read = ~0;
d-blksize = 512;
 
+   /*
+* For unprivileged mounts use current uid/gid.  Still allow
+* user_id and group_id options for compatibility, but
+* only if they match these values.
+*/
+   if (!capable(CAP_SYS_ADMIN)) {
+   d-user_id = current-uid;
+   d-user_id_present = 1;
+   d-group_id = current-gid;
+   d-group_id_present = 1;
+
+   }
+
while ((p = strsep(opt, ,)) != NULL) {
int token;
int value;
@@ -385,6 +398,8 @@ static int parse_fuse_opt(char *opt, str
case OPT_USER_ID:
if (match_int(args[0], value))
return 0;
+   if (d-user_id_present  d-user_id != value)
+   return 0;
d-user_id = value;
d-user_id_present = 1;
break;
@@ -392,6 +407,8 @@ static int parse_fuse_opt(char *opt, str
case OPT_GROUP_ID:
if (match_int(args[0], value))
return 0;
+   if (d-group_id_present  d-group_id != value)
+   return 0;
d-group_id = value;
d-group_id_present = 1;
break;
@@ -596,6 +613,10 @@ static int fuse_fill_super(struct super_
if (!parse_fuse_opt((char *) data, d, is_bdev))
return -EINVAL;
 
+   /* This is a privileged option */
+   if ((d.flags  FUSE_ALLOW_OTHER)  !capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
if (is_bdev) {
 #ifdef CONFIG_BLOCK
if (!sb_set_blocksize(sb, d.blksize))

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 07/10] unprivileged mounts: add sysctl tunable for safe property

2008-01-16 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add the following:

  /proc/sys/fs/types/${FS_TYPE}/usermount_safe

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/filesystems.c
===
--- linux.orig/fs/filesystems.c 2008-01-16 13:24:52.0 +0100
+++ linux/fs/filesystems.c  2008-01-16 13:25:09.0 +0100
@@ -12,6 +12,7 @@
 #include linux/kmod.h
 #include linux/init.h
 #include linux/module.h
+#include linux/sysctl.h
 #include asm/uaccess.h
 
 /*
@@ -51,6 +52,57 @@ static struct file_system_type **find_fi
return p;
 }
 
+#define MAX_FILESYSTEM_VARS 1
+
+struct filesystem_sysctl_table {
+   struct ctl_table_header *header;
+   struct ctl_table table[MAX_FILESYSTEM_VARS + 1];
+};
+
+/*
+ * Create /sys/fs/types/${FSNAME} directory with per fs-type tunables.
+ */
+static int filesystem_sysctl_register(struct file_system_type *fs)
+{
+   struct filesystem_sysctl_table *t;
+   struct ctl_path path[] = {
+   { .procname = fs, .ctl_name = CTL_FS },
+   { .procname = types, .ctl_name = CTL_UNNUMBERED },
+   { .procname = fs-name, .ctl_name = CTL_UNNUMBERED },
+   { }
+   };
+
+   t = kzalloc(sizeof(*t), GFP_KERNEL);
+   if (!t)
+   return -ENOMEM;
+
+
+   t-table[0].ctl_name = CTL_UNNUMBERED;
+   t-table[0].procname = usermount_safe;
+   t-table[0].maxlen = sizeof(int);
+   t-table[0].data = fs-fs_safe;
+   t-table[0].mode = 0644;
+   t-table[0].proc_handler = proc_dointvec;
+
+   t-header = register_sysctl_paths(path, t-table);
+   if (!t-header) {
+   kfree(t);
+   return -ENOMEM;
+   }
+
+   fs-sysctl_table = t;
+
+   return 0;
+}
+
+static void filesystem_sysctl_unregister(struct file_system_type *fs)
+{
+   struct filesystem_sysctl_table *t = fs-sysctl_table;
+
+   unregister_sysctl_table(t-header);
+   kfree(t);
+}
+
 /**
  * register_filesystem - register a new filesystem
  * @fs: the file system structure
@@ -80,6 +132,13 @@ int register_filesystem(struct file_syst
else
*p = fs;
write_unlock(file_systems_lock);
+
+   if (res == 0) {
+   res = filesystem_sysctl_register(fs);
+   if (res != 0)
+   unregister_filesystem(fs);
+   }
+
return res;
 }
 
@@ -108,6 +167,7 @@ int unregister_filesystem(struct file_sy
*tmp = fs-next;
fs-next = NULL;
write_unlock(file_systems_lock);
+   filesystem_sysctl_unregister(fs);
return 0;
}
tmp = (*tmp)-next;
Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-01-16 13:25:09.0 +0100
+++ linux/include/linux/fs.h2008-01-16 13:25:09.0 +0100
@@ -1437,6 +1437,7 @@ struct file_system_type {
struct module *owner;
struct file_system_type * next;
struct list_head fs_supers;
+   struct filesystem_sysctl_table *sysctl_table;
 
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
Index: linux/Documentation/filesystems/proc.txt
===
--- linux.orig/Documentation/filesystems/proc.txt   2008-01-16 
13:25:07.0 +0100
+++ linux/Documentation/filesystems/proc.txt2008-01-16 13:25:09.0 
+0100
@@ -43,6 +43,7 @@ Table of Contents
   2.13 /proc/pid/oom_score - Display current oom-killer score
   2.14 /proc/pid/io - Display the IO accounting fields
   2.15 /proc/pid/coredump_filter - Core dump filtering settings
+  2.16 /proc/sys/fs/types - File system type specific parameters
 
 --
 Preface
@@ -2283,4 +2284,21 @@ For example:
   $ echo 0x7  /proc/self/coredump_filter
   $ ./some_program
 
+2.16 /proc/sys/fs/types/ - File system type specific parameters
+
+
+There's a separate directory /proc/sys/fs/types/type/ for each
+filesystem type, containing the following files:
+
+usermount_safe
+--
+
+Setting this to non-zero will allow filesystems of this type to be
+mounted by unprivileged users (note, that there are other
+prerequisites as well).
+
+Care should be taken when enabling this, since most
+filesystems haven't been designed with unprivileged mounting
+in mind.
+
 --

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 10/10] unprivileged mounts: add no submounts flag

2008-01-16 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Add a new mount flag nosubmnt, which denies submounts for the owner.
This would be useful, if we want to support traditional /etc/fstab
based user mounts.

In this case mount(8) would still have to be suid-root, to check the
mountpoint against the user/users flag in /etc/fstab, but /etc/mtab
would no longer be mandatory for storing the actual owner of the
mount.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
Acked-by: Serge Hallyn [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-01-16 13:25:11.0 +0100
+++ linux/fs/namespace.c2008-01-16 13:25:12.0 +0100
@@ -700,6 +700,7 @@ static int show_vfsmnt(struct seq_file *
{ MNT_NOATIME, ,noatime },
{ MNT_NODIRATIME, ,nodiratime },
{ MNT_RELATIME, ,relatime },
+   { MNT_NOSUBMNT, ,nosubmnt },
{ 0, NULL }
};
struct proc_fs_info *fs_infop;
@@ -1050,6 +1051,9 @@ static bool permit_mount(struct nameidat
if (S_ISLNK(inode-i_mode))
return false;
 
+   if (nd-path.mnt-mnt_flags  MNT_NOSUBMNT)
+   return false;
+
if (!is_mount_owner(nd-path.mnt, current-fsuid))
return false;
 
@@ -1894,9 +1898,11 @@ long do_mount(char *dev_name, char *dir_
mnt_flags |= MNT_RELATIME;
if (flags  MS_RDONLY)
mnt_flags |= MNT_READONLY;
+   if (flags  MS_NOSUBMNT)
+   mnt_flags |= MNT_NOSUBMNT;
 
-   flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
-  MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
+   flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME |
+  MS_NODIRATIME | MS_RELATIME | MS_KERNMOUNT | MS_NOSUBMNT);
 
/* ... and get the mountpoint */
retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-01-16 13:25:11.0 +0100
+++ linux/include/linux/fs.h2008-01-16 13:25:12.0 +0100
@@ -129,6 +129,7 @@ extern int dir_notify_enable;
 #define MS_KERNMOUNT   (122) /* this is a kern_mount call */
 #define MS_I_VERSION   (123) /* Update inode I_version field */
 #define MS_SETUSER (124) /* set mnt_uid to current user */
+#define MS_NOSUBMNT(125) /* don't allow unprivileged submounts */
 #define MS_ACTIVE  (130)
 #define MS_NOUSER  (131)
 
Index: linux/include/linux/mount.h
===
--- linux.orig/include/linux/mount.h2008-01-16 13:25:05.0 +0100
+++ linux/include/linux/mount.h 2008-01-16 13:25:12.0 +0100
@@ -30,6 +30,7 @@ struct mnt_namespace;
 #define MNT_NODIRATIME 0x10
 #define MNT_RELATIME   0x20
 #define MNT_READONLY   0x40/* does the user want this to be r/o? */
+#define MNT_NOSUBMNT   0x80
 
 #define MNT_SHRINKABLE 0x100
 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 09/10] unprivileged mounts: propagation: inherit owner from parent

2008-01-16 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

On mount propagation, let the owner of the clone be inherited from the
parent into which it has been propagated.

If the parent has the nosuid flag, set this flag for the child as
well.  This is needed for the suid-less namespace (use case #2 in the
first patch header), where all mounts are owned by the user and have
the nosuid flag set.  In this case the propagated mount needs to have
nosuid, otherwise a suid executable may be misused by the user.

Similar treatment is not needed for nodev, because devices can't be
abused this way: the user is not able to gain privileges to devices by
rearranging the mount namespace.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/namespace.c
===
--- linux.orig/fs/namespace.c   2008-01-16 13:25:09.0 +0100
+++ linux/fs/namespace.c2008-01-16 13:25:11.0 +0100
@@ -506,10 +506,10 @@ static int reserve_user_mount(void)
return err;
 }
 
-static void __set_mnt_user(struct vfsmount *mnt)
+static void __set_mnt_user(struct vfsmount *mnt, uid_t owner)
 {
WARN_ON(mnt-mnt_flags  MNT_USER);
-   mnt-mnt_uid = current-fsuid;
+   mnt-mnt_uid = owner;
mnt-mnt_flags |= MNT_USER;
 
if (!capable(CAP_SETUID))
@@ -520,7 +520,7 @@ static void __set_mnt_user(struct vfsmou
 
 static void set_mnt_user(struct vfsmount *mnt)
 {
-   __set_mnt_user(mnt);
+   __set_mnt_user(mnt, current-fsuid);
spin_lock(vfsmount_lock);
nr_user_mounts++;
spin_unlock(vfsmount_lock);
@@ -536,7 +536,7 @@ static void clear_mnt_user(struct vfsmou
 }
 
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
-   int flag)
+   int flag, uid_t owner)
 {
struct super_block *sb = old-mnt_sb;
struct vfsmount *mnt;
@@ -560,7 +560,10 @@ static struct vfsmount *clone_mnt(struct
/* don't copy the MNT_USER flag */
mnt-mnt_flags = ~MNT_USER;
if (flag  CL_SETUSER)
-   __set_mnt_user(mnt);
+   __set_mnt_user(mnt, owner);
+
+   if (flag  CL_NOSUID)
+   mnt-mnt_flags |= MNT_NOSUID;
 
if (flag  CL_SLAVE) {
list_add(mnt-mnt_slave, old-mnt_slave_list);
@@ -1066,7 +1069,7 @@ static int lives_below_in_same_fs(struct
 }
 
 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
-   int flag)
+   int flag, uid_t owner)
 {
struct vfsmount *res, *p, *q, *r, *s;
struct nameidata nd;
@@ -1074,7 +1077,7 @@ struct vfsmount *copy_tree(struct vfsmou
if (!(flag  CL_COPY_ALL)  IS_MNT_UNBINDABLE(mnt))
return ERR_PTR(-EPERM);
 
-   res = q = clone_mnt(mnt, dentry, flag);
+   res = q = clone_mnt(mnt, dentry, flag, owner);
if (IS_ERR(q))
goto error;
q-mnt_mountpoint = mnt-mnt_mountpoint;
@@ -1096,7 +1099,7 @@ struct vfsmount *copy_tree(struct vfsmou
p = s;
nd.path.mnt = q;
nd.path.dentry = p-mnt_mountpoint;
-   q = clone_mnt(p, p-mnt_root, flag);
+   q = clone_mnt(p, p-mnt_root, flag, owner);
if (IS_ERR(q))
goto error;
spin_lock(vfsmount_lock);
@@ -1121,7 +1124,7 @@ struct vfsmount *collect_mounts(struct v
 {
struct vfsmount *tree;
down_read(namespace_sem);
-   tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+   tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE, 0);
up_read(namespace_sem);
return tree;
 }
@@ -1292,7 +1295,8 @@ static int do_change_type(struct nameida
  */
 static int do_loopback(struct nameidata *nd, char *old_name, int flags)
 {
-   int clone_fl;
+   int clone_fl = 0;
+   uid_t owner = 0;
struct nameidata old_nd;
struct vfsmount *mnt = NULL;
int err;
@@ -1313,11 +1317,17 @@ static int do_loopback(struct nameidata 
if (!check_mnt(nd-path.mnt) || !check_mnt(old_nd.path.mnt))
goto out;
 
-   clone_fl = (flags  MS_SETUSER) ? CL_SETUSER : 0;
+   if (flags  MS_SETUSER) {
+   clone_fl |= CL_SETUSER;
+   owner = current-fsuid;
+   }
+
if (flags  MS_REC)
-   mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, clone_fl);
+   mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, clone_fl,
+   owner);
else
-   mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, clone_fl);
+   mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, clone_fl,
+   owner);
 
err = PTR_ERR(mnt);
if (IS_ERR(mnt))
@@ -1541,7 +1551,7

  1   2   3   4   5   >