file locks: Split flock_find_conflict out of flock_lock_file

2008-01-14 Thread Matthew Wilcox

Reduce the spaghetti-like nature of flock_lock_file by making the chunk
of code labelled find_conflict into its own function.  Also allocate
memory before taking the kernel lock in preparation for switching to a
normal spinlock.

Signed-off-by: Matthew Wilcox <[EMAIL PROTECTED]>

diff --git a/fs/locks.c b/fs/locks.c
index b681459..bc691e5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -699,6 +699,33 @@ next_task:
return 0;
 }
 
+/*
+ * A helper function for flock_lock_file().  It searches the list of locks
+ * for @inode looking for one which would conflict with @request.
+ * If it does not find one, it returns the address where the requested lock
+ * should be inserted.  If it does find a conflicting lock, it returns NULL.
+ */
+static struct file_lock **
+flock_find_conflict(struct inode *inode, struct file_lock *request)
+{
+   struct file_lock **before;
+
+   for_each_lock(inode, before) {
+   struct file_lock *fl = *before;
+   if (IS_POSIX(fl))
+   break;
+   if (IS_LEASE(fl))
+   continue;
+   if (!flock_locks_conflict(request, fl))
+   continue;
+
+   if (request->fl_flags & FL_SLEEP)
+   locks_insert_block(fl, request);
+   return NULL;
+   }
+   return before;
+}
+
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
  * after any leases, but before any posix locks.
  *
@@ -714,18 +741,21 @@ static int flock_lock_file(struct file *filp, struct 
file_lock *request)
int error = 0;
int found = 0;
 
-   lock_kernel();
-   if (request->fl_flags & FL_ACCESS)
-   goto find_conflict;
+   if (request->fl_flags & FL_ACCESS) {
+   lock_kernel();
+   before = flock_find_conflict(inode, request);
+   unlock_kernel();
+   return before ? 0 : -EAGAIN;
+   }
 
if (request->fl_type != F_UNLCK) {
-   error = -ENOMEM;
new_fl = locks_alloc_lock();
-   if (new_fl == NULL)
-   goto out;
-   error = 0;
+   if (!new_fl)
+   return -ENOMEM;
}
 
+   lock_kernel();
+
for_each_lock(inode, before) {
struct file_lock *fl = *before;
if (IS_POSIX(fl))
@@ -754,22 +784,12 @@ static int flock_lock_file(struct file *filp, struct 
file_lock *request)
if (found)
cond_resched();
 
-find_conflict:
-   for_each_lock(inode, before) {
-   struct file_lock *fl = *before;
-   if (IS_POSIX(fl))
-   break;
-   if (IS_LEASE(fl))
-   continue;
-   if (!flock_locks_conflict(request, fl))
-   continue;
+   before = flock_find_conflict(inode, request);
+   if (!before) {
error = -EAGAIN;
-   if (request->fl_flags & FL_SLEEP)
-   locks_insert_block(fl, request);
goto out;
}
-   if (request->fl_flags & FL_ACCESS)
-   goto out;
+
locks_copy_lock(new_fl, request);
locks_insert_lock(before, new_fl);
new_fl = NULL;
-- 
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


file locks: Use wait_event_interruptible_timeout()

2008-01-14 Thread Matthew Wilcox

interruptible_sleep_on_locked() is just an open-coded
wait_event_interruptible_timeout() with a few assumptions since we know
we hold the BKL.  locks_block_on_timeout() is only used in one place, so
it's actually simpler to inline it into its caller.

Signed-off-by: Matthew Wilcox <[EMAIL PROTECTED]>

 locks.c |   33 -
 1 file changed, 4 insertions(+), 29 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 8b8388e..b681459 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -634,33 +634,6 @@ static int flock_locks_conflict(struct file_lock 
*caller_fl, struct file_lock *s
return (locks_conflict(caller_fl, sys_fl));
 }
 
-static int interruptible_sleep_on_locked(wait_queue_head_t *fl_wait, int 
timeout)
-{
-   int result = 0;
-   DECLARE_WAITQUEUE(wait, current);
-
-   __set_current_state(TASK_INTERRUPTIBLE);
-   add_wait_queue(fl_wait, &wait);
-   if (timeout == 0)
-   schedule();
-   else
-   result = schedule_timeout(timeout);
-   if (signal_pending(current))
-   result = -ERESTARTSYS;
-   remove_wait_queue(fl_wait, &wait);
-   __set_current_state(TASK_RUNNING);
-   return result;
-}
-
-static int locks_block_on_timeout(struct file_lock *blocker, struct file_lock 
*waiter, int time)
-{
-   int result;
-   locks_insert_block(blocker, waiter);
-   result = interruptible_sleep_on_locked(&waiter->fl_wait, time);
-   __locks_delete_block(waiter);
-   return result;
-}
-
 void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
@@ -1256,7 +1229,10 @@ restart:
if (break_time == 0)
break_time++;
}
-   error = locks_block_on_timeout(flock, new_fl, break_time);
+   locks_insert_block(flock, new_fl);
+   error = wait_event_interruptible_timeout(new_fl->fl_wait,
+   !new_fl->fl_next, break_time);
+   __locks_delete_block(new_fl);
if (error >= 0) {
if (error == 0)
time_out_leases(inode);

-- 
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Leak in nlmsvc_testlock for async GETFL case

2008-01-14 Thread Matthew Wilcox
On Mon, Jan 14, 2008 at 03:44:19PM -0500, J. Bruce Fields wrote:
> Thanks!  I've queued it up for 2.6.25.

Hi Bruce,

I haven't had as much time to play with de-BKL-ising fs/locks.c as I
would like, so fixing that for 2.6.25 is probably out of the question,
but here are two janitorial patches that hopefully can be applied and
will make the next steps easier.  They make sense all by themselves,
even if I don't get back to this project for a few months.

-- 
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFD] Incremental fsck

2008-01-14 Thread Ric Wheeler

Pavel Machek wrote:

On Sat 2008-01-12 09:51:40, Theodore Tso wrote:

On Wed, Jan 09, 2008 at 02:52:14PM +0300, Al Boldi wrote:

Ok, but let's look at this a bit more opportunistic / optimistic.

Even after a black-out shutdown, the corruption is pretty minimal, using 
ext3fs at least.



After a unclean shutdown, assuming you have decent hardware that
doesn't lie about when blocks hit iron oxide, you shouldn't have any
corruption at all.  If you have crappy hardware, then all bets are off


What hardware is crappy here. Lets say... internal hdd in thinkpad
x60?

What are ext3 expectations of disk (is there doc somewhere)? For
example... if disk does not lie, but powerfail during write damages
the sector -- is ext3 still going to work properly?

If disk does not lie, but powerfail during write may cause random
numbers to be returned on read -- can fsck handle that?

What abou disk that kills 5 sectors around sector being written during
powerfail; can ext3 survive that?

Pavel



I think that you have to keep in mind the way disk (and other media) 
fail. You can get media failures after a successful write or errors that 
pop up as the media ages.


Not to mention the way most people run with write cache enabled and no 
write barriers enabled - a sure recipe for corruption.


Of course, there are always software errors to introduce corruption even 
when we get everything else right ;-)


From what I see, media errors are the number one cause of corruption in 
file systems. It is critical that fsck (and any other tools) continue 
after an IO error since they are fairly common (just assume that sector 
is lost and do your best as you continue on).


ric

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 9/9] unprivileged mounts: add "no submounts" flag

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> Add a new mount flag "nomnt", which denies submounts for the owner.
> This would be useful, if we want to support traditional /etc/fstab
> based user mounts.
> 
> In this case mount(8) would still have to be suid-root, to check the
> mountpoint against the user/users flag in /etc/fstab, but /etc/mtab
> would no longer be mandatory for storing the actual owner of the
> mount.

Ah, I see, so the floppy drive could be mounted as a MNT_NOMNT but
MNT_USER mount with mnt_owner set.  Makes sense.  I'd ask for a better
name than 'nomnt', but I can't think of one myself.

> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

Acked-by: Serge Hallyn <[EMAIL PROTECTED]>

> ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-04 13:49:52.0 +0100
> +++ linux/fs/namespace.c  2008-01-04 13:50:28.0 +0100
> @@ -694,6 +694,7 @@ static int show_vfsmnt(struct seq_file *
>   { MNT_NOATIME, ",noatime" },
>   { MNT_NODIRATIME, ",nodiratime" },
>   { MNT_RELATIME, ",relatime" },
> + { MNT_NOMNT, ",nomnt" },
>   { 0, NULL }
>   };
>   struct proc_fs_info *fs_infop;
> @@ -1044,6 +1045,9 @@ static bool permit_mount(struct nameidat
>   if (S_ISLNK(inode->i_mode))
>   return false;
> 
> + if (nd->path.mnt->mnt_flags & MNT_NOMNT)
> + return false;
> +
>   if (!is_mount_owner(nd->path.mnt, current->fsuid))
>   return false;
> 
> @@ -1888,9 +1892,11 @@ long do_mount(char *dev_name, char *dir_
>   mnt_flags |= MNT_RELATIME;
>   if (flags & MS_RDONLY)
>   mnt_flags |= MNT_READONLY;
> + if (flags & MS_NOMNT)
> + mnt_flags |= MNT_NOMNT;
> 
> - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
> -MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
> + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME |
> + MS_NODIRATIME | MS_RELATIME | MS_KERNMOUNT | MS_NOMNT);
> 
>   /* ... and get the mountpoint */
>   retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
> Index: linux/include/linux/fs.h
> ===
> --- linux.orig/include/linux/fs.h 2008-01-04 13:49:12.0 +0100
> +++ linux/include/linux/fs.h  2008-01-04 13:49:58.0 +0100
> @@ -130,6 +130,7 @@ extern int dir_notify_enable;
>  #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
>  #define MS_I_VERSION (1<<23) /* Update inode I_version field */
>  #define MS_SETUSER   (1<<24) /* set mnt_uid to current user */
> +#define MS_NOMNT (1<<25) /* don't allow unprivileged submounts */
>  #define MS_ACTIVE(1<<30)
>  #define MS_NOUSER(1<<31)
> 
> Index: linux/include/linux/mount.h
> ===
> --- linux.orig/include/linux/mount.h  2008-01-04 13:45:45.0 +0100
> +++ linux/include/linux/mount.h   2008-01-04 13:49:58.0 +0100
> @@ -30,6 +30,7 @@ struct mnt_namespace;
>  #define MNT_NODIRATIME   0x10
>  #define MNT_RELATIME 0x20
>  #define MNT_READONLY 0x40/* does the user want this to be r/o? */
> +#define MNT_NOMNT0x80
> 
>  #define MNT_SHRINKABLE   0x100
>  #define MNT_IMBALANCED_WRITE_COUNT   0x200 /* just for debugging */
> 
> --
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 7/9] unprivileged mounts: allow unprivileged fuse mounts

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> Use FS_SAFE for "fuse" fs type, but not for "fuseblk".
> 
> FUSE was designed from the beginning to be safe for unprivileged users.  This
> has also been verified in practice over many years.  In addition unprivileged
> mounts require the parent mount to be owned by the user, which is more strict
> than the current userspace policy.
> 
> This will enable future installations to remove the suid-root fusermount
> utility.
> 
> Don't require the "user_id=" and "group_id=" options for unprivileged mounts,
> but if they are present, verify them for sanity.
> 
> Disallow the "allow_other" option for unprivileged mounts.
> 
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

Sounds like a sysctl to enable FS_SAFE for fuse will make this patch
acceptable to everyone?

> ---
> 
> Index: linux/fs/fuse/inode.c
> ===
> --- linux.orig/fs/fuse/inode.c2008-01-03 17:13:13.0 +0100
> +++ linux/fs/fuse/inode.c 2008-01-03 21:28:01.0 +0100
> @@ -357,6 +357,19 @@ static int parse_fuse_opt(char *opt, str
>   d->max_read = ~0;
>   d->blksize = 512;
> 
> + /*
> +  * For unprivileged mounts use current uid/gid.  Still allow
> +  * "user_id" and "group_id" options for compatibility, but
> +  * only if they match these values.
> +  */
> + if (!capable(CAP_SYS_ADMIN)) {
> + d->user_id = current->uid;
> + d->user_id_present = 1;
> + d->group_id = current->gid;
> + d->group_id_present = 1;
> +
> + }
> +
>   while ((p = strsep(&opt, ",")) != NULL) {
>   int token;
>   int value;
> @@ -385,6 +398,8 @@ static int parse_fuse_opt(char *opt, str
>   case OPT_USER_ID:
>   if (match_int(&args[0], &value))
>   return 0;
> + if (d->user_id_present && d->user_id != value)
> + return 0;
>   d->user_id = value;
>   d->user_id_present = 1;
>   break;
> @@ -392,6 +407,8 @@ static int parse_fuse_opt(char *opt, str
>   case OPT_GROUP_ID:
>   if (match_int(&args[0], &value))
>   return 0;
> + if (d->group_id_present && d->group_id != value)
> + return 0;
>   d->group_id = value;
>   d->group_id_present = 1;
>   break;
> @@ -596,6 +613,10 @@ static int fuse_fill_super(struct super_
>   if (!parse_fuse_opt((char *) data, &d, is_bdev))
>   return -EINVAL;
> 
> + /* This is a privileged option */
> + if ((d.flags & FUSE_ALLOW_OTHER) && !capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
>   if (is_bdev) {
>  #ifdef CONFIG_BLOCK
>   if (!sb_set_blocksize(sb, d.blksize))
> @@ -696,9 +717,9 @@ static int fuse_get_sb(struct file_syste
>  static struct file_system_type fuse_fs_type = {
>   .owner  = THIS_MODULE,
>   .name   = "fuse",
> - .fs_flags   = FS_HAS_SUBTYPE,
>   .get_sb = fuse_get_sb,
>   .kill_sb= kill_anon_super,
> + .fs_flags   = FS_HAS_SUBTYPE | FS_SAFE,
>  };
> 
>  #ifdef CONFIG_BLOCK
> 
> --
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [UNIONFS] 00/29 Unionfs and related patches pre-merge review (v2)

2008-01-14 Thread Jesse Hathaway
Erez Zadok  cs.sunysb.edu> writes:

> > Huh?  There's still aboslutely not fix to the underlying problems of
> > the whole idea.   I think we made it pretty clear that unionfs is not
> > the way to go, and that we'll get the union mount patches clear once
> > the per-mountpoint r/o and unprivilegued mount patches series are in
> > and stable.
> 
> I'll reiterate what I've said before: unionfs is used today by many users,
> it works, and is stable.  After years of working with unionfs, we've settled
> on a set of features that users actually use.  This functionality can be in
> mainline today.

I think it would be of great benefit to many linux users to have Unionfs 
in mainline. It is a highly valuable and reliable filesystem which 
allows one to develop interesting live distributions, read only root
images and other types of layered systems. Unionfs works well now, and
given that it is a stand alone filesystem I think it would really be a
waste to exclude it from mainline only to insist that one should wait
for union mounts, which may never reach feature parity with Unionfs,
especially given that the  linux kernel has a long running history of
filesystems with overlapping abilities.

 - Jesse Hathaway

> Unioning at the VFS level, will take a long time to reach the same level of
> maturity and support the same set of features.  Based on my years of
> practical experience with it, unioning directories seems like a simple idea,
> but in practice it's quite hard no matter the approach taken to implement
> it.
> 
> Existing users of unioning aren't likely to switch to Union Mounts unless it
> supports the same set of features.  How long will it realistically take to
> get whiteout support in every lower file system that's used by Unionfs
> users?  How will Union Mounts support persistent inode numbers at the VFS
> level?  Those are just a few of the questions.
> 
> I think a better approach would be to start with Unionfs (a standalone file
> system that doesn't touch the rest of the kernel).  And as Linux gradually
> starts supporting more and more features that help unioning/stacking in
> general, to change Unionfs to use those features (e.g., native whiteout
> support).  Eventually there could be basic unioning support at the VFS
> level, and concurrently a file-system which offers the extra features (e.g.,
> persistency).  This can be done w/o affecting user-visible APIs.
> 
> Cheers,
> Erez.
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo  vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 8/9] unprivileged mounts: propagation: inherit owner from parent

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> On mount propagation, let the owner of the clone be inherited from the
> parent into which it has been propagated.  Also if the parent has the
> "nosuid" flag, set this flag for the child as well.

What about nodev?

thanks,
-serge

> 
> This makes sense for example, when propagation is set up from the
> initial namespace into a per-user namespace, where some or all of the
> mounts may be owned by the user.
> 
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
> ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-04 13:48:14.0 +0100
> +++ linux/fs/namespace.c  2008-01-04 13:49:52.0 +0100
> @@ -500,10 +500,10 @@ static int reserve_user_mount(void)
>   return err;
>  }
> 
> -static void __set_mnt_user(struct vfsmount *mnt)
> +static void __set_mnt_user(struct vfsmount *mnt, uid_t owner)
>  {
>   BUG_ON(mnt->mnt_flags & MNT_USER);
> - mnt->mnt_uid = current->fsuid;
> + mnt->mnt_uid = owner;
>   mnt->mnt_flags |= MNT_USER;
> 
>   if (!capable(CAP_SETUID))
> @@ -514,7 +514,7 @@ static void __set_mnt_user(struct vfsmou
> 
>  static void set_mnt_user(struct vfsmount *mnt)
>  {
> - __set_mnt_user(mnt);
> + __set_mnt_user(mnt, current->fsuid);
>   spin_lock(&vfsmount_lock);
>   nr_user_mounts++;
>   spin_unlock(&vfsmount_lock);
> @@ -530,7 +530,7 @@ static void clear_mnt_user(struct vfsmou
>  }
> 
>  static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
> - int flag)
> + int flag, uid_t owner)
>  {
>   struct super_block *sb = old->mnt_sb;
>   struct vfsmount *mnt;
> @@ -554,7 +554,10 @@ static struct vfsmount *clone_mnt(struct
>   /* don't copy the MNT_USER flag */
>   mnt->mnt_flags &= ~MNT_USER;
>   if (flag & CL_SETUSER)
> - __set_mnt_user(mnt);
> + __set_mnt_user(mnt, owner);
> +
> + if (flag & CL_NOSUID)
> + mnt->mnt_flags |= MNT_NOSUID;
> 
>   if (flag & CL_SLAVE) {
>   list_add(&mnt->mnt_slave, &old->mnt_slave_list);
> @@ -1060,7 +1063,7 @@ static int lives_below_in_same_fs(struct
>  }
> 
>  struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
> - int flag)
> + int flag, uid_t owner)
>  {
>   struct vfsmount *res, *p, *q, *r, *s;
>   struct nameidata nd;
> @@ -1068,7 +1071,7 @@ struct vfsmount *copy_tree(struct vfsmou
>   if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
>   return ERR_PTR(-EPERM);
> 
> - res = q = clone_mnt(mnt, dentry, flag);
> + res = q = clone_mnt(mnt, dentry, flag, owner);
>   if (IS_ERR(q))
>   goto error;
>   q->mnt_mountpoint = mnt->mnt_mountpoint;
> @@ -1090,7 +1093,7 @@ struct vfsmount *copy_tree(struct vfsmou
>   p = s;
>   nd.path.mnt = q;
>   nd.path.dentry = p->mnt_mountpoint;
> - q = clone_mnt(p, p->mnt_root, flag);
> + q = clone_mnt(p, p->mnt_root, flag, owner);
>   if (IS_ERR(q))
>   goto error;
>   spin_lock(&vfsmount_lock);
> @@ -1115,7 +1118,7 @@ struct vfsmount *collect_mounts(struct v
>  {
>   struct vfsmount *tree;
>   down_read(&namespace_sem);
> - tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
> + tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE, 0);
>   up_read(&namespace_sem);
>   return tree;
>  }
> @@ -1286,7 +1289,8 @@ static int do_change_type(struct nameida
>   */
>  static int do_loopback(struct nameidata *nd, char *old_name, int flags)
>  {
> - int clone_fl;
> + int clone_fl = 0;
> + uid_t owner = 0;
>   struct nameidata old_nd;
>   struct vfsmount *mnt = NULL;
>   int err;
> @@ -1307,11 +1311,17 @@ static int do_loopback(struct nameidata 
>   if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt))
>   goto out;
> 
> - clone_fl = (flags & MS_SETUSER) ? CL_SETUSER : 0;
> + if (flags & MS_SETUSER) {
> + clone_fl |= CL_SETUSER;
> + owner = current->fsuid;
> + }
> +
>   if (flags & MS_REC)
> - mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, clone_fl);
> + mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, clone_fl,
> + owner);
>   else
> - mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, clone_fl);
> + mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, clone_fl,
> + owner);
> 
>   err = PTR_ERR(mnt);
>   if (IS_ERR(mnt))
> @@ -1535,7 +1545,7 @@ static int do_new_mount(struct nameidata
>   

Re: [patch 6/9] unprivileged mounts: allow unprivileged mounts

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> Define a new fs flag FS_SAFE, which denotes, that unprivileged mounting of
> this filesystem may not constitute a security problem.
> 
> Since most filesystems haven't been designed with unprivileged mounting in
> mind, a thorough audit is needed before setting this flag.
> 
> For "safe" filesystems also allow unprivileged forced unmounting.
> 
> Move subtype handling from do_kern_mount() into do_new_mount().  All
> other callers are kernel-internal and do not need subtype support.
> 
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

This patch itself doesn't assign FS_SAFE to any filesystems, so
presuming that there is such a thing as an fs safe for users to
mount, and/or users sign their systems away through a sysctl,
this patch in itself appears right.

Acked-by: Serge Hallyn <[EMAIL PROTECTED]>

> ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-03 21:20:11.0 +0100
> +++ linux/fs/namespace.c  2008-01-03 21:21:06.0 +0100
> @@ -960,14 +960,16 @@ static bool is_mount_owner(struct vfsmou
>  /*
>   * umount is permitted for
>   *  - sysadmin
> - *  - mount owner, if not forced umount
> + *  - mount owner
> + *o if not forced umount,
> + *o if forced umount, and filesystem is "safe"
>   */
>  static bool permit_umount(struct vfsmount *mnt, int flags)
>  {
>   if (capable(CAP_SYS_ADMIN))
>   return true;
> 
> - if (flags & MNT_FORCE)
> + if ((flags & MNT_FORCE) && !(mnt->mnt_sb->s_type->fs_flags & FS_SAFE))
>   return false;
> 
>   return is_mount_owner(mnt, current->fsuid);
> @@ -1025,13 +1027,17 @@ asmlinkage long sys_oldumount(char __use
>   * - mountpoint is not a symlink
>   * - mountpoint is in a mount owned by the user
>   */
> -static bool permit_mount(struct nameidata *nd, int *flags)
> +static bool permit_mount(struct nameidata *nd, struct file_system_type *type,
> +  int *flags)
>  {
>   struct inode *inode = nd->path.dentry->d_inode;
> 
>   if (capable(CAP_SYS_ADMIN))
>   return true;
> 
> + if (type && !(type->fs_flags & FS_SAFE))
> + return false;
> +
>   if (S_ISLNK(inode->i_mode))
>   return false;
> 
> @@ -1285,7 +1291,7 @@ static int do_loopback(struct nameidata 
>   struct vfsmount *mnt = NULL;
>   int err;
> 
> - if (!permit_mount(nd, &flags))
> + if (!permit_mount(nd, NULL, &flags))
>   return -EPERM;
>   if (!old_name || !*old_name)
>   return -EINVAL;
> @@ -1466,30 +1472,76 @@ out:
>   return err;
>  }
> 
> +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char 
> *fstype)
> +{
> + int err;
> + const char *subtype = strchr(fstype, '.');
> + if (subtype) {
> + subtype++;
> + err = -EINVAL;
> + if (!subtype[0])
> + goto err;
> + } else
> + subtype = "";
> +
> + mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
> + err = -ENOMEM;
> + if (!mnt->mnt_sb->s_subtype)
> + goto err;
> + return mnt;
> +
> + err:
> + mntput(mnt);
> + return ERR_PTR(err);
> +}
> +
>  /*
>   * create a new mount for userspace and request it to be added into the
>   * namespace's tree
>   */
> -static int do_new_mount(struct nameidata *nd, char *type, int flags,
> +static int do_new_mount(struct nameidata *nd, char *fstype, int flags,
>   int mnt_flags, char *name, void *data)
>  {
> + int err;
>   struct vfsmount *mnt;
> + struct file_system_type *type;
> 
> - if (!type || !memchr(type, 0, PAGE_SIZE))
> + if (!fstype || !memchr(fstype, 0, PAGE_SIZE))
>   return -EINVAL;
> 
> - /* we need capabilities... */
> - if (!capable(CAP_SYS_ADMIN))
> - return -EPERM;
> -
> - mnt = do_kern_mount(type, flags & ~MS_SETUSER, name, data);
> - if (IS_ERR(mnt))
> + type = get_fs_type(fstype);
> + if (!type)
> + return -ENODEV;
> +
> + err = -EPERM;
> + if (!permit_mount(nd, type, &flags))
> + goto out_put_filesystem;
> +
> + if (flags & MS_SETUSER) {
> + err = reserve_user_mount();
> + if (err)
> + goto out_put_filesystem;
> + }
> +
> + mnt = vfs_kern_mount(type, flags & ~MS_SETUSER, name, data);
> + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
> + !mnt->mnt_sb->s_subtype)
> + mnt = fs_set_subtype(mnt, fstype);
> + put_filesystem(type);
> + if (IS_ERR(mnt)) {
> + if (flags & MS_SETUSER)
> + dec_nr_user_mounts();
>   return PTR_ERR(mnt);
> + }
> 
>   if (flags & MS_SETUSER)
> - set_mnt_user(mnt);
> + __set_mnt_user(mnt);
> 
> 

Re: [patch 5/9] unprivileged mounts: allow unprivileged bind mounts

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> Allow bind mounts to unprivileged users if the following conditions are met:
> 
>   - mountpoint is not a symlink
>   - parent mount is owned by the user
>   - the number of user mounts is below the maximum
> 
> Unprivileged mounts imply MS_SETUSER, and will also have the "nosuid" and
> "nodev" mount flags set.
> 
> In particular, if mounting process doesn't have CAP_SETUID capability,
> then the "nosuid" flag will be added, and if it doesn't have CAP_MKNOD
> capability, then the "nodev" flag will be added.
> 
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

Acked-by: Serge Hallyn <[EMAIL PROTECTED]>

> ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-04 13:47:49.0 +0100
> +++ linux/fs/namespace.c  2008-01-04 13:48:01.0 +0100
> @@ -487,11 +487,34 @@ static void dec_nr_user_mounts(void)
>   spin_unlock(&vfsmount_lock);
>  }
> 
> -static void set_mnt_user(struct vfsmount *mnt)
> +static int reserve_user_mount(void)
> +{
> + int err = 0;
> +
> + spin_lock(&vfsmount_lock);
> + if (nr_user_mounts >= max_user_mounts && !capable(CAP_SYS_ADMIN))
> + err = -EPERM;
> + else
> + nr_user_mounts++;
> + spin_unlock(&vfsmount_lock);
> + return err;
> +}
> +
> +static void __set_mnt_user(struct vfsmount *mnt)
>  {
>   BUG_ON(mnt->mnt_flags & MNT_USER);
>   mnt->mnt_uid = current->fsuid;
>   mnt->mnt_flags |= MNT_USER;
> +
> + if (!capable(CAP_SETUID))
> + mnt->mnt_flags |= MNT_NOSUID;
> + if (!capable(CAP_MKNOD))
> + mnt->mnt_flags |= MNT_NODEV;
> +}
> +
> +static void set_mnt_user(struct vfsmount *mnt)
> +{
> + __set_mnt_user(mnt);
>   spin_lock(&vfsmount_lock);
>   nr_user_mounts++;
>   spin_unlock(&vfsmount_lock);
> @@ -510,10 +533,16 @@ static struct vfsmount *clone_mnt(struct
>   int flag)
>  {
>   struct super_block *sb = old->mnt_sb;
> - struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
> + struct vfsmount *mnt;
> 
> + if (flag & CL_SETUSER) {
> + int err = reserve_user_mount();
> + if (err)
> + return ERR_PTR(err);
> + }
> + mnt = alloc_vfsmnt(old->mnt_devname);
>   if (!mnt)
> - return ERR_PTR(-ENOMEM);
> + goto alloc_failed;
> 
>   mnt->mnt_flags = old->mnt_flags;
>   atomic_inc(&sb->s_active);
> @@ -525,7 +554,7 @@ static struct vfsmount *clone_mnt(struct
>   /* don't copy the MNT_USER flag */
>   mnt->mnt_flags &= ~MNT_USER;
>   if (flag & CL_SETUSER)
> - set_mnt_user(mnt);
> + __set_mnt_user(mnt);
> 
>   if (flag & CL_SLAVE) {
>   list_add(&mnt->mnt_slave, &old->mnt_slave_list);
> @@ -550,6 +579,11 @@ static struct vfsmount *clone_mnt(struct
>   spin_unlock(&vfsmount_lock);
>   }
>   return mnt;
> +
> + alloc_failed:
> + if (flag & CL_SETUSER)
> + dec_nr_user_mounts();
> + return ERR_PTR(-ENOMEM);
>  }
> 
>  static inline void __mntput(struct vfsmount *mnt)
> @@ -986,22 +1020,26 @@ asmlinkage long sys_oldumount(char __use
> 
>  #endif
> 
> -static int mount_is_safe(struct nameidata *nd)
> +/*
> + * Conditions for unprivileged mounts are:
> + * - mountpoint is not a symlink
> + * - mountpoint is in a mount owned by the user
> + */
> +static bool permit_mount(struct nameidata *nd, int *flags)
>  {
> + struct inode *inode = nd->path.dentry->d_inode;
> +
>   if (capable(CAP_SYS_ADMIN))
> - return 0;
> - return -EPERM;
> -#ifdef notyet
> - if (S_ISLNK(nd->path.dentry->d_inode->i_mode))
> - return -EPERM;
> - if (nd->path.dentry->d_inode->i_mode & S_ISVTX) {
> - if (current->uid != nd->path.dentry->d_inode->i_uid)
> - return -EPERM;
> - }
> - if (vfs_permission(nd, MAY_WRITE))
> - return -EPERM;
> - return 0;
> -#endif
> + return true;
> +
> + if (S_ISLNK(inode->i_mode))
> + return false;
> +
> + if (!is_mount_owner(nd->path.mnt, current->fsuid))
> + return false;
> +
> + *flags |= MS_SETUSER;
> + return true;
>  }
> 
>  static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
> @@ -1245,9 +1283,10 @@ static int do_loopback(struct nameidata 
>   int clone_fl;
>   struct nameidata old_nd;
>   struct vfsmount *mnt = NULL;
> - int err = mount_is_safe(nd);
> - if (err)
> - return err;
> + int err;
> +
> + if (!permit_mount(nd, &flags))
> + return -EPERM;
>   if (!old_name || !*old_name)
>   return -EINVAL;
>   err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
> 
> --
-
To unsubscribe from this list: send the line "unsubscribe 

Re: [patch 4/9] unprivileged mounts: propagate error values from clone_mnt

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> Allow clone_mnt() to return errors other than ENOMEM.  This will be used for
> returning a different error value when the number of user mounts goes over the
> limit.
> 
> Fix copy_tree() to return EPERM for unbindable mounts.
> 
> Don't propagate further from dup_mnt_ns() as that copy_tree() can only fail
> with -ENOMEM.

I see what you're saying, but it just seems like it's bound to be more
confusing this way.

What's the reason to insist on doing this?  To force people to think
about it as a form of documentation?

Still,

> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

Acked-by: Serge Hallyn <[EMAIL PROTECTED]>

> ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-04 13:47:09.0 +0100
> +++ linux/fs/namespace.c  2008-01-04 13:47:49.0 +0100
> @@ -512,41 +512,42 @@ static struct vfsmount *clone_mnt(struct
>   struct super_block *sb = old->mnt_sb;
>   struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
> 
> - if (mnt) {
> - mnt->mnt_flags = old->mnt_flags;
> - atomic_inc(&sb->s_active);
> - mnt->mnt_sb = sb;
> - mnt->mnt_root = dget(root);
> - mnt->mnt_mountpoint = mnt->mnt_root;
> - mnt->mnt_parent = mnt;
> -
> - /* don't copy the MNT_USER flag */
> - mnt->mnt_flags &= ~MNT_USER;
> - if (flag & CL_SETUSER)
> - set_mnt_user(mnt);
> -
> - if (flag & CL_SLAVE) {
> - list_add(&mnt->mnt_slave, &old->mnt_slave_list);
> - mnt->mnt_master = old;
> - CLEAR_MNT_SHARED(mnt);
> - } else if (!(flag & CL_PRIVATE)) {
> - if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
> - list_add(&mnt->mnt_share, &old->mnt_share);
> - if (IS_MNT_SLAVE(old))
> - list_add(&mnt->mnt_slave, &old->mnt_slave);
> - mnt->mnt_master = old->mnt_master;
> - }
> - if (flag & CL_MAKE_SHARED)
> - set_mnt_shared(mnt);
> + if (!mnt)
> + return ERR_PTR(-ENOMEM);
> 
> - /* stick the duplicate mount on the same expiry list
> -  * as the original if that was on one */
> - if (flag & CL_EXPIRE) {
> - spin_lock(&vfsmount_lock);
> - if (!list_empty(&old->mnt_expire))
> - list_add(&mnt->mnt_expire, &old->mnt_expire);
> - spin_unlock(&vfsmount_lock);
> - }
> + mnt->mnt_flags = old->mnt_flags;
> + atomic_inc(&sb->s_active);
> + mnt->mnt_sb = sb;
> + mnt->mnt_root = dget(root);
> + mnt->mnt_mountpoint = mnt->mnt_root;
> + mnt->mnt_parent = mnt;
> +
> + /* don't copy the MNT_USER flag */
> + mnt->mnt_flags &= ~MNT_USER;
> + if (flag & CL_SETUSER)
> + set_mnt_user(mnt);
> +
> + if (flag & CL_SLAVE) {
> + list_add(&mnt->mnt_slave, &old->mnt_slave_list);
> + mnt->mnt_master = old;
> + CLEAR_MNT_SHARED(mnt);
> + } else if (!(flag & CL_PRIVATE)) {
> + if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
> + list_add(&mnt->mnt_share, &old->mnt_share);
> + if (IS_MNT_SLAVE(old))
> + list_add(&mnt->mnt_slave, &old->mnt_slave);
> + mnt->mnt_master = old->mnt_master;
> + }
> + if (flag & CL_MAKE_SHARED)
> + set_mnt_shared(mnt);
> +
> + /* stick the duplicate mount on the same expiry list
> +  * as the original if that was on one */
> + if (flag & CL_EXPIRE) {
> + spin_lock(&vfsmount_lock);
> + if (!list_empty(&old->mnt_expire))
> + list_add(&mnt->mnt_expire, &old->mnt_expire);
> + spin_unlock(&vfsmount_lock);
>   }
>   return mnt;
>  }
> @@ -1021,11 +1022,11 @@ struct vfsmount *copy_tree(struct vfsmou
>   struct nameidata nd;
> 
>   if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
> - return NULL;
> + return ERR_PTR(-EPERM);
> 
>   res = q = clone_mnt(mnt, dentry, flag);
> - if (!q)
> - goto Enomem;
> + if (IS_ERR(q))
> + goto error;
>   q->mnt_mountpoint = mnt->mnt_mountpoint;
> 
>   p = mnt;
> @@ -1046,8 +1047,8 @@ struct vfsmount *copy_tree(struct vfsmou
>   nd.path.mnt = q;
>   nd.path.dentry = p->mnt_mountpoint;
>   q = clone_mnt(p, p->mnt_root, flag);
> - if (!q)
> - goto Enomem;
> + if (IS_ERR(q))
> + goto error;
>   

Re: [linux-cifs-client] review 4, was Re: projected date for mount.cifs to support DFS junction points

2008-01-14 Thread Christoph Hellwig
On Mon, Jan 14, 2008 at 04:15:05PM +0300, Q (Igor Mammedov) wrote:
> > +   dput(nd->dentry);
> > +   nd->dentry = dget(dentry);
> > +   if (d_mountpoint(nd->dentry))
> > +   goto out_follow;
> > 
> > A link should never be a mountpoint.
> 
> why link? after patch 5 are applied DFS junction point becomes directory
> instead of link. That is what has been done in NFS code.

True, this is the ->follow_link on directory hack.

> Actually I copy & pasted it from NFS code ...
> fs/nfs/namespace.c:nfs_follow_mountpoint
> 
> I've tried to do submount/referral machinery in NFS code way.

Okay, and that came from afs.  I really think we should take large
parts of this back into the VFS because it's just to fragile to
be duplicated in various filesystems.  Probably not a blocker for
your patch but I'll keep an eye on consolidating this back into
the core.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 3/9] unprivileged mounts: account user mounts

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> Add sysctl variables for accounting and limiting the number of user
> mounts.
> 
> The maximum number of user mounts is set to 1024 by default.  This
> won't in itself enable user mounts, setting a mount to be owned by a
> user is first needed
> 
> [akpm]
>  - don't use enumerated sysctls
> 
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

Seems sane enough, given your responses to Dave.

Acked-by: Serge Hallyn <[EMAIL PROTECTED]>

> ---
> 
> Index: linux/Documentation/filesystems/proc.txt
> ===
> --- linux.orig/Documentation/filesystems/proc.txt 2008-01-03 
> 17:12:58.0 +0100
> +++ linux/Documentation/filesystems/proc.txt  2008-01-03 21:15:35.0 
> +0100
> @@ -1012,6 +1012,15 @@ reaches aio-max-nr then io_setup will fa
>  raising aio-max-nr does not result in the pre-allocation or re-sizing
>  of any kernel data structures.
> 
> +nr_user_mounts and max_user_mounts
> +--
> +
> +These represent the number of "user" mounts and the maximum number of
> +"user" mounts respectively.  User mounts may be created by
> +unprivileged users.  User mounts may also be created with sysadmin
> +privileges on behalf of a user, in which case nr_user_mounts may
> +exceed max_user_mounts.
> +
>  2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats
>  ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-03 21:14:16.0 +0100
> +++ linux/fs/namespace.c  2008-01-03 21:15:35.0 +0100
> @@ -44,6 +44,9 @@ static struct list_head *mount_hashtable
>  static struct kmem_cache *mnt_cache __read_mostly;
>  static struct rw_semaphore namespace_sem;
> 
> +int nr_user_mounts;
> +int max_user_mounts = 1024;
> +
>  /* /sys/fs */
>  struct kobject *fs_kobj;
>  EXPORT_SYMBOL_GPL(fs_kobj);
> @@ -477,11 +480,30 @@ static struct vfsmount *skip_mnt_tree(st
>   return p;
>  }
> 
> +static void dec_nr_user_mounts(void)
> +{
> + spin_lock(&vfsmount_lock);
> + nr_user_mounts--;
> + spin_unlock(&vfsmount_lock);
> +}
> +
>  static void set_mnt_user(struct vfsmount *mnt)
>  {
>   BUG_ON(mnt->mnt_flags & MNT_USER);
>   mnt->mnt_uid = current->fsuid;
>   mnt->mnt_flags |= MNT_USER;
> + spin_lock(&vfsmount_lock);
> + nr_user_mounts++;
> + spin_unlock(&vfsmount_lock);
> +}
> +
> +static void clear_mnt_user(struct vfsmount *mnt)
> +{
> + if (mnt->mnt_flags & MNT_USER) {
> + mnt->mnt_uid = 0;
> + mnt->mnt_flags &= ~MNT_USER;
> + dec_nr_user_mounts();
> + }
>  }
> 
>  static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
> @@ -542,6 +564,7 @@ static inline void __mntput(struct vfsmo
>*/
>   WARN_ON(atomic_read(&mnt->__mnt_writers));
>   dput(mnt->mnt_root);
> + clear_mnt_user(mnt);
>   free_vfsmnt(mnt);
>   deactivate_super(sb);
>  }
> @@ -1306,6 +1329,7 @@ static int do_remount(struct nameidata *
>   else
>   err = do_remount_sb(sb, flags, data, 0);
>   if (!err) {
> + clear_mnt_user(nd->path.mnt);
>   nd->path.mnt->mnt_flags = mnt_flags;
>   if (flags & MS_SETUSER)
>   set_mnt_user(nd->path.mnt);
> Index: linux/include/linux/fs.h
> ===
> --- linux.orig/include/linux/fs.h 2008-01-03 20:52:38.0 +0100
> +++ linux/include/linux/fs.h  2008-01-03 21:15:35.0 +0100
> @@ -50,6 +50,9 @@ extern struct inodes_stat_t inodes_stat;
> 
>  extern int leases_enable, lease_break_time;
> 
> +extern int nr_user_mounts;
> +extern int max_user_mounts;
> +
>  #ifdef CONFIG_DNOTIFY
>  extern int dir_notify_enable;
>  #endif
> Index: linux/kernel/sysctl.c
> ===
> --- linux.orig/kernel/sysctl.c2008-01-03 17:13:22.0 +0100
> +++ linux/kernel/sysctl.c 2008-01-03 21:15:35.0 +0100
> @@ -1288,6 +1288,22 @@ static struct ctl_table fs_table[] = {
>  #endif   
>  #endif
>   {
> + .ctl_name   = CTL_UNNUMBERED,
> + .procname   = "nr_user_mounts",
> + .data   = &nr_user_mounts,
> + .maxlen = sizeof(int),
> + .mode   = 0444,
> + .proc_handler   = &proc_dointvec,
> + },
> + {
> + .ctl_name   = CTL_UNNUMBERED,
> + .procname   = "max_user_mounts",
> + .data   = &max_user_mounts,
> + .maxlen = sizeof(int),
> + .mode   = 0644,
> + .proc_handler   = &proc_dointvec,
> + },
> + {
>   .ctl_name   = KERN_SET

Re: [patch 2/9] unprivileged mounts: allow unprivileged umount

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> The owner doesn't need sysadmin capabilities to call umount().
> 
> Similar behavior as umount(8) on mounts having "user=UID" option in /etc/mtab.
> The difference is that umount also checks /etc/fstab, presumably to exclude
> another mount on the same mountpoint.
> 
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

Acked-by: Serge Hallyn <[EMAIL PROTECTED]>

> ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-03 20:52:38.0 +0100
> +++ linux/fs/namespace.c  2008-01-03 21:14:16.0 +0100
> @@ -894,6 +894,27 @@ static int do_umount(struct vfsmount *mn
>   return retval;
>  }
> 
> +static bool is_mount_owner(struct vfsmount *mnt, uid_t uid)
> +{
> + return (mnt->mnt_flags & MNT_USER) && mnt->mnt_uid == uid;
> +}
> +
> +/*
> + * umount is permitted for
> + *  - sysadmin
> + *  - mount owner, if not forced umount
> + */
> +static bool permit_umount(struct vfsmount *mnt, int flags)
> +{
> + if (capable(CAP_SYS_ADMIN))
> + return true;
> +
> + if (flags & MNT_FORCE)
> + return false;
> +
> + return is_mount_owner(mnt, current->fsuid);
> +}
> +
>  /*
>   * Now umount can handle mount points as well as block devices.
>   * This is important for filesystems which use unnamed block devices.
> @@ -917,7 +938,7 @@ asmlinkage long sys_umount(char __user *
>   goto dput_and_out;
> 
>   retval = -EPERM;
> - if (!capable(CAP_SYS_ADMIN))
> + if (!permit_umount(nd.path.mnt, flags))
>   goto dput_and_out;
> 
>   retval = do_umount(nd.path.mnt, flags);
> 
> --
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 1/9] unprivileged mounts: add user mounts to the kernel

2008-01-14 Thread Serge E. Hallyn
Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> From: Miklos Szeredi <[EMAIL PROTECTED]>
> 
> This patchset adds support for keeping mount ownership information in the
> kernel, and allow unprivileged mount(2) and umount(2) in certain cases.
> 
> The mount owner has the following privileges:
> 
>   - unmount the owned mount
>   - create a submount under the owned mount
> 
> The sysadmin can set the owner explicitly on mount and remount.  When an
> unprivileged user creates a mount, then the owner is automatically set to the
> user.
> 
> The following use cases are envisioned:
> 
> 1) Private namespace, with selected mounts owned by user.  E.g.
>/home/$USER is a good candidate for allowing unpriv mounts and unmounts
>within.
> 
> 2) Private namespace, with all mounts owned by user and having the "nosuid"
>flag.  User can mount and umount anywhere within the namespace, but suid
>programs will not work.
> 
> 3) Global namespace, with a designated directory, which is a mount owned by
>the user.  E.g.  /mnt/users/$USER is set up so that it is bind mounted onto
>itself, and set to be owned by $USER.  The user can add/remove mounts only
>under this directory.
> 
> The following extra security measures are taken for unprivileged mounts:
> 
>  - usermounts are limited by a sysctl tunable
>  - force "nosuid,nodev" mount options on the created mount
> 
> For testing unprivileged mounts (and for other purposes) simple
> mount/umount utilities are available from:
> 
>   http://www.kernel.org/pub/linux/kernel/people/mszeredi/mmount/
> 
> After this series I'll be posting a preliminary patch for util-linux-ng,
> to add the same functionality to mount(8) and umount(8).
> 
> This patch:
> 
> A new mount flag, MS_SETUSER is used to make a mount owned by a user.  If this
> flag is specified, then the owner will be set to the current fsuid and the
> mount will be marked with the MNT_USER flag.  On remount don't preserve
> previous owner, and treat MS_SETUSER as for a new mount.  The MS_SETUSER flag
> is ignored on mount move.
> 
> The MNT_USER flag is not copied on any kind of mount cloning: namespace
> creation, binding or propagation.  For bind mounts the cloned mount(s) are set
> to MNT_USER depending on the MS_SETUSER mount flag.  In all the other cases
> MNT_USER is always cleared.
> 
> For MNT_USER mounts a "user=UID" option is added to /proc/PID/mounts.  This is
> compatible with how mount ownership is stored in /etc/mtab.
> 
> The rationale for using MS_SETUSER and MNT_USER, to distinguish "user"
> mounts from "non-user" or "legacy" mounts are follows:
> 
>   a) Mount(2) and umount(2) on legacy mounts always need CAP_SYS_ADMIN
>  capability.  As opposed to user mounts, which will only require,
>  that the mount owner matches the current fsuid.  So a process
>  with fsuid=0 should not be able to mount/umount legacy mounts
>  without the CAP_SYS_ADMIN capability.
> 
>   b) Legacy userspace programs may set fsuid to nonzero before calling
>  mount(2).  In such an unlikely case, this patchset would cause
>  an unintended side effect of making the mount owned by the fsuid.
> 
>   c) For legacy mounts, no "user=UID" option should be shown in
>  /proc/mounts for backwards compatibility.
> 
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>

This looks good to me.

Acked-by: Serge Hallyn <[EMAIL PROTECTED]>

thanks,
-serge

> ---
> 
> Index: linux/fs/namespace.c
> ===
> --- linux.orig/fs/namespace.c 2008-01-03 22:10:10.0 +0100
> +++ linux/fs/namespace.c  2008-01-04 13:46:33.0 +0100
> @@ -477,6 +477,13 @@ static struct vfsmount *skip_mnt_tree(st
>   return p;
>  }
> 
> +static void set_mnt_user(struct vfsmount *mnt)
> +{
> + BUG_ON(mnt->mnt_flags & MNT_USER);
> + mnt->mnt_uid = current->fsuid;
> + mnt->mnt_flags |= MNT_USER;
> +}
> +
>  static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
>   int flag)
>  {
> @@ -491,6 +498,11 @@ static struct vfsmount *clone_mnt(struct
>   mnt->mnt_mountpoint = mnt->mnt_root;
>   mnt->mnt_parent = mnt;
> 
> + /* don't copy the MNT_USER flag */
> + mnt->mnt_flags &= ~MNT_USER;
> + if (flag & CL_SETUSER)
> + set_mnt_user(mnt);
> +
>   if (flag & CL_SLAVE) {
>   list_add(&mnt->mnt_slave, &old->mnt_slave_list);
>   mnt->mnt_master = old;
> @@ -644,6 +656,8 @@ static int show_vfsmnt(struct seq_file *
>   if (mnt->mnt_flags & fs_infop->flag)
>   seq_puts(m, fs_infop->str);
>   }
> + if (mnt->mnt_flags & MNT_USER)
> + seq_printf(m, ",user=%i", mnt->mnt_uid);
>   if (mnt->mnt_sb->s_op->show_options)
>   err = mnt->mnt_sb->s_op->show_options(m, mnt);
>   seq_puts(m, " 0 0\n");
> @@ -1181,8 

Re: Leak in nlmsvc_testlock for async GETFL case

2008-01-14 Thread J. Bruce Fields
On Fri, Jan 11, 2008 at 09:57:35PM -0500, Oleg Drokin wrote:
> Hello!
>
> On Nov 29, 2007, at 2:08 PM, J. Bruce Fields wrote:
>
>> On Thu, Nov 29, 2007 at 01:46:04PM -0500, Oleg Drokin wrote:
>>> Hello!
>>>
>>>   Per our discussion, I am resending this patch that fixes a leak in
>>>   nlmsvc_testlock.  It is addition to another leak fixing patch you
>>>   already have.  Without the patch, there is a leakage of nlmblock
>>>   structure refcount that holds a reference nlmfile structure, that
>>>   holds a reference to struct file, when async GETFL is used
>>>   (-EINPROGRESS return from file_ops->lock()), and also in some error
>>>   cases
>> Thanks for the fix!  Looks right to me.  Yes, somehow I missed this  
>> one
>> when you sent it privately.  Applied and pushed out to
>>  git://linux-nfs.org/~bfields/linux.git nfs-server-stable
>> and I'll submit it for 2.6.25.
>
> After playing around that code a bit more, I figured out the leak was  
> not
> completely fixed by that first patch, the case where there is  
> conflicting
> lock passed in by callback still leaks block reference.
> This simple incremental fix (against your current tree) takes care of  
> that.
>
> Signed-off-by: Oleg Drokin <[EMAIL PROTECTED]>

Thanks!  I've queued it up for 2.6.25.

--b.
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: lockdep warning with LTP dio test (v2.6.24-rc6-125-g5356f66)

2008-01-14 Thread Chris Mason
On Mon, 14 Jan 2008 18:06:09 +0100
Jan Kara <[EMAIL PROTECTED]> wrote:

> On Wed 02-01-08 12:42:19, Zach Brown wrote:
> > Erez Zadok wrote:
> > > Setting: ltp-full-20071031, dio01 test on ext3 with Linus's
> > > latest tree. Kernel w/ SMP, preemption, and lockdep configured.
> > 
> > This is a real lock ordering problem.  Thanks for reporting it.
> > 
> > The updating of atime inside sys_mmap() orders the mmap_sem in the
> > vfs outside of the journal handle in ext3's inode dirtying:
> > 

[ lock inversion traces ]
 
> > Two fixes come to mind:
> > 
> > 1) use something like Peter's ->mmap_prepare() to update atime
> > before acquiring the mmap_sem.
> > ( http://lkml.org/lkml/2007/11/11/97 ).  I don't know if this would
> > leave more paths which do a journal_start() while holding the
> > mmap_sem.
> > 
> > 2) rework ext3's dio to only hold the jbd handle in
> > ext3_get_block(). Chris has a patch for this kicking around
> > somewhere but I'm told it has problems exposing old blocks in
> > ordered data mode.
> > 
> > Does anyone have preferences?  I could go either way.  I certainly
> > don't like the idea of journal handles being held across the
> > entirety of fs/direct-io.c.  It's yet another case of O_DIRECT
> > differing wildly from the buffered path :(.
>   I've looked more into it and I think that 2) is the only way to go
> since transaction start ranks below page lock (standard buffered
> write path) and page lock ranks below mmap_sem. So we have at least
> one more dependency mmap_sem must go before transaction start...

Just to clarify a little bit:

If ext3's DIO code only touches transactions in get_block, then it can
violate data=ordered rules.  Basically the transaction that allocates
the blocks might commit before the DIO code gets around to writing them.

A crash in the wrong place will expose stale data on disk.

-chris
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] fast file mapping for loop

2008-01-14 Thread Jens Axboe
On Mon, Jan 14 2008, Chris Mason wrote:
> Hello everyone,
> 
> Here is a modified version of Jens' patch.  The basic idea is to push
> the mapping maintenance out of loop and down into the filesystem (ext2
> in this case).
> 
> Two new address_space operations are added, one to map
> extents and the other to provide call backs into the FS as io is
> completed.
> 
> Still TODO for this patch:
> 
> * Add exclusion between filling holes and readers.  This is partly
> implemented, when a hole is filled by the FS, the extent is flagged as
> having a hole.  The idea is to check this flag before trying to read
> the blocks and just send back all zeros.
> 
> The flag would be cleared when the blocks filling the hole have been
> written.
> 
> * Exclude page cache readers and writers
> 
> * Add a way for the FS to request a commit before telling the higher
> layers the IO is complete.  This way we can make sure metadata related
> to holes is on disk before claiming the IO is really done.  COW based
> filesystems will also needed it.
> 
> * Change loop to use fast mapping only when the new address_space op is
> provided (whoops, forgot about this until just now)
> 
> * A few other bits for COW, not really relevant until there
> is...something COW using it.

Looks pretty good. Essentially the loop side is 100% the same, it just
offloads the extent ownership to the fs (where it belongs). So I like
it. Attaching a small cleanup/fixup patch for loop, don't think it needs
further explanations.

One suggestion - free_extent_map(), I would call that put_extent_map()
instead.

diff -u b/drivers/block/loop.c b/drivers/block/loop.c
--- b/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -677,13 +677,14 @@
if (IS_ERR(lfe))
return -EIO;
 
-   while(!lfe) {
+   while (!lfe) {
loop_schedule_extent_mapping(lo, bio->bi_sector,
 bio->bi_size, 1);
lfe = loop_lookup_extent(lo, start, GFP_ATOMIC);
if (IS_ERR(lfe))
return -EIO;
}
+
/*
 * handle sparse io
 */
@@ -802,13 +803,13 @@
 {
struct bio *orig_bio = bio->bi_private;
struct inode *inode = bio->bi_bdev->bd_inode;
+   struct address_space *mapping = inode->i_mapping;
u64 start = orig_bio->bi_sector << 9;
u64 len = bio->bi_size;
 
-   if (inode->i_mapping->a_ops->extent_io_complete) {
-   inode->i_mapping->a_ops->extent_io_complete(inode->i_mapping,
-   start, len);
-   }
+   if (mapping->a_ops->extent_io_complete)
+   mapping->a_ops->extent_io_complete(mapping, start, len);
+
bio_put(bio);
bio_endio(orig_bio, err);
 }
@@ -829,6 +830,7 @@
err = -ENOMEM;
goto out;
}
+
/*
 * change the sector so we can find the correct file offset in our
 * endio
@@ -847,7 +849,6 @@
goto out;;
}
 
-
disk_block = em->block_start;
extent_off = start - em->start;
new_bio->bi_sector = (disk_block + extent_off) >> 9;
@@ -924,11 +925,8 @@
spin_unlock_irq(&lo->lo_lock);
 
BUG_ON(!bio);
-   if (lo_act_bio(bio))
-   bio_act = 1;
-   else
-   bio_act = 0;
 
+   bio_act = lo_act_bio(bio);
loop_handle_bio(lo, bio);
 
spin_lock_irq(&lo->lo_lock);
@@ -1103,11 +1101,9 @@
return -EINVAL;
 
/*
-* Need a working bmap. TODO: use the same optimization that
-* direct-io.c does for get_block() mapping more than one block
-* at the time.
+* Need a working extent_map
 */
-   if (inode->i_mapping->a_ops->bmap == NULL)
+   if (inode->i_mapping->a_ops->map_extent == NULL)
return -EINVAL;
/*
 * invalidate all page cache belonging to this file, it could become
diff -u b/include/linux/loop.h b/include/linux/loop.h
--- b/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -18,7 +18,6 @@
 #include 
 #include 
 #include 
-#include 
 
 /* Possible states of device */
 enum {
@@ -72,9 +71,6 @@
struct gendisk  *lo_disk;
struct list_headlo_list;
 
-   struct prio_tree_root   prio_root;
-   struct prio_tree_node   *last_insert;
-   struct prio_tree_node   *last_lookup;
unsigned intblkbits;
 };
 

-- 
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch] rewrite rd

2008-01-14 Thread Jens Axboe
On Mon, Jan 14 2008, Matthew Wilcox wrote:
> On Tue, Dec 04, 2007 at 05:26:28AM +0100, Nick Piggin wrote:
> > +static void copy_to_brd(struct brd_device *brd, const void *src,
> > +   sector_t sector, size_t n)
> > +{
> > +   struct page *page;
> > +   void *dst;
> > +   unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
> > +   size_t copy;
> > +
> > +   copy = min((unsigned long)n, PAGE_SIZE - offset);
> > +   page = brd_lookup_page(brd, sector);
> > +   BUG_ON(!page);
> > +
> > +   dst = kmap_atomic(page, KM_USER1);
> > +   memcpy(dst + offset, src, copy);
> > +   kunmap_atomic(dst, KM_USER1);
> 
> You're using kmap_atomic, but I see no reason you can't be preempted.
> Don't you need to at least disable preemption while you have stuff
> atomically kmapped?

kmap_atomic() disables preemption through pagefault_disable().

-- 
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] fast file mapping for loop

2008-01-14 Thread Chris Mason
Hello everyone,

Here is a modified version of Jens' patch.  The basic idea is to push
the mapping maintenance out of loop and down into the filesystem (ext2
in this case).

Two new address_space operations are added, one to map
extents and the other to provide call backs into the FS as io is
completed.

Still TODO for this patch:

* Add exclusion between filling holes and readers.  This is partly
implemented, when a hole is filled by the FS, the extent is flagged as
having a hole.  The idea is to check this flag before trying to read
the blocks and just send back all zeros.

The flag would be cleared when the blocks filling the hole have been
written.

* Exclude page cache readers and writers

* Add a way for the FS to request a commit before telling the higher
layers the IO is complete.  This way we can make sure metadata related
to holes is on disk before claiming the IO is really done.  COW based
filesystems will also needed it.

* Change loop to use fast mapping only when the new address_space op is
provided (whoops, forgot about this until just now)

* A few other bits for COW, not really relevant until there
is...something COW using it.

-chris

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -76,6 +76,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -481,16 +482,51 @@ static int do_bio_filebacked(struct loop
return ret;
 }
 
+#define __lo_throttle(wq, lock, condition) \
+do {   \
+   DEFINE_WAIT(__wait);\
+   for (;;) {  \
+   prepare_to_wait((wq), &__wait, TASK_UNINTERRUPTIBLE);   \
+   if (condition)  \
+   break;  \
+   spin_unlock_irq((lock));\
+   io_schedule();  \
+   spin_lock_irq((lock));  \
+   }   \
+   finish_wait((wq), &__wait); \
+} while (0)\
+
+#define lo_act_bio(bio)((bio)->bi_bdev)
+#define LO_BIO_THROTTLE128
+
 /*
- * Add bio to back of pending list
+ * A normal block device will throttle on request allocation. Do the same
+ * for loop to prevent millions of bio's queued internally.
+ */
+static void loop_bio_throttle(struct loop_device *lo, struct bio *bio)
+{
+   if (lo_act_bio(bio))
+   __lo_throttle(&lo->lo_bio_wait, &lo->lo_lock,
+   lo->lo_bio_cnt < LO_BIO_THROTTLE);
+}
+
+/*
+ * Add bio to back of pending list and wakeup thread
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
+   loop_bio_throttle(lo, bio);
+
if (lo->lo_biotail) {
lo->lo_biotail->bi_next = bio;
lo->lo_biotail = bio;
} else
lo->lo_bio = lo->lo_biotail = bio;
+
+   if (lo_act_bio(bio))
+   lo->lo_bio_cnt++;
+
+   wake_up(&lo->lo_event);
 }
 
 /*
@@ -510,6 +546,178 @@ static struct bio *loop_get_bio(struct l
return bio;
 }
 
+static void loop_exit_fastfs(struct loop_device *lo)
+{
+   /*
+* drop what page cache we instantiated filling holes
+*/
+   invalidate_inode_pages2(lo->lo_backing_file->f_mapping);
+
+   blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_NONE, NULL);
+}
+
+static inline u64 lo_bio_offset(struct loop_device *lo, struct bio *bio)
+{
+   return (u64)lo->lo_offset + ((u64)bio->bi_sector << 9);
+}
+
+/*
+ * Find extent mapping this lo device block to the file block on the real
+ * device
+ */
+static struct extent_map *loop_lookup_extent(struct loop_device *lo,
+u64 offset, gfp_t gfp_mask)
+{
+   struct address_space *mapping;
+   struct extent_map *em;
+   u64 len = 1 << lo->blkbits;
+
+   mapping = lo->lo_backing_file->f_mapping;
+   em = mapping->a_ops->map_extent(mapping, NULL, 0,
+   offset, len, 0, gfp_mask);
+   return em;
+}
+
+/*
+ * Alloc a hint bio to tell the loop thread to read file blocks for a given
+ * range
+ */
+static void loop_schedule_extent_mapping(struct loop_device *lo,
+sector_t sector,
+unsigned long len, int wait)
+{
+   struct bio *bio, stackbio;
+
+   /*
+* it's ok if we occasionally fail. if called with blocking set,
+* then use an on-stack bio since that must not fail.
+*/
+   if (wait) {
+   bio = &stackbio;
+   bi

Re: lockdep warning with LTP dio test (v2.6.24-rc6-125-g5356f66)

2008-01-14 Thread Jan Kara
On Wed 02-01-08 12:42:19, Zach Brown wrote:
> Erez Zadok wrote:
> > Setting: ltp-full-20071031, dio01 test on ext3 with Linus's latest tree.
> > Kernel w/ SMP, preemption, and lockdep configured.
> 
> This is a real lock ordering problem.  Thanks for reporting it.
> 
> The updating of atime inside sys_mmap() orders the mmap_sem in the vfs
> outside of the journal handle in ext3's inode dirtying:
> 
> > -> #1 (jbd_handle){--..}:
> >[] __lock_acquire+0x9cc/0xb95
> >[] lock_acquire+0x5f/0x78
> >[] journal_start+0xee/0xf8
> >[] ext3_journal_start_sb+0x48/0x4a
> >[] ext3_dirty_inode+0x27/0x6c
> >[] __mark_inode_dirty+0x29/0x144
> >[] touch_atime+0xb7/0xbc
> >[] generic_file_mmap+0x2d/0x42
> >[] mmap_region+0x1e6/0x3b4
> >[] do_mmap_pgoff+0x1fb/0x253
> >[] sys_mmap2+0x9b/0xb5
> >[] syscall_call+0x7/0xb
> >[] 0x
> 
> ext3_direct_IO() orders the journal handle outside of the mmap_sem that
> dio_get_page() acquires to pin pages with get_user_pages():
> 
> > -> #0 (&mm->mmap_sem){}:
> >[] __lock_acquire+0x8bc/0xb95
> >[] lock_acquire+0x5f/0x78
> >[] down_read+0x3a/0x4c
> >[] dio_get_page+0x4e/0x15d
> >[] __blockdev_direct_IO+0x431/0xa81
> >[] ext3_direct_IO+0x10c/0x1a1
> >[] generic_file_direct_IO+0x124/0x139
> >[] generic_file_direct_write+0x56/0x11c
> >[] __generic_file_aio_write_nolock+0x33d/0x489
> >[] generic_file_aio_write+0x58/0xb6
> >[] ext3_file_write+0x27/0x99
> >[] do_sync_write+0xc5/0x102
> >[] vfs_write+0x90/0x119
> >[] sys_write+0x3d/0x61
> >[] sysenter_past_esp+0x5f/0xa5
> >[] 0x
> 
> Two fixes come to mind:
> 
> 1) use something like Peter's ->mmap_prepare() to update atime before
> acquiring the mmap_sem.  ( http://lkml.org/lkml/2007/11/11/97 ).  I
> don't know if this would leave more paths which do a journal_start()
> while holding the mmap_sem.
> 
> 2) rework ext3's dio to only hold the jbd handle in ext3_get_block().
> Chris has a patch for this kicking around somewhere but I'm told it has
> problems exposing old blocks in ordered data mode.
> 
> Does anyone have preferences?  I could go either way.  I certainly don't
> like the idea of journal handles being held across the entirety of
> fs/direct-io.c.  It's yet another case of O_DIRECT differing wildly from
> the buffered path :(.
  I've looked more into it and I think that 2) is the only way to go since
transaction start ranks below page lock (standard buffered write path) and
page lock ranks below mmap_sem. So we have at least one more dependency
mmap_sem must go before transaction start...

Honza
-- 
Jan Kara <[EMAIL PROTECTED]>
SUSE Labs, CR
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch] rewrite rd

2008-01-14 Thread Matthew Wilcox
On Tue, Dec 04, 2007 at 05:26:28AM +0100, Nick Piggin wrote:
> +static void copy_to_brd(struct brd_device *brd, const void *src,
> + sector_t sector, size_t n)
> +{
> + struct page *page;
> + void *dst;
> + unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
> + size_t copy;
> +
> + copy = min((unsigned long)n, PAGE_SIZE - offset);
> + page = brd_lookup_page(brd, sector);
> + BUG_ON(!page);
> +
> + dst = kmap_atomic(page, KM_USER1);
> + memcpy(dst + offset, src, copy);
> + kunmap_atomic(dst, KM_USER1);

You're using kmap_atomic, but I see no reason you can't be preempted.
Don't you need to at least disable preemption while you have stuff
atomically kmapped?

-- 
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [linux-cifs-client] review 4, was Re: projected date for mount.cifs to support DFS junction points

2008-01-14 Thread Q (Igor Mammedov)
Christoph Hellwig wrote:
> [David, any chance you could look at the suggestion below to refactor
>  the automount from ->follow_link code into a common helper now that
>  we've grown a second copy from it]
> 
> 
> +cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
> +{
> + struct dfs_info3_param *referrals = NULL;
> + unsigned int num_referrals = 0;
> + struct cifs_sb_info *cifs_sb;
> + struct cifsSesInfo *ses;
> + char *full_path = NULL;
> + int xid, i;
> + int rc = 0;
> + struct vfsmount *mnt = ERR_PTR(-ENOENT);
> +
> + cFYI(1, ("in %s", __FUNCTION__));
> + BUG_ON(IS_ROOT(dentry));
> +
> + xid = GetXid();
> +
> + dput(nd->dentry);
> + nd->dentry = dget(dentry);
> + if (d_mountpoint(nd->dentry))
> + goto out_follow;
> 
> A link should never be a mountpoint.

why link? after patch 5 are applied DFS junction point becomes directory
instead of link. That is what has been done in NFS code.

> + if (IS_ERR(mnt))
> + goto out_err;
> +
> + mntget(mnt);
> + rc = do_add_mount(mnt, nd, nd->mnt->mnt_flags,
> + &cifs_dfs_automount_list);
> + if (rc < 0) {
> + mntput(mnt);
> + if (rc == -EBUSY)
> + goto out_follow;
> + goto out_err;
> + }
> + mntput(nd->mnt);
> + dput(nd->dentry);
> + nd->mnt = mnt;
> + nd->dentry = dget(mnt->mnt_root);
> 
> the version of the code in afs that you copy & pasted from in afs
> with the switch statement looked more readable.  In fact it would
> probably be useful if most of this could be split into a common
> helper.

Actually I copy & pasted it from NFS code ...
fs/nfs/namespace.c:nfs_follow_mountpoint

I've tried to do submount/referral machinery in NFS code way.


-- 

Best regards,

-
Igor Mammedov,
niallain "at" gmail.com




-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html