from:"\"Ian Kent\""

[PATCH 7/8] autofs - use path_has_submounts() to fix unreliable have_submount() checks

2016-10-10 Thread Ian Kent

From: Ian Kent 

If an automount mount is clone(2)ed into a file system that is propagation
private, when it later expires in the originating namespace, subsequent
calls to autofs ->d_automount() for that dentry in the original namespace
will return ELOOP until the mount is umounted in the cloned namespace.

Now that a struct path is available where needed use path_has_submounts()
instead of have_submounts() so we don't get false positives when checking
if a dentry is a mount point or contains mounts in the current namespace.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/autofs4/dev-ioctl.c |2 +-
 fs/autofs4/root.c  |   14 +++---
 fs/autofs4/waitq.c |   10 +++---
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 40c69f9..afacdaa 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -575,7 +575,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
devid = new_encode_dev(dev);
 
-   err = have_submounts(path.dentry);
+   err = path_has_submounts(&path);
 
if (follow_down_one(&path))
magic = path.dentry->d_sb->s_magic;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index d7e48fe..c4df881 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -387,16 +387,16 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
/*
 * It's possible that user space hasn't removed directories
 * after umounting a rootless multi-mount, although it
-* should. For v5 have_submounts() is sufficient to handle
-* this because the leaves of the directory tree under the
-* mount never trigger mounts themselves (they have an autofs
-* trigger mount mounted on them). But v4 pseudo direct mounts
-* do need the leaves to trigger mounts. In this case we
-* have no choice but to use the list_empty() check and
+* should. For v5 path_has_submounts() is sufficient to
+* handle this because the leaves of the directory tree under
+* the mount never trigger mounts themselves (they have an
+* autofs trigger mount mounted on them). But v4 pseudo direct
+* mounts do need the leaves to trigger mounts. In this case
+* we have no choice but to use the list_empty() check and
 * require user space behave.
 */
if (sbi->version > 4) {
-   if (have_submounts(dentry)) {
+   if (path_has_submounts(path)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index f757f87..ed05cae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct 
qstr *qstr)
 static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
const struct qstr *qstr,
-   struct dentry *dentry, enum autofs_notify notify)
+   struct path *path, enum autofs_notify notify)
 {
+   struct dentry *dentry = path->dentry;
struct autofs_wait_queue *wq;
struct autofs_info *ino;
 
@@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait,
 */
if (notify == NFY_MOUNT) {
struct dentry *new = NULL;
+   struct path this;
int valid = 1;
 
/*
@@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait,
dentry = new;
}
}
-   if (have_submounts(dentry))
+   this.mnt = path->mnt;
+   this.dentry = dentry;
+   if (path_has_submounts(&this))
valid = 0;
 
if (new)
@@ -406,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
return -EINTR;
}
 
-   ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+   ret = validate_request(&wq, sbi, &qstr, path, notify);
if (ret <= 0) {
if (ret != -EINTR)
mutex_unlock(&sbi->wq_mutex);

[PATCH 1/8] vfs - change d_manage() to take a struct path

2016-10-10 Thread Ian Kent

For the autofs module to be able to reliably check if a dentry is a
mountpoint in a multiple namespace environment the ->d_manage() dentry
operation will need to take a path argument instead of a dentry.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 Documentation/filesystems/Locking |2 +-
 Documentation/filesystems/vfs.txt |2 +-
 fs/autofs4/root.c |5 +++--
 fs/namei.c|   13 ++---
 include/linux/dcache.h|2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/Locking 
b/Documentation/filesystems/Locking
index fe15682..1949ac4 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -20,7 +20,7 @@ prototypes:
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
struct vfsmount *(*d_automount)(struct path *path);
-   int (*d_manage)(struct dentry *, bool);
+   int (*d_manage)(struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
 unsigned int);
 
diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index bc3b8e0..db70d71 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -934,7 +934,7 @@ struct dentry_operations {
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
-   int (*d_manage)(struct dentry *, bool);
+   int (*d_manage)(struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
 unsigned int);
 };
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a11f731..5cbd4e1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file 
*file);
 static struct dentry *autofs4_lookup(struct inode *,
 struct dentry *, unsigned int);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool);
+static int autofs4_d_manage(struct path *, bool);
 static void autofs4_dentry_release(struct dentry *);
 
 const struct file_operations autofs4_root_operations = {
@@ -421,8 +421,9 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
return NULL;
 }
 
-static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+static int autofs4_d_manage(struct path *path, bool rcu_walk)
 {
+   struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
diff --git a/fs/namei.c b/fs/namei.c
index a7f601c..704766a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct 
nameidata *nd)
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
-   ret = path->dentry->d_op->d_manage(path->dentry, false);
+   ret = path->dentry->d_op->d_manage(path, false);
if (ret < 0)
break;
}
@@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
-static inline int managed_dentry_rcu(struct dentry *dentry)
+static inline int managed_dentry_rcu(struct path *path)
 {
-   return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
-   dentry->d_op->d_manage(dentry, true) : 0;
+   return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+   path->dentry->d_op->d_manage(path, true) : 0;
 }
 
 /*
@@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, 
struct path *path,
 * Don't forget we might have a non-mountpoint managed dentry
 * that wants to block transit.
 */
-   switch (managed_dentry_rcu(path->dentry)) {
+   switch (managed_dentry_rcu(path)) {
case -ECHILD:
default:
return false;
@@ -1392,8 +1392,7 @@ int follow_down(struct path *path)
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
-   ret = path->dentry->d_op->d_manage(
-   path->dentry, false);
+   ret = path->dentry->d_op->d_manage(path, false);

[RFC PATCH 2/8] vfs - add path_is_mountpoint() helper

2016-10-02 Thread Ian Kent

From: Ian Kent 

d_mountpoint() can only be used reliably to establish if a dentry is
not mounted in any namespace. It isn't aware of the possibility there
may be multiple mounts using a given dentry that may be in a different
namespace.

Add helper functions, path_is_mountpoint() and an rcu version , that
checks if a struct path is a mountpoint for this case.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/namespace.c |   43 +++
 include/linux/fs.h |2 ++
 2 files changed, 45 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda..ca1faaa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1153,6 +1153,49 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
+static bool __path_is_mountpoint(struct path *path)
+{
+   struct mount *mount;
+   struct vfsmount *mnt;
+   unsigned seq;
+
+   do {
+   seq = read_seqbegin(&mount_lock);
+   mount = __lookup_mnt(path->mnt, path->dentry);
+   mnt = mount ? &mount->mnt : NULL;
+   } while (mnt &&
+!(mnt->mnt_flags & MNT_SYNC_UMOUNT) &&
+read_seqretry(&mount_lock, seq));
+
+   return mnt != NULL;
+}
+
+/* Check if path is a mount in current namespace */
+bool path_is_mountpoint(struct path *path)
+{
+   bool res;
+
+   if (!d_mountpoint(path->dentry))
+   return 0;
+
+   rcu_read_lock();
+   res = __path_is_mountpoint(path);
+   rcu_read_unlock();
+
+   return res;
+}
+EXPORT_SYMBOL(path_is_mountpoint);
+
+/* Check if path is a mount in current namespace */
+bool path_is_mountpoint_rcu(struct path *path)
+{
+   if (!d_mountpoint(path->dentry))
+   return 0;
+
+   return __path_is_mountpoint(path);
+}
+EXPORT_SYMBOL(path_is_mountpoint_rcu);
+
 struct vfsmount *mnt_clone_internal(struct path *path)
 {
struct mount *p;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 901e25d..d588b26 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2117,6 +2117,8 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
+extern bool path_is_mountpoint(struct path *);
+extern bool path_is_mountpoint_rcu(struct path *);
 
 extern int current_umask(void);

[RFC PATCH 4/8] autofs - change autofs4_expire_wait() to take struct path

2016-10-02 Thread Ian Kent

From: Ian Kent 

In order to use the functions path_is_mountpoint() (or it's rcu-walk
variant) and path_has_submounts() autofs needs to pass a struct path
in several places.

Start by changing autofs4_expire_wait() to take a struct path instead
of a struct dentry.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/autofs4/autofs_i.h  |2 +-
 fs/autofs4/dev-ioctl.c |2 +-
 fs/autofs4/expire.c|3 ++-
 fs/autofs4/root.c  |   12 +++-
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a439548..b548ee6 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -148,7 +148,7 @@ void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
 int is_autofs4_dentry(struct dentry *);
-int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
+int autofs4_expire_wait(struct path *path, int rcu_walk);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
   struct autofs_sb_info *,
   struct autofs_packet_expire __user *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c7fcc74..3cd96e6 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -457,7 +457,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
ino = autofs4_dentry_ino(path.dentry);
if (ino) {
err = 0;
-   autofs4_expire_wait(path.dentry, 0);
+   autofs4_expire_wait(&path, 0);
spin_lock(&sbi->fs_lock);
param->requester.uid =
from_kuid_munged(current_user_ns(), ino->uid);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index d8e6d42..7eac498 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -495,8 +495,9 @@ found:
return expired;
 }
 
-int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
+int autofs4_expire_wait(struct path *path, int rcu_walk)
 {
+   struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 0705daa..e5ed061 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -286,22 +286,24 @@ static int autofs4_mount_wait(struct dentry *dentry, bool 
rcu_walk)
return status;
 }
 
-static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
+static int do_expire_wait(struct path *path, bool rcu_walk)
 {
+   struct dentry *dentry = path->dentry;
struct dentry *expiring;
 
expiring = autofs4_lookup_expiring(dentry, rcu_walk);
if (IS_ERR(expiring))
return PTR_ERR(expiring);
if (!expiring)
-   return autofs4_expire_wait(dentry, rcu_walk);
+   return autofs4_expire_wait(path, rcu_walk);
else {
+   struct path this = { .mnt = path->mnt, .dentry = expiring };
/*
 * If we are racing with expire the request might not
 * be quite complete, but the directory has been removed
 * so it must have been successful, just wait for it.
 */
-   autofs4_expire_wait(expiring, 0);
+   autofs4_expire_wait(&this, 0);
autofs4_del_expiring(expiring);
dput(expiring);
}
@@ -354,7 +356,7 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
 * and the directory was removed, so just go ahead and try
 * the mount.
 */
-   status = do_expire_wait(dentry, 0);
+   status = do_expire_wait(path, 0);
if (status && status != -EAGAIN)
return NULL;
 
@@ -438,7 +440,7 @@ static int autofs4_d_manage(struct path *path, bool 
rcu_walk)
}
 
/* Wait for pending expires */
-   if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
+   if (do_expire_wait(path, rcu_walk) == -ECHILD)
return -ECHILD;
 
/*

[RFC PATCH 5/8] autofs - change autofs4_wait() to take struct path

2016-10-02 Thread Ian Kent

From: Ian Kent 

In order to use the functions path_is_mountpoint() (or its rcu-walk
variant) and path_has_submounts() autofs needs to pass a struct path
in several places.

Now change autofs4_wait() to take a struct path instead of a struct
dentry.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/autofs4/autofs_i.h |2 +-
 fs/autofs4/expire.c   |5 +++--
 fs/autofs4/root.c |   16 
 fs/autofs4/waitq.c|3 ++-
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b548ee6..1b6ed78 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -220,7 +220,7 @@ static inline int autofs_prepare_pipe(struct file *pipe)
 
 /* Queue management functions */
 
-int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait(struct autofs_sb_info *, struct path *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 7eac498..a37ba40 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -526,7 +526,7 @@ retry:
 
pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
 
-   status = autofs4_wait(sbi, dentry, NFY_NONE);
+   status = autofs4_wait(sbi, path, NFY_NONE);
wait_for_completion(&ino->expire_complete);
 
pr_debug("expire done status=%d\n", status);
@@ -593,11 +593,12 @@ int autofs4_do_expire_multi(struct super_block *sb, 
struct vfsmount *mnt,
 
if (dentry) {
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+   struct path path = { .mnt = mnt, .dentry = dentry };
 
/* This is synchronous because it makes the daemon a
 * little easier
 */
-   ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+   ret = autofs4_wait(sbi, &path, NFY_EXPIRE);
 
spin_lock(&sbi->fs_lock);
/* avoid rapid-fire expire attempts if expiry fails */
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e5ed061..a12c248 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -269,17 +269,17 @@ next:
return NULL;
 }
 
-static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
+static int autofs4_mount_wait(struct path *path, bool rcu_walk)
 {
-   struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-   struct autofs_info *ino = autofs4_dentry_ino(dentry);
+   struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb);
+   struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
int status = 0;
 
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
-   pr_debug("waiting for mount name=%pd\n", dentry);
-   status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+   pr_debug("waiting for mount name=%pd\n", path->dentry);
+   status = autofs4_wait(sbi, path, NFY_MOUNT);
pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
@@ -364,7 +364,7 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_PENDING) {
spin_unlock(&sbi->fs_lock);
-   status = autofs4_mount_wait(dentry, 0);
+   status = autofs4_mount_wait(path, 0);
if (status)
return ERR_PTR(status);
goto done;
@@ -405,7 +405,7 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
-   status = autofs4_mount_wait(dentry, 0);
+   status = autofs4_mount_wait(path, 0);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
if (status) {
@@ -447,7 +447,7 @@ static int autofs4_d_manage(struct path *path, bool 
rcu_walk)
 * This dentry may be under construction so wait on mount
 * completion.
 */
-   status = autofs4_mount_wait(dentry, rcu_walk);
+   status = autofs4_mount_wait(path, rcu_walk);
if (status)
return status;
 
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 431fd7e..f757f87 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -345,8 +345,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 }
 
 int autofs4_wait(struct autofs_sb_info *sbi,
-struct dentry *dentry, enum autofs_notify notify)
+struct path *path,

[RFC PATCH 8/8] vfs - remove unused have_submounts() function

2016-10-02 Thread Ian Kent

Now that path_has_submounts() has been added have_submounts() is no
longer used so remove it.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/dcache.c|   33 -
 include/linux/dcache.h |1 -
 2 files changed, 34 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 872f04e..719d8b4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1273,39 +1273,6 @@ rename_retry:
goto again;
 }
 
-/*
- * Search for at least 1 mount point in the dentry's subdirs.
- * We descend to the next level whenever the d_subdirs
- * list is non-empty and continue searching.
- */
-
-static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
-{
-   int *ret = data;
-   if (d_mountpoint(dentry)) {
-   *ret = 1;
-   return D_WALK_QUIT;
-   }
-   return D_WALK_CONTINUE;
-}
-
-/**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
- *
- * Return true if the parent or its subdirectories contain
- * a mount point
- */
-int have_submounts(struct dentry *parent)
-{
-   int ret = 0;
-
-   d_walk(parent, &ret, check_mount, NULL);
-
-   return ret;
-}
-EXPORT_SYMBOL(have_submounts);
-
 struct check_mount {
struct vfsmount *mnt;
unsigned int mounted;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 5c5d197..384c87c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -254,7 +254,6 @@ extern struct dentry *d_find_alias(struct inode *);
 extern void d_prune_aliases(struct inode *);
 
 /* test whether we have any submounts in a subdir tree */
-extern int have_submounts(struct dentry *);
 extern int path_has_submounts(struct path *);
 
 /*

[RFC PATCH 6/8] autofs - use path_is_mountpoint() to fix unreliable d_mountpoint() checks

2016-10-02 Thread Ian Kent

From: Ian Kent 

If an automount mount is clone(2)ed into a file system that is propagation
private, when it later expires in the originating namespace, subsequent
calls to autofs ->d_automount() for that dentry in the original namespace
will return ELOOP until the mount is umounted in the cloned namespace.

Now that a struct path is available where needed use path_is_mountpoint()
instead of d_mountpoint() so we don't get false positives when checking
if a dentry is a mount point in the current namespace.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/autofs4/root.c |   24 
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a12c248..0f5d264 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -107,12 +107,15 @@ static int autofs4_dir_open(struct inode *inode, struct 
file *file)
 {
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+   struct path path;
 
pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
 
if (autofs4_oz_mode(sbi))
goto out;
 
+   path = file->f_path;
+
/*
 * An empty directory in an autofs file system is always a
 * mount point. The daemon must have failed to mount this
@@ -123,7 +126,7 @@ static int autofs4_dir_open(struct inode *inode, struct 
file *file)
 * it.
 */
spin_lock(&sbi->lookup_lock);
-   if (!d_mountpoint(dentry) && simple_empty(dentry)) {
+   if (!path_is_mountpoint(&path) && simple_empty(dentry)) {
spin_unlock(&sbi->lookup_lock);
return -ENOENT;
}
@@ -372,15 +375,15 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
 
/*
 * If the dentry is a symlink it's equivalent to a directory
-* having d_mountpoint() true, so there's no need to call back
-* to the daemon.
+* having path_is_mountpoint() true, so there's no need to call
+* back to the daemon.
 */
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
 
-   if (!d_mountpoint(dentry)) {
+   if (!path_is_mountpoint(path)) {
/*
 * It's possible that user space hasn't removed directories
 * after umounting a rootless multi-mount, although it
@@ -434,8 +437,13 @@ static int autofs4_d_manage(struct path *path, bool 
rcu_walk)
 
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
-   if (!d_mountpoint(dentry))
-   return -EISDIR;
+   if (rcu_walk) {
+   if (!path_is_mountpoint_rcu(path))
+   return -EISDIR;
+   } else {
+   if (!path_is_mountpoint(path))
+   return -EISDIR;
+   }
return 0;
}
 
@@ -463,7 +471,7 @@ static int autofs4_d_manage(struct path *path, bool 
rcu_walk)
 
if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
return 0;
-   if (d_mountpoint(dentry))
+   if (path_is_mountpoint_rcu(path))
return 0;
inode = d_inode_rcu(dentry);
if (inode && S_ISLNK(inode->i_mode))
@@ -490,7 +498,7 @@ static int autofs4_d_manage(struct path *path, bool 
rcu_walk)
 * we can avoid needless calls ->d_automount() and avoid
 * an incorrect ELOOP error return.
 */
-   if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+   if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
(d_really_is_positive(dentry) && d_is_symlink(dentry)))
status = -EISDIR;
}

[RFC PATCH 7/8] autofs - use path_has_submounts() to fix unreliable have_submount() checks

2016-10-02 Thread Ian Kent

From: Ian Kent 

If an automount mount is clone(2)ed into a file system that is propagation
private, when it later expires in the originating namespace, subsequent
calls to autofs ->d_automount() for that dentry in the original namespace
will return ELOOP until the mount is umounted in the cloned namespace.

Now that a struct path is available where needed use path_has_submounts()
instead of have_submounts() so we don't get false positives when checking
if a dentry is a mount point or contains mounts in the current namespace.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/autofs4/dev-ioctl.c |2 +-
 fs/autofs4/root.c  |   14 +++---
 fs/autofs4/waitq.c |   10 +++---
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 3cd96e6..953e790 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -564,7 +564,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
devid = new_encode_dev(dev);
 
-   err = have_submounts(path.dentry);
+   err = path_has_submounts(&path);
 
if (follow_down_one(&path))
magic = path.dentry->d_sb->s_magic;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 0f5d264..cf90d37 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -387,16 +387,16 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
/*
 * It's possible that user space hasn't removed directories
 * after umounting a rootless multi-mount, although it
-* should. For v5 have_submounts() is sufficient to handle
-* this because the leaves of the directory tree under the
-* mount never trigger mounts themselves (they have an autofs
-* trigger mount mounted on them). But v4 pseudo direct mounts
-* do need the leaves to trigger mounts. In this case we
-* have no choice but to use the list_empty() check and
+* should. For v5 path_has_submounts() is sufficient to
+* handle this because the leaves of the directory tree under
+* the mount never trigger mounts themselves (they have an
+* autofs trigger mount mounted on them). But v4 pseudo direct
+* mounts do need the leaves to trigger mounts. In this case
+* we have no choice but to use the list_empty() check and
 * require user space behave.
 */
if (sbi->version > 4) {
-   if (have_submounts(dentry)) {
+   if (path_has_submounts(path)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index f757f87..ed05cae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct 
qstr *qstr)
 static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
const struct qstr *qstr,
-   struct dentry *dentry, enum autofs_notify notify)
+   struct path *path, enum autofs_notify notify)
 {
+   struct dentry *dentry = path->dentry;
struct autofs_wait_queue *wq;
struct autofs_info *ino;
 
@@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait,
 */
if (notify == NFY_MOUNT) {
struct dentry *new = NULL;
+   struct path this;
int valid = 1;
 
/*
@@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait,
dentry = new;
}
}
-   if (have_submounts(dentry))
+   this.mnt = path->mnt;
+   this.dentry = dentry;
+   if (path_has_submounts(&this))
valid = 0;
 
if (new)
@@ -406,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
return -EINTR;
}
 
-   ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+   ret = validate_request(&wq, sbi, &qstr, path, notify);
if (ret <= 0) {
if (ret != -EINTR)
mutex_unlock(&sbi->wq_mutex);

[RFC PATCH 3/8] vfs - add path_has_submounts()

2016-10-02 Thread Ian Kent

From: Ian Kent 

d_mountpoint() can only be used reliably to establish if a dentry is
not mounted in any namespace. It isn't aware of the possibility there
may be multiple mounts using the given dentry, possibly in a different
namespace.

Add function, path_has_submounts(), that checks is a struct path contains
mounts (or is a mountpoint itself) to handle this case.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/dcache.c|   35 +++
 include/linux/dcache.h |1 +
 2 files changed, 36 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7cc95..872f04e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1306,6 +1306,41 @@ int have_submounts(struct dentry *parent)
 }
 EXPORT_SYMBOL(have_submounts);
 
+struct check_mount {
+   struct vfsmount *mnt;
+   unsigned int mounted;
+};
+
+static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
+{
+   struct check_mount *info = data;
+   struct path path = { .mnt = info->mnt, .dentry = dentry };
+
+   if (path_is_mountpoint(&path)) {
+   info->mounted = 1;
+   return D_WALK_QUIT;
+   }
+   return D_WALK_CONTINUE;
+}
+
+/**
+ * path_has_submounts - check for mounts over a dentry in the
+ *  current namespace.
+ * @parent: path to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point in the current namespace.
+ */
+int path_has_submounts(struct path *parent)
+{
+   struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
+
+   d_walk(parent->dentry, &data, path_check_mount, NULL);
+
+   return data.mounted;
+}
+EXPORT_SYMBOL(path_has_submounts);
+
 /*
  * Called by mount code to set a mountpoint and check if the mountpoint is
  * reachable (e.g. NFS can unhash a directory dentry and then the complete
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ad2df92..5c5d197 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -255,6 +255,7 @@ extern void d_prune_aliases(struct inode *);
 
 /* test whether we have any submounts in a subdir tree */
 extern int have_submounts(struct dentry *);
+extern int path_has_submounts(struct path *);
 
 /*
  * This adds the entry to the hash queues.

[RFC PATCH 1/8] vfs - change d_manage() to take a struct path

2016-10-02 Thread Ian Kent

For the autofs module to be able to reliably check if a dentry is a
mountpoint in a multiple namespace environment the ->d_manage() dentry
operation will need to take a path argument instead of a dentry.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 Documentation/filesystems/Locking |2 +-
 Documentation/filesystems/vfs.txt |2 +-
 fs/autofs4/root.c |5 +++--
 fs/namei.c|   13 ++---
 include/linux/dcache.h|2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/Locking 
b/Documentation/filesystems/Locking
index d30fb2c..189ea98 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -20,7 +20,7 @@ prototypes:
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
struct vfsmount *(*d_automount)(struct path *path);
-   int (*d_manage)(struct dentry *, bool);
+   int (*d_manage)(struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
 unsigned int);
 
diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index 9ace359..0035ac8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -939,7 +939,7 @@ struct dentry_operations {
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
-   int (*d_manage)(struct dentry *, bool);
+   int (*d_manage)(struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
 unsigned int);
 };
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index fa84bb8..0705daa 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file 
*file);
 static struct dentry *autofs4_lookup(struct inode *,
 struct dentry *, unsigned int);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool);
+static int autofs4_d_manage(struct path *, bool);
 static void autofs4_dentry_release(struct dentry *);
 
 const struct file_operations autofs4_root_operations = {
@@ -421,8 +421,9 @@ done:
return NULL;
 }
 
-static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+static int autofs4_d_manage(struct path *path, bool rcu_walk)
 {
+   struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
diff --git a/fs/namei.c b/fs/namei.c
index adb0414..e86b9d0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct 
nameidata *nd)
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
-   ret = path->dentry->d_op->d_manage(path->dentry, false);
+   ret = path->dentry->d_op->d_manage(path, false);
if (ret < 0)
break;
}
@@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
-static inline int managed_dentry_rcu(struct dentry *dentry)
+static inline int managed_dentry_rcu(struct path *path)
 {
-   return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
-   dentry->d_op->d_manage(dentry, true) : 0;
+   return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+   path->dentry->d_op->d_manage(path, true) : 0;
 }
 
 /*
@@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, 
struct path *path,
 * Don't forget we might have a non-mountpoint managed dentry
 * that wants to block transit.
 */
-   switch (managed_dentry_rcu(path->dentry)) {
+   switch (managed_dentry_rcu(path)) {
case -ECHILD:
default:
return false;
@@ -1392,8 +1392,7 @@ int follow_down(struct path *path)
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
-   ret = path->dentry->d_op->d_manage(
-   path->dentry, false);
+   ret = path->dentry->d_op->d_manage(path, false);
if (ret < 0)
return

[RFC PATCH 0/8] Patch series to fix autofs unreliable usage of d_mountpoint()

2016-10-02 Thread Ian Kent

This series fixes the potential problem of autofs returning ELOOP when
a mount exists in a propogation private mount namespace other than the
namespace in which the mount is to be performed.

I'm posting the series as an RFC in the hope of catching stupid mistakes
that I may have made before submitting to mmotm. Please note the series
here is against the current Linus tree and may be slightly different when
posted for inclusion in mmotm.

Please review and post any comments.
---

Ian Kent (8):
  vfs - change d_manage() to take a struct path
  vfs - add path_is_mountpoint() helper
  vfs - add path_has_submounts()
  autofs - change autofs4_expire_wait() to take struct path
  autofs - change autofs4_wait() to take struct path
  autofs - use path_is_mountpoint() to fix unreliable d_mountpoint() checks
  autofs - use path_has_submounts() to fix unreliable have_submount() checks
  vfs - remove unused have_submounts() function


 Documentation/filesystems/Locking |2 +
 Documentation/filesystems/vfs.txt |2 +
 fs/autofs4/autofs_i.h |4 +-
 fs/autofs4/dev-ioctl.c|4 +-
 fs/autofs4/expire.c   |8 +++-
 fs/autofs4/root.c |   71 +
 fs/autofs4/waitq.c|   13 +--
 fs/dcache.c   |   36 ++-
 fs/namei.c|   13 +++
 fs/namespace.c|   43 ++
 include/linux/dcache.h|4 +-
 include/linux/fs.h|2 +
 12 files changed, 133 insertions(+), 69 deletions(-)

--
Ian

Re: linux-next: manual merge of the akpm-current tree with the userns tree

2016-09-30 Thread Ian Kent

On Fri, 2016-09-30 at 17:42 +1000, Stephen Rothwell wrote:
> Hi Andrew,

Hi Stephen,

> 
> Today's linux-next merge of the akpm-current tree got a conflict in:
> 
>   include/linux/mount.h
> 
> between commit:
> 
>   312ddcb332c3 ("mnt: Add a per mount namespace limit on the number of
> mounts")
> 
> from the userns tree and commit:
> 
>   a0461d15d75c ("vfs: make is_local_mountpoint() usable by others")
> 
> from the akpm-current tree.

Yes, this is a problem.

There is a fundamental flaw in the series surrounding commit a0461d15d75c.

In discussion with Eric it was decided a different approach was needed and I'm
holding back on posting an updated series because I was worried something like
this might happen and didn't want to make matters worse.

I definitely don't want this series to go to the Linus tree and it would be
great if you could drop it from the next tree. Eric's patch should then apply
without change.

I had asked Andrew to drop the series but he must have missed my request.

And I thought they had already been dropped but I must have been looking at an
incorrect branch. I'll need to look at the akpm repo. again.

In the meantime all I can offer is the patch names corresponding to the
descriptions.

They are:
fs-make-is_local_mountpoint-usable-by-others.patch
fs-add-have_local_submounts.patch
autofs-make-mountpoint-checks-namespace-aware.patch
fs-remove-unused-have_submounts-function.patch

Sorry for the inconvenience.
Ian

> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
>

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-28 Thread Ian Kent

On Fri, 2016-09-23 at 20:00 +0800, Ian Kent wrote:
> On Fri, 2016-09-23 at 12:26 +0800, Ian Kent wrote:
> > On Thu, 2016-09-22 at 20:37 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > > On Thu, 2016-09-22 at 10:43 -0500, Eric W. Biederman wrote:
> > > > > Ian Kent  writes:
> > > > > 
> > > > > > Eric, Mateusz, I appreciate your spending time on this and
> > > > > > particularly
> > > > > > pointing
> > > > > > out my embarrassingly stupid is_local_mountpoint() usage mistake.
> > > > > > 
> > > > > > Please accept my apology for the inconvenience.
> > > > > > 
> > > > > > If all goes well (in testing) I'll have follow up patches to correct
> > > > > > this
> > > > > > fairly
> > > > > > soon.
> > > > > 
> > > > > Related question.  Do you happen to know how many mounts per mount
> > > > > namespace tend to be used?  It looks like it is going to be wise to
> > > > > put
> > > > > a configurable limit on that number.  And I would like the default to
> > > > > be
> > > > > something high enough most people don't care.  I believe autofs is
> > > > > likely where people tend to use the most mounts.
> > 
> > Yes, I agree, I did want to try and avoid changing the parameters to
> > ->d_mamange() but passing a struct path pointer might be better in the long
> > run
> > anyway.
> > 
> 
> Andrew, could you please drop patches for this series.
> 
> I believe they are:
> fs-make-is_local_mountpoint-usable-by-others.patch
> fs-add-have_local_submounts.patch
> autofs-make-mountpoint-checks-namespace-aware.patch
> fs-remove-unused-have_submounts-function.patch
> 
> I'm going to have a go at what Eric and I discussed above rather than update
> this series.

There are a couple of problems I see preventing me from posting an updated
series.

It looks like this series was dropped from the mmotm tree but is still present
in the linux-next tree.

Hopefully this won't be pushed to the Linux tree, it's not likely to be what's
needed.

Also there are two commits in the Linus tree that will conflict with an updated
series which is worth a heads up.

They are:

commit 8ac790f312 from Al Viro
title: qstr: constify instances in autofs4

commit e698b8a436 from Miklos Szeredi
title: vfs: document ->d_real()

Is there anything I should do to help with this?
Ian

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-27 Thread Ian Kent

On Tue, 2016-09-27 at 08:14 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > On Mon, 2016-09-26 at 11:05 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > > On Fri, 2016-09-23 at 14:15 -0500, Eric W. Biederman wrote:
> > > > > Ian Kent  writes:
> > > > > 
> > > > > 2> On Thu, 2016-09-22 at 20:37 -0500, Eric W. Biederman wrote:
> > > > > > > Ian Kent  writes:
> > > > > > > 
> > > > > > > > On Thu, 2016-09-22 at 10:43 -0500, Eric W. Biederman wrote:
> > > > > > > > > Ian Kent  writes:
> > > > > > > > > 
> > > > > > > > > > Eric, Mateusz, I appreciate your spending time on this and
> > > > > > > > > > particularly
> > > > > > > > > > pointing
> > > > > > > > > > out my embarrassingly stupid is_local_mountpoint() usage
> > > > > > > > > > mistake.
> > > > > > > > > > 
> > > > > > > > > > Please accept my apology for the inconvenience.
> > > > > > > > > > 
> > > > > > > > > > If all goes well (in testing) I'll have follow up patches to
> > > > > > > > > > correct
> > > > > > > > > > this
> > > > > > > > > > fairly
> > > > > > > > > > soon.
> > > > > > > > > 
> > > > > > > > > Related question.  Do you happen to know how many mounts per
> > > > > > > > > mount
> > > > > > > > > namespace tend to be used?  It looks like it is going to be
> > > > > > > > > wise
> > > > > > > > > to
> > > > > > > > > put
> > > > > > > > > a configurable limit on that number.  And I would like the
> > > > > > > > > default
> > > > > > > > > to
> > > > > > > > > be
> > > > > > > > > something high enough most people don't care.  I believe
> > > > > > > > > autofs is
> > > > > > > > > likely where people tend to use the most mounts.
> > > > > > 
> > > > > > Yes, I agree, I did want to try and avoid changing the parameters to
> > > > > > ->d_mamange() but passing a struct path pointer might be better in
> > > > > > the
> > > > > > long
> > > > > > run
> > > > > > anyway.
> > > > > 
> > > > > Given that there is exactly one implementation of d_manage in the tree
> > > > > I
> > > > > don't imagine it will be disruptive to change that.
> > > > 
> > > > Yes, but it could be used by external modules.
> > > > 
> > > > And there's also have_submounts().
> > > 
> > > Good point about have_submounts.
> > > 
> > > > I can update that using the existing d_walk() infrastructure or take it
> > > > (mostly)
> > > > into the autofs module and get rid of have_submounts().
> > > > 
> > > > I'll go with the former to start with and see what people think.
> > > 
> > > That will be interesting to so.  It is not clear to me that if d_walk
> > > needs to be updated, and if d_walk doesn't need to be updated I would
> > > be surprised to see it take into autofs.  But I am happy to look at the
> > > end result and see what you come up with.
> > 
> > I didn't mean d_walk() itself, just the have_submounts() function that's
> > used
> > only by autofs these days. That's all I'll be changing.
> > 
> > To take this functionality into the autofs module shouldn't be a big deal as
> > it
> > amounts to a directory traversal with a check at each node.
> > 
> > But I vaguely remember talk of wanting to get rid of have_submounts() and
> > autofs
> > being the only remaining user.
> > 
> > So I mentioned it to try and elicit a comment, ;)
> 
> From my perspective the key detail is that d_walk is private to dcache.c
> 
> That said you want to look at may_umount_tree, or may_umount that
> are already exported from fs/namespace.c, and already used by autofs.
> One of those may already do the job you are trying to do.

Right, I'm aware of them, autofs uses may_umount() when asking if an autofs
mount can be umounted, and it uses may_umount_tree() to check busyness in its
expire.

may_umount_tree() (and may_umount()) won't tell you if there are any mounts
within the directory tree, only that there is an elevated reference count, for
example an open file or working directory etc., ie. busy.

OTOH, have_submounts() will return true as soon as it encounters a
d_mountpoint() dentry in the directory tree which is what's needed where it's
used and why it can return a false positive for the problem case.

I get that a function, say, path_has_submounts() probably shouldn't be placed in
dcache.c and that's another reason to take the traversal into autofs. 

But if there's no word from Al on that I'd prefer to use d_walk() and put it
there.

Ian

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-26 Thread Ian Kent

On Mon, 2016-09-26 at 11:05 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > On Fri, 2016-09-23 at 14:15 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > 2> On Thu, 2016-09-22 at 20:37 -0500, Eric W. Biederman wrote:
> > > > > Ian Kent  writes:
> > > > > 
> > > > > > On Thu, 2016-09-22 at 10:43 -0500, Eric W. Biederman wrote:
> > > > > > > Ian Kent  writes:
> > > > > > > 
> > > > > > > > Eric, Mateusz, I appreciate your spending time on this and
> > > > > > > > particularly
> > > > > > > > pointing
> > > > > > > > out my embarrassingly stupid is_local_mountpoint() usage
> > > > > > > > mistake.
> > > > > > > > 
> > > > > > > > Please accept my apology for the inconvenience.
> > > > > > > > 
> > > > > > > > If all goes well (in testing) I'll have follow up patches to
> > > > > > > > correct
> > > > > > > > this
> > > > > > > > fairly
> > > > > > > > soon.
> > > > > > > 
> > > > > > > Related question.  Do you happen to know how many mounts per mount
> > > > > > > namespace tend to be used?  It looks like it is going to be wise
> > > > > > > to
> > > > > > > put
> > > > > > > a configurable limit on that number.  And I would like the default
> > > > > > > to
> > > > > > > be
> > > > > > > something high enough most people don't care.  I believe autofs is
> > > > > > > likely where people tend to use the most mounts.
> > > > 
> > > > Yes, I agree, I did want to try and avoid changing the parameters to
> > > > ->d_mamange() but passing a struct path pointer might be better in the
> > > > long
> > > > run
> > > > anyway.
> > > 
> > > Given that there is exactly one implementation of d_manage in the tree I
> > > don't imagine it will be disruptive to change that.
> > 
> > Yes, but it could be used by external modules.
> > 
> > And there's also have_submounts().
> 
> Good point about have_submounts.
> 
> > I can update that using the existing d_walk() infrastructure or take it
> > (mostly)
> > into the autofs module and get rid of have_submounts().
> > 
> > I'll go with the former to start with and see what people think.
> 
> That will be interesting to so.  It is not clear to me that if d_walk
> needs to be updated, and if d_walk doesn't need to be updated I would
> be surprised to see it take into autofs.  But I am happy to look at the
> end result and see what you come up with.

I didn't mean d_walk() itself, just the have_submounts() function that's used
only by autofs these days. That's all I'll be changing.

To take this functionality into the autofs module shouldn't be a big deal as it
amounts to a directory traversal with a check at each node.

But I vaguely remember talk of wanting to get rid of have_submounts() and autofs
being the only remaining user.

So I mentioned it to try and elicit a comment, ;)

> 
> Eric

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-23 Thread Ian Kent

On Fri, 2016-09-23 at 14:15 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> 2> On Thu, 2016-09-22 at 20:37 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > > On Thu, 2016-09-22 at 10:43 -0500, Eric W. Biederman wrote:
> > > > > Ian Kent  writes:
> > > > > 
> > > > > > Eric, Mateusz, I appreciate your spending time on this and
> > > > > > particularly
> > > > > > pointing
> > > > > > out my embarrassingly stupid is_local_mountpoint() usage mistake.
> > > > > > 
> > > > > > Please accept my apology for the inconvenience.
> > > > > > 
> > > > > > If all goes well (in testing) I'll have follow up patches to correct
> > > > > > this
> > > > > > fairly
> > > > > > soon.
> > > > > 
> > > > > Related question.  Do you happen to know how many mounts per mount
> > > > > namespace tend to be used?  It looks like it is going to be wise to
> > > > > put
> > > > > a configurable limit on that number.  And I would like the default to
> > > > > be
> > > > > something high enough most people don't care.  I believe autofs is
> > > > > likely where people tend to use the most mounts.
> > 
> > Yes, I agree, I did want to try and avoid changing the parameters to
> > ->d_mamange() but passing a struct path pointer might be better in the long
> > run
> > anyway.
> 
> Given that there is exactly one implementation of d_manage in the tree I
> don't imagine it will be disruptive to change that.

Yes, but it could be used by external modules.

And there's also have_submounts().

I can update that using the existing d_walk() infrastructure or take it (mostly)
into the autofs module and get rid of have_submounts().

I'll go with the former to start with and see what people think.

> 
> Eric

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-23 Thread Ian Kent

On Fri, 2016-09-23 at 12:26 +0800, Ian Kent wrote:
> On Thu, 2016-09-22 at 20:37 -0500, Eric W. Biederman wrote:
> > Ian Kent  writes:
> > 
> > > On Thu, 2016-09-22 at 10:43 -0500, Eric W. Biederman wrote:
> > > > Ian Kent  writes:
> > > > 
> > > > > Eric, Mateusz, I appreciate your spending time on this and
> > > > > particularly
> > > > > pointing
> > > > > out my embarrassingly stupid is_local_mountpoint() usage mistake.
> > > > > 
> > > > > Please accept my apology for the inconvenience.
> > > > > 
> > > > > If all goes well (in testing) I'll have follow up patches to correct
> > > > > this
> > > > > fairly
> > > > > soon.
> > > > 
> > > > Related question.  Do you happen to know how many mounts per mount
> > > > namespace tend to be used?  It looks like it is going to be wise to put
> > > > a configurable limit on that number.  And I would like the default to be
> > > > something high enough most people don't care.  I believe autofs is
> > > > likely where people tend to use the most mounts.
> 
> Yes, I agree, I did want to try and avoid changing the parameters to
> ->d_mamange() but passing a struct path pointer might be better in the long
> run
> anyway.
> 

Andrew, could you please drop patches for this series.

I believe they are:
fs-make-is_local_mountpoint-usable-by-others.patch
fs-add-have_local_submounts.patch
autofs-make-mountpoint-checks-namespace-aware.patch
fs-remove-unused-have_submounts-function.patch

I'm going to have a go at what Eric and I discussed above rather than update
this series.

Ian

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-22 Thread Ian Kent

On Thu, 2016-09-22 at 20:37 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > On Thu, 2016-09-22 at 10:43 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > > Eric, Mateusz, I appreciate your spending time on this and particularly
> > > > pointing
> > > > out my embarrassingly stupid is_local_mountpoint() usage mistake.
> > > > 
> > > > Please accept my apology for the inconvenience.
> > > > 
> > > > If all goes well (in testing) I'll have follow up patches to correct
> > > > this
> > > > fairly
> > > > soon.
> > > 
> > > Related question.  Do you happen to know how many mounts per mount
> > > namespace tend to be used?  It looks like it is going to be wise to put
> > > a configurable limit on that number.  And I would like the default to be
> > > something high enough most people don't care.  I believe autofs is
> > > likely where people tend to use the most mounts.

Yes, I agree, I did want to try and avoid changing the parameters to
->d_mamange() but passing a struct path pointer might be better in the long run
anyway.


> > That's a good question.
> > 
> > I've been thinking that maybe I should have used a lookup_mnt() type check
> > as I
> > originally started out to, for this reason, as the mnt_namespace list looks
> > to
> > be a linear list.
> > 
> > But there can be a lot of mounts, and not only due to autofs, so maybe that
> > should be considered anyway.
> 
> There are two reasons for is_local_mountpoint being the way it is.
> 
> a) For the cases where you don't have the parent mount.
> b) For the cases where you want to stop things if something is mounted
>on a dentry in the local mount namespace even if it isn't mounted
>on that dentry at your current mount parent. (Something that was
>important to not change the semantics of the single mount namespace case).
> 
> Both of those cases to apply to unlink, rmdir, and rename.  I don't think
> either of those cases apply to what autofs is trying to do.  Certainly
> not the second.
> 
> So if you have the parent mount I really think you want to be using some
> variation of lookup_mnt().  The fact it is rcu safe may help with some
> of those weird corner cases as well.
> 
> > The number of mounts for direct mount maps is usually not very large because
> > of
> > the way they are implemented, large direct mount maps can have performance
> > problems. There can be anywhere from a few (likely case a few hundred) to
> > less
> > than 1, plus mounts that have been triggered and not yet expired.
> > 
> > Indirect mounts have one autofs mount at the root plus the number of mounts
> > that
> > have been triggered and not yet expired.
> > 
> > The number of autofs indirect map entries can range from a few to the common
> > case of several thousand and in rare cases up to between 3 and 5.
> > I've
> > not heard of people with maps larger than 5 entries.
> > 
> > The larger the number of map entries the greater the possibility for a large
> > number of active mounts so it's not hard to expect cases of a 1000 or
> > somewhat
> > more active mounts.
> 
> Fair.  So at least 1000.  And there can be enough mounts that a limit of
> 100,000 might be necessary to give head room for the existings configurations.
> 
> Thank you.  Now I just need to wrap my head around fs/namespace.c again
> and see how bad a count like that will be to maintain.
> 
> Eric
>

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-22 Thread Ian Kent

On Thu, 2016-09-22 at 10:43 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > Eric, Mateusz, I appreciate your spending time on this and particularly
> > pointing
> > out my embarrassingly stupid is_local_mountpoint() usage mistake.
> > 
> > Please accept my apology for the inconvenience.
> > 
> > If all goes well (in testing) I'll have follow up patches to correct this
> > fairly
> > soon.
> 
> Related question.  Do you happen to know how many mounts per mount
> namespace tend to be used?  It looks like it is going to be wise to put
> a configurable limit on that number.  And I would like the default to be
> something high enough most people don't care.  I believe autofs is
> likely where people tend to use the most mounts.

That's a good question.

I've been thinking that maybe I should have used a lookup_mnt() type check as I
originally started out to, for this reason, as the mnt_namespace list looks to
be a linear list.

But there can be a lot of mounts, and not only due to autofs, so maybe that
should be considered anyway.

The number of mounts for direct mount maps is usually not very large because of
the way they are implemented, large direct mount maps can have performance
problems. There can be anywhere from a few (likely case a few hundred) to less
than 1, plus mounts that have been triggered and not yet expired.

Indirect mounts have one autofs mount at the root plus the number of mounts that
have been triggered and not yet expired.

The number of autofs indirect map entries can range from a few to the common
case of several thousand and in rare cases up to between 3 and 5. I've
not heard of people with maps larger than 5 entries.

The larger the number of map entries the greater the possibility for a large
number of active mounts so it's not hard to expect cases of a 1000 or somewhat
more active mounts.

Ian

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-21 Thread Ian Kent

On Wed, 2016-09-21 at 07:00 +0800, Ian Kent wrote:
> On Wed, 2016-09-21 at 06:44 +0800, Ian Kent wrote:
> > On Tue, 2016-09-20 at 11:50 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > > On Sat, 2016-09-17 at 22:10 +0200, Mateusz Guzik wrote:
> > > > > On Wed, Sep 14, 2016 at 02:14:45PM +0800, Ian Kent wrote:
> > > > > > If an automount mount is clone(2)ed into a file system that is
> > > > > > propagation private, when it later expires in the originating
> > > > > > namespace subsequent calls to autofs ->d_automount() for that
> > > > > > dentry in the original namespace will return ELOOP until the
> > > > > > mount is manually umounted in the cloned namespace.
> > > > > > 
> > > > > > In the same way, if an autofs mount is triggered by automount(8)
> > > > > > running within a container the dentry will be seen as mounted in
> > > > > > the root init namespace and calls to ->d_automount() in that
> > > > > > namespace
> > > > > > will return ELOOP until the mount is umounted within the container.
> > > > > > 
> > > > > > Also, have_submounts() can return an incorect result when a mount
> > > > > > exists in a namespace other than the one being checked.
> > > > > > 
> > > > > > @@ -460,7 +460,7 @@ static int autofs4_d_manage(struct dentry
> > > > > > *dentry,
> > > > > > bool
> > > > > > rcu_walk)
> > > > > >  
> > > > > > if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
> > > > > > return 0;
> > > > > > -   if (d_mountpoint(dentry))
> > > > > > +   if (is_local_mountpoint(dentry))
> > > > > > return 0;
> > > > > > inode = d_inode_rcu(dentry);
> > > > > > if (inode && S_ISLNK(inode->i_mode))
> > > > > 
> > > > > This change is within RCU lookup.
> > > > > 
> > > > > is_local_mountpoint may end up calling __is_local_mountpoint, which
> > > > > will
> > > > > optionally take the namespace_sem lock, resulting in a splat:
> > > > 
> > > > Yes, that's a serious problem I missed.
> > > > 
> > > > snip ...
> > > > 
> > > > > I don't know this code. Perhaps it will be perfectly fine performance
> > > > > wise
> > > > > to
> > > > > just drop out of RCU lookup in this case.
> > > > 
> > > > It's a bit worse than that.
> > > > 
> > > > I think being able to continue the rcu-walk for an already mounted
> > > > dentry
> > > > that
> > > > is not being expired is an important part of the performance improvement
> > > > given
> > > > by the series that added this.
> > > > 
> > > > Can you confirm that Neil?
> > > > 
> > > > But for the case here the existing test can allow rcu-walk to continue
> > > > for
> > > > a
> > > > dentry that would attempt to trigger an automount so it's also a bug in
> > > > the
> > > > existing code.
> > > 
> > > I don't think the existing code is buggy.  As I read __follow_mount_rcu
> > > if DCACHE_NEED_AUTOMOUNT is set on the dentry after return from d_manage
> > > the code will break out of the rcu walk.
> > 
> > That's right.
> > 
> > I'm working on follow up patches now.
> > 
> > > 
> > > > Any thoughts on how we can handle this Neil, I'm having a bit of trouble
> > > > working
> > > > out how to resolve this one.
> > > 
> > > I believe in this case d_mountpoint is enough for the rcu walk case.  If
> > > the mountpoint turns out not to be local __follow_mount_rcu will kick
> > > out of the rcu walk as it will return false.  Because:
> > >   return !mounted && !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
> > > 
> > > I am not quite certain about the non-rcu case.  That can't be
> > > is_local_mountpoint as that is inside a spinlock and is_local_mountpoint
> > > can sleep.  Perhaps d_mountpoint is simply correct for autofs4_d_manage.
> > 
> > That's right, I think I have other mistakes there too.
> > 
> > I'm checking those too.
> 
> It may be that is_local_mountpoint() isn't the right thing to use for this.
> 
> When I originally set out to do this I used a lookup_mnt() type check which
> should be sufficient in this case so I might need to go back to that.

Eric, Mateusz, I appreciate your spending time on this and particularly pointing
out my embarrassingly stupid is_local_mountpoint() usage mistake.

Please accept my apology for the inconvenience.

If all goes well (in testing) I'll have follow up patches to correct this fairly
soon.

Ian

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-20 Thread Ian Kent

On Wed, 2016-09-21 at 06:44 +0800, Ian Kent wrote:
> On Tue, 2016-09-20 at 11:50 -0500, Eric W. Biederman wrote:
> > Ian Kent  writes:
> > 
> > > On Sat, 2016-09-17 at 22:10 +0200, Mateusz Guzik wrote:
> > > > On Wed, Sep 14, 2016 at 02:14:45PM +0800, Ian Kent wrote:
> > > > > If an automount mount is clone(2)ed into a file system that is
> > > > > propagation private, when it later expires in the originating
> > > > > namespace subsequent calls to autofs ->d_automount() for that
> > > > > dentry in the original namespace will return ELOOP until the
> > > > > mount is manually umounted in the cloned namespace.
> > > > > 
> > > > > In the same way, if an autofs mount is triggered by automount(8)
> > > > > running within a container the dentry will be seen as mounted in
> > > > > the root init namespace and calls to ->d_automount() in that namespace
> > > > > will return ELOOP until the mount is umounted within the container.
> > > > > 
> > > > > Also, have_submounts() can return an incorect result when a mount
> > > > > exists in a namespace other than the one being checked.
> > > > > 
> > > > > @@ -460,7 +460,7 @@ static int autofs4_d_manage(struct dentry *dentry,
> > > > > bool
> > > > > rcu_walk)
> > > > >  
> > > > >   if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
> > > > >   return 0;
> > > > > - if (d_mountpoint(dentry))
> > > > > + if (is_local_mountpoint(dentry))
> > > > >   return 0;
> > > > >   inode = d_inode_rcu(dentry);
> > > > >   if (inode && S_ISLNK(inode->i_mode))
> > > > 
> > > > This change is within RCU lookup.
> > > > 
> > > > is_local_mountpoint may end up calling __is_local_mountpoint, which will
> > > > optionally take the namespace_sem lock, resulting in a splat:
> > > 
> > > Yes, that's a serious problem I missed.
> > > 
> > > snip ...
> > > 
> > > > I don't know this code. Perhaps it will be perfectly fine performance
> > > > wise
> > > > to
> > > > just drop out of RCU lookup in this case.
> > > 
> > > It's a bit worse than that.
> > > 
> > > I think being able to continue the rcu-walk for an already mounted dentry
> > > that
> > > is not being expired is an important part of the performance improvement
> > > given
> > > by the series that added this.
> > > 
> > > Can you confirm that Neil?
> > > 
> > > But for the case here the existing test can allow rcu-walk to continue for
> > > a
> > > dentry that would attempt to trigger an automount so it's also a bug in
> > > the
> > > existing code.
> > 
> > I don't think the existing code is buggy.  As I read __follow_mount_rcu
> > if DCACHE_NEED_AUTOMOUNT is set on the dentry after return from d_manage
> > the code will break out of the rcu walk.
> 
> That's right.
> 
> I'm working on follow up patches now.
> 
> > 
> > > Any thoughts on how we can handle this Neil, I'm having a bit of trouble
> > > working
> > > out how to resolve this one.
> > 
> > I believe in this case d_mountpoint is enough for the rcu walk case.  If
> > the mountpoint turns out not to be local __follow_mount_rcu will kick
> > out of the rcu walk as it will return false.  Because:
> > return !mounted && !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
> > 
> > I am not quite certain about the non-rcu case.  That can't be
> > is_local_mountpoint as that is inside a spinlock and is_local_mountpoint
> > can sleep.  Perhaps d_mountpoint is simply correct for autofs4_d_manage.
> 
> That's right, I think I have other mistakes there too.
> 
> I'm checking those too.

It may be that is_local_mountpoint() isn't the right thing to use for this.

When I originally set out to do this I used a lookup_mnt() type check which
should be sufficient in this case so I might need to go back to that.

> 
> > 
> > Eric

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-20 Thread Ian Kent

On Tue, 2016-09-20 at 11:50 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > On Sat, 2016-09-17 at 22:10 +0200, Mateusz Guzik wrote:
> > > On Wed, Sep 14, 2016 at 02:14:45PM +0800, Ian Kent wrote:
> > > > If an automount mount is clone(2)ed into a file system that is
> > > > propagation private, when it later expires in the originating
> > > > namespace subsequent calls to autofs ->d_automount() for that
> > > > dentry in the original namespace will return ELOOP until the
> > > > mount is manually umounted in the cloned namespace.
> > > > 
> > > > In the same way, if an autofs mount is triggered by automount(8)
> > > > running within a container the dentry will be seen as mounted in
> > > > the root init namespace and calls to ->d_automount() in that namespace
> > > > will return ELOOP until the mount is umounted within the container.
> > > > 
> > > > Also, have_submounts() can return an incorect result when a mount
> > > > exists in a namespace other than the one being checked.
> > > > 
> > > > @@ -460,7 +460,7 @@ static int autofs4_d_manage(struct dentry *dentry,
> > > > bool
> > > > rcu_walk)
> > > >  
> > > > if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
> > > > return 0;
> > > > -   if (d_mountpoint(dentry))
> > > > +   if (is_local_mountpoint(dentry))
> > > > return 0;
> > > > inode = d_inode_rcu(dentry);
> > > > if (inode && S_ISLNK(inode->i_mode))
> > > 
> > > This change is within RCU lookup.
> > > 
> > > is_local_mountpoint may end up calling __is_local_mountpoint, which will
> > > optionally take the namespace_sem lock, resulting in a splat:
> > 
> > Yes, that's a serious problem I missed.
> > 
> > snip ...
> > 
> > > I don't know this code. Perhaps it will be perfectly fine performance wise
> > > to
> > > just drop out of RCU lookup in this case.
> > 
> > It's a bit worse than that.
> > 
> > I think being able to continue the rcu-walk for an already mounted dentry
> > that
> > is not being expired is an important part of the performance improvement
> > given
> > by the series that added this.
> > 
> > Can you confirm that Neil?
> > 
> > But for the case here the existing test can allow rcu-walk to continue for a
> > dentry that would attempt to trigger an automount so it's also a bug in the
> > existing code.
> 
> I don't think the existing code is buggy.  As I read __follow_mount_rcu
> if DCACHE_NEED_AUTOMOUNT is set on the dentry after return from d_manage
> the code will break out of the rcu walk.

That's right.

I'm working on follow up patches now.

> 
> > Any thoughts on how we can handle this Neil, I'm having a bit of trouble
> > working
> > out how to resolve this one.
> 
> I believe in this case d_mountpoint is enough for the rcu walk case.  If
> the mountpoint turns out not to be local __follow_mount_rcu will kick
> out of the rcu walk as it will return false.  Because:
>   return !mounted && !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
> 
> I am not quite certain about the non-rcu case.  That can't be
> is_local_mountpoint as that is inside a spinlock and is_local_mountpoint
> can sleep.  Perhaps d_mountpoint is simply correct for autofs4_d_manage.

That's right, I think I have other mistakes there too.

I'm checking those too.

> 
> Eric
>

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-18 Thread Ian Kent

On Sat, 2016-09-17 at 22:10 +0200, Mateusz Guzik wrote:
> On Wed, Sep 14, 2016 at 02:14:45PM +0800, Ian Kent wrote:
> > If an automount mount is clone(2)ed into a file system that is
> > propagation private, when it later expires in the originating
> > namespace subsequent calls to autofs ->d_automount() for that
> > dentry in the original namespace will return ELOOP until the
> > mount is manually umounted in the cloned namespace.
> > 
> > In the same way, if an autofs mount is triggered by automount(8)
> > running within a container the dentry will be seen as mounted in
> > the root init namespace and calls to ->d_automount() in that namespace
> > will return ELOOP until the mount is umounted within the container.
> > 
> > Also, have_submounts() can return an incorect result when a mount
> > exists in a namespace other than the one being checked.
> > 
> > @@ -460,7 +460,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool
> > rcu_walk)
> >  
> > if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
> > return 0;
> > -   if (d_mountpoint(dentry))
> > +   if (is_local_mountpoint(dentry))
> > return 0;
> > inode = d_inode_rcu(dentry);
> > if (inode && S_ISLNK(inode->i_mode))
> 
> This change is within RCU lookup.
> 
> is_local_mountpoint may end up calling __is_local_mountpoint, which will
> optionally take the namespace_sem lock, resulting in a splat:

Yes, that's a serious problem I missed.

snip ...

> I don't know this code. Perhaps it will be perfectly fine performance wise to
> just drop out of RCU lookup in this case.

It's a bit worse than that.

I think being able to continue the rcu-walk for an already mounted dentry that
is not being expired is an important part of the performance improvement given
by the series that added this.

Can you confirm that Neil?

But for the case here the existing test can allow rcu-walk to continue for a
dentry that would attempt to trigger an automount so it's also a bug in the
existing code.

Any thoughts on how we can handle this Neil, I'm having a bit of trouble working
out how to resolve this one.

Ian

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-18 Thread Ian Kent

On Fri, 2016-09-16 at 10:58 +0800, Ian Kent wrote:
> On Thu, 2016-09-15 at 19:47 -0500, Eric W. Biederman wrote:
> > Ian Kent  writes:
> > 
> > > On Wed, 2016-09-14 at 21:08 -0500, Eric W. Biederman wrote:
> > > > Ian Kent  writes:
> > > > 
> > > > > On Wed, 2016-09-14 at 12:28 -0500, Eric W. Biederman wrote:
> > > > > > Ian Kent  writes:
> > > > > > 
> > > > > > > If an automount mount is clone(2)ed into a file system that is
> > > > > > > propagation private, when it later expires in the originating
> > > > > > > namespace subsequent calls to autofs ->d_automount() for that
> > > > > > > dentry in the original namespace will return ELOOP until the
> > > > > > > mount is manually umounted in the cloned namespace.
> > > > > > > 
> > > > > > > In the same way, if an autofs mount is triggered by automount(8)
> > > > > > > running within a container the dentry will be seen as mounted in
> > > > > > > the root init namespace and calls to ->d_automount() in that
> > > > > > > namespace
> > > > > > > will return ELOOP until the mount is umounted within the
> > > > > > > container.
> > > > > > > 
> > > > > > > Also, have_submounts() can return an incorect result when a mount
> > > > > > > exists in a namespace other than the one being checked.
> > > > > > 
> > > > > > Overall this appears to be a fairly reasonable set of changes.  It
> > > > > > does
> > > > > > increase the expense when an actual mount point is encountered, but
> > > > > > if
> > > > > > these are the desired some increase in cost when a dentry is a
> > > > > > mountpoint is unavoidable.
> > > > > > 
> > > > > > May I ask the motiviation for this set of changes?  Reading through
> > > > > > the
> > > > > > changes I don't grasp why we want to change the behavior of autofs.
> > > > > > What problem is being solved?  What are the benefits?
> > > > > 
> > > > > LOL, it's all too easy for me to give a patch description that I think
> > > > > explains
> > > > > a problem I need to solve without realizing it isn't clear to others
> > > > > what
> > > > > the
> > > > > problem is, sorry about that.
> > > > > 
> > > > > For quite a while now, and not that frequently but consistently, I've
> > > > > been
> > > > > getting reports of people using autofs getting ELOOP errors and not
> > > > > being
> > > > > able
> > > > > to mount automounts.
> > > > > 
> > > > > This has been due to the cloning of autofs file systems (that have
> > > > > active
> > > > > automounts at the time of the clone) by other systems.
> > > > > 
> > > > > An unshare, as one example, can easily result in the cloning of an
> > > > > autofs
> > > > > file
> > > > > system that has active mounts which shows this problem.
> > > > > 
> > > > > Once an active mount that has been cloned is expired in the namespace
> > > > > that
> > > > > performed the unshare it can't be (auto)mounted again in the the
> > > > > originating
> > > > > namespace because the mounted check in the autofs module will think it
> > > > > is
> > > > > already mounted.
> > > > > 
> > > > > I'm not sure this is a clear description either, hopefully it is
> > > > > enough
> > > > > to
> > > > > demonstrate the type of problem I'm typing to solve.
> > > > 
> > > > So to rephrase the problem is that an autofs instance can stop working
> > > > properly from the perspective of the mount namespace it is mounted in
> > > > if the autofs instance is shared between multiple mount namespaces.  The
> > > > problem is that mounts and unmounts do not always propogate between
> > > > mount namespaces.  This lack of symmetric mount/unmount behavior
> > > > leads to mountpoints that become unusable.
> > > 
> > > That's right.
&g

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-15 Thread Ian Kent

On Thu, 2016-09-15 at 19:47 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > On Wed, 2016-09-14 at 21:08 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > > On Wed, 2016-09-14 at 12:28 -0500, Eric W. Biederman wrote:
> > > > > Ian Kent  writes:
> > > > > 
> > > > > > If an automount mount is clone(2)ed into a file system that is
> > > > > > propagation private, when it later expires in the originating
> > > > > > namespace subsequent calls to autofs ->d_automount() for that
> > > > > > dentry in the original namespace will return ELOOP until the
> > > > > > mount is manually umounted in the cloned namespace.
> > > > > > 
> > > > > > In the same way, if an autofs mount is triggered by automount(8)
> > > > > > running within a container the dentry will be seen as mounted in
> > > > > > the root init namespace and calls to ->d_automount() in that
> > > > > > namespace
> > > > > > will return ELOOP until the mount is umounted within the container.
> > > > > > 
> > > > > > Also, have_submounts() can return an incorect result when a mount
> > > > > > exists in a namespace other than the one being checked.
> > > > > 
> > > > > Overall this appears to be a fairly reasonable set of changes.  It
> > > > > does
> > > > > increase the expense when an actual mount point is encountered, but if
> > > > > these are the desired some increase in cost when a dentry is a
> > > > > mountpoint is unavoidable.
> > > > > 
> > > > > May I ask the motiviation for this set of changes?  Reading through
> > > > > the
> > > > > changes I don't grasp why we want to change the behavior of autofs.
> > > > > What problem is being solved?  What are the benefits?
> > > > 
> > > > LOL, it's all too easy for me to give a patch description that I think
> > > > explains
> > > > a problem I need to solve without realizing it isn't clear to others
> > > > what
> > > > the
> > > > problem is, sorry about that.
> > > > 
> > > > For quite a while now, and not that frequently but consistently, I've
> > > > been
> > > > getting reports of people using autofs getting ELOOP errors and not
> > > > being
> > > > able
> > > > to mount automounts.
> > > > 
> > > > This has been due to the cloning of autofs file systems (that have
> > > > active
> > > > automounts at the time of the clone) by other systems.
> > > > 
> > > > An unshare, as one example, can easily result in the cloning of an
> > > > autofs
> > > > file
> > > > system that has active mounts which shows this problem.
> > > > 
> > > > Once an active mount that has been cloned is expired in the namespace
> > > > that
> > > > performed the unshare it can't be (auto)mounted again in the the
> > > > originating
> > > > namespace because the mounted check in the autofs module will think it
> > > > is
> > > > already mounted.
> > > > 
> > > > I'm not sure this is a clear description either, hopefully it is enough
> > > > to
> > > > demonstrate the type of problem I'm typing to solve.
> > > 
> > > So to rephrase the problem is that an autofs instance can stop working
> > > properly from the perspective of the mount namespace it is mounted in
> > > if the autofs instance is shared between multiple mount namespaces.  The
> > > problem is that mounts and unmounts do not always propogate between
> > > mount namespaces.  This lack of symmetric mount/unmount behavior
> > > leads to mountpoints that become unusable.
> > 
> > That's right.
> > 
> > It's also worth considering that symmetric mount propagation is usually not
> > the
> > behaviour needed either and things like LXC and Docker are set propagation
> > slave
> > because of problems caused by propagation back to the parent namespace.
> > 
> > So a mount can be triggered within a container, mounted by the automount
> > daemon
> > in the parent namespace, and propagated to the child and similarly for
> > expires,
> > which is the common u

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-15 Thread Ian Kent

On Thu, 2016-09-15 at 12:12 +0800, Ian Kent wrote:
> On Wed, 2016-09-14 at 21:08 -0500, Eric W. Biederman wrote:
> > Ian Kent  writes:
> > 
> > > On Wed, 2016-09-14 at 12:28 -0500, Eric W. Biederman wrote:
> > > > Ian Kent  writes:
> > > > 
> > > > > If an automount mount is clone(2)ed into a file system that is
> > > > > propagation private, when it later expires in the originating
> > > > > namespace subsequent calls to autofs ->d_automount() for that
> > > > > dentry in the original namespace will return ELOOP until the
> > > > > mount is manually umounted in the cloned namespace.
> > > > > 
> > > > > In the same way, if an autofs mount is triggered by automount(8)
> > > > > running within a container the dentry will be seen as mounted in
> > > > > the root init namespace and calls to ->d_automount() in that namespace
> > > > > will return ELOOP until the mount is umounted within the container.
> > > > > 
> > > > > Also, have_submounts() can return an incorect result when a mount
> > > > > exists in a namespace other than the one being checked.
> > > > 
> > > > Overall this appears to be a fairly reasonable set of changes.  It does
> > > > increase the expense when an actual mount point is encountered, but if
> > > > these are the desired some increase in cost when a dentry is a
> > > > mountpoint is unavoidable.
> > > > 
> > > > May I ask the motiviation for this set of changes?  Reading through the
> > > > changes I don't grasp why we want to change the behavior of autofs.
> > > > What problem is being solved?  What are the benefits?
> > > 
> > > LOL, it's all too easy for me to give a patch description that I think
> > > explains
> > > a problem I need to solve without realizing it isn't clear to others what
> > > the
> > > problem is, sorry about that.
> > > 
> > > For quite a while now, and not that frequently but consistently, I've been
> > > getting reports of people using autofs getting ELOOP errors and not being
> > > able
> > > to mount automounts.
> > > 
> > > This has been due to the cloning of autofs file systems (that have active
> > > automounts at the time of the clone) by other systems.
> > > 
> > > An unshare, as one example, can easily result in the cloning of an autofs
> > > file
> > > system that has active mounts which shows this problem.
> > > 
> > > Once an active mount that has been cloned is expired in the namespace that
> > > performed the unshare it can't be (auto)mounted again in the the
> > > originating
> > > namespace because the mounted check in the autofs module will think it is
> > > already mounted.
> > > 
> > > I'm not sure this is a clear description either, hopefully it is enough to
> > > demonstrate the type of problem I'm typing to solve.
> > 
> > So to rephrase the problem is that an autofs instance can stop working
> > properly from the perspective of the mount namespace it is mounted in
> > if the autofs instance is shared between multiple mount namespaces.  The
> > problem is that mounts and unmounts do not always propogate between
> > mount namespaces.  This lack of symmetric mount/unmount behavior
> > leads to mountpoints that become unusable.
> 
> That's right.
> 
> It's also worth considering that symmetric mount propagation is usually not
> the
> behaviour needed either and things like LXC and Docker are set propagation
> slave
> because of problems caused by propagation back to the parent namespace.
> 
> So a mount can be triggered within a container, mounted by the automount
> daemon
> in the parent namespace, and propagated to the child and similarly for
> expires,
> which is the common use case now.
> 
> > 
> > Which leads to the question what is the expected new behavior with your
> > patchset applied.  New mounts can be added in the parent mount namespace
> > (because the test is local).  Does your change also allow the
> > autofs mountpoints to be used in the other mount namespaces that share
> > the autofs instance if everything becomes unmounted?
> 
> The problem occurs when the subordinate namespace doesn't deal with these
> propagated mounts properly, although they can obviously be used by the
> subordinate namespace.
> 
> > 
> > Or is it expected tha

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-14 Thread Ian Kent

On Wed, 2016-09-14 at 21:08 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > On Wed, 2016-09-14 at 12:28 -0500, Eric W. Biederman wrote:
> > > Ian Kent  writes:
> > > 
> > > > If an automount mount is clone(2)ed into a file system that is
> > > > propagation private, when it later expires in the originating
> > > > namespace subsequent calls to autofs ->d_automount() for that
> > > > dentry in the original namespace will return ELOOP until the
> > > > mount is manually umounted in the cloned namespace.
> > > > 
> > > > In the same way, if an autofs mount is triggered by automount(8)
> > > > running within a container the dentry will be seen as mounted in
> > > > the root init namespace and calls to ->d_automount() in that namespace
> > > > will return ELOOP until the mount is umounted within the container.
> > > > 
> > > > Also, have_submounts() can return an incorect result when a mount
> > > > exists in a namespace other than the one being checked.
> > > 
> > > Overall this appears to be a fairly reasonable set of changes.  It does
> > > increase the expense when an actual mount point is encountered, but if
> > > these are the desired some increase in cost when a dentry is a
> > > mountpoint is unavoidable.
> > > 
> > > May I ask the motiviation for this set of changes?  Reading through the
> > > changes I don't grasp why we want to change the behavior of autofs.
> > > What problem is being solved?  What are the benefits?
> > 
> > LOL, it's all too easy for me to give a patch description that I think
> > explains
> > a problem I need to solve without realizing it isn't clear to others what
> > the
> > problem is, sorry about that.
> > 
> > For quite a while now, and not that frequently but consistently, I've been
> > getting reports of people using autofs getting ELOOP errors and not being
> > able
> > to mount automounts.
> > 
> > This has been due to the cloning of autofs file systems (that have active
> > automounts at the time of the clone) by other systems.
> > 
> > An unshare, as one example, can easily result in the cloning of an autofs
> > file
> > system that has active mounts which shows this problem.
> > 
> > Once an active mount that has been cloned is expired in the namespace that
> > performed the unshare it can't be (auto)mounted again in the the originating
> > namespace because the mounted check in the autofs module will think it is
> > already mounted.
> > 
> > I'm not sure this is a clear description either, hopefully it is enough to
> > demonstrate the type of problem I'm typing to solve.
> 
> So to rephrase the problem is that an autofs instance can stop working
> properly from the perspective of the mount namespace it is mounted in
> if the autofs instance is shared between multiple mount namespaces.  The
> problem is that mounts and unmounts do not always propogate between
> mount namespaces.  This lack of symmetric mount/unmount behavior
> leads to mountpoints that become unusable.

That's right.

It's also worth considering that symmetric mount propagation is usually not the
behaviour needed either and things like LXC and Docker are set propagation slave
because of problems caused by propagation back to the parent namespace.

So a mount can be triggered within a container, mounted by the automount daemon
in the parent namespace, and propagated to the child and similarly for expires,
which is the common use case now.

> 
> Which leads to the question what is the expected new behavior with your
> patchset applied.  New mounts can be added in the parent mount namespace
> (because the test is local).  Does your change also allow the
> autofs mountpoints to be used in the other mount namespaces that share
> the autofs instance if everything becomes unmounted?

The problem occurs when the subordinate namespace doesn't deal with these
propagated mounts properly, although they can obviously be used by the
subordinate namespace.

> 
> Or is it expected that other mount namespaces that share an autofs
> instance will get changes in their mounts via mount propagation and if
> mount propagation is insufficient they are on their own.

Namespaces that receive updates via mount propagation from a parent will
continue to function as they do now.

Mounts that don't get updates via mount propagation will retain the mount to use
if they need to, as they would without this change, but the originating
namespace will also continue to function as expected.

Th

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-14 Thread Ian Kent

On Wed, 2016-09-14 at 12:28 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > If an automount mount is clone(2)ed into a file system that is
> > propagation private, when it later expires in the originating
> > namespace subsequent calls to autofs ->d_automount() for that
> > dentry in the original namespace will return ELOOP until the
> > mount is manually umounted in the cloned namespace.
> > 
> > In the same way, if an autofs mount is triggered by automount(8)
> > running within a container the dentry will be seen as mounted in
> > the root init namespace and calls to ->d_automount() in that namespace
> > will return ELOOP until the mount is umounted within the container.
> > 
> > Also, have_submounts() can return an incorect result when a mount
> > exists in a namespace other than the one being checked.
> 
> Overall this appears to be a fairly reasonable set of changes.  It does
> increase the expense when an actual mount point is encountered, but if
> these are the desired some increase in cost when a dentry is a
> mountpoint is unavoidable.

The possibility of a significant increase in overhead with this change for
autofs is one reason I've held back on posting the change for a long time.

If there are many instances of a mount (ie. thousands) I think the mnt_namespace
mount list could become large enough to be a problem. So that list might
eventually need to be a hashed list instead of a linear list.

But this would likely also be a problem in areas other than just autofs.

> 
> May I ask the motiviation for this set of changes?  Reading through the
> changes I don't grasp why we want to change the behavior of autofs.
> What problem is being solved?  What are the benefits?
> 
> Eric
> 
> > Signed-off-by: Ian Kent 
> > Cc: Al Viro 
> > Cc: Eric W. Biederman 
> > Cc: Omar Sandoval 
> > ---
> >  fs/autofs4/dev-ioctl.c |2 +-
> >  fs/autofs4/expire.c|4 ++--
> >  fs/autofs4/root.c  |   30 +++---
> >  fs/autofs4/waitq.c |2 +-
> >  4 files changed, 19 insertions(+), 19 deletions(-)
> > 
> > diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
> > index c7fcc74..0024e25 100644
> > --- a/fs/autofs4/dev-ioctl.c
> > +++ b/fs/autofs4/dev-ioctl.c
> > @@ -564,7 +564,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file
> > *fp,
> >  
> > devid = new_encode_dev(dev);
> >  
> > -   err = have_submounts(path.dentry);
> > +   err = have_local_submounts(path.dentry);
> >  
> > if (follow_down_one(&path))
> > magic = path.dentry->d_sb->s_magic;
> > diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
> > index d8e6d42..7cc34ef 100644
> > --- a/fs/autofs4/expire.c
> > +++ b/fs/autofs4/expire.c
> > @@ -236,7 +236,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
> >  * count for the autofs dentry.
> >  * If the fs is busy update the expiry counter.
> >  */
> > -   if (d_mountpoint(p)) {
> > +   if (is_local_mountpoint(p)) {
> > if (autofs4_mount_busy(mnt, p)) {
> > top_ino->last_used = jiffies;
> > dput(p);
> > @@ -280,7 +280,7 @@ static struct dentry *autofs4_check_leaves(struct
> > vfsmount *mnt,
> > while ((p = get_next_positive_dentry(p, parent))) {
> > pr_debug("dentry %p %pd\n", p, p);
> >  
> > -   if (d_mountpoint(p)) {
> > +   if (is_local_mountpoint(p)) {
> > /* Can we umount this guy */
> > if (autofs4_mount_busy(mnt, p))
> > continue;
> > diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
> > index fa84bb8..4150ad6 100644
> > --- a/fs/autofs4/root.c
> > +++ b/fs/autofs4/root.c
> > @@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct
> > file *file)
> >  * it.
> >  */
> > spin_lock(&sbi->lookup_lock);
> > -   if (!d_mountpoint(dentry) && simple_empty(dentry)) {
> > +   if (!is_local_mountpoint(dentry) && simple_empty(dentry)) {
> > spin_unlock(&sbi->lookup_lock);
> > return -ENOENT;
> > }
> > @@ -370,28 +370,28 @@ static struct vfsmount *autofs4_d_automount(struct
> > path *path)
> >  
> > /*
> >  * If the dentry is a symlink it's equivalent to a directory
> > -* h

Re: [PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-14 Thread Ian Kent

On Wed, 2016-09-14 at 12:28 -0500, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > If an automount mount is clone(2)ed into a file system that is
> > propagation private, when it later expires in the originating
> > namespace subsequent calls to autofs ->d_automount() for that
> > dentry in the original namespace will return ELOOP until the
> > mount is manually umounted in the cloned namespace.
> > 
> > In the same way, if an autofs mount is triggered by automount(8)
> > running within a container the dentry will be seen as mounted in
> > the root init namespace and calls to ->d_automount() in that namespace
> > will return ELOOP until the mount is umounted within the container.
> > 
> > Also, have_submounts() can return an incorect result when a mount
> > exists in a namespace other than the one being checked.
> 
> Overall this appears to be a fairly reasonable set of changes.  It does
> increase the expense when an actual mount point is encountered, but if
> these are the desired some increase in cost when a dentry is a
> mountpoint is unavoidable.
> 
> May I ask the motiviation for this set of changes?  Reading through the
> changes I don't grasp why we want to change the behavior of autofs.
> What problem is being solved?  What are the benefits?

LOL, it's all too easy for me to give a patch description that I think explains
a problem I need to solve without realizing it isn't clear to others what the
problem is, sorry about that.

For quite a while now, and not that frequently but consistently, I've been
getting reports of people using autofs getting ELOOP errors and not being able
to mount automounts.

This has been due to the cloning of autofs file systems (that have active
automounts at the time of the clone) by other systems.

An unshare, as one example, can easily result in the cloning of an autofs file
system that has active mounts which shows this problem.

Once an active mount that has been cloned is expired in the namespace that
performed the unshare it can't be (auto)mounted again in the the originating
namespace because the mounted check in the autofs module will think it is
already mounted.

I'm not sure this is a clear description either, hopefully it is enough to
demonstrate the type of problem I'm typing to solve.

> Eric
> 
> > Signed-off-by: Ian Kent 
> > Cc: Al Viro 
> > Cc: Eric W. Biederman 
> > Cc: Omar Sandoval 
> > ---
> >  fs/autofs4/dev-ioctl.c |2 +-
> >  fs/autofs4/expire.c|4 ++--
> >  fs/autofs4/root.c  |   30 +++---
> >  fs/autofs4/waitq.c |2 +-
> >  4 files changed, 19 insertions(+), 19 deletions(-)
> > 
> > diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
> > index c7fcc74..0024e25 100644
> > --- a/fs/autofs4/dev-ioctl.c
> > +++ b/fs/autofs4/dev-ioctl.c
> > @@ -564,7 +564,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file
> > *fp,
> >  
> > devid = new_encode_dev(dev);
> >  
> > -   err = have_submounts(path.dentry);
> > +   err = have_local_submounts(path.dentry);
> >  
> > if (follow_down_one(&path))
> > magic = path.dentry->d_sb->s_magic;
> > diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
> > index d8e6d42..7cc34ef 100644
> > --- a/fs/autofs4/expire.c
> > +++ b/fs/autofs4/expire.c
> > @@ -236,7 +236,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
> >  * count for the autofs dentry.
> >  * If the fs is busy update the expiry counter.
> >  */
> > -   if (d_mountpoint(p)) {
> > +   if (is_local_mountpoint(p)) {
> > if (autofs4_mount_busy(mnt, p)) {
> > top_ino->last_used = jiffies;
> > dput(p);
> > @@ -280,7 +280,7 @@ static struct dentry *autofs4_check_leaves(struct
> > vfsmount *mnt,
> > while ((p = get_next_positive_dentry(p, parent))) {
> > pr_debug("dentry %p %pd\n", p, p);
> >  
> > -   if (d_mountpoint(p)) {
> > +   if (is_local_mountpoint(p)) {
> > /* Can we umount this guy */
> > if (autofs4_mount_busy(mnt, p))
> > continue;
> > diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
> > index fa84bb8..4150ad6 100644
> > --- a/fs/autofs4/root.c
> > +++ b/fs/autofs4/root.c
> > @@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct
> > file *file)
> >  * it.
> &g

[PATCH 4/4] fs - remove unused have_submounts() function

2016-09-13 Thread Ian Kent

Having added the have_local_submounts() function there are no
remaining users of have_submounts() so remove it.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/dcache.c|   33 -
 include/linux/dcache.h |1 -
 2 files changed, 34 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 218166b..ad36f23 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1279,39 +1279,6 @@ rename_retry:
  * list is non-empty and continue searching.
  */
 
-static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
-{
-   int *ret = data;
-   if (d_mountpoint(dentry)) {
-   *ret = 1;
-   return D_WALK_QUIT;
-   }
-   return D_WALK_CONTINUE;
-}
-
-/**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
- *
- * Return true if the parent or its subdirectories contain
- * a mount point
- */
-int have_submounts(struct dentry *parent)
-{
-   int ret = 0;
-
-   d_walk(parent, &ret, check_mount, NULL);
-
-   return ret;
-}
-EXPORT_SYMBOL(have_submounts);
-
-/*
- * Search for at least 1 mount point in the dentry's subdirs.
- * We descend to the next level whenever the d_subdirs
- * list is non-empty and continue searching.
- */
-
 static enum d_walk_ret check_local_mount(void *data, struct dentry *dentry)
 {
int *ret = data;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 796b358..7220683 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -254,7 +254,6 @@ extern struct dentry *d_find_alias(struct inode *);
 extern void d_prune_aliases(struct inode *);
 
 /* test whether we have any submounts in a subdir tree */
-extern int have_submounts(struct dentry *);
 extern int have_local_submounts(struct dentry *);
 
 /*

[PATCH 3/4] autofs - make mountpoint checks namespace aware

2016-09-13 Thread Ian Kent

If an automount mount is clone(2)ed into a file system that is
propagation private, when it later expires in the originating
namespace subsequent calls to autofs ->d_automount() for that
dentry in the original namespace will return ELOOP until the
mount is manually umounted in the cloned namespace.

In the same way, if an autofs mount is triggered by automount(8)
running within a container the dentry will be seen as mounted in
the root init namespace and calls to ->d_automount() in that namespace
will return ELOOP until the mount is umounted within the container.

Also, have_submounts() can return an incorect result when a mount
exists in a namespace other than the one being checked.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/autofs4/dev-ioctl.c |2 +-
 fs/autofs4/expire.c|4 ++--
 fs/autofs4/root.c  |   30 +++---
 fs/autofs4/waitq.c |2 +-
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c7fcc74..0024e25 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -564,7 +564,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
devid = new_encode_dev(dev);
 
-   err = have_submounts(path.dentry);
+   err = have_local_submounts(path.dentry);
 
if (follow_down_one(&path))
magic = path.dentry->d_sb->s_magic;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index d8e6d42..7cc34ef 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -236,7 +236,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 * count for the autofs dentry.
 * If the fs is busy update the expiry counter.
 */
-   if (d_mountpoint(p)) {
+   if (is_local_mountpoint(p)) {
if (autofs4_mount_busy(mnt, p)) {
top_ino->last_used = jiffies;
dput(p);
@@ -280,7 +280,7 @@ static struct dentry *autofs4_check_leaves(struct vfsmount 
*mnt,
while ((p = get_next_positive_dentry(p, parent))) {
pr_debug("dentry %p %pd\n", p, p);
 
-   if (d_mountpoint(p)) {
+   if (is_local_mountpoint(p)) {
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, p))
continue;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index fa84bb8..4150ad6 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct 
file *file)
 * it.
 */
spin_lock(&sbi->lookup_lock);
-   if (!d_mountpoint(dentry) && simple_empty(dentry)) {
+   if (!is_local_mountpoint(dentry) && simple_empty(dentry)) {
spin_unlock(&sbi->lookup_lock);
return -ENOENT;
}
@@ -370,28 +370,28 @@ static struct vfsmount *autofs4_d_automount(struct path 
*path)
 
/*
 * If the dentry is a symlink it's equivalent to a directory
-* having d_mountpoint() true, so there's no need to call back
-* to the daemon.
+* having is_local_mountpoint() true, so there's no need to
+* call back to the daemon.
 */
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
 
-   if (!d_mountpoint(dentry)) {
+   if (!is_local_mountpoint(dentry)) {
/*
 * It's possible that user space hasn't removed directories
 * after umounting a rootless multi-mount, although it
-* should. For v5 have_submounts() is sufficient to handle
-* this because the leaves of the directory tree under the
-* mount never trigger mounts themselves (they have an autofs
-* trigger mount mounted on them). But v4 pseudo direct mounts
-* do need the leaves to trigger mounts. In this case we
-* have no choice but to use the list_empty() check and
-* require user space behave.
+* should. For v5 have_local_submounts() is sufficient to
+* handle this because the leaves of the directory tree under
+* the mount never trigger mounts themselves (they have an
+* autofs trigger mount mounted on them). But v4 pseudo
+* direct mounts do need the leaves to trigger mounts. In
+* this case we have no choice but to use the list_empty()
+* check and require user space behave.
 */
if (sbi->version > 4) {
-

[PATCH 2/4] fs - add have_local_submounts()

2016-09-13 Thread Ian Kent

The have_submounts() function checks if a dentry is a mountpoint
in any namespace but autofs needs to know if the dentry is a
mountpoint within the current namespace.

Add have_local_submounts() to to do this.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/dcache.c|   34 ++
 include/linux/dcache.h |1 +
 2 files changed, 35 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7cc95..218166b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1307,6 +1307,40 @@ int have_submounts(struct dentry *parent)
 EXPORT_SYMBOL(have_submounts);
 
 /*
+ * Search for at least 1 mount point in the dentry's subdirs.
+ * We descend to the next level whenever the d_subdirs
+ * list is non-empty and continue searching.
+ */
+
+static enum d_walk_ret check_local_mount(void *data, struct dentry *dentry)
+{
+   int *ret = data;
+   if (is_local_mountpoint(dentry)) {
+   *ret = 1;
+   return D_WALK_QUIT;
+   }
+   return D_WALK_CONTINUE;
+}
+
+/**
+ * have_local_submounts - check for mounts over a dentry
+ *   in the current namespace
+ * @parent: dentry to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point
+ */
+int have_local_submounts(struct dentry *parent)
+{
+   int ret = 0;
+
+   d_walk(parent, &ret, check_local_mount, NULL);
+
+   return ret;
+}
+EXPORT_SYMBOL(have_local_submounts);
+
+/*
  * Called by mount code to set a mountpoint and check if the mountpoint is
  * reachable (e.g. NFS can unhash a directory dentry and then the complete
  * subtree can become unreachable).
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 5ff3e9a..796b358 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -255,6 +255,7 @@ extern void d_prune_aliases(struct inode *);
 
 /* test whether we have any submounts in a subdir tree */
 extern int have_submounts(struct dentry *);
+extern int have_local_submounts(struct dentry *);
 
 /*
  * This adds the entry to the hash queues.

[PATCH 1/4] fs - make is_local_mountpoint() usable by others

2016-09-13 Thread Ian Kent

The is_local_mountpoint() function will be needed for autofs
to check if a dentry is a mountpoint in the current namespace.

Signed-off-by: Ian Kent 
Cc: Al Viro 
Cc: Eric W. Biederman 
Cc: Omar Sandoval 
---
 fs/mount.h|9 -
 fs/namespace.c|1 +
 include/linux/mount.h |9 +
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 14db05d..c25e6e8 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -127,12 +127,3 @@ struct proc_mounts {
 };
 
 extern const struct seq_operations mounts_op;
-
-extern bool __is_local_mountpoint(struct dentry *dentry);
-static inline bool is_local_mountpoint(struct dentry *dentry)
-{
-   if (!d_mountpoint(dentry))
-   return false;
-
-   return __is_local_mountpoint(dentry);
-}
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda..bea1507 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -725,6 +725,7 @@ bool __is_local_mountpoint(struct dentry *dentry)
 out:
return is_covered;
 }
+EXPORT_SYMBOL(__is_local_mountpoint);
 
 static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 {
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 54a594d..575b745 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct super_block;
 struct vfsmount;
@@ -96,4 +97,12 @@ extern void mark_mounts_for_expiry(struct list_head *mounts);
 
 extern dev_t name_to_dev_t(const char *name);
 
+extern bool __is_local_mountpoint(struct dentry *dentry);
+static inline bool is_local_mountpoint(struct dentry *dentry)
+{
+   if (!d_mountpoint(dentry))
+   return false;
+
+   return __is_local_mountpoint(dentry);
+}
 #endif /* _LINUX_MOUNT_H */

[PATCH] autofs - use dentry flags to block walks during expire

2016-09-11 Thread Ian Kent

Somewhere along the way the autofs expire operation has changed to
hold a spin lock over expired dentry selection. The autofs indirect
mount expired dentry selection is complicated and quite lengthy so
it isn't appropriate to hold a spin lock over the operation.

Commit 47be6184 added a might_sleep() to dput() causing a BUG()
about this usage to be issued.

But the spin lock doesn't need to be held over this check, the
autofs dentry info. flags are enough to block walks into dentrys
during the expire.

I've left the direct mount expire as it is (for now) becuase it
is much simpler and quicker than the indirect mount expire and
adding spin lock release and re-aquires would do nothing more
than add overhead.

Fixes: 47be61845c77 ("fs/dcache.c: avoid soft-lockup in dput()")
Signed-off-by: Ian Kent 
Cc: Takashi Iwai 
Cc: Andrew Morton 
Cc: NeilBrown 
---
 fs/autofs4/expire.c |   55 +++
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b493909..d8e6d42 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
return NULL;
 }
+
 /*
  * Find an eligible tree to time-out
  * A tree is eligible if :-
@@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block 
*sb,
struct dentry *root = sb->s_root;
struct dentry *dentry;
struct dentry *expired;
+   struct dentry *found;
struct autofs_info *ino;
 
if (!root)
@@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block 
*sb,
 
dentry = NULL;
while ((dentry = get_next_positive_subdir(dentry, root))) {
+   int flags = how;
+
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
-   if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
-   expired = NULL;
-   else
-   expired = should_expire(dentry, mnt, timeout, how);
-   if (!expired) {
+   if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
spin_unlock(&sbi->fs_lock);
continue;
}
+   spin_unlock(&sbi->fs_lock);
+
+   expired = should_expire(dentry, mnt, timeout, flags);
+   if (!expired)
+   continue;
+
+   spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(expired);
ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
-   spin_lock(&sbi->fs_lock);
-   if (should_expire(expired, mnt, timeout, how)) {
-   if (expired != dentry)
-   dput(dentry);
-   goto found;
-   }
 
+   /* Make sure a reference is not taken on found if
+* things have changed.
+*/
+   flags &= ~AUTOFS_EXP_LEAVES;
+   found = should_expire(expired, mnt, timeout, how);
+   if (!found || found != expired)
+   /* Something has changed, continue */
+   goto next;
+
+   if (expired != dentry)
+   dput(dentry);
+
+   spin_lock(&sbi->fs_lock);
+   goto found;
+next:
+   spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+   spin_unlock(&sbi->fs_lock);
if (expired != dentry)
dput(expired);
-   spin_unlock(&sbi->fs_lock);
}
return NULL;
 
@@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
+   int state;
 
/* Block on any pending expire */
if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
@@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int 
rcu_walk)
if (rcu_walk)
return -ECHILD;
 
+retry:
spin_lock(&sbi->fs_lock);
-   if (ino->flags & AUTOFS_INF_EXPIRING) {
+   state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING);
+   if (state == AUTOFS_INF_WANT_EXPIRE) {
+   spin_unlock(&sbi->fs_lock);
+   /*
+* Possibly being selected for expire, wait until
+* it's selected or not.
+*/
+   schedule_timeout_uninterruptible(HZ/10);
+   goto retry;
+   }
+   if (state & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
 
pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);

Re: [PATCH] autofs - use dentry flags to block walks during expire

2016-09-08 Thread Ian Kent

On Fri, 2016-09-09 at 11:39 +1000, NeilBrown wrote:
> On Thu, Sep 01 2016, Ian Kent wrote:
> 
> > Somewhere along the way the autofs expire operation has changed to
> > hold a spin lock over expired dentry selection. The autofs indirect
> > mount expired dentry selection is complicated and quite lengthy so
> > it isn't appropriate to hold a spin lock over the operation.
> > 
> > Commit 47be6184 added a might_sleep() to dput() causing a BUG()
> > about this usage to be issued.
> > 
> > But the spin lock doesn't need to be held over this check, the
> > autofs dentry info. flags are enough to block walks into dentrys
> > during the expire.
> > 
> > I've left the direct mount expire as it is (for now) becuase it
> > is much simpler and quicker than the indirect mount expire and
> > adding spin lock release and re-aquires would do nothing more
> > than add overhead.
> > 
> > Signed-off-by: Ian Kent 
> > ---
> >  fs/autofs4/expire.c |   55 +++-
> > ---
> >  1 file changed, 42 insertions(+), 13 deletions(-)
> > 
> > diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
> > index b493909..2d8e176 100644
> > --- a/fs/autofs4/expire.c
> > +++ b/fs/autofs4/expire.c
> > @@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry
> > *dentry,
> > }
> > return NULL;
> >  }
> > +
> >  /*
> >   * Find an eligible tree to time-out
> >   * A tree is eligible if :-
> > @@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct
> > super_block *sb,
> > struct dentry *root = sb->s_root;
> > struct dentry *dentry;
> > struct dentry *expired;
> > +   struct dentry *found;
> > struct autofs_info *ino;
> >  
> > if (!root)
> > @@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct
> > super_block *sb,
> >  
> > dentry = NULL;
> > while ((dentry = get_next_positive_subdir(dentry, root))) {
> > +   int flags = how;
> > +
> > spin_lock(&sbi->fs_lock);
> > ino = autofs4_dentry_ino(dentry);
> > -   if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
> > -   expired = NULL;
> > -   else
> > -   expired = should_expire(dentry, mnt, timeout, how);
> > -   if (!expired) {
> > +   if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
> > spin_unlock(&sbi->fs_lock);
> > continue;
> > }
> > +   spin_unlock(&sbi->fs_lock);
> > +
> > +   expired = should_expire(dentry, mnt, timeout, flags);
> > +   if (!expired)
> > +   continue;
> > +
> > +   spin_lock(&sbi->fs_lock);
> > ino = autofs4_dentry_ino(expired);
> > ino->flags |= AUTOFS_INF_WANT_EXPIRE;
> > spin_unlock(&sbi->fs_lock);
> > synchronize_rcu();
> > -   spin_lock(&sbi->fs_lock);
> > -   if (should_expire(expired, mnt, timeout, how)) {
> > -   if (expired != dentry)
> > -   dput(dentry);
> > -   goto found;
> > -   }
> >  
> > +   /* Make sure a reference is not taken on found if
> > +* things have changed.
> > +*/
> > +   flags &= ~AUTOFS_EXP_LEAVES;
> > +   found = should_expire(expired, mnt, timeout, how);
> > +   if (!found || found != expired)
> > +   /* Something has changed, continue */
> > +   goto next;
> > +
> > +   if (expired != dentry)
> > +   dput(dentry);
> > +
> > +   spin_lock(&sbi->fs_lock);
> > +   goto found;
> > +next:
> > +   spin_lock(&sbi->fs_lock);
> > ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
> > +   spin_unlock(&sbi->fs_lock);
> > if (expired != dentry)
> > dput(expired);
> > -   spin_unlock(&sbi->fs_lock);
> > }
> > return NULL;
> >  
> > @@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int
> > rcu_walk)
> > struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
> > struct autofs_info *ino = autofs4_dentry_ino(dentry);
> > int status;
> > +   in

[PATCH] autofs - use dentry flags to block walks during expire

2016-08-31 Thread Ian Kent

Somewhere along the way the autofs expire operation has changed to
hold a spin lock over expired dentry selection. The autofs indirect
mount expired dentry selection is complicated and quite lengthy so
it isn't appropriate to hold a spin lock over the operation.

Commit 47be6184 added a might_sleep() to dput() causing a BUG()
about this usage to be issued.

But the spin lock doesn't need to be held over this check, the
autofs dentry info. flags are enough to block walks into dentrys
during the expire.

I've left the direct mount expire as it is (for now) becuase it
is much simpler and quicker than the indirect mount expire and
adding spin lock release and re-aquires would do nothing more
than add overhead.

Signed-off-by: Ian Kent 
---
 fs/autofs4/expire.c |   55 +++
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b493909..2d8e176 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
return NULL;
 }
+
 /*
  * Find an eligible tree to time-out
  * A tree is eligible if :-
@@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block 
*sb,
struct dentry *root = sb->s_root;
struct dentry *dentry;
struct dentry *expired;
+   struct dentry *found;
struct autofs_info *ino;
 
if (!root)
@@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block 
*sb,
 
dentry = NULL;
while ((dentry = get_next_positive_subdir(dentry, root))) {
+   int flags = how;
+
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
-   if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
-   expired = NULL;
-   else
-   expired = should_expire(dentry, mnt, timeout, how);
-   if (!expired) {
+   if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
spin_unlock(&sbi->fs_lock);
continue;
}
+   spin_unlock(&sbi->fs_lock);
+
+   expired = should_expire(dentry, mnt, timeout, flags);
+   if (!expired)
+   continue;
+
+   spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(expired);
ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
-   spin_lock(&sbi->fs_lock);
-   if (should_expire(expired, mnt, timeout, how)) {
-   if (expired != dentry)
-   dput(dentry);
-   goto found;
-   }
 
+   /* Make sure a reference is not taken on found if
+* things have changed.
+*/
+   flags &= ~AUTOFS_EXP_LEAVES;
+   found = should_expire(expired, mnt, timeout, how);
+   if (!found || found != expired)
+   /* Something has changed, continue */
+   goto next;
+
+   if (expired != dentry)
+   dput(dentry);
+
+   spin_lock(&sbi->fs_lock);
+   goto found;
+next:
+   spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+   spin_unlock(&sbi->fs_lock);
if (expired != dentry)
dput(expired);
-   spin_unlock(&sbi->fs_lock);
}
return NULL;
 
@@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
+   int state;
 
/* Block on any pending expire */
if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
@@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int 
rcu_walk)
if (rcu_walk)
return -ECHILD;
 
+retry:
spin_lock(&sbi->fs_lock);
-   if (ino->flags & AUTOFS_INF_EXPIRING) {
+   state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING);
+   if (state == AUTOFS_INF_WANT_EXPIRE) {
+   spin_unlock(&sbi->fs_lock);
+   /*
+* Possibly being selected for expire, wait until
+* it's selected or not.
+*/
+   schedule_timeout(HZ/10);
+   goto retry;
+   }
+   if (state & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
 
pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);

[PATCH 1/3] autofs: remove possibly misleading /* #define DEBUG */

2016-08-30 Thread Ian Kent

From: Tomohiro Kusumi 

Having this in autofs_i.h gives illusion that uncommenting this
enables pr_debug(), but it doesn't enable all the pr_debug() in
autofs because inclusion order matters.

XFS has the same DEBUG macro in its core header fs/xfs/xfs.h,
however XFS seems to have a rule to include this prior to other
XFS headers as well as kernel headers. This is not the case with
autofs, and DEBUG could be enabled via Makefile, so autofs should
just get rid of this comment to make the code less confusing.
It's a comment, so there is literally no functional difference.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/autofs_i.h |2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index dd71bd4..4404a22 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -34,8 +34,6 @@
 #include 
 #include 
 
-/* #define DEBUG */
-
 #ifdef pr_fmt
 #undef pr_fmt
 #endif

[PATCH 3/3] autofs: fix "fix dev ioctl number range check"

2016-08-30 Thread Ian Kent

From: Tomohiro Kusumi 

102a340f had a typo that made the count macro negative (-13).
The acutal check used by ioctl is ((cmd - cmd_first) > COUNT),
so it needs to be positive (13).

* 102a340f is a commit in linux-next which hasn't been merged
to mainline upstream.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/autofs_i.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4404a22..a1fba42 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -21,7 +21,7 @@
 
 #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
 #define AUTOFS_DEV_IOCTL_IOC_COUNT \
-   (AUTOFS_DEV_IOCTL_VERSION_CMD - AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
+   (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD)
 
 #include 
 #include

[PATCH 2/3] autofs: refactor ioctl fn vector in iookup_dev_ioctl()

2016-08-30 Thread Ian Kent

From: Tomohiro Kusumi 

cmd part of this struct is the same as an index of itself within
_ioctls[]. In fact this cmd is unused, so we can drop this part.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/dev-ioctl.c |   49 
 1 file changed, 16 insertions(+), 33 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index e89d6bb..fc09eb7 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -597,42 +597,25 @@ out:
 
 static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
 {
-   static struct {
-   int cmd;
-   ioctl_fn fn;
-   } _ioctls[] = {
-   {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD),
-autofs_dev_ioctl_version},
-   {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
-autofs_dev_ioctl_protover},
-   {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
-autofs_dev_ioctl_protosubver},
-   {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
-autofs_dev_ioctl_openmount},
-   {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
-autofs_dev_ioctl_closemount},
-   {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
-autofs_dev_ioctl_ready},
-   {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
-autofs_dev_ioctl_fail},
-   {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
-autofs_dev_ioctl_setpipefd},
-   {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
-autofs_dev_ioctl_catatonic},
-   {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
-autofs_dev_ioctl_timeout},
-   {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
-autofs_dev_ioctl_requester},
-   {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
-autofs_dev_ioctl_expire},
-   {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
-autofs_dev_ioctl_askumount},
-   {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
-autofs_dev_ioctl_ismountpoint}
+   static ioctl_fn _ioctls[] = {
+   autofs_dev_ioctl_version,
+   autofs_dev_ioctl_protover,
+   autofs_dev_ioctl_protosubver,
+   autofs_dev_ioctl_openmount,
+   autofs_dev_ioctl_closemount,
+   autofs_dev_ioctl_ready,
+   autofs_dev_ioctl_fail,
+   autofs_dev_ioctl_setpipefd,
+   autofs_dev_ioctl_catatonic,
+   autofs_dev_ioctl_timeout,
+   autofs_dev_ioctl_requester,
+   autofs_dev_ioctl_expire,
+   autofs_dev_ioctl_askumount,
+   autofs_dev_ioctl_ismountpoint,
};
unsigned int idx = cmd_idx(cmd);
 
-   return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
+   return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx];
 }
 
 /* ioctl dispatcher */

Re: 4.8-rc4 spews "BUG: sleeping function called from invalid context at fs/dcache.c:757"

2016-08-29 Thread Ian Kent

On Tue, 2016-08-30 at 09:37 +0800, Ian Kent wrote:
> On Mon, 2016-08-29 at 16:18 +0100, Al Viro wrote:
> > On Mon, Aug 29, 2016 at 04:35:46PM +0200, Takashi Iwai wrote:
> > 
> > >   [] dput+0x46/0x400
> > ... which should not be called in atomic contexts
> > >   [] follow_down_one+0x27/0x60
> > ... and neither should this
> > >   [] autofs4_mount_busy+0x32/0x110
> > ... nor that (for fsck sake, there's full-blown path_put() in it!)
> > >   [] should_expire+0x51/0x3d0
> > ... so that would better not be called in atomic either (incidentally,
> > it also calls dput() directly)
> > >   [] autofs4_expire_indirect+0x190/0x2d0
> > ... while here it is called under sbi->fs_lock.
> > 
> > > I don't remember of a similar stack trace in the past, so if any, it
> > > can be a regression in 4.8 kernel.  But I cannot say it in 100%, as
> > > this looks spontaneous, nor I would be able to reproduce it at the
> > > next boot...
> > 
> > It's old; the race is narrow, but it's been there for quite a while, by
> > the look of it.
> 
> Right, I missed that when the rcu-walk concurrency changes went in, mmm 

Umm ... no, the problem has been there much longer than that ...

Re: 4.8-rc4 spews "BUG: sleeping function called from invalid context at fs/dcache.c:757"

2016-08-29 Thread Ian Kent

On Mon, 2016-08-29 at 16:18 +0100, Al Viro wrote:
> On Mon, Aug 29, 2016 at 04:35:46PM +0200, Takashi Iwai wrote:
> 
> >   [] dput+0x46/0x400
>   ... which should not be called in atomic contexts
> >   [] follow_down_one+0x27/0x60
>   ... and neither should this
> >   [] autofs4_mount_busy+0x32/0x110
>   ... nor that (for fsck sake, there's full-blown path_put() in it!)
> >   [] should_expire+0x51/0x3d0
>   ... so that would better not be called in atomic either (incidentally,
> it also calls dput() directly)
> >   [] autofs4_expire_indirect+0x190/0x2d0
>   ... while here it is called under sbi->fs_lock.
> 
> > I don't remember of a similar stack trace in the past, so if any, it
> > can be a regression in 4.8 kernel.  But I cannot say it in 100%, as
> > this looks spontaneous, nor I would be able to reproduce it at the
> > next boot...
> 
> It's old; the race is narrow, but it's been there for quite a while, by
> the look of it.

Right, I missed that when the rcu-walk concurrency changes went in, mmm

Re: [PATCH v05 37/72] include/uapi/linux/auto_fs.h: include linux/limits.h

2016-08-22 Thread Ian Kent

On Mon, 2016-08-22 at 20:32 +0200, Mikko Rapeli wrote:
> Fixes userspace compilation error:
> 
> error: ‘NAME_MAX’ undeclared here (not in a function)

Thanks for your effort but a patch for this, from from Tomohiro Kusumi, is
currently included in the mm tree.

> 
> Signed-off-by: Mikko Rapeli 
> ---
>  include/uapi/linux/auto_fs.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
> index 9175a1b..1bfc3ed 100644
> --- a/include/uapi/linux/auto_fs.h
> +++ b/include/uapi/linux/auto_fs.h
> @@ -12,6 +12,7 @@
>  #define _UAPI_LINUX_AUTO_FS_H
>  
>  #include 
> +#include 
>  #ifndef __KERNEL__
>  #include 
>  #endif /* __KERNEL__ */

[PATCH 02/18] autofs: Drop unnecessary extern in autofs_i.h

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

autofs4_kill_sb() doesn't need to be declared as extern,
and no other functions in .h are explicitly declared as extern.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/autofs_i.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a439548..eb87055 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -271,4 +271,4 @@ static inline void autofs4_del_expiring(struct dentry 
*dentry)
}
 }
 
-extern void autofs4_kill_sb(struct super_block *);
+void autofs4_kill_sb(struct super_block *);

[PATCH 12/18] autofs: Update struct autofs_dev_ioctl in Documentation

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

Sync with changes made by 730c9eec which introduced an union for
various ioctl commands instead of having statically named arg1,2.

This commit simply replaces arg1,2 with the corresponding fields
without changing semantics.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 .../filesystems/autofs4-mount-control.txt  |   70 
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/Documentation/filesystems/autofs4-mount-control.txt 
b/Documentation/filesystems/autofs4-mount-control.txt
index 540d9a7..50a3e01 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -179,8 +179,19 @@ struct autofs_dev_ioctl {
 * including this struct */
__s32 ioctlfd;  /* automount command fd */
 
-   __u32 arg1; /* Command parameters */
-   __u32 arg2;
+   union {
+   struct args_protoverprotover;
+   struct args_protosubver protosubver;
+   struct args_openmount   openmount;
+   struct args_ready   ready;
+   struct args_failfail;
+   struct args_setpipefd   setpipefd;
+   struct args_timeout timeout;
+   struct args_requester   requester;
+   struct args_expire  expire;
+   struct args_askumount   askumount;
+   struct args_ismountpointismountpoint;
+   };
 
char path[0];
 };
@@ -192,8 +203,8 @@ optionally be used to check a specific mount corresponding 
to a given
 mount point file descriptor, and when requesting the uid and gid of the
 last successful mount on a directory within the autofs file system.
 
-The fields arg1 and arg2 are used to communicate parameters and results of
-calls made as described below.
+The union is used to communicate parameters and results of calls made
+as described below.
 
 The path field is used to pass a path where it is needed and the size field
 is used account for the increased structure length when translating the
@@ -245,9 +256,9 @@ AUTOFS_DEV_IOCTL_PROTOVER_CMD and 
AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD
 Get the major and minor version of the autofs4 protocol version understood
 by loaded module. This call requires an initialized struct autofs_dev_ioctl
 with the ioctlfd field set to a valid autofs mount point descriptor
-and sets the requested version number in structure field arg1. These
-commands return 0 on success or one of the negative error codes if
-validation fails.
+and sets the requested version number in version field of struct args_protover
+or sub_version field of struct args_protosubver. These commands return
+0 on success or one of the negative error codes if validation fails.
 
 
 AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
@@ -256,9 +267,9 @@ AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT
 Obtain and release a file descriptor for an autofs managed mount point
 path. The open call requires an initialized struct autofs_dev_ioctl with
 the path field set and the size field adjusted appropriately as well
-as the arg1 field set to the device number of the autofs mount. The
-device number can be obtained from the mount options shown in
-/proc/mounts. The close call requires an initialized struct
+as the devid field of struct args_openmount set to the device number of
+the autofs mount. The device number can be obtained from the mount options
+shown in /proc/mounts. The close call requires an initialized struct
 autofs_dev_ioct with the ioctlfd field set to the descriptor obtained
 from the open call. The release of the file descriptor can also be done
 with close(2) so any open descriptors will also be closed at process exit.
@@ -272,10 +283,10 @@ AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD
 Return mount and expire result status from user space to the kernel.
 Both of these calls require an initialized struct autofs_dev_ioctl
 with the ioctlfd field set to the descriptor obtained from the open
-call and the arg1 field set to the wait queue token number, received
-by user space in the foregoing mount or expire request. The arg2 field
-is set to the status to be returned. For the ready call this is always
-0 and for the fail call it is set to the errno of the operation.
+call and the token field of struct args_ready or struct args_fail set
+to the wait queue token number, received by user space in the foregoing
+mount or expire request. The status field of struct args_fail is set to
+the errno of the operation. It is set to 0 on success.
 
 
 AUTOFS_DEV_IOCTL_SETPIPEFD_CMD
@@ -290,9 +301,10 @@ mount be catatonic (see next call).
 
 The call requires an initialized struct autofs_dev_ioctl with the
 ioctlfd field set to the descriptor obtained from the open call and
-the arg1

[PATCH 09/18] autofs: Don't fail to free_dev_ioctl(param)

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

Returning -ENOTTY here fails to free dynamically allocated param.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/dev-ioctl.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c7fcc74..d47b35a 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -662,7 +662,8 @@ static int _autofs_dev_ioctl(unsigned int command,
fn = lookup_dev_ioctl(cmd);
if (!fn) {
pr_warn("unknown command 0x%08x\n", command);
-   return -ENOTTY;
+   err = -ENOTTY;
+   goto out;
}
 
fp = NULL;

[PATCH 11/18] autofs: Fix Documentation regarding devid on ioctl

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

The explanation on how ioctl handles devid seems incorrect.
Userspace who calls this ioctl has no input regarding devid,
and ioctl implementation retrieves devid via superblock.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 .../filesystems/autofs4-mount-control.txt  |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Documentation/filesystems/autofs4-mount-control.txt 
b/Documentation/filesystems/autofs4-mount-control.txt
index aff2211..540d9a7 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -323,9 +323,8 @@ mount on the given path dentry.
 
 The call requires an initialized struct autofs_dev_ioctl with the path
 field set to the mount point in question and the size field adjusted
-appropriately as well as the arg1 field set to the device number of the
-containing autofs mount. Upon return the struct field arg1 contains the
-uid and arg2 the gid.
+appropriately. Upon return the struct field arg1 contains the uid and
+arg2 the gid.
 
 When reconstructing an autofs mount tree with active mounts we need to
 re-connect to mounts that may have used the original process uid and

[PATCH 15/18] autofs: Add autofs_dev_ioctl_version() for AUTOFS_DEV_IOCTL_VERSION_CMD

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

No functional changes, based on the following justification.

1. Make the code more consistent using the ioctl vector _ioctls[],
   rather than assigning NULL only for this ioctl command.
2. Remove goto done; for better maintainability in the long run.
3. The existing code is based on the fact that validate_dev_ioctl()
   sets ioctl version for any command, but AUTOFS_DEV_IOCTL_VERSION_CMD
   should explicitly set it regardless of the default behavior.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/dev-ioctl.c |   25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 13e7517..7289216 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -172,6 +172,17 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct 
file *f)
return sbi;
 }
 
+/* Return autofs dev ioctl version */
+static int autofs_dev_ioctl_version(struct file *fp,
+   struct autofs_sb_info *sbi,
+   struct autofs_dev_ioctl *param)
+{
+   /* This should have already been set. */
+   param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+   param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+   return 0;
+}
+
 /* Return autofs module protocol version */
 static int autofs_dev_ioctl_protover(struct file *fp,
 struct autofs_sb_info *sbi,
@@ -590,7 +601,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
int cmd;
ioctl_fn fn;
} _ioctls[] = {
-   {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
+   {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD),
+autofs_dev_ioctl_version},
{cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
 autofs_dev_ioctl_protover},
{cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
@@ -655,10 +667,6 @@ static int _autofs_dev_ioctl(unsigned int command,
if (err)
goto out;
 
-   /* The validate routine above always sets the version */
-   if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
-   goto done;
-
fn = lookup_dev_ioctl(cmd);
if (!fn) {
pr_warn("unknown command 0x%08x\n", command);
@@ -672,9 +680,11 @@ static int _autofs_dev_ioctl(unsigned int command,
/*
 * For obvious reasons the openmount can't have a file
 * descriptor yet. We don't take a reference to the
-* file during close to allow for immediate release.
+* file during close to allow for immediate release,
+* and the same for retrieving ioctl version.
 */
-   if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
+   if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+   cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
fp = fget(param->ioctlfd);
if (!fp) {
@@ -707,7 +717,6 @@ cont:
 
if (fp)
fput(fp);
-done:
if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
err = -EFAULT;
 out:

[PATCH 10/18] autofs: Remove AUTOFS_DEVID_LEN

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

This macro was never used by neither kernel nor userspace,
and also doesn't represent "devid length" in bytes.
(unless it was added to mean something else).

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 include/linux/auto_dev-ioctl.h |2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index 7caaf29..bf82e3a 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -18,8 +18,6 @@
 #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1
 #define AUTOFS_DEV_IOCTL_VERSION_MINOR 0
 
-#define AUTOFS_DEVID_LEN   16
-
 #define AUTOFS_DEV_IOCTL_SIZE  sizeof(struct autofs_dev_ioctl)
 
 /*

[PATCH 13/18] autofs: Fix pr_debug() message

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

This isn't a return value, so change the message to indicate
the status is the result of may_umount().

(or locate pr_debug() after put_user() with the same message)

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/root.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 1b0495a..d25c55f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -840,7 +840,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, 
int __user *p)
if (may_umount(mnt))
status = 1;
 
-   pr_debug("returning %d\n", status);
+   pr_debug("may umount %d\n", status);
 
status = put_user(status, p);

[PATCH 18/18] autofs4 - move linux/auto_dev-ioctl.h to uapi/linux

2016-08-11 Thread Ian Kent

Since linux/auto_dev-ioctl.h wasn't included in include/linux/Kbuild
it wasn't moved to uapi/linux as part of the uapi series.

Signed-off-by: Ian Kent 
---
 include/linux/auto_dev-ioctl.h  |  209 -
 include/uapi/linux/auto_dev-ioctl.h |  221 +++
 2 files changed, 222 insertions(+), 208 deletions(-)
 create mode 100644 include/uapi/linux/auto_dev-ioctl.h

diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index bf82e3a..28c1505 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -10,212 +10,5 @@
 #ifndef _LINUX_AUTO_DEV_IOCTL_H
 #define _LINUX_AUTO_DEV_IOCTL_H
 
-#include 
-#include 
-
-#define AUTOFS_DEVICE_NAME "autofs"
-
-#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1
-#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0
-
-#define AUTOFS_DEV_IOCTL_SIZE  sizeof(struct autofs_dev_ioctl)
-
-/*
- * An ioctl interface for autofs mount point control.
- */
-
-struct args_protover {
-   __u32   version;
-};
-
-struct args_protosubver {
-   __u32   sub_version;
-};
-
-struct args_openmount {
-   __u32   devid;
-};
-
-struct args_ready {
-   __u32   token;
-};
-
-struct args_fail {
-   __u32   token;
-   __s32   status;
-};
-
-struct args_setpipefd {
-   __s32   pipefd;
-};
-
-struct args_timeout {
-   __u64   timeout;
-};
-
-struct args_requester {
-   __u32   uid;
-   __u32   gid;
-};
-
-struct args_expire {
-   __u32   how;
-};
-
-struct args_askumount {
-   __u32   may_umount;
-};
-
-struct args_ismountpoint {
-   union {
-   struct args_in {
-   __u32   type;
-   } in;
-   struct args_out {
-   __u32   devid;
-   __u32   magic;
-   } out;
-   };
-};
-
-/*
- * All the ioctls use this structure.
- * When sending a path size must account for the total length
- * of the chunk of memory otherwise is is the size of the
- * structure.
- */
-
-struct autofs_dev_ioctl {
-   __u32 ver_major;
-   __u32 ver_minor;
-   __u32 size; /* total size of data passed in
-* including this struct */
-   __s32 ioctlfd;  /* automount command fd */
-
-   /* Command parameters */
-
-   union {
-   struct args_protoverprotover;
-   struct args_protosubver protosubver;
-   struct args_openmount   openmount;
-   struct args_ready   ready;
-   struct args_failfail;
-   struct args_setpipefd   setpipefd;
-   struct args_timeout timeout;
-   struct args_requester   requester;
-   struct args_expire  expire;
-   struct args_askumount   askumount;
-   struct args_ismountpointismountpoint;
-   };
-
-   char path[0];
-};
-
-static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
-{
-   memset(in, 0, sizeof(struct autofs_dev_ioctl));
-   in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
-   in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
-   in->size = sizeof(struct autofs_dev_ioctl);
-   in->ioctlfd = -1;
-}
-
-/*
- * If you change this make sure you make the corresponding change
- * to autofs-dev-ioctl.c:lookup_ioctl()
- */
-enum {
-   /* Get various version info */
-   AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
-   AUTOFS_DEV_IOCTL_PROTOVER_CMD,
-   AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD,
-
-   /* Open mount ioctl fd */
-   AUTOFS_DEV_IOCTL_OPENMOUNT_CMD,
-
-   /* Close mount ioctl fd */
-   AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD,
-
-   /* Mount/expire status returns */
-   AUTOFS_DEV_IOCTL_READY_CMD,
-   AUTOFS_DEV_IOCTL_FAIL_CMD,
-
-   /* Activate/deactivate autofs mount */
-   AUTOFS_DEV_IOCTL_SETPIPEFD_CMD,
-   AUTOFS_DEV_IOCTL_CATATONIC_CMD,
-
-   /* Expiry timeout */
-   AUTOFS_DEV_IOCTL_TIMEOUT_CMD,
-
-   /* Get mount last requesting uid and gid */
-   AUTOFS_DEV_IOCTL_REQUESTER_CMD,
-
-   /* Check for eligible expire candidates */
-   AUTOFS_DEV_IOCTL_EXPIRE_CMD,
-
-   /* Request busy status */
-   AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD,
-
-   /* Check if path is a mountpoint */
-   AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
-};
-
-#define AUTOFS_IOCTL 0x93
-
-#define AUTOFS_DEV_IOCTL_VERSION \
-   _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_PROTOVER \
-   _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV_IOCTL_PROTOSUBVER \
-   _IOWR(AUTOFS_IOCTL, \
- AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl)
-
-#define AUTOFS_DEV

[PATCH 17/18] autofs: Move inclusion of linux/limits.h to uapi

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

linux/limits.h should be included by uapi instead of linux/auto_fs.h
so as not to cause compile error in userspace.

 # cat << EOF > ./test1.c
 > #include 
 > #include 
 > int main(void) {
 > return 0;
 > }
 > EOF
 # gcc -Wall -g ./test1.c
 In file included from ./test1.c:2:0:
 /usr/include/linux/auto_fs.h:54:12: error: 'NAME_MAX' undeclared here (not in 
a function)
   char name[NAME_MAX+1];
 ^

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 include/linux/auto_fs.h  |1 -
 include/uapi/linux/auto_fs.h |1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index b4066bb..b8f814c 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -10,7 +10,6 @@
 #define _LINUX_AUTO_FS_H
 
 #include 
-#include 
 #include 
 #include 
 #endif /* _LINUX_AUTO_FS_H */
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 9175a1b..1bfc3ed 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -12,6 +12,7 @@
 #define _UAPI_LINUX_AUTO_FS_H
 
 #include 
+#include 
 #ifndef __KERNEL__
 #include 
 #endif /* __KERNEL__ */

[PATCH 16/18] autofs: Fix print format for ioctl warning message

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

All other warnings use "cmd(0x%08x)" and this is the only one with "cmd(%d)".
(below comes from my userspace debug program, but not automount daemon)

[ 1139.905676] autofs4:pid:1640:check_dev_ioctl_version: ioctl control 
interface version mismatch: kernel(1.0), user(0.0), cmd(-1072131215)

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/dev-ioctl.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 7289216..e89d6bb 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -75,7 +75,7 @@ static int check_dev_ioctl_version(int cmd, struct 
autofs_dev_ioctl *param)
if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
(param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
pr_warn("ioctl control interface version mismatch: "
-   "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+   "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n",
AUTOFS_DEV_IOCTL_VERSION_MAJOR,
AUTOFS_DEV_IOCTL_VERSION_MINOR,
param->ver_major, param->ver_minor, cmd);

[PATCH 14/18] autofs - fix dev ioctl number range check

2016-08-11 Thread Ian Kent

The count of miscellaneous device ioctls in fs/autofs4/autofs_i.h is wrong.

The number of ioctls is the difference between AUTOFS_DEV_IOCTL_VERSION_CMD
and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD (14) not the difference between
AUTOFS_IOC_COUNT and 11 (21).

Signed-off-by: Ian Kent 
Cc: Tomohiro Kusumi 
---
 fs/autofs4/autofs_i.h  |3 ++-
 fs/autofs4/dev-ioctl.c |2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 73e3d38..dd71bd4 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -20,7 +20,8 @@
 #define AUTOFS_IOC_COUNT 32
 
 #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
-#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
+#define AUTOFS_DEV_IOCTL_IOC_COUNT \
+   (AUTOFS_DEV_IOCTL_VERSION_CMD - AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
 
 #include 
 #include 
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d47b35a..13e7517 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -642,7 +642,7 @@ static int _autofs_dev_ioctl(unsigned int command,
cmd = _IOC_NR(command);
 
if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
-   cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
+   cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) {
return -ENOTTY;
}

[PATCH 01/18] autofs: Fix typos in Documentation/filesystems/autofs4.txt

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

plus minor whitespace fixes.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 Documentation/filesystems/autofs4.txt |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/autofs4.txt 
b/Documentation/filesystems/autofs4.txt
index 39d02e1..8fac3fe 100644
--- a/Documentation/filesystems/autofs4.txt
+++ b/Documentation/filesystems/autofs4.txt
@@ -203,9 +203,9 @@ initiated or is being considered, otherwise it returns 0.
 Mountpoint expiry
 -
 
-The VFS has a mechansim for automatically expiring unused mounts,
+The VFS has a mechanism for automatically expiring unused mounts,
 much as it can expire any unused dentry information from the dcache.
-This is guided by the MNT_SHRINKABLE flag.  This  only applies to
+This is guided by the MNT_SHRINKABLE flag.  This only applies to
 mounts that were created by `d_automount()` returning a filesystem to be
 mounted.  As autofs doesn't return such a filesystem but leaves the
 mounting to the automount daemon, it must involve the automount daemon
@@ -298,7 +298,7 @@ remove directories and symlinks using normal filesystem 
operations.
 autofs knows whether a process requesting some operation is the daemon
 or not based on its process-group id number (see getpgid(1)).
 
-When an autofs filesystem it mounted the pgid of the mounting
+When an autofs filesystem is mounted the pgid of the mounting
 processes is recorded unless the "pgrp=" option is given, in which
 case that number is recorded instead.  Any request arriving from a
 process in that process group is considered to come from the daemon.
@@ -450,7 +450,7 @@ Commands are:
 numbers for existing filesystems can be found in
 `/proc/self/mountinfo`.
 - **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`.
-- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the  filesystem is in
+- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in
 catatonic mode, this can provide the write end of a new pipe
 in `arg1` to re-establish communication with a daemon.  The
 process group of the calling process is used to identify the

[PATCH 06/18] autofs - remove ino free in autofs4_dir_symlink()

2016-08-11 Thread Ian Kent

The inode allocation failure case in autofs4_dir_symlink() frees
the autofs dentry info of the dentry without setting ->d_fsdata to
NULL.

That could lead to a double free so just get rid of the free and
leave it to ->d_release().

Signed-off-by: Ian Kent 
Cc: Tomohiro Kusumi 
---
 fs/autofs4/root.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index fa84bb8..1b0495a 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -577,8 +577,6 @@ static int autofs4_dir_symlink(struct inode *dir,
inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
if (!inode) {
kfree(cp);
-   if (!dentry->d_fsdata)
-   kfree(ino);
return -ENOMEM;
}
inode->i_private = cp;

[PATCH 04/18] autofs - fix autofs4_fill_super() error exit handling

2016-08-11 Thread Ian Kent

Somewhere along the line the error handling gotos have become incorrect.

Signed-off-by: Ian Kent 
Cc: Tomohiro Kusumi 
---
 fs/autofs4/inode.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 8357544..64d721f 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -313,7 +313,7 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
 
if (!pipe) {
pr_err("could not open pipe file descriptor\n");
-   goto fail_dput;
+   goto fail_put_pid;
}
ret = autofs_prepare_pipe(pipe);
if (ret < 0)
@@ -334,14 +334,14 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
 fail_fput:
pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
-   /* fall through */
+fail_put_pid:
+   put_pid(sbi->oz_pgrp);
 fail_dput:
dput(root);
goto fail_free;
 fail_ino:
kfree(ino);
 fail_free:
-   put_pid(sbi->oz_pgrp);
kfree(sbi);
s->s_fs_info = NULL;
return ret;

[PATCH 07/18] autofs: Use autofs4_free_ino() to kfree dentry data

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

kfree dentry data allocated by autofs4_new_ino() with
autofs4_free_ino() instead of raw kfree.
(since we have the interface to free autofs_info*)

This patch was modified to remove the need to set the dentry
info field to NULL dew to a change in the previous patch.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/inode.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 0dd4db9..f35ead7 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -340,7 +340,7 @@ fail_dput:
dput(root);
goto fail_free;
 fail_ino:
-   kfree(ino);
+   autofs4_free_ino(ino);
 fail_free:
kfree(sbi);
s->s_fs_info = NULL;

[PATCH 03/18] autofs: Test autofs versions first on sb initialization

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

This patch does what the below comment says.
It could be and it's considered better to do this first
before various functions get called during initialization.

/* Couldn't this be tested earlier? */

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/inode.c |   34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 61b2105..8357544 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -274,6 +274,23 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
goto fail_dput;
}
 
+   /* Test versions first */
+   if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
+   sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
+   pr_err("kernel does not match daemon version "
+  "daemon (%d, %d) kernel (%d, %d)\n",
+  sbi->min_proto, sbi->max_proto,
+  AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+   goto fail_dput;
+   }
+
+   /* Establish highest kernel protocol version */
+   if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
+   sbi->version = AUTOFS_MAX_PROTO_VERSION;
+   else
+   sbi->version = sbi->max_proto;
+   sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
@@ -291,23 +308,6 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
root_inode->i_fop = &autofs4_root_operations;
root_inode->i_op = &autofs4_dir_inode_operations;
 
-   /* Couldn't this be tested earlier? */
-   if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
-   sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
-   pr_err("kernel does not match daemon version "
-  "daemon (%d, %d) kernel (%d, %d)\n",
-  sbi->min_proto, sbi->max_proto,
-  AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
-   goto fail_dput;
-   }
-
-   /* Establish highest kernel protocol version */
-   if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION)
-   sbi->version = AUTOFS_MAX_PROTO_VERSION;
-   else
-   sbi->version = sbi->max_proto;
-   sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
-
pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);

[PATCH 05/18] autofs: Add WARN_ON(1) for non dir/link inode case

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

It's invalid if the given mode is neither dir nor link,
so warn on else case.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/inode.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 64d721f..0dd4db9 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -368,7 +368,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, 
umode_t mode)
inode->i_fop = &autofs4_dir_operations;
} else if (S_ISLNK(mode)) {
inode->i_op = &autofs4_symlink_inode_operations;
-   }
+   } else
+   WARN_ON(1);
 
return inode;
 }

[PATCH 08/18] autofs: Remove obsolete sb fields

2016-08-11 Thread Ian Kent

From: Tomohiro Kusumi 

These two were left from aa55ddf3 which removed unused ioctls.

Signed-off-by: Tomohiro Kusumi 
Signed-off-by: Ian Kent 
---
 fs/autofs4/autofs_i.h |2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index eb87055..73e3d38 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -111,8 +111,6 @@ struct autofs_sb_info {
int max_proto;
unsigned long exp_timeout;
unsigned int type;
-   int reghost_enabled;
-   int needs_reghost;
struct super_block *sb;
struct mutex wq_mutex;
struct mutex pipe_mutex;

[PATCH] autofs: don't stuck in a loop if vfs_write returns an error

2016-06-16 Thread Ian Kent

From: Andrey Vagin 

__vfs_write() returns a negative value in a error case.

Signed-off-by: Andrey Vagin 
Signed-off-by: Ian Kent 
---
 fs/autofs4/waitq.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 0146d91..631f155 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info *sbi,
set_fs(KERNEL_DS);
 
mutex_lock(&sbi->pipe_mutex);
-   wr = __vfs_write(file, data, bytes, &file->f_pos);
-   while (bytes && wr) {
+   while (bytes) {
+   wr = __vfs_write(file, data, bytes, &file->f_pos);
+   if (wr <= 0)
+   break;
data += wr;
bytes -= wr;
-   wr = __vfs_write(file, data, bytes, &file->f_pos);
}
mutex_unlock(&sbi->pipe_mutex);

Re: [PATCH] autofs: don't stuck in a loop if vfs_write returns an error

2016-06-16 Thread Ian Kent

On Fri, 2016-06-10 at 12:07 -0700, Andrew Morton wrote:
> On Thu, 09 Jun 2016 09:23:26 +0800 Ian Kent  wrote:
> 
> > > I was getting ready to send these over to Andrew and found that opendir(3)
> > > is
> > > failing on a number of tests (51 of 230, 9 fails are expected) with 4.6.0.
> > > 
> > > It's not the patches, yours or mine and it doesn't happen with 4.4.x
> > > kernels.
> > > 
> > > Looks like I'm going to have to bisect to work out what's going on and
> > > that
> > > will
> > > take a while.
> > 
> > The regression has been fixed now.
> > 
> > Al Viro sent a patch for it to Linus yesterday, it's commit e6ec03a25f1 in
> > the
> > Linux tree.
> > 
> > I can send my patches to Andrew (after re-testing) but any autofs related
> > testing of linux.next will need the above commit.
> > 
> > Andrew, surely this isn't the first time this type of problem has happened,
> > how 
> > is it usually handled, what do I need to do to make this go smoothly?
> 
> e6ec03a25f1 is in Linus's tree now so everything should be good.

Everything was good.

There's another autofs patch in Al's for-linus tree now and I don't know when it
will go to Linus.

That one will definitely cause merge conflicts.

After thinking about it, it's probably best to defer the module rename series
until after the next merge window, after all it's not urgent (apart from being
needed for so long).

I'll send over Andrei's patch alone soonish.

Thanks all for your patience,
Ian

[ANNOUNCE] autofs 5.1.2 release

2016-06-14 Thread Ian Kent

Hi all,

An update is overdue so here it is.

It's mostly a bug fix update.

autofs
==

The package can be found at:

ftp://ftp.kernel.org/pub/linux/daemons/autofs/v5

It is autofs-5.1.2.tar.[gz|xz]

No source rpm is there as it can be produced by using:

rpmbuild -ts autofs-5.1.2.tar.gz

and the binary rpm by using:

rpmbuild -tb autofs-5.1.2.tar.gz

See the README.amd-maps file for information about using amd format
maps.

Here are the entries from the CHANGELOG which outline the updates:

15/06/2015 autofs-5.1.2
===
- update libtirpc workaround for new soname.
- revert fix libtirpc name clash.
- fix left mount count return from umount_multi_triggers().
- fix return handling in sss lookup module.
- move query dn calculation from do_bind() to do_connect().
- make do_connect() return a status.
- make connect_to_server() return a status.
- make find_dc_server() return a status.
- make find_server() return a status.
- fix return handling of do_reconnect() in ldap module.
- fix rwlock unlock crash.
- fix config old name lookup.
- fix error handling on ldap bind fail.
- fix direct mount stale instance flag reset.
- fix direct map expire not set for initail empty map.
- fix missing source sss in multi map lookup.
- fix update_hosts_mounts() return.
- change lookup to use reinit instead of reopen.
- update map_hash_table_size description.
- add configuration option to use fqdn in mounts.
- fix out of order call in program map lookup.
- fix error handling of is_mounted().
- Add a mode option for master map entries.
- define monotonic clock helper functions.
- use monotonic clock for alarm thread condition wait.
- define pending condition init helper function.
- use monotonic clock for direct mount condition.
- use monotonic clock for indirect mount condition.
- change remaining gettimeofday() to use clock_gettime().
- change time() to use monotonic_clock().
- remove unused function elapsed().
- fix unbind sasl external mech.
- fix sasl connection concurrancy problem.
- fix memory leak in nisplus lookup_reinit().
- fix memory leak in ldap do_init().
- fix use after free in sun parser parse_init().
- fix use after free in open_lookup().
- fix typo in autofs_sasl_bind().
- fix memory leak in get_network_proximity().
- fix use after free in match_my_name().
- improve scalability of direct mount path component.
- always set direct mounts catatonic at exit.
- fix use-after-free in st_queue_handler().
- log pipe read errors.
- fix handle_mounts() termination condition check.
- fix Makefile linking dependencies.
- fix modules make clean target.
- fix autofs(5) description of supported map sources.
- add autofs(5) note of IPv6 libtirpc requirement.
- add remote-fs.target systemd dependency.
- fix typo in autofs.conf.
- fix yp map age not updated during map lookup.
- add config option to supress not found log message.
- fix possible memory leak in nfs mount.

Ian

Re: [PATCH] autofs4: Fix endless loop in autofs4_write

2016-06-10 Thread Ian Kent

On Sat, 2016-06-11 at 09:09 +0800, Ian Kent wrote:
> On Fri, 2016-06-10 at 19:07 +0200, Laurent Dufour wrote:
> > The 'commit e9a7c2f1a548 ("autofs4: coding style fixes")' removed the
> > check done on the __vfs_write()'s returned value in autofs4_write().
> > This may lead to a spinning process which can't catch any signal.
> 
> Yeah, sorry my bad.

Actually Andrei Vagin has already sent this patch to me some time ago and I'm
working to send it to Andrew Morton (along with several others).

There have been a couple of hold ups on this, sorry about that.

> 
> > 
> > Call stack showed in xmon could be :
> > [c003a76c7500] c030df74 __vfs_write+0x134/0x1c0
> > (unreliable)
> > [c003a76c75a0] d52a35d4 autofs4_notify_daemon+0x174/0x3f0
> > [autofs4]
> > [c003a76c7780] d52a3fa0 autofs4_wait+0x750/0xa10 [autofs4]
> > [c003a76c78b0] d52a24d8 autofs4_mount_wait+0x78/0x140
> > [autofs4]
> > [c003a76c7930] d52a2f48 autofs4_d_automount+0x1d8/0x370
> > [autofs4]
> > [c003a76c79c0] c03221e4 follow_managed+0x204/0x3a0
> > [c003a76c7a20] c0322c10 lookup_fast+0x220/0x420
> > [c003a76c7a90] c032324c walk_component+0x5c/0x3e0
> > [c003a76c7b00] c0323794 link_path_walk+0x1c4/0x5f0
> > [c003a76c7b90] c0324b00 path_openat+0xf0/0x1620
> > [c003a76c7c90] c0327f6c do_filp_open+0xfc/0x170
> > [c003a76c7dc0] c0000030d06c do_sys_open+0x1bc/0x2e0
> > [c003a76c7e30] c0009260 system_call+0x38/0x108
> > --- Exception: c01 (System Call) at 3fffa38a0988
> > 
> > Cc: Ian Kent 
> > Cc: aut...@vger.kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > Cc: sta...@vger.kernel.org
> > Reviewed-by: Greg Kurz 
> > Signed-off-by: Laurent Dufour 
> > Fixes: e9a7c2f1a548 ("autofs4: coding style fixes")
> > ---
> >  fs/autofs4/waitq.c | 7 ---
> >  1 file changed, 4 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> > index 0146d911f468..106d94139281 100644
> > --- a/fs/autofs4/waitq.c
> > +++ b/fs/autofs4/waitq.c
> > @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info *sbi,
> > set_fs(KERNEL_DS);
> >  
> > mutex_lock(&sbi->pipe_mutex);
> > -   wr = __vfs_write(file, data, bytes, &file->f_pos);
> > -   while (bytes && wr) {
> 
> Right but why not just wr >= 0 here.
> 
> I guess this patch probably saves a few bytes.
> 
> I'll add it to the series.
>  
> > +   while (bytes) {
> > +   wr = __vfs_write(file, data, bytes, &file->f_pos);
> > +   if (wr < 0)
> > +   break;
> > data += wr;
> > bytes -= wr;
> > -   wr = __vfs_write(file, data, bytes, &file->f_pos);
> > }
> > mutex_unlock(&sbi->pipe_mutex);
> >  
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe autofs" in

Re: [PATCH] autofs4: Fix endless loop in autofs4_write

2016-06-10 Thread Ian Kent

On Fri, 2016-06-10 at 19:07 +0200, Laurent Dufour wrote:
> The 'commit e9a7c2f1a548 ("autofs4: coding style fixes")' removed the
> check done on the __vfs_write()'s returned value in autofs4_write().
> This may lead to a spinning process which can't catch any signal.

Yeah, sorry my bad.

> 
> Call stack showed in xmon could be :
> [c003a76c7500] c030df74 __vfs_write+0x134/0x1c0
> (unreliable)
> [c003a76c75a0] d52a35d4 autofs4_notify_daemon+0x174/0x3f0
> [autofs4]
> [c003a76c7780] d52a3fa0 autofs4_wait+0x750/0xa10 [autofs4]
> [c003a76c78b0] d52a24d8 autofs4_mount_wait+0x78/0x140
> [autofs4]
> [c003a76c7930] d52a2f48 autofs4_d_automount+0x1d8/0x370
> [autofs4]
> [c003a76c79c0] c03221e4 follow_managed+0x204/0x3a0
> [c003a76c7a20] c0322c10 lookup_fast+0x220/0x420
> [c003a76c7a90] c032324c walk_component+0x5c/0x3e0
> [c003a76c7b00] c0323794 link_path_walk+0x1c4/0x5f0
> [c003a76c7b90] c0324b00 path_openat+0xf0/0x1620
> [c003a76c7c90] c0327f6c do_filp_open+0xfc/0x170
> [c003a76c7dc0] c030d06c do_sys_open+0x1bc/0x2e0
> [c003a76c7e30] c00000009260 system_call+0x38/0x108
> --- Exception: c01 (System Call) at 3fffa38a0988
> 
> Cc: Ian Kent 
> Cc: aut...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: sta...@vger.kernel.org
> Reviewed-by: Greg Kurz 
> Signed-off-by: Laurent Dufour 
> Fixes: e9a7c2f1a548 ("autofs4: coding style fixes")
> ---
>  fs/autofs4/waitq.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> index 0146d911f468..106d94139281 100644
> --- a/fs/autofs4/waitq.c
> +++ b/fs/autofs4/waitq.c
> @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info *sbi,
>   set_fs(KERNEL_DS);
>  
>   mutex_lock(&sbi->pipe_mutex);
> - wr = __vfs_write(file, data, bytes, &file->f_pos);
> - while (bytes && wr) {

Right but why not just wr >= 0 here.

I guess this patch probably saves a few bytes.

I'll add it to the series.
 
> + while (bytes) {
> + wr = __vfs_write(file, data, bytes, &file->f_pos);
> + if (wr < 0)
> + break;
>   data += wr;
>   bytes -= wr;
> - wr = __vfs_write(file, data, bytes, &file->f_pos);
>   }
>   mutex_unlock(&sbi->pipe_mutex);
>

Re: [PATCH] autofs: don't stuck in a loop if vfs_write returns an error

2016-06-10 Thread Ian Kent

On Thu, 2016-06-09 at 10:27 -0700, Andrei Vagin wrote:
> On Wed, Jun 8, 2016 at 6:23 PM, Ian Kent  wrote:
> > On Mon, 2016-05-30 at 13:52 +0800, Ian Kent wrote:
> > > On Tue, 2016-05-24 at 09:34 +0800, Ian Kent wrote:
> > > > On Mon, 2016-05-23 at 14:50 -0700, Andrei Vagin wrote:
> > > > > Hi Ian,
> > > > > 
> > > > > When are you going to apply this patch? We can't test linux-next
> > > > > without
> > > > > it.
> > > > 
> > > > I though I sent this with the last series but I can't see that I have.
> > > > 
> > > > I have the rest of that series to send over to Andrew which I was
> > > > planning
> > > > to
> > > > do
> > > > after the current merge window closes (which is about now I guess).
> > > > 
> > > > I'll include it in that series.
> > > > Sorry for the hold up, ;)
> > > 
> > > Some bad news I'm afraid.
> > > 
> > > I was getting ready to send these over to Andrew and found that opendir(3)
> > > is
> > > failing on a number of tests (51 of 230, 9 fails are expected) with 4.6.0.
> > > 
> > > It's not the patches, yours or mine and it doesn't happen with 4.4.x
> > > kernels.
> > > 
> > > Looks like I'm going to have to bisect to work out what's going on and
> > > that
> > > will
> > > take a while.
> > 
> > The regression has been fixed now.
> > 
> > Al Viro sent a patch for it to Linus yesterday, it's commit e6ec03a25f1 in
> > the
> > Linux tree.
> 
> It's good!
> 
> > 
> > I can send my patches to Andrew (after re-testing) but any autofs related
> > testing of linux.next will need the above commit.
> > 
> > Andrew, surely this isn't the first time this type of problem has happened,
> > how
> > is it usually handled, what do I need to do to make this go smoothly?
> 
> In linux-next we catch two sorts of bugs.
> 
> 1. If bugs is triggered very often, we report it, when it isn't in
> Linus' tree. And such bugs are fixed very fast.
> 2. If bugs is triggered rarely. In this case we may detect this bug
> too late, when it's in Linus' tree. In this case we may need to
> workaraound it.
> 
> Here the problem belongs to the second type. It is triggered only when
> one or more tests failed and we try to kill all test processes.
> Actually it doesn't affect regular runs of tests, but it's annoying
> when we are investigating something.
> 
> Ian, I don't think that you need to do anything special. Thank you for
> handling this patch!

LOL, it's good that it was my standard (relatively simple) testing that exposed
it.

I'll wait a little longer in case Andrew has anything to add then send the patch
series to him.

> 
> Thanks,
> Andrew
> 
> > 
> > > 
> > > > 
> > > > Ian
> > > > 
> > > > > 
> > > > > Thanks,
> > > > > Andrew
> > > > > 
> > > > > 
> > > > > On Fri, Apr 1, 2016 at 12:37 AM, Ian Kent  wrote:
> > > > > > On Thu, 2016-03-31 at 22:12 -0700, Andrey Vagin wrote:
> > > > > > > From: Andrey Vagin 
> > > > > > > 
> > > > > > > __vfs_write() returns a negative value in a error case.
> > > > > > 
> > > > > > Ha, right, I'll send this along to Andrew with my next series which
> > > > > > should be soon.
> > > > > > 
> > > > > > > 
> > > > > > > Cc: Ian Kent 
> > > > > > > Signed-off-by: Andrey Vagin 
> > > > > > > ---
> > > > > > >  fs/autofs4/waitq.c | 7 ---
> > > > > > >  1 file changed, 4 insertions(+), 3 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> > > > > > > index 0146d91..631f155 100644
> > > > > > > --- a/fs/autofs4/waitq.c
> > > > > > > +++ b/fs/autofs4/waitq.c
> > > > > > > @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info
> > > > > > > *sbi,
> > > > > > >   set_fs(KERNEL_DS);
> > > > > > > 
> > > > > > >   mutex_lock(&sbi->pipe_mutex);
> > > > > > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > > > > - while (bytes && wr) {
> > > > > > > + while (bytes) {
> > > > > > > + wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > > > > + if (wr <= 0)
> > > > > > > + break;
> > > > > > >   data += wr;
> > > > > > >   bytes -= wr;
> > > > > > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > > > >   }
> > > > > > >   mutex_unlock(&sbi->pipe_mutex);
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe autofs" in

Re: [PATCH] autofs: don't stuck in a loop if vfs_write returns an error

2016-06-08 Thread Ian Kent

On Mon, 2016-05-30 at 13:52 +0800, Ian Kent wrote:
> On Tue, 2016-05-24 at 09:34 +0800, Ian Kent wrote:
> > On Mon, 2016-05-23 at 14:50 -0700, Andrei Vagin wrote:
> > > Hi Ian,
> > > 
> > > When are you going to apply this patch? We can't test linux-next without
> > > it.
> > 
> > I though I sent this with the last series but I can't see that I have.
> > 
> > I have the rest of that series to send over to Andrew which I was planning
> > to
> > do
> > after the current merge window closes (which is about now I guess).
> > 
> > I'll include it in that series.
> > Sorry for the hold up, ;)
> 
> Some bad news I'm afraid.
> 
> I was getting ready to send these over to Andrew and found that opendir(3) is
> failing on a number of tests (51 of 230, 9 fails are expected) with 4.6.0.
> 
> It's not the patches, yours or mine and it doesn't happen with 4.4.x kernels.
> 
> Looks like I'm going to have to bisect to work out what's going on and that
> will
> take a while.

The regression has been fixed now.

Al Viro sent a patch for it to Linus yesterday, it's commit e6ec03a25f1 in the
Linux tree.

I can send my patches to Andrew (after re-testing) but any autofs related
testing of linux.next will need the above commit.

Andrew, surely this isn't the first time this type of problem has happened, how 
is it usually handled, what do I need to do to make this go smoothly?

> 
> > 
> > Ian
> > 
> > > 
> > > Thanks,
> > > Andrew
> > > 
> > > 
> > > On Fri, Apr 1, 2016 at 12:37 AM, Ian Kent  wrote:
> > > > On Thu, 2016-03-31 at 22:12 -0700, Andrey Vagin wrote:
> > > > > From: Andrey Vagin 
> > > > > 
> > > > > __vfs_write() returns a negative value in a error case.
> > > > 
> > > > Ha, right, I'll send this along to Andrew with my next series which
> > > > should be soon.
> > > > 
> > > > > 
> > > > > Cc: Ian Kent 
> > > > > Signed-off-by: Andrey Vagin 
> > > > > ---
> > > > >  fs/autofs4/waitq.c | 7 ---
> > > > >  1 file changed, 4 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> > > > > index 0146d91..631f155 100644
> > > > > --- a/fs/autofs4/waitq.c
> > > > > +++ b/fs/autofs4/waitq.c
> > > > > @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info
> > > > > *sbi,
> > > > >   set_fs(KERNEL_DS);
> > > > > 
> > > > >   mutex_lock(&sbi->pipe_mutex);
> > > > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > > - while (bytes && wr) {
> > > > > + while (bytes) {
> > > > > + wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > > + if (wr <= 0)
> > > > > + break;
> > > > >   data += wr;
> > > > >   bytes -= wr;
> > > > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > >   }
> > > > >   mutex_unlock(&sbi->pipe_mutex);
> --
> To unsubscribe from this list: send the line "unsubscribe autofs" in

Re: [PATCH] autofs: don't stuck in a loop if vfs_write returns an error

2016-05-29 Thread Ian Kent

On Tue, 2016-05-24 at 09:34 +0800, Ian Kent wrote:
> On Mon, 2016-05-23 at 14:50 -0700, Andrei Vagin wrote:
> > Hi Ian,
> > 
> > When are you going to apply this patch? We can't test linux-next without it.
> 
> I though I sent this with the last series but I can't see that I have.
> 
> I have the rest of that series to send over to Andrew which I was planning to
> do
> after the current merge window closes (which is about now I guess).
> 
> I'll include it in that series.
> Sorry for the hold up, ;)

Some bad news I'm afraid.

I was getting ready to send these over to Andrew and found that opendir(3) is
failing on a number of tests (51 of 230, 9 fails are expected) with 4.6.0.

It's not the patches, yours or mine and it doesn't happen with 4.4.x kernels.

Looks like I'm going to have to bisect to work out what's going on and that will
take a while.

> 
> Ian
> 
> > 
> > Thanks,
> > Andrew
> > 
> > 
> > On Fri, Apr 1, 2016 at 12:37 AM, Ian Kent  wrote:
> > > On Thu, 2016-03-31 at 22:12 -0700, Andrey Vagin wrote:
> > > > From: Andrey Vagin 
> > > > 
> > > > __vfs_write() returns a negative value in a error case.
> > > 
> > > Ha, right, I'll send this along to Andrew with my next series which
> > > should be soon.
> > > 
> > > > 
> > > > Cc: Ian Kent 
> > > > Signed-off-by: Andrey Vagin 
> > > > ---
> > > >  fs/autofs4/waitq.c | 7 ---
> > > >  1 file changed, 4 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> > > > index 0146d91..631f155 100644
> > > > --- a/fs/autofs4/waitq.c
> > > > +++ b/fs/autofs4/waitq.c
> > > > @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info
> > > > *sbi,
> > > >   set_fs(KERNEL_DS);
> > > > 
> > > >   mutex_lock(&sbi->pipe_mutex);
> > > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > - while (bytes && wr) {
> > > > + while (bytes) {
> > > > + wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > > + if (wr <= 0)
> > > > + break;
> > > >   data += wr;
> > > >   bytes -= wr;
> > > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > >   }
> > > >   mutex_unlock(&sbi->pipe_mutex);

Re: [PATCH] autofs: don't stuck in a loop if vfs_write returns an error

2016-05-23 Thread Ian Kent

On Mon, 2016-05-23 at 14:50 -0700, Andrei Vagin wrote:
> Hi Ian,
> 
> When are you going to apply this patch? We can't test linux-next without it.

I though I sent this with the last series but I can't see that I have.

I have the rest of that series to send over to Andrew which I was planning to do
after the current merge window closes (which is about now I guess).

I'll include it in that series.
Sorry for the hold up, ;)

Ian

> 
> Thanks,
> Andrew
> 
> 
> On Fri, Apr 1, 2016 at 12:37 AM, Ian Kent  wrote:
> > On Thu, 2016-03-31 at 22:12 -0700, Andrey Vagin wrote:
> > > From: Andrey Vagin 
> > > 
> > > __vfs_write() returns a negative value in a error case.
> > 
> > Ha, right, I'll send this along to Andrew with my next series which
> > should be soon.
> > 
> > > 
> > > Cc: Ian Kent 
> > > Signed-off-by: Andrey Vagin 
> > > ---
> > >  fs/autofs4/waitq.c | 7 ---
> > >  1 file changed, 4 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> > > index 0146d91..631f155 100644
> > > --- a/fs/autofs4/waitq.c
> > > +++ b/fs/autofs4/waitq.c
> > > @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info
> > > *sbi,
> > >   set_fs(KERNEL_DS);
> > > 
> > >   mutex_lock(&sbi->pipe_mutex);
> > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > - while (bytes && wr) {
> > > + while (bytes) {
> > > + wr = __vfs_write(file, data, bytes, &file->f_pos);
> > > + if (wr <= 0)
> > > + break;
> > >   data += wr;
> > >   bytes -= wr;
> > > - wr = __vfs_write(file, data, bytes, &file->f_pos);
> > >   }
> > >   mutex_unlock(&sbi->pipe_mutex);
> > >

Re: [PATCH] autofs: don't stuck in a loop if vfs_write returns an error

2016-04-01 Thread Ian Kent

On Thu, 2016-03-31 at 22:12 -0700, Andrey Vagin wrote:
> From: Andrey Vagin 
> 
> __vfs_write() returns a negative value in a error case.

Ha, right, I'll send this along to Andrew with my next series which
should be soon.

> 
> Cc: Ian Kent 
> Signed-off-by: Andrey Vagin 
> ---
>  fs/autofs4/waitq.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
> index 0146d91..631f155 100644
> --- a/fs/autofs4/waitq.c
> +++ b/fs/autofs4/waitq.c
> @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info
> *sbi,
>   set_fs(KERNEL_DS);
>  
>   mutex_lock(&sbi->pipe_mutex);
> - wr = __vfs_write(file, data, bytes, &file->f_pos);
> - while (bytes && wr) {
> + while (bytes) {
> + wr = __vfs_write(file, data, bytes, &file->f_pos);
> + if (wr <= 0)
> + break;
>   data += wr;
>   bytes -= wr;
> - wr = __vfs_write(file, data, bytes, &file->f_pos);
>   }
>   mutex_unlock(&sbi->pipe_mutex);
>

Re: call_usermodehelper in containers

2016-03-25 Thread Ian Kent

On Fri, 2016-03-25 at 02:28 +0100, Oleg Nesterov wrote:
> Hi Ian,
> 
> I can't really recall this old discussion, so I can be easily wrong...
> 
> On 03/24, Ian Kent wrote:
> > 
> > On Mon, 2013-11-18 at 18:28 +0100, Oleg Nesterov wrote:
> > > 
> > > IOW. Please the the "patch" below. It is obviously incomplete and
> > > wrong,
> > > and it can be more clear/clean. And probably we need another API.
> > > Just
> > > to explain what I mean.
> 
> I hope you didn't miss this part ;)

Not at all.

> 
> In particular, we want to turn task_work_add(..., bool notify) into
> task_work_add(..., how_to_notify mask) and this "mask" should allow
> to force TIF_SIGPENDING.

The point of posting the reply was to try and get some advice as my
understanding of the signalling subsystem is fairly poor.

LOL, I'll have another look at the task_work_add() code and see if I can
understand what your trying to tell me.

> 
> > > With this patch call_usermodehelper(..., UMH_IN_MY_NS) should do
> > > exec
> > > from the caller's namespace.
> > 
> > Umm ... I don't think this can work.
> > 
> > I don't think it can be assumed that the init process of a container
> > will behave like an init process.
> > 
> > If you try and do this with a Docker container that has /bin/bash as
> > the
> > init process signals never arrive and work doesn't start until some
> > other signal arrives
> 
> only if it blocks/ignores SIGCHLD? But this doesn't matter, see above
> and
> note the "until we have task_work_add_interruptibel()" in the pseudo
> -code
> I showed.

It seems, and this is not the only case I've encountered, that the init
process in docker containers can be a problem when you want to capture
and handle signals.

I've seen this with /bin/bash and supervisord so far.
I don't know if it is the docker container creation doing this or
something else  certainly I can catch signals within subordinate
processes.

The other thing that occurs to me is that just about anything in a
container could be subverted so the definition of a privileged process
which can be used as a template form execution is essentially undefined.

Mmm ... maybe I've got that wrong too, ;)

Ian

Re: call_usermodehelper in containers

2016-03-24 Thread Ian Kent

On Mon, 2013-11-18 at 18:28 +0100, Oleg Nesterov wrote:
> On 11/15, Eric W. Biederman wrote:
> > 
> > I don't understand that one.  Having a preforked thread with the
> > proper
> > environment that can act like kthreadd in terms of spawning user
> > mode
> > helpers works and is simple.
> 
> Can't we ask ->child_reaper to create the non-daemonized kernel thread
> with the "right" ->nsproxy, ->fs, etc?
> 
> IOW. Please the the "patch" below. It is obviously incomplete and
> wrong,
> and it can be more clear/clean. And probably we need another API. Just
> to explain what I mean.
> 
> With this patch call_usermodehelper(..., UMH_IN_MY_NS) should do exec
> from the caller's namespace.

Umm ... I don't think this can work.

I don't think it can be assumed that the init process of a container
will behave like an init process.

If you try and do this with a Docker container that has /bin/bash as the
init process signals never arrive and work doesn't start until some
other signal arrives at which time it fails to create the kernel thread
returning an error ERESTARTNOINTER (IIRC).

In fact a number of other things relating to signalling processes to
cleanly shutdown in a container suffer the same problem.

I probably don't understand what's actually going on, this is just my
impression of what I'm seeing.

> 
> Oleg.
> ---
> 
> --- a/include/linux/kmod.h
> +++ b/include/linux/kmod.h
> @@ -24,6 +24,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #define KMOD_PATH_LEN 256
> @@ -53,8 +54,14 @@ struct file;
>  #define UMH_WAIT_PROC2   /* wait for the process to
> complete */
>  #define UMH_KILLABLE 4   /* wait for EXEC/PROC killable
> */
>  
> +// FIXME: IMH_* is not actually a mask
> +#define UMH_IN_MY_NS 8
> +
>  struct subprocess_info {
> - struct work_struct work;
> + union {
> + struct work_struct work;
> + struct callback_head twork;
> + };
>   struct completion *complete;
>   char *path;
>   char **argv;
> --- a/kernel/kmod.c
> +++ b/kernel/kmod.c
> @@ -541,7 +541,6 @@ struct subprocess_info
> *call_usermodehelper_setup(char *path, char **argv,
>   if (!sub_info)
>   goto out;
>  
> - INIT_WORK(&sub_info->work, __call_usermodehelper);
>   sub_info->path = path;
>   sub_info->argv = argv;
>   sub_info->envp = envp;
> @@ -554,6 +553,24 @@ struct subprocess_info
> *call_usermodehelper_setup(char *path, char **argv,
>  }
>  EXPORT_SYMBOL(call_usermodehelper_setup);
>  
> +static int call_call_usermodehelper(void *twork)
> +{
> + struct subprocess_info *sub_info =
> + container_of(twork, struct subprocess_info, twork);
> +
> + __call_usermodehelper(&sub_info->work);
> + do_exit(0);
> +
> +}
> +
> +static void fork_umh_helper(struct callback_head *twork)
> +{
> + if (current->flags & PF_EXITING)
> + return; // WRONG, FIXME
> +
> + kernel_thread(call_call_usermodehelper, twork, SIGCHLD);
> +}
> +
>  /**
>   * call_usermodehelper_exec - start a usermode application
>   * @sub_info: information about the subprocessa
> @@ -570,6 +587,10 @@ int call_usermodehelper_exec(struct
> subprocess_info *sub_info, int wait)
>  {
>   DECLARE_COMPLETION_ONSTACK(done);
>   int retval = 0;
> + bool in_my_ns;
> +
> + in_my_ns = wait & UMH_IN_MY_NS;
> + wait &= ~UMH_IN_MY_NS;
>  
>   if (!sub_info->path) {
>   call_usermodehelper_freeinfo(sub_info);
> @@ -594,7 +615,21 @@ int call_usermodehelper_exec(struct
> subprocess_info *sub_info, int wait)
>   sub_info->complete = &done;
>   sub_info->wait = wait;
>  
> - queue_work(khelper_wq, &sub_info->work);
> + if (likely(!in_my_ns)) {
> + INIT_WORK(&sub_info->work, __call_usermodehelper);
> + queue_work(khelper_wq, &sub_info->work);
> + } else {
> + // RACY, WRONG, ETC
> + struct task_struct *my_init =
> task_active_pid_ns(current)->child_reaper;
> +
> + init_task_work(&sub_info->twork, fork_umh_helper);
> + task_work_add(my_init, &sub_info->twork, false);
> +
> + // until we have task_work_add_interruptibel()
> + do_send_sig_info(SIGCHLD, SEND_SIG_FORCED, my_init,
> false);
> +
> + }
> +
>   if (wait == UMH_NO_WAIT)/* task has freed sub_info */
>   goto unlock;
>  
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux
> -fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: fs: NULL deref in atime_needs_update

2016-02-23 Thread Ian Kent

On Wed, 2016-02-17 at 00:40 +0100, Mickaël Salaün wrote:
> Hi,
> 
> Actually I found the same bug (without fuzzing) and I can reproduce it
> in a deterministic way (e.g. by creating a LSM that return 1 for the
> security_file_open hook). At least, from v4.2.8 I can easily trigger
> traces like this :

Reading through this thread I wonder if this is a new problem.

Is there a previous kernel it can't be reproduced on?
Perhaps a bisect will shed some light on what's happening.

> 
> BUG: unable to handle kernel NULL pointer dereference at
> 0050
> IP: [] atime_needs_update+0x11/0xc0
> PGD 127b17067 PUD 12ab2e067 PMD 0 
> Oops:  [#45] SMP 
> [...]
> RIP: 0010:[]  []
> atime_needs_update+0x11/0xc0
> RSP: 0018:880127853c18  EFLAGS: 00010246
> RAX: 88012ad0c080 RBX: 88012ad0c1d8 RCX: 88012ad0c080
> RDX:  RSI: 88012ad0c1d8 RDI: 880127853d98
> RBP: 880127853c28 R08: 8800cc0a2540 R09: 8800cfbfc320
> R10: 8800cc0a2540 R11: 0001 R12: 8800cb5d6300
> R13:  R14: 88012ad0c080 R15: 880127853e7c
> FS:  7f1054aae700() GS:88012fc4()
> knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 0050 CR3: 000127977000 CR4: 000406e0
> Stack:
> 88012ad0c1d8 8800cb5d6300 880127853c60 8117094e
> 8800c9ade3c0  a670294f 880127853d70
> 880127853d98 880127853c98 8116071c 8800cb4ada80
> Call Trace:
> [] ? touch_atime+0x2e/0xd0
> [] ? trailing_symlink+0xec/0x280
> [] ? path_openat+0x468/0x1240
> [] ? pagevec_lru_move_fn+0xed/0x110
> [] ? __activate_page+0x130/0x130
> [] ? do_filp_open+0x8c/0x100
> [] ? filename_lookup+0xec/0x180
> [] ? do_open_execat+0x74/0x170
> [] ? do_execveat_common.isra.42+0x1a7/0x6a0
> [] ? SyS_execve+0x30/0x40
> [] ? stub_execve+0x5/0x5
> [] ? entry_SYSCALL_64_fastpath+0x16/0x6a
> Code: 89 c7 e8 63 eb ff ff 48 89 d8 5b c3 0f 1f 40 00 66 2e 0f 1f 84
> 00 00 00 00 00 55 48 89 e5 41 54 53 f6 46 0c 02 75 72 48 8b 56 28 <48>
> 8b 42 50 a9 01 04 00 00 75 63 f6 c4 08 75 65 4c 8b 27 41 8b 
> RIP  [] atime_needs_update+0x11/0xc0
> RSP 
> CR2: 0050
> ---[ end trace 97dc4f4bb0214bd8 ]---
> 
> 
> Regards,
>  Mickaël
> 
> 
> On 05/02/2016 22:11, Dmitry Vyukov wrote:
> > Hello,
> > 
> > I've hit the following GPF while running syzkaller fuzzer:
> > 
> > general protection fault:  [#1] SMP DEBUG_PAGEALLOC KASAN
> > Modules linked in:
> > CPU: 1 PID: 5178 Comm: syz-executor Not tainted 4.5.0-rc2+ #65
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
> > 01/01/2011
> > task: 880064768000 ti: 8800622c task.ti:
> > 8800622c
> > RIP: 0010:[]  []
> > atime_needs_update+0x2d/0x460
> > RSP: 0018:8800622c7a30  EFLAGS: 00010203
> > RAX: dc00 RBX:  RCX: dc00
> > RDX: 0001 RSI:  RDI: 000c
> > RBP: 8800622c7a58 R08: 0001 R09: 
> > R10:  R11: 0001 R12: 8800622c7c08
> > R13: 8800622c7c08 R14: 8800301ca322 R15: 8800622c7bb0
> > FS:  7fd1c9f8b700() GS:88003ed0()
> > knlGS:
> > CS:  0010 DS:  ES:  CR0: 8005003b
> > CR2: 20f31000 CR3: 62274000 CR4: 06e0
> > Stack:
> >  8800622c7bf4  8800622c7c08 8800301ca322
> >  8800622c7bb0 8800622c7b38 817ecd91 880030bf5200
> >  8800622c7bb8 11000c458f56 8800622c7c00 8800622c7be0
> > Call Trace:
> >  [< inline >] get_link fs/namei.c:1006
> >  [] link_path_walk+0xaf1/0x1030 fs/namei.c:1968
> >  [] path_parentat+0x41/0x150 fs/namei.c:2176
> >  [] filename_parentat+0x17c/0x3c0 fs/namei.c:2198
> >  [< inline >] user_path_parent fs/namei.c:2412
> >  [< inline >] SYSC_renameat2 fs/namei.c:4411
> >  [< inline >] SyS_renameat2 fs/namei.c:4375
> >  [< inline >] SYSC_renameat fs/namei.c:4521
> >  [] SyS_renameat+0x192/0x820 fs/namei.c:4518
> >  [] entry_SYSCALL_64_fastpath+0x16/0x7a
> > arch/x86/entry/entry_64.S:185
> > Code: 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 89 f3 e8 08 25 d5
> > ff 48 8d 7b 0c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03
> > <0f>
> > b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85
> > RIP  [] atime_needs_update+0x2d/0x460
> > fs/inode.c:1611
> >  RSP 
> > ---[ end trace 1a4c9bda4680ce46 ]---
> > 
> > On commit df48ab3c2f5ffca88b7803ffbadd074bd5a0a2ef.
> > 
> > Objdump shows that inode is NULL in atime_needs_update.
> > 
> > Unfortunately reproduction of this crash is very hard. The program
> > executes something along the lines of:
> > 
> > mmap(0x2000, 15945728, PROT_READ|PROT_WRITE,
> > MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x2000
> > mkdir("./bus", 0662515705056234013740)  = 0
> > openat(AT_FDCWD, "./bus", O_RDONLY|O

Re: call_usermodehelper in containers

2016-02-23 Thread Ian Kent

On Tue, 2016-02-23 at 09:36 -0500, J. Bruce Fields wrote:
> On Tue, Feb 23, 2016 at 10:55:30AM +0800, Ian Kent wrote:
> > You know, wrt. the mechanism Oleg suggested, I've been wondering if
> > it's
> > even necessary to capture process template information for
> > execution.
> > 
> > Isn't the main issue the execution of unknown arbitrary objects
> > getting
> > access to a privileged context?
> > 
> > Then perhaps it is sufficient to require registration of an SHA hash
> > (of
> > some sort) for these objects by a suitably privileged process and
> > only
> > allow helper execution of valid objects.
> 
> That executable probably also depends on libraries, services, and tons
> of other miscellaneous stuff in its environment.  The NFSv4 client
> idmapper, for example, may be doing ldap calls.  Unless the helper is
> created with incredible care, I don't think that it's enough just to
> verify that you're executing the correct helper.

Yeah, I was thinking the logistics of keeping something like this up to
date would be hard but calculating this for every call would be too much
overhead I think.

> 
> --b.
> 
> > 
> > If that is sufficient then helper execution from within a container
> > or
> > user namespace could just use the callers environment itself.
> > 
> > What else do we need to be wary of, any thoughts Eric?
> > 
> > Ian

Re: call_usermodehelper in containers

2016-02-22 Thread Ian Kent

On Fri, 2016-02-19 at 13:14 +0800, Ian Kent wrote:
> On Thu, 2016-02-18 at 14:45 -0600, Eric W. Biederman wrote:
> > Ian Kent  writes:
> > 
> > > On Thu, 2016-02-18 at 14:36 +0800, Ian Kent wrote:
> > > > On Thu, 2016-02-18 at 12:43 +0900, Kamezawa Hiroyuki wrote:
> > > > > On 2016/02/18 11:57, Eric W. Biederman wrote:
> > > > > > 
> > > > > > Ccing The containers list because a related discussion is
> > > > > > happening
> > > > > > there
> > > > > > and somehow this thread has never made it there.
> > > > > > 
> > > > > > Ian Kent  writes:
> > > > > > 
> > > > > > > On Mon, 2013-11-18 at 18:28 +0100, Oleg Nesterov wrote:
> > > > > > > > On 11/15, Eric W. Biederman wrote:
> > > > > > > > > 
> > > > > > > > > I don't understand that one.  Having a preforked
> > > > > > > > > thread
> > > > > > > > > with
> > > > > > > > > the
> > > > > > > > > proper
> > > > > > > > > environment that can act like kthreadd in terms of
> > > > > > > > > spawning
> > > > > > > > > user
> > > > > > > > > mode
> > > > > > > > > helpers works and is simple.
> > > > > > > 
> > > > > > > Forgive me replying to such an old thread but ...
> > > > > > > 
> > > > > > > After realizing workqueues can't be used to pre-create
> > > > > > > threads
> > > > > > > to
> > > > > > > run
> > > > > > > usermode helpers I've returned to look at this.
> > > > > > 
> > > > > > If someone can wind up with a good implementation I will be
> > > > > > happy.
> > > > > > 
> > > > > > > > Can't we ask ->child_reaper to create the non-daemonized
> > > > > > > > kernel
> > > > > > > > thread
> > > > > > > > with the "right" ->nsproxy, ->fs, etc?
> > > > > > > 
> > > > > > > Eric, do you think this approach would be sufficient too?
> > > > > > > 
> > > > > > > Probably wouldn't be quite right for user namespaces but
> > > > > > > should
> > > > > > > provide
> > > > > > > what's needed for other cases?
> > > > > > > 
> > > > > > > It certainly has the advantage of not having to maintain a
> > > > > > > plague
> > > > > > > of
> > > > > > > processes waiting around to execute helpers.
> > > > > > 
> > > > > > That certainly sounds attractive.  Especially for the case
> > > > > > of
> > > > > > everyone
> > > > > > who wants to set a core pattern in a container.
> > > > > > 
> > > > > > I am fuzzy on all of the details right now, but what I do
> > > > > > remember
> > > > > > is
> > > > > > that in the kernel the user mode helper concepts when they
> > > > > > attempted
> > > > > > to
> > > > > > scrub a processes environment were quite error prone until
> > > > > > we
> > > > > > managed to
> > > > > > get kthreadd(pid 2) on the scene which always had a clean
> > > > > > environment.
> > > > > > 
> > > > > > If we are going to tie this kind of thing to the pid
> > > > > > namespace
> > > > > > I
> > > > > > recommend simplying denying it if you are in a user
> > > > > > namespace
> > > > > > without
> > > > > > an approrpriate pid namespace.  AKA simply not allowing
> > > > > > thigns
> > > > > > to
> > > > > > be
> > > > > > setup
> > > > > > if current->pid_ns->user_ns != current->user_ns.
> > > > > > 
> > > > > Can't be handled by simple capability like
> > > > > CAP_SYS_USERMODEHELPER ?
> > 
> &

Re: call_usermodehelper in containers

2016-02-19 Thread Ian Kent

On Fri, 2016-02-19 at 18:30 +0900, Kamezawa Hiroyuki wrote:
> On 2016/02/19 14:37, Ian Kent wrote:
> > On Fri, 2016-02-19 at 12:08 +0900, Kamezawa Hiroyuki wrote:
> > > On 2016/02/19 5:45, Eric W. Biederman wrote:
> > > > Personally I am a fan of the don't be clever and capture a
> > > > kernel
> > > > thread
> > > > approach as it is very easy to see you what if any exploitation
> > > > opportunities there are.  The justifications for something more
> > > > clever
> > > > is trickier.  Of course we do something that from this
> > > > perspective
> > > > would
> > > > be considered ``clever'' today with kthreadd and user mode
> > > > helpers.
> > > > 
> > > 
> > > I read old discussionlet me allow clarification  to create a
> > > helper kernel thread
> > > to run usermodehelper with using kthreadd.
> > > 
> > > 0) define a trigger to create an independent usermodehelper
> > > environment for a container.
> > > Option A) at creating some namespace (pid, uid, etc...)
> > > Option B) at creating a new nsproxy
> > > Option C).at a new systemcall is called or some sysctl,
> > > make_private_usermode_helper() or some,
> > > 
> > >It's expected this should be triggered by init process of a
> > > container with some capability.
> > >And scope of the effect should be defined. pid namespace ?
> > > nsporxy ?
> > > or new namespace ?
> > > 
> > > 1) create a helper thread.
> > > task = kthread_create(kthread_work_fn, ?, ?, "usermodehelper")
> > > switch task's nsproxy to current.(swtich_task_namespaces())
> > > switch task's cgroups to current (cgroup_attach_task_all())
> > > switch task's cred to current.
> > > copy task's capability from current
> > > (and any other ?)
> > > wake_up_process()
> > > 
> > > And create a link between kthread_wq and container.
> > 
> > Not sure I quite understand this but I thought the difficulty with
> > this
> > approach previously (even though the approach was very much
> > incomplete)
> > was knowing that all the "moving parts" would not allow
> > vulnerabilities.
> > 
> Ok, that was discussed.
> 
> > And it looks like this would require a kernel thread for each
> > instance.
> > So for a thousand containers that each mount an NFS mount that
> > means, at
> > least, 1000 additional kernel threads. Might be able to sell that,
> > if we
> > were lucky, but from an system administration POV it's horrible.
> > 
> I agree.
> 
> > There's also the question of existence (aka. lifetime) to deal with
> > since the thread above needs to be created at a time other than the
> > usermode helper callback.
> > 
> > What happens for SIGKILL on a container?
> > 

First understand that the fork and workqueue code is not something I've
needed to look at in the past so it's still quite new to me even now.

> It depends on how the helper kthread is tied to a container related
> object.
> If kthread is linked with some namespace, we can kill it when a
> namespace
> goes away.

I don't know how to do that so without knowing any better I assume it
could be difficult and complicated but, of course, I don't know.

> 
> So, with your opinion,
>   - a helper thread should be spawned on demand
>   - the lifetime of it should be clear. It will be good to have as
> same life time as the container.

This was always what I believed to be the best way to do it but ...

Not sure you've seen the other threads on this by me so let me provide
some history.

I started out posting a series (totally untested, an RFC only) in the
hope of finding a way to do this.

After a few iterations that lead to the conclusion that a kernel thread
would need to be created to provide context for subsequent helper
execution (for every distinct context), much the same as we have here,
and that the init process of the required context would probably be
sufficient for this, required as the environment of the thread
requesting helper execution itself could be used subvert execution.

I ended up accepting that even if I could work out what needed to be
captured and work out what needed to be done to switch to the
namspace(s) and other bits that would be high maintenance as it would be
fairly complicated and subsystems may be added or changed over time.

Also I had assumed a singlethread workqueue would create a sing

Re: call_usermodehelper in containers

2016-02-18 Thread Ian Kent

On Fri, 2016-02-19 at 12:08 +0900, Kamezawa Hiroyuki wrote:
> On 2016/02/19 5:45, Eric W. Biederman wrote: 
> > Personally I am a fan of the don't be clever and capture a kernel
> > thread
> > approach as it is very easy to see you what if any exploitation
> > opportunities there are.  The justifications for something more
> > clever
> > is trickier.  Of course we do something that from this perspective
> > would
> > be considered ``clever'' today with kthreadd and user mode helpers.
> > 
> 
> I read old discussionlet me allow clarification  to create a
> helper kernel thread 
> to run usermodehelper with using kthreadd.
> 
> 0) define a trigger to create an independent usermodehelper
> environment for a container.
>Option A) at creating some namespace (pid, uid, etc...)
>Option B) at creating a new nsproxy
>Option C).at a new systemcall is called or some sysctl,
> make_private_usermode_helper() or some,
>   
>   It's expected this should be triggered by init process of a
> container with some capability.
>   And scope of the effect should be defined. pid namespace ? nsporxy ?
> or new namespace ?
> 
> 1) create a helper thread.
>task = kthread_create(kthread_work_fn, ?, ?, "usermodehelper")
>switch task's nsproxy to current.(swtich_task_namespaces())
>switch task's cgroups to current (cgroup_attach_task_all())
>switch task's cred to current.
>copy task's capability from current
>(and any other ?)
>wake_up_process()
>
>And create a link between kthread_wq and container.

Not sure I quite understand this but I thought the difficulty with this
approach previously (even though the approach was very much incomplete)
was knowing that all the "moving parts" would not allow vulnerabilities.

And it looks like this would require a kernel thread for each instance.
So for a thousand containers that each mount an NFS mount that means, at
least, 1000 additional kernel threads. Might be able to sell that, if we
were lucky, but from an system administration POV it's horrible.

There's also the question of existence (aka. lifetime) to deal with
since the thread above needs to be created at a time other than the
usermode helper callback.

What happens for SIGKILL on a container?

> 2) modify call_usermodehelper() to use kthread_worker
> 
> 
> It seems the problem is which object container private user mode
> helper should be tied to.
> 
> Regards,
> -Kame

Re: call_usermodehelper in containers

2016-02-18 Thread Ian Kent

On Thu, 2016-02-18 at 14:45 -0600, Eric W. Biederman wrote:
> Ian Kent  writes:
> 
> > On Thu, 2016-02-18 at 14:36 +0800, Ian Kent wrote:
> > > On Thu, 2016-02-18 at 12:43 +0900, Kamezawa Hiroyuki wrote:
> > > > On 2016/02/18 11:57, Eric W. Biederman wrote:
> > > > > 
> > > > > Ccing The containers list because a related discussion is
> > > > > happening
> > > > > there
> > > > > and somehow this thread has never made it there.
> > > > > 
> > > > > Ian Kent  writes:
> > > > > 
> > > > > > On Mon, 2013-11-18 at 18:28 +0100, Oleg Nesterov wrote:
> > > > > > > On 11/15, Eric W. Biederman wrote:
> > > > > > > > 
> > > > > > > > I don't understand that one.  Having a preforked thread
> > > > > > > > with
> > > > > > > > the
> > > > > > > > proper
> > > > > > > > environment that can act like kthreadd in terms of
> > > > > > > > spawning
> > > > > > > > user
> > > > > > > > mode
> > > > > > > > helpers works and is simple.
> > > > > > 
> > > > > > Forgive me replying to such an old thread but ...
> > > > > > 
> > > > > > After realizing workqueues can't be used to pre-create
> > > > > > threads
> > > > > > to
> > > > > > run
> > > > > > usermode helpers I've returned to look at this.
> > > > > 
> > > > > If someone can wind up with a good implementation I will be
> > > > > happy.
> > > > > 
> > > > > > > Can't we ask ->child_reaper to create the non-daemonized
> > > > > > > kernel
> > > > > > > thread
> > > > > > > with the "right" ->nsproxy, ->fs, etc?
> > > > > > 
> > > > > > Eric, do you think this approach would be sufficient too?
> > > > > > 
> > > > > > Probably wouldn't be quite right for user namespaces but
> > > > > > should
> > > > > > provide
> > > > > > what's needed for other cases?
> > > > > > 
> > > > > > It certainly has the advantage of not having to maintain a
> > > > > > plague
> > > > > > of
> > > > > > processes waiting around to execute helpers.
> > > > > 
> > > > > That certainly sounds attractive.  Especially for the case of
> > > > > everyone
> > > > > who wants to set a core pattern in a container.
> > > > > 
> > > > > I am fuzzy on all of the details right now, but what I do
> > > > > remember
> > > > > is
> > > > > that in the kernel the user mode helper concepts when they
> > > > > attempted
> > > > > to
> > > > > scrub a processes environment were quite error prone until we
> > > > > managed to
> > > > > get kthreadd(pid 2) on the scene which always had a clean
> > > > > environment.
> > > > > 
> > > > > If we are going to tie this kind of thing to the pid namespace
> > > > > I
> > > > > recommend simplying denying it if you are in a user namespace
> > > > > without
> > > > > an approrpriate pid namespace.  AKA simply not allowing thigns
> > > > > to
> > > > > be
> > > > > setup
> > > > > if current->pid_ns->user_ns != current->user_ns.
> > > > > 
> > > > Can't be handled by simple capability like
> > > > CAP_SYS_USERMODEHELPER ?
> 
> I wasn't talking about a capability I was talking about how to
> identify
> where the user mode helper lives.
> 
> > > > User_ns check seems not to allow core-dump-cather in host will
> > > > not
> > > > work if user_ns is used.
> 
> The bottom line is all of this approaches non-sense if user namespaces
> are not used.  If you just have a pid namespace or a mount namespace
> (or
> perhaps both) and your fire off a new fangled user mode helper you get
> a
> deep problem.  The user space process started to handle your core dump
> or
> your nfs callback will have a full set of capabil

Re: call_usermodehelper in containers

2016-02-17 Thread Ian Kent

On Thu, 2016-02-18 at 14:36 +0800, Ian Kent wrote:
> On Thu, 2016-02-18 at 12:43 +0900, Kamezawa Hiroyuki wrote:
> > On 2016/02/18 11:57, Eric W. Biederman wrote:
> > > 
> > > Ccing The containers list because a related discussion is
> > > happening
> > > there
> > > and somehow this thread has never made it there.
> > > 
> > > Ian Kent  writes:
> > > 
> > > > On Mon, 2013-11-18 at 18:28 +0100, Oleg Nesterov wrote:
> > > > > On 11/15, Eric W. Biederman wrote:
> > > > > > 
> > > > > > I don't understand that one.  Having a preforked thread with
> > > > > > the
> > > > > > proper
> > > > > > environment that can act like kthreadd in terms of spawning
> > > > > > user
> > > > > > mode
> > > > > > helpers works and is simple.
> > > > 
> > > > Forgive me replying to such an old thread but ...
> > > > 
> > > > After realizing workqueues can't be used to pre-create threads
> > > > to
> > > > run
> > > > usermode helpers I've returned to look at this.
> > > 
> > > If someone can wind up with a good implementation I will be happy.
> > > 
> > > > > Can't we ask ->child_reaper to create the non-daemonized
> > > > > kernel
> > > > > thread
> > > > > with the "right" ->nsproxy, ->fs, etc?
> > > > 
> > > > Eric, do you think this approach would be sufficient too?
> > > > 
> > > > Probably wouldn't be quite right for user namespaces but should
> > > > provide
> > > > what's needed for other cases?
> > > > 
> > > > It certainly has the advantage of not having to maintain a
> > > > plague
> > > > of
> > > > processes waiting around to execute helpers.
> > > 
> > > That certainly sounds attractive.  Especially for the case of
> > > everyone
> > > who wants to set a core pattern in a container.
> > > 
> > > I am fuzzy on all of the details right now, but what I do remember
> > > is
> > > that in the kernel the user mode helper concepts when they
> > > attempted
> > > to
> > > scrub a processes environment were quite error prone until we
> > > managed to
> > > get kthreadd(pid 2) on the scene which always had a clean
> > > environment.
> > > 
> > > If we are going to tie this kind of thing to the pid namespace I
> > > recommend simplying denying it if you are in a user namespace
> > > without
> > > an approrpriate pid namespace.  AKA simply not allowing thigns to
> > > be
> > > setup
> > > if current->pid_ns->user_ns != current->user_ns.
> > > 
> > Can't be handled by simple capability like CAP_SYS_USERMODEHELPER ?
> > 
> > User_ns check seems not to allow core-dump-cather in host will not
> > work if user_ns is used.
> 
> I don't think so but I'm not sure.
> 
> The approach I was talking about assumes the init process of the
> caller
> (say within a container, corresponding to ->child_reaper) is an
> appropriate template for umh thread execution.
> 
> But I don't think that covers the case where unshare has created
> different namespaces, like a mount namespace for example.
> 
> The current workqueue sub system can't be used to pre-create a thread
> to
> be used for umh execution so, either is needs changes or yet another
> mechanism needs to be implemented.
> 
> For uses other than core dumping capturing a reference to the struct
> pid
> of the environment init process and using that as an execution
> template
> should be sufficient and takes care of environment existence problems
> with some extra checks, not to mention eliminating the need for a
> potentially huge number of kernel threads needing to be created to
> provide execution templates.
> 
> Where to store this and how to access it when needed is another
> problem.
> 
> Not sure a usermode helper capability is the right thing either as I
> thought one important use of user namespaces was to allow unprivileged
> users to perform operations they otherwise can't.
> 
> Maybe a CAP_SYS_USERNSCOREDUMP or similar would be sensible 
> 
> Still an appropriate execution template would be needed and IIUC we
> can't trust getting that from within a user created namespace
> environment.

Perhaps, if a struct cred could be captured at some appropriate time
that could be used to cater for user namespaces.

Eric, do you think that would be possible to do without allowing users
to circumvent security?

> 
> > 
> > Thanks,
> > -Kame

Re: call_usermodehelper in containers

2016-02-17 Thread Ian Kent

On Thu, 2016-02-18 at 12:43 +0900, Kamezawa Hiroyuki wrote:
> On 2016/02/18 11:57, Eric W. Biederman wrote:
> > 
> > Ccing The containers list because a related discussion is happening
> > there
> > and somehow this thread has never made it there.
> > 
> > Ian Kent  writes:
> > 
> > > On Mon, 2013-11-18 at 18:28 +0100, Oleg Nesterov wrote:
> > > > On 11/15, Eric W. Biederman wrote:
> > > > > 
> > > > > I don't understand that one.  Having a preforked thread with
> > > > > the
> > > > > proper
> > > > > environment that can act like kthreadd in terms of spawning
> > > > > user
> > > > > mode
> > > > > helpers works and is simple.
> > > 
> > > Forgive me replying to such an old thread but ...
> > > 
> > > After realizing workqueues can't be used to pre-create threads to
> > > run
> > > usermode helpers I've returned to look at this.
> > 
> > If someone can wind up with a good implementation I will be happy.
> > 
> > > > Can't we ask ->child_reaper to create the non-daemonized kernel
> > > > thread
> > > > with the "right" ->nsproxy, ->fs, etc?
> > > 
> > > Eric, do you think this approach would be sufficient too?
> > > 
> > > Probably wouldn't be quite right for user namespaces but should
> > > provide
> > > what's needed for other cases?
> > > 
> > > It certainly has the advantage of not having to maintain a plague
> > > of
> > > processes waiting around to execute helpers.
> > 
> > That certainly sounds attractive.  Especially for the case of
> > everyone
> > who wants to set a core pattern in a container.
> > 
> > I am fuzzy on all of the details right now, but what I do remember
> > is
> > that in the kernel the user mode helper concepts when they attempted
> > to
> > scrub a processes environment were quite error prone until we
> > managed to
> > get kthreadd(pid 2) on the scene which always had a clean
> > environment.
> > 
> > If we are going to tie this kind of thing to the pid namespace I
> > recommend simplying denying it if you are in a user namespace
> > without
> > an approrpriate pid namespace.  AKA simply not allowing thigns to be
> > setup
> > if current->pid_ns->user_ns != current->user_ns.
> > 
> Can't be handled by simple capability like CAP_SYS_USERMODEHELPER ?
> 
> User_ns check seems not to allow core-dump-cather in host will not
> work if user_ns is used.

I don't think so but I'm not sure.

The approach I was talking about assumes the init process of the caller
(say within a container, corresponding to ->child_reaper) is an
appropriate template for umh thread execution.

But I don't think that covers the case where unshare has created
different namespaces, like a mount namespace for example.

The current workqueue sub system can't be used to pre-create a thread to
be used for umh execution so, either is needs changes or yet another
mechanism needs to be implemented.

For uses other than core dumping capturing a reference to the struct pid
of the environment init process and using that as an execution template
should be sufficient and takes care of environment existence problems
with some extra checks, not to mention eliminating the need for a
potentially huge number of kernel threads needing to be created to
provide execution templates.

Where to store this and how to access it when needed is another problem.

Not sure a usermode helper capability is the right thing either as I
thought one important use of user namespaces was to allow unprivileged
users to perform operations they otherwise can't.

Maybe a CAP_SYS_USERNSCOREDUMP or similar would be sensible 

Still an appropriate execution template would be needed and IIUC we
can't trust getting that from within a user created namespace
environment.

> 
> Thanks,
> -Kame
>

Re: call_usermodehelper in containers

2016-02-14 Thread Ian Kent

On Sat, 2016-02-13 at 17:08 +0100, Stanislav Kinsburskiy wrote:
> 
> 13.02.2016 00:39, Ian Kent пишет:
> > On Fri, 2013-11-15 at 15:54 +0400, Stanislav Kinsbursky wrote:
> > > 15.11.2013 15:03, Eric W. Biederman пишет:
> > > > Stanislav Kinsbursky  writes:
> > > > 
> > > > > 12.11.2013 17:30, Jeff Layton пишет:
> > > > > > On Tue, 12 Nov 2013 17:02:36 +0400
> > > > > > Stanislav Kinsbursky  wrote:
> > > > > > 
> > > > > > > 12.11.2013 15:12, Jeff Layton пишет:
> > > > > > > > On Mon, 11 Nov 2013 16:47:03 -0800
> > > > > > > > Greg KH  wrote:
> > > > > > > > 
> > > > > > > > > On Mon, Nov 11, 2013 at 07:18:25AM -0500, Jeff Layton
> > > > > > > > > wrote:
> > > > > > > > > > We have a bit of a problem wrt to upcalls that use
> > > > > > > > > > call_usermodehelper
> > > > > > > > > > with containers and I'd like to bring this to some
> > > > > > > > > > sort
> > > > > > > > > > of resolution...
> > > > > > > > > > 
> > > > > > > > > > A particularly problematic case (though there are
> > > > > > > > > > others) is the
> > > > > > > > > > nfsdcltrack upcall. It basically uses
> > > > > > > > > > call_usermodehelper to run a
> > > > > > > > > > program in userland to track some information on
> > > > > > > > > > stable
> > > > > > > > > > storage for
> > > > > > > > > > nfsd.
> > > > > > > > > I thought the discussion at the kernel summit about
> > > > > > > > > this
> > > > > > > > > issue was:
> > > > > > > > >   - don't do this.
> > > > > > > > >   - don't do it.
> > > > > > > > >   - if you really need to do this, fix nfsd
> > > > > > > > > 
> > > > > > > > Sorry, I couldn't make the kernel summit so I missed
> > > > > > > > that
> > > > > > > > discussion. I
> > > > > > > > guess LWN didn't cover it?
> > > > > > > > 
> > > > > > > > In any case, I guess then that we'll either have to come
> > > > > > > > up
> > > > > > > > with some
> > > > > > > > way to fix nfsd here, or simply ensure that nfsd can
> > > > > > > > never
> > > > > > > > be started
> > > > > > > > unless root in the container has a full set of a full
> > > > > > > > set of
> > > > > > > > capabilities.
> > > > > > > > 
> > > > > > > > One sort of Rube Goldberg possibility to fix nfsd is:
> > > > > > > > 
> > > > > > > > - when we start nfsd in a container, fork off an extra
> > > > > > > > kernel thread
> > > > > > > >   that just sits idle. That thread would need to be
> > > > > > > > a
> > > > > > > > descendant of the
> > > > > > > >   userland process that started nfsd, so we'd need
> > > > > > > > to
> > > > > > > > create it with
> > > > > > > >   kernel_thread().
> > > > > > > > 
> > > > > > > > - Have the kernel just start up the UMH program in the
> > > > > > > > init_ns mount
> > > > > > > >   namespace as it currently does, but also pass the
> > > > > > > > pid
> > > > > > > > of the idle
> > > > > > > >   kernel thread to the UMH upcall.
> > > > > > > > 
> > > > > > > > - The program will then use /proc//root and
> > > > > > > > /proc//ns/* to set
> > > > > > > >   itself up for doing things properly.
> > > > > > > > 
> > > > > > > > Note that with this mechanism we can't actua

Re: call_usermodehelper in containers

2016-02-12 Thread Ian Kent

On Fri, 2013-11-15 at 15:54 +0400, Stanislav Kinsbursky wrote:
> 15.11.2013 15:03, Eric W. Biederman пишет:
> > Stanislav Kinsbursky  writes:
> > 
> > > 12.11.2013 17:30, Jeff Layton пишет:
> > > > On Tue, 12 Nov 2013 17:02:36 +0400
> > > > Stanislav Kinsbursky  wrote:
> > > > 
> > > > > 12.11.2013 15:12, Jeff Layton пишет:
> > > > > > On Mon, 11 Nov 2013 16:47:03 -0800
> > > > > > Greg KH  wrote:
> > > > > > 
> > > > > > > On Mon, Nov 11, 2013 at 07:18:25AM -0500, Jeff Layton
> > > > > > > wrote:
> > > > > > > > We have a bit of a problem wrt to upcalls that use
> > > > > > > > call_usermodehelper
> > > > > > > > with containers and I'd like to bring this to some sort
> > > > > > > > of resolution...
> > > > > > > > 
> > > > > > > > A particularly problematic case (though there are
> > > > > > > > others) is the
> > > > > > > > nfsdcltrack upcall. It basically uses
> > > > > > > > call_usermodehelper to run a
> > > > > > > > program in userland to track some information on stable
> > > > > > > > storage for
> > > > > > > > nfsd.
> > > > > > > 
> > > > > > > I thought the discussion at the kernel summit about this
> > > > > > > issue was:
> > > > > > >   - don't do this.
> > > > > > >   - don't do it.
> > > > > > >   - if you really need to do this, fix nfsd
> > > > > > > 
> > > > > > 
> > > > > > Sorry, I couldn't make the kernel summit so I missed that
> > > > > > discussion. I
> > > > > > guess LWN didn't cover it?
> > > > > > 
> > > > > > In any case, I guess then that we'll either have to come up
> > > > > > with some
> > > > > > way to fix nfsd here, or simply ensure that nfsd can never
> > > > > > be started
> > > > > > unless root in the container has a full set of a full set of
> > > > > > capabilities.
> > > > > > 
> > > > > > One sort of Rube Goldberg possibility to fix nfsd is:
> > > > > > 
> > > > > > - when we start nfsd in a container, fork off an extra
> > > > > > kernel thread
> > > > > >  that just sits idle. That thread would need to be a
> > > > > > descendant of the
> > > > > >  userland process that started nfsd, so we'd need to
> > > > > > create it with
> > > > > >  kernel_thread().
> > > > > > 
> > > > > > - Have the kernel just start up the UMH program in the
> > > > > > init_ns mount
> > > > > >  namespace as it currently does, but also pass the pid
> > > > > > of the idle
> > > > > >  kernel thread to the UMH upcall.
> > > > > > 
> > > > > > - The program will then use /proc//root and
> > > > > > /proc//ns/* to set
> > > > > >  itself up for doing things properly.
> > > > > > 
> > > > > > Note that with this mechanism we can't actually run a
> > > > > > different binary
> > > > > > per container, but that's probably fine for most purposes.
> > > > > > 
> > > > > 
> > > > > Hmmm... Why we can't? We can go a bit further with userspace
> > > > > idea.
> > > > > 
> > > > > We use UMH some very limited number of user programs. For 2,
> > > > > actually:
> > > > > 1) /sbin/nfs_cache_getent
> > > > > 2) /sbin/nfsdcltrack
> > > > > 
> > > > 
> > > > No, the kernel uses them for a lot more than that. Pretty much
> > > > all of
> > > > the keys API upcalls use it. See all of the callers of
> > > > call_usermodehelper. All of them are running user binaries out
> > > > of the
> > > > kernel, and almost all of them are certainly broken wrt
> > > > containers.
> > > > 
> > > > > If we convert them into proxies, which use /proc//root
> > > > > and /proc//ns/*, this will allow us to lookup the right
> > > > > binary.
> > > > > The only limitation here is presence of this "proxy" binaries
> > > > > on "host".
> > > > > 
> > > > 
> > > > Suppose I spawn my own container as a user, using all of this
> > > > spiffy
> > > > new user namespace stuff. Then I make the kernel use
> > > > call_usermodehelper to call the upcall in the init_ns, and then
> > > > trick
> > > > it into running my new "escape_from_namespace" program with
> > > > "real" root
> > > > privileges.
> > > > 
> > > > I don't think we can reasonably assume that having the kernel
> > > > exec an
> > > > arbitrary binary inside of a container is safe. Doing so inside
> > > > of the
> > > > init_ns is marginally more safe, but only marginally so...
> > > > 
> > > > > And we don't need any significant changes in kernel.
> > > > > 
> > > > > BTW, Jeff, could you remind me, please, why exactly we need to
> > > > > use UMH to run the binary?
> > > > > What are this capabilities, which force us to do so?
> > > > > 
> > > > 
> > > > Nothing _forces_ us to do so, but upcalls are very difficult to
> > > > handle,
> > > > and UMH has a lot of advantages over a long-running daemon
> > > > launched by
> > > > userland.
> > > > 
> > > > Originally, I created the nfsdcltrack upcall as a running daemon
> > > > called
> > > > nfsdcld, and the kernel used rpc_pipefs to communicate with it.
> > > > 
> > > > Everyone hated it because no one likes to have to run daemons
> > > > for
> > > > infrequently used upcalls. It's a pain for

Re: call_usermodehelper in containers

2016-02-10 Thread Ian Kent

On Mon, 2013-11-18 at 18:28 +0100, Oleg Nesterov wrote:
> On 11/15, Eric W. Biederman wrote:
> > 
> > I don't understand that one.  Having a preforked thread with the
> > proper
> > environment that can act like kthreadd in terms of spawning user
> > mode
> > helpers works and is simple.

Forgive me replying to such an old thread but ...

After realizing workqueues can't be used to pre-create threads to run
usermode helpers I've returned to look at this.

> 
> Can't we ask ->child_reaper to create the non-daemonized kernel thread
> with the "right" ->nsproxy, ->fs, etc?

Eric, do you think this approach would be sufficient too?

Probably wouldn't be quite right for user namespaces but should provide
what's needed for other cases?

It certainly has the advantage of not having to maintain a plague of
processes waiting around to execute helpers.

> 
> IOW. Please the the "patch" below. It is obviously incomplete and
> wrong,
> and it can be more clear/clean. And probably we need another API. Just
> to explain what I mean.

> 
> With this patch call_usermodehelper(..., UMH_IN_MY_NS) should do exec
> from the caller's namespace.
> 
> Oleg.
> ---
> 
> --- a/include/linux/kmod.h
> +++ b/include/linux/kmod.h
> @@ -24,6 +24,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #define KMOD_PATH_LEN 256
> @@ -53,8 +54,14 @@ struct file;
>  #define UMH_WAIT_PROC2   /* wait for the process to
> complete */
>  #define UMH_KILLABLE 4   /* wait for EXEC/PROC killable
> */
>  
> +// FIXME: IMH_* is not actually a mask
> +#define UMH_IN_MY_NS 8
> +
>  struct subprocess_info {
> - struct work_struct work;
> + union {
> + struct work_struct work;
> + struct callback_head twork;
> + };
>   struct completion *complete;
>   char *path;
>   char **argv;
> --- a/kernel/kmod.c
> +++ b/kernel/kmod.c
> @@ -541,7 +541,6 @@ struct subprocess_info
> *call_usermodehelper_setup(char *path, char **argv,
>   if (!sub_info)
>   goto out;
>  
> - INIT_WORK(&sub_info->work, __call_usermodehelper);
>   sub_info->path = path;
>   sub_info->argv = argv;
>   sub_info->envp = envp;
> @@ -554,6 +553,24 @@ struct subprocess_info
> *call_usermodehelper_setup(char *path, char **argv,
>  }
>  EXPORT_SYMBOL(call_usermodehelper_setup);
>  
> +static int call_call_usermodehelper(void *twork)
> +{
> + struct subprocess_info *sub_info =
> + container_of(twork, struct subprocess_info, twork);
> +
> + __call_usermodehelper(&sub_info->work);
> + do_exit(0);
> +
> +}
> +
> +static void fork_umh_helper(struct callback_head *twork)
> +{
> + if (current->flags & PF_EXITING)
> + return; // WRONG, FIXME
> +
> + kernel_thread(call_call_usermodehelper, twork, SIGCHLD);
> +}
> +
>  /**
>   * call_usermodehelper_exec - start a usermode application
>   * @sub_info: information about the subprocessa
> @@ -570,6 +587,10 @@ int call_usermodehelper_exec(struct
> subprocess_info *sub_info, int wait)
>  {
>   DECLARE_COMPLETION_ONSTACK(done);
>   int retval = 0;
> + bool in_my_ns;
> +
> + in_my_ns = wait & UMH_IN_MY_NS;
> + wait &= ~UMH_IN_MY_NS;
>  
>   if (!sub_info->path) {
>   call_usermodehelper_freeinfo(sub_info);
> @@ -594,7 +615,21 @@ int call_usermodehelper_exec(struct
> subprocess_info *sub_info, int wait)
>   sub_info->complete = &done;
>   sub_info->wait = wait;
>  
> - queue_work(khelper_wq, &sub_info->work);
> + if (likely(!in_my_ns)) {
> + INIT_WORK(&sub_info->work, __call_usermodehelper);
> + queue_work(khelper_wq, &sub_info->work);
> + } else {
> + // RACY, WRONG, ETC
> + struct task_struct *my_init =
> task_active_pid_ns(current)->child_reaper;
> +
> + init_task_work(&sub_info->twork, fork_umh_helper);
> + task_work_add(my_init, &sub_info->twork, false);
> +
> + // until we have task_work_add_interruptibel()
> + do_send_sig_info(SIGCHLD, SEND_SIG_FORCED, my_init,
> false);
> +
> + }
> +
>   if (wait == UMH_NO_WAIT)/* task has freed sub_info */
>   goto unlock;
>  
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux
> -fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] autofs: show pipe inode in mount options

2016-02-01 Thread Ian Kent

On Tue, 2016-01-26 at 11:55 +0800, Ian Kent wrote:
> On Mon, 2016-01-25 at 15:48 -0800, Andrew Morton wrote:
> > On Tue, 26 Jan 2016 10:19:07 +1100 Stephen Rothwell <
> > s...@canb.auug.org.au> wrote:
> > 
> > > Hi Ian,
> > > 
> > > On Sat, 23 Jan 2016 08:30:17 +0800 Ian Kent 
> > > wrote:
> > > > 
> > > > I haven't had anything significant enough for autofs to warrant
> > > > maintaining a tree and sending push requests so I'll need to ask
> > > > Stephen what I need to do (perhaps you could offer some advise on
> > > > that
> > > > now Stephen, please).
> > > 
> > > I guess if its just a few patches every now and then, then Andrew
> > > Morton may be the best person to shepherd them upstream.
> > 
> > yup, send 'em along.
> > 
> > I actually was handling the autofs4 stuff back in 2014 for a bit.
> 
> Thanks Andrew.
> 
> Last time I tried to send the module rename series we got confused some
> how, patches not seen leading to conflicts in applying later patches
> IIRC, which lead to the recommendation I send them to linux-next.
> 
> The series has grown a bit too now but I'm thinking I should send them
> in smaller groups, such as coding style fixes and white space fixes,
> change to use pr* logging, etc.
> 
> Hopefully that will make the process much more straight forward.
> 
> The thing is the patches are mostly not urgent which is why I keep
> postponing sending them when higher priority things come up.
> 
> As for the patch from Stanislav, I'll put that at the top of my patch
> queue, have a quick look at it and send it over so that, hopefully, it
> can get merged.
> 
> I'll probably send a couple of others too to get things going on with
> (what I'm calling) the module rename series.

Hi Andrew,

I've sent the first bunch of patches to get this change started, including
 Stanislav's patch.

Thanks
Ian

Re: [PATCH] autofs: show pipe inode in mount options

2016-01-25 Thread Ian Kent

On Mon, 2016-01-25 at 15:48 -0800, Andrew Morton wrote:
> On Tue, 26 Jan 2016 10:19:07 +1100 Stephen Rothwell <
> s...@canb.auug.org.au> wrote:
> 
> > Hi Ian,
> > 
> > On Sat, 23 Jan 2016 08:30:17 +0800 Ian Kent 
> > wrote:
> > > 
> > > I haven't had anything significant enough for autofs to warrant
> > > maintaining a tree and sending push requests so I'll need to ask
> > > Stephen what I need to do (perhaps you could offer some advise on
> > > that
> > > now Stephen, please).
> > 
> > I guess if its just a few patches every now and then, then Andrew
> > Morton may be the best person to shepherd them upstream.
> 
> yup, send 'em along.
> 
> I actually was handling the autofs4 stuff back in 2014 for a bit.

Thanks Andrew.

Last time I tried to send the module rename series we got confused some
how, patches not seen leading to conflicts in applying later patches
IIRC, which lead to the recommendation I send them to linux-next.

The series has grown a bit too now but I'm thinking I should send them
in smaller groups, such as coding style fixes and white space fixes,
change to use pr* logging, etc.

Hopefully that will make the process much more straight forward.

The thing is the patches are mostly not urgent which is why I keep
postponing sending them when higher priority things come up.

As for the patch from Stanislav, I'll put that at the top of my patch
queue, have a quick look at it and send it over so that, hopefully, it
can get merged.

I'll probably send a couple of others too to get things going on with
(what I'm calling) the module rename series.

Ian

Re: [PATCH] autofs: show pipe inode in mount options

2016-01-22 Thread Ian Kent

On Sat, 2016-01-23 at 08:30 +0800, Ian Kent wrote:
> On Fri, 2016-01-22 at 12:34 +0100, Stanislav Kinsburskiy wrote:
> > Hi again,
> > 
> > I would like to ask about any progress with this patch?
> > Any other requirements to make it able to merge?
> 
> Sorry for the delay.
> 
> Since there haven't been any comments from Al or Stephen I'm think I
> should include it in the series I plan on sending to linux-next to
> rename autofs4 to autofs (among other things).
> 
> I haven't had anything significant enough for autofs to warrant
> maintaining a tree and sending push requests so I'll need to ask
> Stephen what I need to do (perhaps you could offer some advise on
> that
> now Stephen, please).

Apologies, there appears to be a parse error in my grammar above,
sorry, and clearly "push" should be "pull" in the paragraph above.

> 
> I'm also struggling to get back to this and carry out the needed
> testing and I'll need to re-base the series too now but I'm getting
> there.
> 
> I didn't see a follow up patch with an updated description, did I
> miss
> it?
> 
> Ian
> --
> To unsubscribe from this list: send the line "unsubscribe autofs" in

Re: [PATCH] autofs: show pipe inode in mount options

2016-01-22 Thread Ian Kent

On Fri, 2016-01-22 at 12:34 +0100, Stanislav Kinsburskiy wrote:
> Hi again,
> 
> I would like to ask about any progress with this patch?
> Any other requirements to make it able to merge?

Sorry for the delay.

Since there haven't been any comments from Al or Stephen I'm think I
should include it in the series I plan on sending to linux-next to
rename autofs4 to autofs (among other things).

I haven't had anything significant enough for autofs to warrant
maintaining a tree and sending push requests so I'll need to ask
Stephen what I need to do (perhaps you could offer some advise on that
now Stephen, please).

I'm also struggling to get back to this and carry out the needed
testing and I'll need to re-base the series too now but I'm getting
there.

I didn't see a follow up patch with an updated description, did I miss
it?

Ian

Re: [RFC] A couple of questions about the paged I/O sub system

2015-10-23 Thread Ian Kent

On Thu, 2015-10-22 at 18:54 -0700, Hugh Dickins wrote:
> On Thu, 22 Oct 2015, Ian Kent wrote:
> > On Wed, 2015-10-21 at 12:56 -0700, Hugh Dickins wrote:
> > > On Wed, 21 Oct 2015, Ian Kent wrote:
> > 
> > Thanks for taking the time to reply Hugh.
> > 
> > > 
> > > > Hi all,
> > > > 
> > > > I've been looking through some of the page reclaim code and at
> > > > truncate_inode_pages().
> > > > 
> > > > I'm not familiar with the code and I'm struggling to understand
> it.
> > > > 
> > > > One thing that is puzzling me right now is, if a file has pages
> > > that
> > > > have been modified and are swapped out when
> > > pagevec_lookup_entries() is
> > > > called will they be found?
> > > 
> > > truncate_inode_pages() is a library function which a filesystem
> calls
> > > at some stage in its inode truncation processing, to take all the
> > > incore
> > > pages out of pagecache (out of its radix_tree), and free them up
> > > (usually: some might be otherwise pinned in memory at the time).
> > > 
> > > A filesystem will have other work to do, very particular to that
> > > filesystem, to free up the actual disk blocks: that's definitely
> > > not part of truncate_inode_pages()'s job.
> > > 
> > > It's also called when evicting an inode no longer needed in
> memory,
> > > to free the associated pagecache, when not deleting the blocks on
> > > disk.
> > > 
> > > I think I don't understand your "swapped out": modifications
> occur to
> > > a page while it is in pagecache, and those modifications need to
> be
> > > written back to disk before that page can be reclaimed for other
> use.
> > 
> > Indeed, now I think about it, "swapped out" is a bad choice of
> words
> > when talking about a paged IO system.
> > 
> > What I'm trying to say is if pages allocated to a mapping are
> modified,
> > then under memory pressure, are they ever reclaimed by writing them
> to
> > swap storage or are they always reclaimed by writing them back to
> disk?
> > 
> > Now I think about what you've said here and looking at the code I
> > suspect the answer is they are always reclaimed by writing them to
> > disk.
> 
> Yes.
> 
> > 
> > > 
> > > > 
> > > > If not then how does truncate_inode_pages(_range)() handle
> waiting
> > > for
> > > > these pages to be swapped back in to perform the writeback and
> > > > truncation?
> > > 
> > > Pages are never "swapped back in to perform the writeback":
> > > if writeback is needed, it's done before the page can be freed
> from
> > > pagecache; and if that data is needed again after the page was
> freed,
> > > it's read back in from disk to fresh page.
> > 
> > That makes sense, using swap would be unnecessary double handling.
> > 
> > > 
> > > You may be worrying about what happens when a page is modified or
> > > under writeback when it is truncated: I think that's something
> each
> > > filesystem has to be careful of, and may deal with in different
> ways.
> > 
> > I'm wondering how a mapping nrpages can be non-zero (read greater
> than
> > one) after calling truncate_inode_pages().
> > 
> > But I'm looking at a much older kernel so it's quite different to
> > current upstream and this seemed like a question relevant to both
> > kernels to get some idea of how page reclaim works.
> > 
> > I guess what I'm really looking to work out is if it's possible,
> with
> > the current upstream kernel, for a mapping to have nrpages greater
> than
> > 1 after calling truncate_inode_pages() and hopefully get some
> > explanation of why if that's not so.
> 
> I assume you're worrying about a truncate_inode_pages(mapping, 0). 
> If
> it's truncate_inode_pages(mapping, 1), or lstart anything greater
> than 0,
> then it will leave behind the incompletely truncated pages at the
> start:
> no mystery in that.

I am, sorry I didn't make that clear to start with.

> 
> > 
> > It's certainly possible with the older kernel I'm looking at but I
> need
> > some info. before I consider looking for possible changes to back
> port.
> 
> Probably what you're looking for is Jan Kara's v3.0 commit
> 08

Re: [RFC] A couple of questions about the paged I/O sub system

2015-10-21 Thread Ian Kent

On Wed, 2015-10-21 at 12:56 -0700, Hugh Dickins wrote:
> On Wed, 21 Oct 2015, Ian Kent wrote:

Thanks for taking the time to reply Hugh.

> 
> > Hi all,
> > 
> > I've been looking through some of the page reclaim code and at
> > truncate_inode_pages().
> > 
> > I'm not familiar with the code and I'm struggling to understand it.
> > 
> > One thing that is puzzling me right now is, if a file has pages
> that
> > have been modified and are swapped out when
> pagevec_lookup_entries() is
> > called will they be found?
> 
> truncate_inode_pages() is a library function which a filesystem calls
> at some stage in its inode truncation processing, to take all the
> incore
> pages out of pagecache (out of its radix_tree), and free them up
> (usually: some might be otherwise pinned in memory at the time).
> 
> A filesystem will have other work to do, very particular to that
> filesystem, to free up the actual disk blocks: that's definitely
> not part of truncate_inode_pages()'s job.
> 
> It's also called when evicting an inode no longer needed in memory,
> to free the associated pagecache, when not deleting the blocks on
> disk.
> 
> I think I don't understand your "swapped out": modifications occur to
> a page while it is in pagecache, and those modifications need to be
> written back to disk before that page can be reclaimed for other use.

Indeed, now I think about it, "swapped out" is a bad choice of words
when talking about a paged IO system.

What I'm trying to say is if pages allocated to a mapping are modified,
then under memory pressure, are they ever reclaimed by writing them to
swap storage or are they always reclaimed by writing them back to disk?

Now I think about what you've said here and looking at the code I
suspect the answer is they are always reclaimed by writing them to
disk.

> 
> > 
> > If not then how does truncate_inode_pages(_range)() handle waiting
> for
> > these pages to be swapped back in to perform the writeback and
> > truncation?
> 
> Pages are never "swapped back in to perform the writeback":
> if writeback is needed, it's done before the page can be freed from
> pagecache; and if that data is needed again after the page was freed,
> it's read back in from disk to fresh page.

That makes sense, using swap would be unnecessary double handling.

> 
> You may be worrying about what happens when a page is modified or
> under writeback when it is truncated: I think that's something each
> filesystem has to be careful of, and may deal with in different ways.

I'm wondering how a mapping nrpages can be non-zero (read greater than
one) after calling truncate_inode_pages().

But I'm looking at a much older kernel so it's quite different to
current upstream and this seemed like a question relevant to both
kernels to get some idea of how page reclaim works.

I guess what I'm really looking to work out is if it's possible, with
the current upstream kernel, for a mapping to have nrpages greater than
1 after calling truncate_inode_pages() and hopefully get some
explanation of why if that's not so.

It's certainly possible with the older kernel I'm looking at but I need
some info. before I consider looking for possible changes to back port.

> 
> I'm not sure how much to read in to your use of the word "swap".
> It's true that shmem/tmpfs uses swap (of the swapon/swapoff variety)
> as backing for its pages when under pressure (and uses its own
> variant
> shmem_undo_range() to manage that, instead of
> truncate_inode_pages()),
> but most filesystems don't use "swap" at all.
> 
> I just noticed your subject "paged I/O sub system": I hope you
> realize
> that mm/page_io.c is solely concerned with swap (of the
> swapon/swapoff
> variety), and has next to nothing to do with filesystems.  (Just as,
> conversely, mm/swap.c has next to nothing to do with swap.)

LOL, right, I'm looking at the page reclaim code which, so far, hasn't
lead me to either of those source files.

> 
> > 
> > Anyone, please?
> 
> I hope something I've said there has helped, but warn you that
> I'm a terrible person to engage in an extended conversation with!
> Expect long silences, pray for someone else to jump in.

As well as pointing out that swap storage shouldn't be used in this
case you've reminded me of the difference between swapping and demand
paging, so that's a good start.

Perhaps folks at linux-mm will have more to say.

> > Ian
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC] A couple of questions about the paged I/O sub system

2015-10-20 Thread Ian Kent

Hi all,

I've been looking through some of the page reclaim code and at
truncate_inode_pages().

I'm not familiar with the code and I'm struggling to understand it.

One thing that is puzzling me right now is, if a file has pages that
have been modified and are swapped out when pagevec_lookup_entries() is
called will they be found?

If not then how does truncate_inode_pages(_range)() handle waiting for
these pages to be swapped back in to perform the writeback and
truncation?

Anyone, please?
Ian

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] freeing unlinked file indefinitely delayed

2015-07-12 Thread Ian Kent

On Sun, 2015-07-12 at 16:17 +0100, Al Viro wrote:
> On Thu, Jul 09, 2015 at 07:26:44PM +0800, Ian Kent wrote:
> > > But the dentrys that will most likely face summary execution will be
> > > hashed, such as was the case on that 2.6.32 kernel at dput().
> > > 
> > > Doesn't that mean that something dropped the dentry after the dput(),
> > > that will now also free the dentry, that took the refcount to 0?
> > 
> > Oh wait, think I get it now ... perhaps it's prune_one_dentry() doing
> > it ...
> 
> What, unhashing?  Yes, it does.

Yep, that was what I was thinking at the time.

> 
> A bit of context - the breakage that had first pointed in direction of
> this bug had been a deadlock with dcache shrinker run on frozen fs was
> stumbling across a hashed dentry with zero refcount *and* zero link count
> of its inode, triggering its eviction, final iput(), inode freeing and
> deadlock on attempt to do sb_start_intwrite() there; figuring out how could
> such a dentry appear in the first place had uncovered this fun.  Which
>   a) is a bug in its own right and
>   b) happens in mainline as well.

I get all of that, and it sure does look like these things should be
treated as unhashed.

My puzzle is the life cycle of DCACHE_DISCONNECTED dentrys, which is
mostly unrelated.

Not to worry, this isn't the first time I've been defeated trying to
work it out.

The only way I can see disconnected dentrys created (possibly unhashed,
and maybe not materialized) is via nfs and nfsd, beside the usage
mentioned here of course.

There must be some indirection I'm missing wrt. export_operations
usage 

Ian

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] freeing unliked file indefinitely delayed

2015-07-09 Thread Ian Kent

On Thu, 2015-07-09 at 19:17 +0800, Ian Kent wrote:
> On Wed, 2015-07-08 at 02:42 +0100, Al Viro wrote:
> > Normally opening a file, unlinking it and then closing will have
> > the inode freed upon close() (provided that it's not otherwise busy and
> > has no remaining links, of course).  However, there's one case where that
> > does *not* happen.  Namely, if you open it by fhandle with cold dcache,
> > then unlink() and close().
> > 
> > In normal case you get d_delete() in unlink(2) notice that dentry
> > is busy and unhash it; on the final dput() it will be forcibly evicted from
> > dcache, triggering iput() and inode removal.  In this case, though, we end
> > up with *two* dentries - disconnected (created by open-by-fhandle) and
> > regular one (used by unlink()).  The latter will have its reference to inode
> > dropped just fine, but the former will not - it's considered hashed (it
> > is on the ->s_anon list), so it will stay around until the memory pressure
> > will finally do it in.  As the result, we have the final iput() delayed
> > indefinitely.  It's trivial to reproduce -
> > 
> > #define _GNU_SOURCE
> > #include 
> > #include 
> > #include 
> > 
> > void flush_dcache(void)
> > {
> > system("mount -o remount,rw /");
> > }
> > 
> > static char buf[20 * 1024 * 1024];
> > 
> > main()
> > {
> > int fd;
> > union { 
> > struct file_handle f;
> > char buf[MAX_HANDLE_SZ];
> > } x;
> > int m;
> > 
> > x.f.handle_bytes = sizeof(x);
> > chdir("/root");
> > mkdir("foo", 0700);
> > fd = open("foo/bar", O_CREAT | O_RDWR, 0600);
> > close(fd);
> > name_to_handle_at(AT_FDCWD, "foo/bar", &x.f, &m, 0);
> > flush_dcache();
> > fd = open_by_handle_at(AT_FDCWD, &x.f, O_RDWR);
> > unlink("foo/bar");
> > write(fd, buf, sizeof(buf));
> > system("df ."); /* 20Mb eaten */
> > close(fd);
> > system("df ."); /* should've freed those 20Mb */
> > flush_dcache();
> > system("df ."); /* should be the same as #2 */
> > }
> > 
> > will spit out something like
> > Filesystem 1K-blocks   Used Available Use% Mounted on
> > /dev/root 322023 303843  1131 100% /
> > Filesystem 1K-blocks   Used Available Use% Mounted on
> > /dev/root 322023 303843  1131 100% /
> > Filesystem 1K-blocks   Used Available Use% Mounted on
> > /dev/root 322023 283282 21692  93% /
> > - inode gets freed only when dentry is finally evicted (here we trigger
> > than by remount; normally it would've happened in response to memory
> > pressure hell knows when).
> > 
> > IMO it's a bug.  Between the close() and final flush_dcache() the file has
> > no surviving links, is *not* busy, won't show up in fuser/lsof/whatnot
> > output, and yet it's still not freed.  I'm not saying that it's 
> > realistically
> > exploitable (albeit with nfsd it might be), but it's a very unpleasant
> > self-LART.
> > 
> > FWIW, my prefered fix would be simply to have the final dput() treat
> > disconnected dentries as "kill on sight"; checking for i_nlink of the
> > inode, as Bruce suggested several years ago, will *not* work, simply
> > because having another link to that file and unlinking it after close
> > will reproduce the situation - disconnected dentry sticks around in
> > dcache past its final dput() and past the last unlink() of our file.
> > Theoretically it might cause an overhead for nfsd (no_subtree_check v3
> > export might see more d_alloc()/d_free(); icache retention will still
> > prevent constant rereading the inode from disk).  _IF_ that proves to
> > be noticable, we might need to come up with something more elaborate
> > (e.g. have unlink() and rename() kick disconnected aliases out if the link
> > count has reached 0), but it's more complex and needs careful ananlysis
> > of correctness - we need to prove that there's no way to miss the link
> > count reaching 0.  I would prefer to treat all disconnected as unhashed
> > for dcache retention purposes - it's simpler and less brittle.  Comments?
> > I mean something like this:
> 
> A

Re: [RFC] freeing unliked file indefinitely delayed

2015-07-09 Thread Ian Kent

On Wed, 2015-07-08 at 02:42 +0100, Al Viro wrote:
>   Normally opening a file, unlinking it and then closing will have
> the inode freed upon close() (provided that it's not otherwise busy and
> has no remaining links, of course).  However, there's one case where that
> does *not* happen.  Namely, if you open it by fhandle with cold dcache,
> then unlink() and close().
> 
>   In normal case you get d_delete() in unlink(2) notice that dentry
> is busy and unhash it; on the final dput() it will be forcibly evicted from
> dcache, triggering iput() and inode removal.  In this case, though, we end
> up with *two* dentries - disconnected (created by open-by-fhandle) and
> regular one (used by unlink()).  The latter will have its reference to inode
> dropped just fine, but the former will not - it's considered hashed (it
> is on the ->s_anon list), so it will stay around until the memory pressure
> will finally do it in.  As the result, we have the final iput() delayed
> indefinitely.  It's trivial to reproduce -
> 
> #define _GNU_SOURCE
> #include 
> #include 
> #include 
> 
> void flush_dcache(void)
> {
> system("mount -o remount,rw /");
> }
> 
> static char buf[20 * 1024 * 1024];
> 
> main()
> {
> int fd;
> union { 
> struct file_handle f;
> char buf[MAX_HANDLE_SZ];
> } x;
> int m;
> 
> x.f.handle_bytes = sizeof(x);
> chdir("/root");
> mkdir("foo", 0700);
> fd = open("foo/bar", O_CREAT | O_RDWR, 0600);
> close(fd);
> name_to_handle_at(AT_FDCWD, "foo/bar", &x.f, &m, 0);
> flush_dcache();
> fd = open_by_handle_at(AT_FDCWD, &x.f, O_RDWR);
> unlink("foo/bar");
> write(fd, buf, sizeof(buf));
> system("df .");   /* 20Mb eaten */
> close(fd);
> system("df .");   /* should've freed those 20Mb */
> flush_dcache();
> system("df .");   /* should be the same as #2 */
> }
> 
> will spit out something like
> Filesystem 1K-blocks   Used Available Use% Mounted on
> /dev/root 322023 303843  1131 100% /
> Filesystem 1K-blocks   Used Available Use% Mounted on
> /dev/root 322023 303843  1131 100% /
> Filesystem 1K-blocks   Used Available Use% Mounted on
> /dev/root 322023 283282 21692  93% /
> - inode gets freed only when dentry is finally evicted (here we trigger
> than by remount; normally it would've happened in response to memory
> pressure hell knows when).
> 
> IMO it's a bug.  Between the close() and final flush_dcache() the file has
> no surviving links, is *not* busy, won't show up in fuser/lsof/whatnot
> output, and yet it's still not freed.  I'm not saying that it's realistically
> exploitable (albeit with nfsd it might be), but it's a very unpleasant
> self-LART.
> 
> FWIW, my prefered fix would be simply to have the final dput() treat
> disconnected dentries as "kill on sight"; checking for i_nlink of the
> inode, as Bruce suggested several years ago, will *not* work, simply
> because having another link to that file and unlinking it after close
> will reproduce the situation - disconnected dentry sticks around in
> dcache past its final dput() and past the last unlink() of our file.
> Theoretically it might cause an overhead for nfsd (no_subtree_check v3
> export might see more d_alloc()/d_free(); icache retention will still
> prevent constant rereading the inode from disk).  _IF_ that proves to
> be noticable, we might need to come up with something more elaborate
> (e.g. have unlink() and rename() kick disconnected aliases out if the link
> count has reached 0), but it's more complex and needs careful ananlysis
> of correctness - we need to prove that there's no way to miss the link
> count reaching 0.  I would prefer to treat all disconnected as unhashed
> for dcache retention purposes - it's simpler and less brittle.  Comments?
> I mean something like this:

Al, help me out here, I'm struggling to understand where these dentrys
come from (apart from your reproducer).

For example, on the heavily patched 2.6.32 kernel where this was first
seen the problem dentry is annoymous, refcount 0, and unhashed.

But the dentrys that will most likely face summary execution will be
hashed, such as was the case on that 2.6.32 kernel at dput().

Doesn't that mean that something dropped the dentry after the dput(),
that will now also free the dentry, that took the refcount to 0?

Ian

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3.2 059/221] autofs4: check dev ioctl size before allocating

2015-05-04 Thread Ian Kent

On Tue, 2015-05-05 at 02:16 +0100, Ben Hutchings wrote:
> 3.2.69-rc1 review patch.  If anyone has any objections, please let me know.

Perhaps you should also consider including commit 0a280962 along with
this one.

> 
> --
> 
> From: Sasha Levin 
> 
> commit e53d77eb8bb616e903e34cc7a918401bee3b5149 upstream.
> 
> There wasn't any check of the size passed from userspace before trying
> to allocate the memory required.
> 
> This meant that userspace might request more space than allowed,
> triggering an OOM.
> 
> Signed-off-by: Sasha Levin 
> Signed-off-by: Ian Kent 
> Signed-off-by: Andrew Morton 
> Signed-off-by: Linus Torvalds 
> Signed-off-by: Ben Hutchings 
> ---
>  fs/autofs4/dev-ioctl.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> --- a/fs/autofs4/dev-ioctl.c
> +++ b/fs/autofs4/dev-ioctl.c
> @@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev
>   if (tmp.size < sizeof(tmp))
>   return ERR_PTR(-EINVAL);
>  
> + if (tmp.size > (PATH_MAX + sizeof(tmp)))
> + return ERR_PTR(-ENAMETOOLONG);
> +
>   return memdup_user(in, tmp.size);
>  }
>  
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[ANNOUNCE] autofs 5.0.10 release

2015-04-21 Thread Ian Kent

Hi all,

The thing to watch out for in this release is a change made to program
map execution environments. The standard environment added at program
map execution introduced a security problem when interpreted languages
like python were used. By default, a prefix is now added to these names
to avoid the problem and, for those that wish to force the use of
standard
names, a configuration option has been added to do that.

autofs
==

The package can be found at:

ftp://ftp.kernel.org/pub/linux/daemons/autofs/v5

It is autofs-5.0.10.tar.[gz|xz]

No source rpm is there as it can be produced by using:

rpmbuild -ts autofs-5.0.10.tar.gz

and the binary rpm by using:

rpmbuild -tb autofs-5.0.10.tar.gz

See the README.amd-maps file for information about using amd format
maps.

Here are the entries from the CHANGELOG which outline the updates:

21/04/2015 autofs-5.0.10
===
- fix mistake in assignment.
- check for non existent negative entries in lookup_ghost().
- fix reset flex scan buffer on init.
- wait for master map available at start.
- add master read wait option.
- add serialization to sasl init.
- dont allocate dev_ctl_ops too early.
- fix race accessing qdn in get_query_dn().
- fix incorrect round robin host detection.
- fix memory leak in create_client().
- fix memory leak in get_exports().
- fix typo in flagdir configure option.
- clarify multiple mounts description.
- gaurd against incorrect umount return.
- update man page autofs(8) for systemd.
- dont pass sloppy option for other than nfs mounts.
- make service want network-online.
- fix fix master map type check.
- init qdn before use in get_query_dn().
- fix typo in update_hosts_mounts().
- make negative cache update consistent for all lookup modules.
- ensure negative cache isn't updated on remount.
- dont add wildcard to negative cache.
- add a prefix to program map stdvars.
- add config option to force use of program map stdvars.
- fix incorrect check in parse_mount().
- handle duplicates in multi mounts.
- revert special case cifs escapes.
- fix map option parsing for 'strictatime'.
- fix showmount search in auto.net.
- remove obsolete comment in auto.net.
- fix macro usage in lookup_program.c.
- fix gcc5 complaints.
- remove unused offset handling code.
- fix mount as you go offset selection.
- link daemon with pthread library (Debian patch).
- manpage corrections (Debian patch).
- fix manpages hyphenation (Debian patch).

Ian




signature.asc
Description: This is a digitally signed message part

[ANNOUNCE] autofs 5.1.1 release

2015-04-21 Thread Ian Kent

Hi all,

The thing to watch out for in this release is a change made to program
map execution environments. The standard environment added at program
map execution introduced a security problem when interpreted languages
like python were used. By default, a prefix is now added to these names
to avoid the problem and, for those that wish to force the use of
standard
names, a configuration option has been added to do that.

autofs
==

The package can be found at:

ftp://ftp.kernel.org/pub/linux/daemons/autofs/v5

It is autofs-5.1.1.tar.[gz|xz]

No source rpm is there as it can be produced by using:

rpmbuild -ts autofs-5.1.1.tar.gz

and the binary rpm by using:

rpmbuild -tb autofs-5.1.1.tar.gz

See the README.amd-maps file for information about using amd format
maps.

Here are the entries from the CHANGELOG which outline the updates:

21/04/2015 autofs-5.1.1
===
- fix compile error in defaults.c.
- add serialization to sasl init.
- dont allocate dev_ctl_ops too early.
- fix incorrect round robin host detection.
- fix race accessing qdn in get_query_dn().
- fix leak in cache_push_mapent().
- fix config entry read buffer not checked.
- fix FILE pointer check in defaults_read_config().
- fix memory leak in conf_amd_get_log_options().
- fix signed comparison in inet_fill_net().
- fix buffer size checks in get_network_proximity().
- fix leak in get_network_proximity().
- fix buffer size checks in merge_options().
- check amd lex buffer len before copy.
- add return check in ldap check_map_indirect().
- check host macro is set before use.
- check options length before use in parse_amd.c.
- fix some out of order evaluations in parse_amd.c.
- fix copy and paste error in dup_defaults_entry().
- fix leak in parse_mount().
- add mutex call return check in defaults.c.
- force disable browse mode for amd format maps.
- fix hosts map options check in lookup_amd_instance().
- fix memory leak in create_client().
- fix memory leak in get_exports().
- fix memory leak in get_defaults_entry().
- fix out of order clearing of options buffer.
- fix reset amd lexer scan buffer.
- ignore multiple commas in options strings.
- fix typo in flagdir configure option.
- clarify multiple mounts description.
- gaurd against incorrect umount return.
- update man page autofs(8) for systemd.
- dont pass sloppy option for other than nfs mounts.
- make service want network-online.
- fix fix master map type check.
- init qdn before use in get_query_dn().
- fix typo in update_hosts_mounts().
- fix hosts map update on reload.
- make negative cache update consistent for all lookup modules.
- ensure negative cache isn't updated on remount.
- dont add wildcard to negative cache.
- add a prefix to program map stdvars.
- add config option to force use of program map stdvars.
- fix incorrect check in parse_mount().
- handle duplicates in multi mounts.
- revert special case cifs escapes.
- fix map option parsing for 'strictatime'.
- fix showmount search in auto.net.
- remove obsolete comment in auto.net.
- fix macro usage in lookup_program.c.
- fix gcc5 complaints.
- remove unused offset handling code.
- fix mount as you go offset selection.
- link daemon with pthread library (Debian patch).
- manpage corrections (Debian patch).
- fix manpages hyphenation (Debian patch).

Ian


signature.asc
Description: This is a digitally signed message part

Re: [RFC PATCH 5 7/7] KEYS: exec request key within service thread of key creator

2015-04-06 Thread Ian Kent

On Thu, 2015-04-02 at 13:58 +0100, David Howells wrote:
> Ian Kent  wrote:
> 
> > +
> > +   /* Namespace token */
> > +   int umh_token;
> 
> If you could put it after data_len so that all the smaller-than-wordsize
> fields are together for better packing.

OK.

> 
> > +   umh_wq_put_token(key->umh_token);
> 
> Does gc.c need an extra #include for this?

Umm ... you'd think so, wonder how it compiled without kmod.h 

> 
> > +   /* If running within a container use the container namespace */
> > +   if (current->nsproxy->net_ns != &init_net)
> > +   key->umh_token = umh_wq_get_token(0, "keys");
> 
> So keys live in the networking namespace?

Perhaps checking the pid namespace would make more sense?

> 
> > -   ret = call_usermodehelper_keys(argv[0], argv, envp, keyring,
> > -  UMH_WAIT_PROC);
> > +   /* If running within a container use the container namespace */
> > +   if (key->umh_token)
> > +   ret = call_usermodehelper_keys_service(argv[0], argv, envp,
> > +  keyring, key->umh_token,
> > +  UMH_WAIT_PROC);
> > +   else
> > +   ret = call_usermodehelper_keys(argv[0], argv, envp,
> > +  keyring, UMH_WAIT_PROC);
> 
> call_usermodehelper_keys_service() would appear to be superfluous.  If
> key->umh_token is 0, you call call_usermodehelper_keys() which then calls
> call_usermodehelper_keys_service() with a 0 token...

Yeah, not really worth the additional function. IIRC there are no other
callers of call_usermodehelper_keys().

> 
> David


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH 5 1/7] kmod - add workqueue service thread store

2015-04-06 Thread Ian Kent

On Thu, 2015-04-02 at 13:43 +0100, David Howells wrote:
> Ian Kent  wrote:
> 
> > +static struct umh_wq_entry *umh_wq_find_entry(int token)
> > +{
> > +   struct umh_wq_entry *this, *entry;
> > +   struct hlist_head *bucket;
> > +   unsigned int hash;
> > +
> > +   hash = hash_32((unsigned long) token, UMH_WQ_HASH_SHIFT);
> > +   bucket = &umh_wq_hash[hash];
> > +
> > +   entry = ERR_PTR(-ENOENT);
> > +   if (hlist_empty(bucket))
> > +   goto out;
> > +
> > +   hlist_for_each_entry(this, bucket, umh_wq_hlist) {
> > +   if (this->token == token) {
> > +   entry = this;
> > +   break;
> > +   }
> > +   }
> > +out:
> > +   return entry;
> > +}
> 
> Can "struct umh_wq_entry *" be used as the token?

Probably not, for example.

Couldn't a user set a different workqueue_struct and have it used for
execution. Not sure what that would get the user but it sounds like the
original reason we couldn't allow execution directly within the caller
environment.

> 
> David


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH 5 3/7] nfsd - use service thread if not executing in init namespace

2015-03-31 Thread Ian Kent

On Tue, 2015-03-31 at 09:14 -0400, J. Bruce Fields wrote:
> On Tue, Mar 31, 2015 at 11:14:58AM +0800, Ian Kent wrote:
> > From: Ian Kent 
> > 
> > If nfsd is running within a container the client tracking operations
> > should run within their originating container also. To do that get a
> > token to a service thread created within the container environment
> > for usermode helper calls.
> > 
> > Signed-off-by: Ian Kent 
> > Cc: Benjamin Coddington 
> > Cc: Al Viro 
> > Cc: J. Bruce Fields 
> > Cc: David Howells 
> > Cc: Trond Myklebust 
> > Cc: Oleg Nesterov 
> > Cc: Eric W. Biederman 
> > Cc: Jeff Layton 
> > ---
> >  fs/nfsd/netns.h   |3 +++
> >  fs/nfsd/nfs4recover.c |   48 
> > +++-
> >  fs/nfsd/nfsctl.c  |6 ++
> >  3 files changed, 40 insertions(+), 17 deletions(-)
> > 
> > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > index ea6749a..099a3c5 100644
> > --- a/fs/nfsd/netns.h
> > +++ b/fs/nfsd/netns.h
> > @@ -112,6 +112,9 @@ struct nfsd_net {
> > u32 clientid_counter;
> >  
> > struct svc_serv *nfsd_serv;
> > +
> > +   /* Namespace token */
> > +   int umh_token;
> >  };
> >  
> >  /* Simple check to find out if a given net was properly initialized */
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 1c307f0..2547edb 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -1184,7 +1184,8 @@ nfsd4_cltrack_grace_start(time_t grace_start)
> >  }
> >  
> >  static int
> > -nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
> > +nfsd4_umh_cltrack_upcall(char *cmd, char *arg,
> > +char *env0, char *env1, int token)
> >  {
> > char *envp[3];
> > char *argv[4];
> > @@ -1209,7 +1210,11 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char 
> > *env0, char *env1)
> > argv[2] = arg;
> > argv[3] = NULL;
> >  
> > -   ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
> > +   if (token > 0)
> > +   ret = call_usermodehelper_service(argv[0], argv, envp,
> > + token, UMH_WAIT_PROC);
> > +   else
> > +   ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
> 
> Do we really need to handle the init_net case specially, or could we
> just let it create a workqueue in that case as well?

That's pretty much up to you but there's still the need to get a token
to create the work queue (and put it to terminate it) or just pass 0 to
call_usermodehelper_service().

> 
> --b.
> 
> > /*
> >  * Disable the upcall mechanism if we're getting an ENOENT or EACCES
> >  * error. The admin can re-enable it on the fly by using sysfs
> > @@ -1252,14 +1257,8 @@ nfsd4_umh_cltrack_init(struct net *net)
> > struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
> >  
> > -   /* XXX: The usermode helper s not working in container yet. */
> > -   if (net != &init_net) {
> > -   WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
> > -   "tracking in a container!\n");
> > -   return -EINVAL;
> > -   }
> > -
> > -   ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
> > +   ret = nfsd4_umh_cltrack_upcall("init", NULL,
> > +   grace_start, NULL, nn->umh_token);
> > kfree(grace_start);
> > return ret;
> >  }
> > @@ -1285,6 +1284,7 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
> >  {
> > char *hexid, *has_session, *grace_start;
> > struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > +   int ret;
> >  
> > /*
> >  * With v4.0 clients, there's little difference in outcome between a
> > @@ -1312,7 +1312,10 @@ nfsd4_umh_cltrack_create(struct nfs4_client *clp)
> > grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
> >  
> > nfsd4_cltrack_upcall_lock(clp);
> > -   if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, 
> > grace_start))
> > +   ret = nfsd4_umh_cltrack_upcall("create",
> > +  hexid, has_session, grace_start,
> > +  nn->umh_token);
> > +   if (!ret)
> > set_b

Re: [RFC PATCH 5 1/7] kmod - add workqueue service thread store

2015-03-31 Thread Ian Kent

On Tue, 2015-03-31 at 07:21 -0400, Jeff Layton wrote:
> On Tue, 31 Mar 2015 11:14:42 +0800
> Ian Kent  wrote:
> 
> > From: Ian Kent 
> > 
> > Persistent use of process information is needed for contained
> > execution in a namespace other than the root init namespace.
> > 
> > Use a simple random token as a key to create and store thread
> > information in a hashed list for use by the usermode helper
> > thread runner.
> > 
> > Signed-off-by: Ian Kent 
> > Cc: Benjamin Coddington 
> > Cc: Al Viro 
> > Cc: J. Bruce Fields 
> > Cc: David Howells 
> > Cc: Trond Myklebust 
> > Cc: Oleg Nesterov 
> > Cc: Eric W. Biederman 
> > Cc: Jeff Layton 
> > ---
> >  include/linux/kmod.h |3 +
> >  kernel/kmod.c|  179 
> > ++
> >  2 files changed, 181 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/kmod.h b/include/linux/kmod.h
> > index 0555cc6..fa46722 100644
> > --- a/include/linux/kmod.h
> > +++ b/include/linux/kmod.h
> > @@ -66,6 +66,9 @@ struct subprocess_info {
> > void *data;
> >  };
> >  
> > +extern int umh_wq_get_token(int token, const char *service);
> > +extern void umh_wq_put_token(int token);
> > +
> >  extern int
> >  call_usermodehelper(char *path, char **argv, char **envp, int wait);
> >  
> > diff --git a/kernel/kmod.c b/kernel/kmod.c
> > index 2777f40..55d20ce 100644
> > --- a/kernel/kmod.c
> > +++ b/kernel/kmod.c
> > @@ -40,13 +40,30 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +#include 
> >  
> >  #include 
> >  
> >  extern int max_threads;
> >  
> > +#define KHELPER"khelper"
> >  static struct workqueue_struct *khelper_wq;
> >  
> > +#define UMH_WQ_HASH_SHIFT  6
> > +#define UMH_WQ_HASH_SIZE   1 << UMH_WQ_HASH_SHIFT
> > +
> > +struct umh_wq_entry {
> > +   int token;
> > +   unsigned int count;
> > +   struct workqueue_struct *wq;
> > +   struct hlist_node umh_wq_hlist;
> > +};
> > +
> > +static DEFINE_SPINLOCK(umh_wq_hash_lock);
> > +static struct hlist_head umh_wq_hash[UMH_WQ_HASH_SIZE];
> > +
> >  #define CAP_BSET   (void *)1
> >  #define CAP_PI (void *)2
> >  
> > @@ -475,6 +492,165 @@ static void helper_unlock(void)
> > wake_up(&running_helpers_waitq);
> >  }
> >  
> > +static void umh_wq_hash_init(void)
> > +{
> > +   int i;
> > +
> > +   for (i = 0; i < UMH_WQ_HASH_SIZE; i++)
> > +   INIT_HLIST_HEAD(&umh_wq_hash[i]);
> > +}
> > +
> > +static struct umh_wq_entry *umh_wq_find_entry(int token)
> > +{
> > +   struct umh_wq_entry *this, *entry;
> > +   struct hlist_head *bucket;
> > +   unsigned int hash;
> > +
> > +   hash = hash_32((unsigned long) token, UMH_WQ_HASH_SHIFT);
> > +   bucket = &umh_wq_hash[hash];
> > +
> > +   entry = ERR_PTR(-ENOENT);
> > +   if (hlist_empty(bucket))
> > +   goto out;
> > +
> > +   hlist_for_each_entry(this, bucket, umh_wq_hlist) {
> > +   if (this->token == token) {
> > +   entry = this;
> > +   break;
> > +   }
> > +   }
> > +out:
> > +   return entry;
> > +}
> > +
> > +static struct workqueue_struct *umh_find_wq(int token, unsigned int nowait)
> 
> nit: there's no caller of this in this patch, but one is added in patch #2.

I could change that.
The patch is meant to setup the infrastructure for the subsequent patch.
I can re-organize them if this ends up being worth continuing with.

> 
> > +{
> > +   struct umh_wq_entry *entry;
> > +   unsigned long flags;
> > +
> > +   if (!token)
> > +   return khelper_wq;
> > +
> > +   if (nowait)
> > +   spin_lock_irqsave(&umh_wq_hash_lock, flags);
> > +   else
> > +   spin_lock(&umh_wq_hash_lock);
> > +   entry = umh_wq_find_entry(token);
> > +   if (nowait)
> > +   spin_unlock_irqrestore(&umh_wq_hash_lock, flags);
> > +   else
> > +   spin_unlock(&umh_wq_hash_lock);
> > +
> > +   return entry->wq;
> > +}
> > +
> > +/**
> > + * umh_wq_get_token - create service thread and return an identifying token
> > + * @token: token of an existing service thread or 0 to create a new
> > + *servic

< 1 2 3 4 5 6 7 8 >

301 - 400 of 724 matches

Mail list logo