[RFC 10/26] VFS white-out handling

2007-07-30 Thread Jan Blunck
Introduce white-out handling in the VFS.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/inode.c |   22 ++
 fs/namei.c |  417 +++--
 fs/readdir.c   |6 
 include/linux/fs.h |7 
 4 files changed, 441 insertions(+), 11 deletions(-)

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1410,6 +1410,26 @@ void __init inode_init(unsigned long mem
INIT_HLIST_HEAD(inode_hashtable[loop]);
 }
 
+/*
+ * Dummy default file-operations:
+ * Never open a whiteout. This is always a bug.
+ */
+static int whiteout_no_open(struct inode *irrelevant, struct file *dontcare)
+{
+   printk(WARNING: at %s:%d %s(): Attempted to open a whiteout!\n,
+  __FILE__, __LINE__, __FUNCTION__);
+   /*
+* Nobody should ever be able to open a whiteout. On the other hand
+* this isn't fatal so lets just print a warning message.
+*/
+   WARN_ON(1);
+   return -ENXIO;
+}
+
+static struct file_operations def_wht_fops = {
+   .open   = whiteout_no_open,
+};
+
 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 {
inode-i_mode = mode;
@@ -1423,6 +1443,8 @@ void init_special_inode(struct inode *in
inode-i_fop = def_fifo_fops;
else if (S_ISSOCK(mode))
inode-i_fop = bad_sock_fops;
+   else if (S_ISWHT(mode))
+   inode-i_fop = def_wht_fops;
else
printk(KERN_DEBUG init_special_inode: bogus i_mode (%o)\n,
   mode);
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -887,7 +887,7 @@ static fastcall int __link_path_walk(con
 
err = -ENOENT;
inode = next.dentry-d_inode;
-   if (!inode)
+   if (!inode || S_ISWHT(inode-i_mode))
goto out_dput;
err = -ENOTDIR; 
if (!inode-i_op)
@@ -951,6 +951,8 @@ last_component:
err = -ENOENT;
if (!inode)
break;
+   if (S_ISWHT(inode-i_mode))
+   break;
if (lookup_flags  LOOKUP_DIRECTORY) {
err = -ENOTDIR; 
if (!inode-i_op || !inode-i_op-lookup)
@@ -1434,13 +1436,10 @@ static inline int check_sticky(struct in
  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
  * nfs_async_unlink().
  */
-static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int __may_delete(struct inode *dir, struct dentry *victim, int isdir)
 {
int error;
 
-   if (!victim-d_inode)
-   return -ENOENT;
-
BUG_ON(victim-d_parent-d_inode != dir);
audit_inode_child(victim-d_name.name, victim-d_inode, dir);
 
@@ -1466,6 +1465,14 @@ static int may_delete(struct inode *dir,
return 0;
 }
 
+static int may_delete(struct inode *dir, struct dentry *victim, int isdir)
+{
+   if (!victim-d_inode || S_ISWHT(victim-d_inode-i_mode))
+   return -ENOENT;
+
+   return __may_delete(dir, victim, isdir);
+}
+
 /* Check whether we can create an object with dentry child in directory
  *  dir.
  *  1. We can't do it if child already exists (open has special treatment for
@@ -1477,7 +1484,7 @@ static int may_delete(struct inode *dir,
 static inline int may_create(struct inode *dir, struct dentry *child,
 struct nameidata *nd)
 {
-   if (child-d_inode)
+   if (child-d_inode  !S_ISWHT(child-d_inode-i_mode))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
@@ -1559,6 +1566,13 @@ int vfs_create(struct inode *dir, struct
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
+
+   if (dentry-d_inode  S_ISWHT(dentry-d_inode-i_mode)) {
+   error = vfs_unlink_whiteout(dir, dentry);
+   if (error)
+   return error;
+   }
+
DQUOT_INIT(dir);
error = dir-i_op-create(dir, dentry, mode, nd);
if (!error)
@@ -1741,7 +1755,7 @@ do_last:
}
 
/* Negative dentry, just create the file */
-   if (!path.dentry-d_inode) {
+   if (!path.dentry-d_inode || S_ISWHT(path.dentry-d_inode-i_mode)) {
error = open_namei_create(nd, path, flag, mode);
if (error)
goto exit;
@@ -1903,6 +1917,12 @@ int vfs_mknod(struct inode *dir, struct 
if (error)
return error;
 
+   if (dentry-d_inode  S_ISWHT(dentry-d_inode-i_mode)) {
+   error = vfs_unlink_whiteout(dir, dentry);
+   if (error)
+   return error;
+   }
+
DQUOT_INIT(dir);
error = dir-i_op-mknod(dir, dentry, mode, dev);
if (!error)
@@ -1969,6 +1989,7 @@ asmlinkage long sys_mknod(const char __u
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int 

[RFC 17/26] union-mount: Drive the union cache via dcache

2007-07-30 Thread Jan Blunck
If a dentry is removed from dentry cache because its usage count drops to
zero, the references to the underlying layer of the unions the dentry is in
are droped too. Therefore the union cache is driven by the dentry cache.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/dcache.c|8 +
 fs/union.c |   72 +
 include/linux/dcache.h |8 +
 include/linux/union.h  |6 
 4 files changed, 94 insertions(+)

--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -18,6 +18,7 @@
 #include linux/string.h
 #include linux/mm.h
 #include linux/fs.h
+#include linux/union.h
 #include linux/fsnotify.h
 #include linux/slab.h
 #include linux/init.h
@@ -142,11 +143,14 @@ static struct dentry *__d_kill(struct de
list_add(dentry-d_lru, list);
spin_unlock(dentry-d_lock);
spin_unlock(dcache_lock);
+   __shrink_d_unions(dentry, list);
return NULL;
}
 
/* drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);
+   /* If the dentry was in an union delete them */
+   shrink_d_unions(dentry);
parent = dentry-d_parent;
d_free(dentry);
return dentry == parent ? NULL : parent;
@@ -721,6 +725,7 @@ static void shrink_dcache_for_umount_sub
iput(inode);
}
 
+   shrink_d_unions(dentry);
d_free(dentry);
 
/* finished when we fall off the top of the tree,
@@ -1464,7 +1469,9 @@ void d_delete(struct dentry * dentry)
spin_lock(dentry-d_lock);
isdir = S_ISDIR(dentry-d_inode-i_mode);
if (atomic_read(dentry-d_count) == 1) {
+   __d_drop_unions(dentry);
dentry_iput(dentry);
+   shrink_d_unions(dentry);
fsnotify_nameremove(dentry, isdir);
 
/* remove this and other inotify debug checks after 2.6.18 */
@@ -1478,6 +1485,7 @@ void d_delete(struct dentry * dentry)
spin_unlock(dentry-d_lock);
spin_unlock(dcache_lock);
 
+   shrink_d_unions(dentry);
fsnotify_nameremove(dentry, isdir);
 }
 
--- a/fs/union.c
+++ b/fs/union.c
@@ -258,6 +258,8 @@ int append_to_union(struct vfsmount *mnt
union_put(this);
return 0;
}
+   list_add(this-u_unions, dentry-d_unions);
+   dest_dentry-d_unionized++;
__union_hash(this);
spin_unlock(union_lock);
return 0;
@@ -333,3 +335,73 @@ int follow_union_mount(struct vfsmount *
 
return res;
 }
+
+/*
+ * This must be called when unhashing a dentry. This is called with dcache_lock
+ * and unhashes all unions this dentry is in.
+ */
+void __d_drop_unions(struct dentry *dentry)
+{
+   struct union_mount *this, *next;
+
+   spin_lock(union_lock);
+   list_for_each_entry_safe(this, next, dentry-d_unions, u_unions)
+   __union_unhash(this);
+   spin_unlock(union_lock);
+}
+
+/*
+ * This must be called after __d_drop_unions() without holding any locks.
+ * Note: The dentry might still be reachable via a lookup but at that time it
+ * already a negative dentry. Otherwise it would be unhashed. The union_mount
+ * structure itself is still reachable through mnt-mnt_unions (which we
+ * protect against with union_lock).
+ */
+void shrink_d_unions(struct dentry *dentry)
+{
+   struct union_mount *this, *next;
+
+repeat:
+   spin_lock(union_lock);
+   list_for_each_entry_safe(this, next, dentry-d_unions, u_unions) {
+   BUG_ON(!hlist_unhashed(this-u_hash));
+   BUG_ON(!hlist_unhashed(this-u_rhash));
+   list_del(this-u_unions);
+   this-u_next.dentry-d_unionized--;
+   spin_unlock(union_lock);
+   union_put(this);
+   goto repeat;
+   }
+   spin_unlock(union_lock);
+}
+
+extern void __dput(struct dentry *, struct list_head *);
+
+/*
+ * This is the special variant for use in dput() only.
+ */
+void __shrink_d_unions(struct dentry *dentry, struct list_head *list)
+{
+   struct union_mount *this, *next;
+
+   BUG_ON(!d_unhashed(dentry));
+
+repeat:
+   spin_lock(union_lock);
+   list_for_each_entry_safe(this, next, dentry-d_unions, u_unions) {
+   struct dentry *n_dentry = this-u_next.dentry;
+   struct vfsmount *n_mnt = this-u_next.mnt;
+
+   BUG_ON(!hlist_unhashed(this-u_hash));
+   BUG_ON(!hlist_unhashed(this-u_rhash));
+   list_del(this-u_unions);
+   this-u_next.dentry-d_unionized--;
+   spin_unlock(union_lock);
+   if (__union_put(this)) {
+   __dput(n_dentry, list);
+   mntput(n_mnt);
+   }
+   goto repeat;
+   }
+   spin_unlock(union_lock);
+}
--- 

[RFC 01/26] [PATCH 14/18] shmem: convert to using splice instead of sendfile()

2007-07-30 Thread Jan Blunck
From: Hugh Dickins [EMAIL PROTECTED]

Remove shmem_file_sendfile and resurrect shmem_readpage, as used by tmpfs
to support loop and sendfile in 2.4 and 2.5.  Now tmpfs can support splice,
loop and sendfile in the simplest way, using generic_file_splice_read and
generic_file_splice_write (with the aid of shmem_prepare_write).

We could make some efficiency tweaks later, if there's a real need;
but this is stable and works well as is.

Signed-off-by: Hugh Dickins [EMAIL PROTECTED]
Signed-off-by: Jens Axboe [EMAIL PROTECTED]
---
 mm/shmem.c |   40 
 1 file changed, 16 insertions(+), 24 deletions(-)

--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1109,8 +1109,8 @@ static int shmem_getpage(struct inode *i
 * Normally, filepage is NULL on entry, and either found
 * uptodate immediately, or allocated and zeroed, or read
 * in under swappage, which is then assigned to filepage.
-* But shmem_write_begin passes in a locked filepage,
-* which may be found not uptodate by other callers too,
+* But shmem_readpage and shmem_write_begin passes in a locked
+* filepage, which may be found not uptodate by other callers too,
 * and may need to be copied from the swappage read in.
 */
 repeat:
@@ -1454,9 +1454,18 @@ static const struct inode_operations shm
 static const struct inode_operations shmem_symlink_inline_operations;
 
 /*
- * Normally tmpfs makes no use of shmem_write_begin, but it
- * lets a tmpfs file be used read-write below the loop driver.
+ * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
+ * but providing them allows a tmpfs file to be used for splice, sendfile, and
+ * below the loop driver, in the generic fashion that many filesystems support.
  */
+static int shmem_readpage(struct file *file, struct page *page)
+{
+   struct inode *inode = page-mapping-host;
+   int error = shmem_getpage(inode, page-index, page, SGP_CACHE, NULL);
+   unlock_page(page);
+   return error;
+}
+
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1701,25 +1710,6 @@ static ssize_t shmem_file_read(struct fi
return desc.error;
 }
 
-static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
-size_t count, read_actor_t actor, void *target)
-{
-   read_descriptor_t desc;
-
-   if (!count)
-   return 0;
-
-   desc.written = 0;
-   desc.count = count;
-   desc.arg.data = target;
-   desc.error = 0;
-
-   do_shmem_file_read(in_file, ppos, desc, actor);
-   if (desc.written)
-   return desc.written;
-   return desc.error;
-}
-
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry-d_sb);
@@ -2376,6 +2366,7 @@ static const struct address_space_operat
.writepage  = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
+   .readpage   = shmem_readpage,
.write_begin= shmem_write_begin,
.write_end  = shmem_write_end,
 #endif
@@ -2389,7 +2380,8 @@ static const struct file_operations shme
.read   = shmem_file_read,
.write  = shmem_file_write,
.fsync  = simple_sync_file,
-   .sendfile   = shmem_file_sendfile,
+   .splice_read= generic_file_splice_read,
+   .splice_write   = generic_file_splice_write,
 #endif
 };
 

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 20/26] union-mount: Simple union-mount readdir implementation

2007-07-30 Thread Jan Blunck
This is a very simple union mount readdir implementation. It modifies the
readdir routine to merge the entries of union mounted directories and
eliminate duplicates while walking the union stack.

  FIXME:
  This patch needs to be reworked! At the moment this only works for ext2 and
  tmpfs. All kind of index directories that return d_off  i_size don't work
  with this.

The directory entries are read starting from the top layer and they are
maintained in a cache. Subsequently when the entries from the bottom layers
of the union stack are read they are checked for duplicates (in the cache)
before being passed out to the user space. There can be multiple calls
to readdir/getdents routines for reading the entries of a single directory.
But union directory cache is not maitained across these calls. Instead
for every call, the previously read entries are re-read into the cache
and newly read entires are compared against these for duplicates before
being they are returned to user space.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
Signed-off-by: Bharata B Rao [EMAIL PROTECTED]
---
 fs/readdir.c  |   11 -
 fs/union.c|  336 ++
 include/linux/union.h |   25 +++
 3 files changed, 364 insertions(+), 8 deletions(-)

--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -16,13 +16,14 @@
 #include linux/security.h
 #include linux/syscalls.h
 #include linux/unistd.h
+#include linux/union.h
 
 #include asm/uaccess.h
 
 int vfs_readdir(struct file *file, filldir_t filler, void *buf)
 {
-   struct inode *inode = file-f_path.dentry-d_inode;
int res = -ENOTDIR;
+
if (!file-f_op || !file-f_op-readdir)
goto out;
 
@@ -30,13 +31,7 @@ int vfs_readdir(struct file *file, filld
if (res)
goto out;
 
-   mutex_lock(inode-i_mutex);
-   res = -ENOENT;
-   if (!IS_DEADDIR(inode)) {
-   res = file-f_op-readdir(file, buf, filler);
-   file_accessed(file);
-   }
-   mutex_unlock(inode-i_mutex);
+   res = do_readdir(file, buf, filler);
 out:
return res;
 }
--- a/fs/union.c
+++ b/fs/union.c
@@ -18,6 +18,8 @@
 #include linux/hash.h
 #include linux/fs.h
 #include linux/union.h
+#include linux/module.h
+#include linux/file.h
 
 /*
  * This is borrowed from fs/inode.c. The hashtable for lookups. Somebody
@@ -462,3 +464,337 @@ void detach_mnt_union(struct vfsmount *m
union_put(um);
return;
 }
+
+
+/*
+ * Union mounts support for readdir.
+ */
+
+/* This is a copy from fs/readdir.c */
+struct getdents_callback {
+   struct linux_dirent __user *current_dir;
+   struct linux_dirent __user *previous;
+   int count;
+   int error;
+};
+
+/* The readdir union cache object */
+struct union_cache_entry {
+   struct list_head list;
+   struct qstr name;
+};
+
+static int union_cache_add_entry(struct list_head *list,
+const char *name, int namelen)
+{
+   struct union_cache_entry *this;
+   char *tmp_name;
+
+   this = kmalloc(sizeof(*this), GFP_KERNEL);
+   if (!this) {
+   printk(KERN_CRIT
+  union_cache_add_entry(): out of kernel memory\n);
+   return -ENOMEM;
+   }
+
+   tmp_name = kmalloc(namelen + 1, GFP_KERNEL);
+   if (!tmp_name) {
+   printk(KERN_CRIT
+  union_cache_add_entry(): out of kernel memory\n);
+   kfree(this);
+   return -ENOMEM;
+   }
+
+   this-name.name = tmp_name;
+   this-name.len = namelen;
+   this-name.hash = 0;
+   memcpy(tmp_name, name, namelen);
+   tmp_name[namelen] = 0;
+   INIT_LIST_HEAD(this-list);
+   list_add(this-list, list);
+   return 0;
+}
+
+static void union_cache_free(struct list_head *uc_list)
+{
+   struct list_head *p;
+   struct list_head *ptmp;
+   int count = 0;
+
+   list_for_each_safe(p, ptmp, uc_list) {
+   struct union_cache_entry *this;
+
+   this = list_entry(p, struct union_cache_entry, list);
+   list_del_init(this-list);
+   kfree(this-name.name);
+   kfree(this);
+   count++;
+   }
+   return;
+}
+
+static int union_cache_find_entry(struct list_head *uc_list,
+ const char *name, int namelen)
+{
+   struct union_cache_entry *p;
+   int ret = 0;
+
+   list_for_each_entry(p, uc_list, list) {
+   if (p-name.len != namelen)
+   continue;
+   if (strncmp(p-name.name, name, namelen) == 0) {
+   ret = 1;
+   break;
+   }
+   }
+
+   return ret;
+}
+
+/*
+ * There are four filldir() wrapper necessary for the union mount readdir
+ * implementation:
+ *
+ * - filldir_topmost(): fills the union's readdir cache and the user space
+ * buffer. This is 

[RFC 16/26] union-mount: Introduce union_mount structure

2007-07-30 Thread Jan Blunck
This patch adds the basic structures of VFS based union mounts. It is a new
implementation based on some of my old idea's that influenced Bharata B Rao
[EMAIL PROTECTED] who came up with the proposal to let the
union_mount struct only point to the next layer in the union stack. I rewrote
nearly all of the central patches around lookup and the dcache interaction.

Advantages of the new implementation:
- the new union stack is no longer tied directly to one dentry
- the union stack enables dentries to be part of more than one union
  (bind mounts)
- it is unnecessary to traverse the union stack when de/referencing a dentry
- caching of union stack information still driven by dentry cache

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/Kconfig |8 +
 fs/Makefile|2 
 fs/dcache.c|4 
 fs/union.c |  335 +
 include/linux/dcache.h |9 +
 include/linux/union.h  |   61 
 6 files changed, 419 insertions(+)

--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -551,6 +551,14 @@ config INOTIFY_USER
 
  If unsure, say Y.
 
+config UNION_MOUNT
+   bool Union mount support (EXPERIMENTAL)
+   depends on EXPERIMENTAL
+   ---help---
+ If you say Y here, you will be able to mount file systems as
+ union mount stacks. This is a VFS based implementation and
+ should work with all file systems. If unsure, say N.
+
 config QUOTA
bool Quota support
help
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,8 @@ obj-$(CONFIG_FS_POSIX_ACL)+= posix_acl.
 obj-$(CONFIG_NFS_COMMON)   += nfs_common/
 obj-$(CONFIG_GENERIC_ACL)  += generic_acl.o
 
+obj-$(CONFIG_UNION_MOUNT)  += union.o
+
 obj-$(CONFIG_QUOTA)+= dquot.o
 obj-$(CONFIG_QFMT_V1)  += quota_v1.o
 obj-$(CONFIG_QFMT_V2)  += quota_v2.o
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -985,6 +985,10 @@ struct dentry *d_alloc(struct dentry * p
 #ifdef CONFIG_PROFILING
dentry-d_cookie = NULL;
 #endif
+#ifdef CONFIG_UNION_MOUNT
+   INIT_LIST_HEAD(dentry-d_unions);
+   dentry-d_unionized = 0;
+#endif
INIT_HLIST_NODE(dentry-d_hash);
INIT_LIST_HEAD(dentry-d_lru);
INIT_LIST_HEAD(dentry-d_subdirs);
--- /dev/null
+++ b/fs/union.c
@@ -0,0 +1,335 @@
+/*
+ * VFS based union mount for Linux
+ *
+ * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
+ * Copyright (C) 2007 Novell Inc.
+ *
+ *   Author(s): Jan Blunck ([EMAIL PROTECTED])
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include linux/bootmem.h
+#include linux/init.h
+#include linux/types.h
+#include linux/hash.h
+#include linux/fs.h
+#include linux/union.h
+
+/*
+ * This is borrowed from fs/inode.c. The hashtable for lookups. Somebody
+ * should try to make this good - I've just made it work.
+ */
+static unsigned int union_hash_mask __read_mostly;
+static unsigned int union_hash_shift __read_mostly;
+static struct hlist_head *union_hashtable __read_mostly;
+static unsigned int union_rhash_mask __read_mostly;
+static unsigned int union_rhash_shift __read_mostly;
+static struct hlist_head *union_rhashtable __read_mostly;
+
+/*
+ * Locking Rules:
+ * - dcache_lock (for union_rlookup() only)
+ * - union_lock
+ */
+DEFINE_SPINLOCK(union_lock);
+
+static struct kmem_cache *union_cache __read_mostly;
+
+static unsigned long hash(struct dentry *dentry, struct vfsmount *mnt)
+{
+   unsigned long tmp;
+
+   tmp = ((unsigned long)mnt * (unsigned long)dentry) ^
+   (GOLDEN_RATIO_PRIME + (unsigned long)mnt) / L1_CACHE_BYTES;
+   tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME)  union_hash_shift);
+   return tmp  union_hash_mask;
+}
+
+static __initdata unsigned long union_hash_entries;
+
+static int __init set_union_hash_entries(char *str)
+{
+   if (!str)
+   return 0;
+   union_hash_entries = simple_strtoul(str, str, 0);
+   return 1;
+}
+
+__setup(union_hash_entries=, set_union_hash_entries);
+
+static int __init init_union(void)
+{
+   int loop;
+
+   union_cache = kmem_cache_create(union_mount,
+   sizeof(struct union_mount), 0,
+   SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+   NULL, NULL);
+
+   union_hashtable = alloc_large_system_hash(Union-cache,
+ sizeof(struct hlist_head),
+ union_hash_entries,
+ 14,
+ 0,
+ union_hash_shift,
+ 

[RFC 18/26] union-mount: Changes to the namespace handling

2007-07-30 Thread Jan Blunck
Creates the proper struct union_mount when mounting something into a
union. If the topmost filesystem isn't capable of handling the white-out
filetype it could only be mount read-only.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namespace.c|   46 ++--
 fs/union.c|   57 ++
 include/linux/mount.h |3 ++
 include/linux/union.h |6 +
 4 files changed, 110 insertions(+), 2 deletions(-)

--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@
 #include linux/security.h
 #include linux/mount.h
 #include linux/ramfs.h
+#include linux/union.h
 #include asm/uaccess.h
 #include asm/unistd.h
 #include pnode.h
@@ -68,6 +69,9 @@ struct vfsmount *alloc_vfsmnt(const char
INIT_LIST_HEAD(mnt-mnt_share);
INIT_LIST_HEAD(mnt-mnt_slave_list);
INIT_LIST_HEAD(mnt-mnt_slave);
+#ifdef CONFIG_UNION_MOUNT
+   INIT_LIST_HEAD(mnt-mnt_unions);
+#endif
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -157,6 +161,7 @@ static void __touch_mnt_namespace(struct
 
 static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
 {
+   detach_mnt_union(mnt);
old_nd-dentry = mnt-mnt_mountpoint;
old_nd-mnt = mnt-mnt_parent;
mnt-mnt_parent = mnt;
@@ -180,6 +185,7 @@ static void attach_mnt(struct vfsmount *
list_add_tail(mnt-mnt_hash, mount_hashtable +
hash(nd-mnt, nd-dentry));
list_add_tail(mnt-mnt_child, nd-mnt-mnt_mounts);
+   attach_mnt_union(mnt, nd-mnt, nd-dentry);
 }
 
 /*
@@ -202,6 +208,7 @@ static void commit_tree(struct vfsmount 
list_add_tail(mnt-mnt_hash, mount_hashtable +
hash(parent, mnt-mnt_mountpoint));
list_add_tail(mnt-mnt_child, parent-mnt_mounts);
+   attach_mnt_union(mnt, mnt-mnt_parent, mnt-mnt_mountpoint);
touch_mnt_namespace(n);
 }
 
@@ -577,6 +584,7 @@ void release_mounts(struct list_head *he
struct dentry *dentry;
struct vfsmount *m;
spin_lock(vfsmount_lock);
+   detach_mnt_union(mnt);
dentry = mnt-mnt_mountpoint;
m = mnt-mnt_parent;
mnt-mnt_mountpoint = mnt-mnt_root;
@@ -999,6 +1007,10 @@ static int do_change_type(struct nameida
if (nd-dentry != nd-mnt-mnt_root)
return -EINVAL;
 
+   /* Don't change the type of union mounts */
+   if (IS_MNT_UNION(nd-mnt))
+   return -EINVAL;
+
down_write(namespace_sem);
spin_lock(vfsmount_lock);
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
@@ -1011,7 +1023,8 @@ static int do_change_type(struct nameida
 /*
  * do loopback mount.
  */
-static int do_loopback(struct nameidata *nd, char *old_name, int flags)
+static int do_loopback(struct nameidata *nd, char *old_name, int flags,
+  int mnt_flags)
 {
int clone_flags = 0;
uid_t owner = 0;
@@ -1049,6 +1062,18 @@ static int do_loopback(struct nameidata 
if (IS_ERR(mnt))
goto out;
 
+   /*
+* Unions couldn't be writable if the filesystem doesn't know about
+* whiteouts
+*/
+   err = -ENOTSUPP;
+   if ((mnt_flags  MNT_UNION) 
+   !(mnt-mnt_sb-s_flags  (MS_WHITEOUT|MS_RDONLY)))
+   goto out;
+
+   if (mnt_flags  MNT_UNION)
+   mnt-mnt_flags |= MNT_UNION;
+
err = graft_tree(mnt, nd);
if (err) {
LIST_HEAD(umount_list);
@@ -1121,6 +1146,13 @@ static int do_move_mount(struct nameidat
if (err)
return err;
 
+   /* moving to or from a union mount is not supported */
+   err = -EINVAL;
+   if (IS_MNT_UNION(nd-mnt))
+   goto exit;
+   if (IS_MNT_UNION(old_nd.mnt))
+   goto exit;
+
down_write(namespace_sem);
while (d_mountpoint(nd-dentry)  follow_down(nd-mnt, nd-dentry))
;
@@ -1176,6 +1208,7 @@ out:
up_write(namespace_sem);
if (!err)
path_release(parent_nd);
+exit:
path_release(old_nd);
return err;
 }
@@ -1253,6 +1286,15 @@ int do_add_mount(struct vfsmount *newmnt
if (S_ISLNK(newmnt-mnt_root-d_inode-i_mode))
goto unlock;
 
+   /*
+* Unions couldn't be writable if the filesystem doesn't know about
+* whiteouts
+*/
+   err = -ENOTSUPP;
+   if ((mnt_flags  MNT_UNION) 
+   !(newmnt-mnt_sb-s_flags  (MS_WHITEOUT|MS_RDONLY)))
+   goto unlock;
+
/* some flags may have been set earlier */
newmnt-mnt_flags |= mnt_flags;
if ((err = graft_tree(newmnt, nd)))
@@ -1579,7 +1621,7 @@ long do_mount(char 

[RFC 15/26] union-mount: Add union-mount mount flag

2007-07-30 Thread Jan Blunck
Introduce MNT_UNION and MS_UNION flags. You need additional patches for
util-linux for that to work.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namespace.c|6 +-
 include/linux/fs.h|1 +
 include/linux/mount.h |1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -437,6 +437,7 @@ static int show_vfsmnt(struct seq_file *
{ MNT_NODIRATIME, ,nodiratime },
{ MNT_RELATIME, ,relatime },
{ MNT_NOMNT, ,nomnt },
+   { MNT_UNION, ,union },
{ 0, NULL }
};
struct proc_fs_info *fs_infop;
@@ -1558,9 +1559,12 @@ long do_mount(char *dev_name, char *dir_
mnt_flags |= MNT_RELATIME;
if (flags  MS_NOMNT)
mnt_flags |= MNT_NOMNT;
+   if (flags  MS_UNION)
+   mnt_flags |= MNT_UNION;
 
flags = ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
-  MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_NOMNT);
+  MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_NOMNT |
+  MS_UNION );
 
/* ... and get the mountpoint */
retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -114,6 +114,7 @@ extern int dir_notify_enable;
 #define MS_REMOUNT 32  /* Alter flags of a mounted FS */
 #define MS_MANDLOCK64  /* Allow mandatory locks on an FS */
 #define MS_DIRSYNC 128 /* Directory modifications are synchronous */
+#define MS_UNION   256
 #define MS_NOATIME 1024/* Do not update access times. */
 #define MS_NODIRATIME  2048/* Do not update directory access times */
 #define MS_BIND4096
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -36,6 +36,7 @@ struct mnt_namespace;
 #define MNT_SHARED 0x1000  /* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE 0x2000  /* if the vfsmount is a unbindable mount */
 #define MNT_PNODE_MASK 0x3000  /* propagation flag mask */
+#define MNT_UNION  0x4000  /* if the vfsmount is a union mount */
 
 struct vfsmount {
struct list_head mnt_hash;

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 26/26] union-mount: Debug code

2007-07-30 Thread Jan Blunck
Some debugging code itself.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namei.c|   26 ++
 fs/union.c|   27 +++
 include/linux/namei.h |4 
 3 files changed, 57 insertions(+)

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include linux/fcntl.h
 #include linux/namei.h
 #include linux/union.h
+#include linux/union_debug.h
 #include asm/namei.h
 #include asm/uaccess.h
 
@@ -1794,11 +1795,15 @@ int hash_lookup_union(struct nameidata *
struct path safe = { .dentry = nd-dentry, .mnt = nd-mnt };
int res ;
 
+   UM_DEBUG_LOOKUP(name = \%*s\\n, name-len, name-name);
+
pathget(safe);
res = __hash_lookup_topmost(nd, name, path);
if (res)
goto out;
 
+   UM_DEBUG_LOOKUP_DENTRY(path-dentry);
+
/* only directories can be part of a union stack */
if (!path-dentry-d_inode ||
!S_ISDIR(path-dentry-d_inode-i_mode))
@@ -1813,6 +1818,7 @@ int hash_lookup_union(struct nameidata *
goto out;
}
 
+   UM_DEBUG_LOOKUP_DENTRY(path-dentry);
 out:
path_release(nd);
nd-dentry = safe.dentry;
@@ -2765,6 +2771,8 @@ out_freename:
kfree(name.name);
 out:
pathput(safe);
+   UM_DEBUG(err = %d\n, err);
+   UM_DEBUG_DENTRY(dentry);
return err;
 }
 
@@ -2802,6 +2810,9 @@ int vfs_unlink_whiteout(struct inode *di
}
mutex_unlock(dentry-d_inode-i_mutex);
 
+   UM_DEBUG(err = %d\n, error);
+   UM_DEBUG_DENTRY(dentry);
+
/*
 * We can call dentry_iput() since nobody could actually do something
 * useful with a whiteout. So dropping the reference to the inode
@@ -3490,6 +3501,10 @@ int vfs_rename_union(struct nameidata *o
struct dentry *dentry;
int error;
 
+   UM_DEBUG_DENTRY(old-dentry);
+   UM_DEBUG_DENTRY(new-dentry);
+/* return -EPERM; */
+
if (old-dentry-d_inode == new-dentry-d_inode)
return 0;
 
@@ -3530,6 +3545,9 @@ int vfs_rename_union(struct nameidata *o
 
/* possibly delete the existing new file */
if ((newnd-dentry == new-dentry-d_parent)  new-dentry-d_inode) {
+   UM_DEBUG(unlink:\n);
+   UM_DEBUG_DENTRY(new-dentry);
+
/* FIXME: inode may be truncated while we hold a lock */
error = vfs_unlink(new_dir, new-dentry);
if (error)
@@ -3540,6 +3558,9 @@ int vfs_rename_union(struct nameidata *o
if (IS_ERR(dentry))
goto freename;
 
+   UM_DEBUG(new target:\n);
+   UM_DEBUG_DENTRY(new-dentry);
+
dput(new-dentry);
new-dentry = dentry;
}
@@ -3554,6 +3575,10 @@ int vfs_rename_union(struct nameidata *o
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto freename;
+
+   UM_DEBUG(whiteout:\n);
+   UM_DEBUG_DENTRY(dentry);
+
error = vfs_whiteout(old_dir, dentry);
dput(dentry);
 
@@ -3567,6 +3592,7 @@ int vfs_rename_union(struct nameidata *o
 */
 freename:
kfree(old_name.name);
+   UM_DEBUG(err = %d\n, error);
return error;
 }
 
--- a/fs/union.c
+++ b/fs/union.c
@@ -18,6 +18,7 @@
 #include linux/hash.h
 #include linux/fs.h
 #include linux/union.h
+#include linux/union_debug.h
 #include linux/module.h
 #include linux/file.h
 #include linux/mm.h
@@ -253,6 +254,9 @@ int append_to_union(struct vfsmount *mnt
 
BUG_ON(!IS_MNT_UNION(mnt));
 
+   UM_DEBUG_DENTRY(dentry);
+   UM_DEBUG_DENTRY(dest_dentry);
+
this = union_alloc(dentry, mnt, dest_dentry, dest_mnt);
if (!this)
return -ENOMEM;
@@ -822,6 +826,8 @@ int union_relookup_topmost(struct nameid
char *kbuf, *name;
struct nameidata this;
 
+   UM_DEBUG_DENTRY(nd-dentry);
+
kbuf = (char *)__get_free_page(GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
@@ -838,6 +844,7 @@ int union_relookup_topmost(struct nameid
path_release(nd);
nd-dentry = this.dentry;
nd-mnt = this.mnt;
+   UM_DEBUG_DENTRY(nd-dentry);
 
/*
 * the nd-flags should be unchanged
@@ -846,6 +853,7 @@ int union_relookup_topmost(struct nameid
nd-um_flags = ~LAST_LOWLEVEL;
  free_page:
free_page((unsigned long)kbuf);
+   UM_DEBUG(err = %d\n, err);
return err;
 }
 
@@ -895,6 +903,8 @@ struct dentry *union_create_topmost(stru
if (IS_ERR(dentry))
goto out_unlock;
 
+   UM_DEBUG_DENTRY(dentry);
+
switch (mode  S_IFMT) {
case S_IFREG:
/*
@@ -916,6 +926,9 @@ struct dentry *union_create_topmost(stru
dentry = ERR_PTR(res);
goto out_unlock;
}
+
+   UM_DEBUG_DENTRY(dentry);
+
break;
case S_IFDIR:
res = 

[RFC 19/26] union-mount: Make lookup work for union-mounted file systems

2007-07-30 Thread Jan Blunck
On union-mounted file systems the lookup function must also visit lower layers
of the union-stack when doing a lookup. This patches add support for
union-mounts to cached lookups and real lookups.

We have 3 different styles of lookup functions now:
- multiple pathname components, follow mounts, follow union, follow symlinks
- single pathname component, doesn't follow mounts, follow union, doesn't
  follow symlinks
- single pathname component doesn't follow mounts, doesn't follow unions,
  doesn't follow symlinks

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namei.c|  467 +-
 include/linux/namei.h |6 
 2 files changed, 465 insertions(+), 8 deletions(-)

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -31,6 +31,7 @@
 #include linux/file.h
 #include linux/fcntl.h
 #include linux/namei.h
+#include linux/union.h
 #include asm/namei.h
 #include asm/uaccess.h
 
@@ -415,6 +416,167 @@ static struct dentry *cache_lookup(struc
 }
 
 /*
+ * cache_lookup_topmost - lookup the topmost (non-)negative dentry
+ *
+ * This is used for union mount lookups from dcache. The first non-negative
+ * dentry is searched on all layers of the union stack. Otherwise the topmost
+ * negative dentry is return.
+ */
+static int __cache_lookup_topmost(struct nameidata *nd, struct qstr *name,
+ struct path *path)
+{
+   struct dentry *dentry;
+
+   dentry = d_lookup(nd-dentry, name);
+   if (dentry  dentry-d_op  dentry-d_op-d_revalidate)
+   dentry = do_revalidate(dentry, nd);
+
+   /*
+* Remember the topmost negative dentry in case we don't find anything
+*/
+   path-dentry = dentry;
+   path-mnt = dentry ? nd-mnt : NULL;
+
+   if (!dentry || dentry-d_inode)
+   return !dentry;
+
+   /* look for the first non-negative dentry */
+
+   while (follow_union_down(nd-mnt, nd-dentry)) {
+   dentry = d_hash_and_lookup(nd-dentry, name);
+
+   /*
+* If parts of the union stack are not in the dcache we need
+* to do a real lookup
+*/
+   if (!dentry)
+   goto out_dput;
+
+   /*
+* If parts of the union don't survive the revalidation we
+* need to do a real lookup
+*/
+   if (dentry-d_op  dentry-d_op-d_revalidate) {
+   dentry = do_revalidate(dentry, nd);
+   if (!dentry)
+   goto out_dput;
+   }
+
+   if (dentry-d_inode)
+   goto out_dput;
+
+   dput(dentry);
+   }
+
+   return !dentry;
+
+out_dput:
+   dput(path-dentry);
+   path-dentry = dentry;
+   path-mnt = dentry ? mntget(nd-mnt) : NULL;
+   return !dentry;
+}
+
+/*
+ * cache_lookup_union - lookup the rest of the union stack
+ *
+ * This is called after you have the topmost dentry in @path.
+ */
+static int __cache_lookup_union(struct nameidata *nd, struct qstr *name,
+   struct path *path)
+{
+   struct path last = *path;
+   struct dentry *dentry;
+
+   while (follow_union_down(nd-mnt, nd-dentry)) {
+   dentry = d_hash_and_lookup(nd-dentry, name);
+   if (!dentry)
+   return 1;
+
+   if (dentry-d_op  dentry-d_op-d_revalidate) {
+   dentry = do_revalidate(dentry, nd);
+   if (!dentry)
+   return 1;
+   }
+
+   if (!dentry-d_inode) {
+   dput(dentry);
+   continue;
+   }
+
+   /* only directories can be part of a union stack */
+   if (!S_ISDIR(dentry-d_inode-i_mode)) {
+   dput(dentry);
+   break;
+   }
+
+   /* now we know we found something real  */
+   append_to_union(last.mnt, last.dentry, nd-mnt, dentry);
+
+   if (last.dentry != path-dentry)
+   pathput(last);
+   last.dentry = dentry;
+   last.mnt = mntget(nd-mnt);
+   }
+
+   if (last.dentry != path-dentry)
+   pathput(last);
+
+   return 0;
+}
+
+/*
+ * cache_lookup - lookup a single pathname part from dcache
+ *
+ * This is a union mount capable version of what d_lookup()  revalidate()
+ * would do. This function returns a valid (union) dentry on success.
+ *
+ * Remember: On failure it means that parts of the union aren't cached. You
+ * should call real_lookup() afterwards to find the proper (union) dentry.
+ */
+static int cache_lookup_union(struct nameidata *nd, struct qstr *name,
+ struct path *path)
+{
+   int res ;
+
+   if (!IS_MNT_UNION(nd-mnt)) {
+   path-dentry = 

[RFC 21/26] union-mount: in-kernel file copy between union mounted filesystems

2007-07-30 Thread Jan Blunck
This patch introduces in-kernel file copy between union mounted
filesystems. When a file is opened for writing but resides on a lower (thus
read-only) layer of the union stack it is copied to the topmost union layer
first.

This patch uses the do_splice() for doing the in-kernel file copy.

Signed-off-by: Bharata B Rao [EMAIL PROTECTED]
Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namei.c|   73 ++-
 fs/union.c|  312 ++
 include/linux/union.h |9 +
 3 files changed, 389 insertions(+), 5 deletions(-)

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -994,7 +994,7 @@ static int __follow_mount(struct path *p
return res;
 }
 
-static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 {
while (d_mountpoint(*dentry)) {
struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
@@ -1213,6 +1213,21 @@ static fastcall int __link_path_walk(con
if (err)
break;
 
+   if ((nd-flags  LOOKUP_TOPMOST) 
+   (nd-um_flags  LAST_LOWLEVEL)) {
+   struct dentry *dentry;
+
+   dentry = union_create_topmost(nd, this, next);
+   if (IS_ERR(dentry)) {
+   err = PTR_ERR(dentry);
+   goto out_dput;
+   }
+   dput_path(next, nd);
+   next.mnt = nd-mnt;
+   next.dentry = dentry;
+   nd-um_flags = ~LAST_LOWLEVEL;
+   }
+
err = -ENOENT;
inode = next.dentry-d_inode;
if (!inode || S_ISWHT(inode-i_mode))
@@ -1267,6 +1282,22 @@ last_component:
err = do_lookup(nd, this, next);
if (err)
break;
+
+   if ((nd-flags  LOOKUP_TOPMOST) 
+   (nd-um_flags  LAST_LOWLEVEL)) {
+   struct dentry *dentry;
+
+   dentry = union_create_topmost(nd, this, next);
+   if (IS_ERR(dentry)) {
+   err = PTR_ERR(dentry);
+   goto out_dput;
+   }
+   dput_path(next, nd);
+   next.mnt = nd-mnt;
+   next.dentry = dentry;
+   nd-um_flags = ~LAST_LOWLEVEL;
+   }
+
inode = next.dentry-d_inode;
if ((lookup_flags  LOOKUP_FOLLOW)
 inode  inode-i_op  inode-i_op-follow_link) {
@@ -1755,7 +1786,7 @@ out:
return err;
 }
 
-static int hash_lookup_union(struct nameidata *nd, struct qstr *name,
+int hash_lookup_union(struct nameidata *nd, struct qstr *name,
 struct path *path)
 {
struct path safe = { .dentry = nd-dentry, .mnt = nd-mnt };
@@ -2169,6 +2200,11 @@ int open_namei(int dfd, const char *path
 nd, flag);
if (error)
return error;
+   if (flag  FMODE_WRITE) {
+   error = union_copyup(nd, flag);
+   if (error)
+   return error;
+   }
goto ok;
}
 
@@ -2188,6 +2224,16 @@ int open_namei(int dfd, const char *path
if (nd-last_type != LAST_NORM || nd-last.name[nd-last.len])
goto exit;
 
+   /*
+* If this dentry is on an union mount we need the topmost dentry here.
+* This creates all topmost directories on the path to this dentry too.
+*/
+   if (is_unionized(nd-dentry, nd-mnt)) {
+   error = union_relookup_topmost(nd, nd-flags  ~LOOKUP_PARENT);
+   if (error)
+   goto exit;
+   }
+
dir = nd-dentry;
nd-flags = ~LOOKUP_PARENT;
mutex_lock(dir-d_inode-i_mutex);
@@ -2235,10 +2281,21 @@ do_last:
if (path.dentry-d_inode-i_op  
path.dentry-d_inode-i_op-follow_link)
goto do_link;
 
-   path_to_nameidata(path, nd);
error = -EISDIR;
if (path.dentry-d_inode  S_ISDIR(path.dentry-d_inode-i_mode))
-   goto exit;
+   goto exit_dput;
+
+   /*
+* If this file is on a lower layer of the union stack, copy it to the
+* topmost layer before opening it
+*/
+   if (path.dentry-d_inode  (path.dentry-d_parent != dir)) {
+   error = __union_copyup(path, nd, path);
+   if (error)
+   goto exit_dput;
+   }
+
+   path_to_nameidata(path, nd);
 ok:
error = may_open(nd, acc_mode, flag);
if (error)
@@ -3437,9 +3494,15 @@ static int do_rename(int olddfd, const c
error = -ENOTEMPTY;
if (new.dentry == 

[RFC 25/26] union-mount: Debug Infrastructure

2007-07-30 Thread Jan Blunck
This adds debugfs/relay based debugging infrastructure helpful when doing
development of the union-mount code itself. The debgging output can be enabled
during runtime by:

 echo 1  /proc/sys/fs/union-debug

This registers the relayfs files where the debug code is writing its output
to. There are different levels of debugging output available which can be ORed
together. For the valid sysctl values see include/linux/union_debug.h.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 include/linux/union_debug.h |   91 ++
 lib/Kconfig.debug   |9 +
 lib/Makefile|2 
 lib/union_debug.c   |  268 
 4 files changed, 370 insertions(+)

--- /dev/null
+++ b/include/linux/union_debug.h
@@ -0,0 +1,91 @@
+/*
+ * VFS based union mount for Linux
+ *
+ * Copyright (C) 2004-2007 IBM Corporation, IBM Deutschland Entwicklung GmbH.
+ * Copyright (C) 2007 Novell Inc.
+ *   Author(s): Jan Blunck ([EMAIL PROTECTED])
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#ifndef __LINUX_UNION_DEBUG_H
+#define __LINUX_UNION_DEBUG_H
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_DEBUG_UNION_MOUNT
+
+#include linux/sched.h
+
+/* This is taken from klog debugging facility */
+extern void klog(const void *data, int len);
+extern void klog_printk(const char *fmt, ...);
+extern void klog_printk_dentry(const char *func, struct dentry *dentry);
+
+extern int sysctl_union_debug;
+
+#define UNION_MOUNT_DEBUG  1
+#define UNION_MOUNT_DEBUG_DCACHE   2
+#define UNION_MOUNT_DEBUG_LOCK 4
+#define UNION_MOUNT_DEBUG_READDIR  8
+#define UNION_MOUNT_DEBUG_LOOKUP   16
+
+#define UM_DEBUG(fmt, args...) \
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG) \
+   klog_printk(%s:  fmt, __FUNCTION__, ## args); \
+} while (0)
+#define UM_DEBUG_DENTRY(dentry)
\
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG) \
+   klog_printk_dentry(__FUNCTION__, (dentry)); \
+} while (0)
+#define UM_DEBUG_DCACHE(fmt, args...)  \
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG_DCACHE)  \
+   klog_printk(%s:  fmt, __FUNCTION__, ## args); \
+} while (0)
+#define UM_DEBUG_DCACHE_DENTRY(dentry) \
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG_DCACHE)  \
+   klog_printk_dentry(__FUNCTION__, (dentry)); \
+} while (0)
+#define UM_DEBUG_LOCK(fmt, args...)\
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG_LOCK)\
+   klog_printk(%s:  fmt, __FUNCTION__, ## args); \
+} while (0)
+#define UM_DEBUG_READDIR(fmt, args...) \
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG_READDIR) \
+   klog_printk(%s:  fmt, __FUNCTION__, ## args); \
+} while (0)
+#define UM_DEBUG_LOOKUP(fmt, args...)  \
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG_LOOKUP)  \
+   klog_printk(%s:  fmt, __FUNCTION__, ## args); \
+} while (0)
+#define UM_DEBUG_LOOKUP_DENTRY(dentry) \
+do {   \
+   if (sysctl_union_debug  UNION_MOUNT_DEBUG_LOOKUP)  \
+   klog_printk_dentry(__FUNCTION__, (dentry)); \
+} while (0)
+
+#else  /* CONFIG_DEBUG_UNION_MOUNT */
+
+#define UM_DEBUG(fmt, args...) do { /* empty */ } while (0)
+#define UM_DEBUG_DENTRY(fmt, args...)  do { /* empty */ } while (0)
+#define UM_DEBUG_DCACHE(fmt, args...)  do { /* empty */ } while (0)
+#define UM_DEBUG_DCACHE_DENTRY(fmt, args...)   do { /* empty */ } while (0)
+#define UM_DEBUG_LOCK(fmt, args...)do { /* empty */ } while (0)
+#define UM_DEBUG_READDIR(fmt, args...) do { /* empty */ } while (0)
+#define UM_DEBUG_LOOKUP_DENTRY(fmt, args...)   do { /* empty */ } while (0)
+#define UM_DEBUG_LOOKUP_DENTRY(fmt, args...)   do { /* empty */ } while 

[RFC 06/26] VFS: Make real_lookup() return a struct path

2007-07-30 Thread Jan Blunck
This patch changes real_lookup() into returning a struct path.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namei.c |   77 ++---
 1 file changed, 48 insertions(+), 29 deletions(-)

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -462,10 +462,11 @@ ok:
  * make sure that nobody added the entry to the dcache in the meantime..
  * SMP-safe
  */
-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, 
struct nameidata *nd)
+static int real_lookup(struct nameidata *nd, struct qstr *name,
+  struct path *path)
 {
-   struct dentry * result;
-   struct inode *dir = parent-d_inode;
+   struct inode *dir = nd-dentry-d_inode;
+   int res = 0;
 
mutex_lock(dir-i_mutex);
/*
@@ -482,19 +483,27 @@ static struct dentry * real_lookup(struc
 *
 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
 */
-   result = d_lookup(parent, name);
-   if (!result) {
-   struct dentry * dentry = d_alloc(parent, name);
-   result = ERR_PTR(-ENOMEM);
+   path-dentry = d_lookup(nd-dentry, name);
+   path-mnt = nd-mnt;
+   if (!path-dentry) {
+   struct dentry *dentry = d_alloc(nd-dentry, name);
if (dentry) {
-   result = dir-i_op-lookup(dir, dentry, nd);
-   if (result)
+   path-dentry = dir-i_op-lookup(dir, dentry, nd);
+   if (path-dentry) {
dput(dentry);
-   else
-   result = dentry;
+   if (IS_ERR(path-dentry)) {
+   res = PTR_ERR(path-dentry);
+   path-dentry = NULL;
+   path-mnt = NULL;
+   }
+   } else
+   path-dentry = dentry;
+   } else {
+   res = -ENOMEM;
+   path-mnt = NULL;
}
mutex_unlock(dir-i_mutex);
-   return result;
+   return res;
}
 
/*
@@ -502,12 +511,20 @@ static struct dentry * real_lookup(struc
 * we waited on the semaphore. Need to revalidate.
 */
mutex_unlock(dir-i_mutex);
-   if (result-d_op  result-d_op-d_revalidate) {
-   result = do_revalidate(result, nd);
-   if (!result)
-   result = ERR_PTR(-ENOENT);
+   if (path-dentry-d_op  path-dentry-d_op-d_revalidate) {
+   path-dentry = do_revalidate(path-dentry, nd);
+   if (!path-dentry) {
+   res = -ENOENT;
+   path-mnt = NULL;
+   }
+   if (IS_ERR(path-dentry)) {
+   res = PTR_ERR(path-dentry);
+   path-dentry = NULL;
+   path-mnt = NULL;
+   }
}
-   return result;
+
+   return res;
 }
 
 static int __emul_lookup_dentry(const char *, struct nameidata *);
@@ -748,35 +765,37 @@ static __always_inline void follow_dotdo
 static int do_lookup(struct nameidata *nd, struct qstr *name,
 struct path *path)
 {
-   struct vfsmount *mnt = nd-mnt;
-   struct dentry *dentry = __d_lookup(nd-dentry, name);
+   int err;
 
-   if (!dentry)
+   path-dentry = __d_lookup(nd-dentry, name);
+   path-mnt = nd-mnt;
+   if (!path-dentry)
goto need_lookup;
-   if (dentry-d_op  dentry-d_op-d_revalidate)
+   if (path-dentry-d_op  path-dentry-d_op-d_revalidate)
goto need_revalidate;
+
 done:
-   path-mnt = mnt;
-   path-dentry = dentry;
__follow_mount(path);
return 0;
 
 need_lookup:
-   dentry = real_lookup(nd-dentry, name, nd);
-   if (IS_ERR(dentry))
+   err = real_lookup(nd, name, path);
+   if (err)
goto fail;
goto done;
 
 need_revalidate:
-   dentry = do_revalidate(dentry, nd);
-   if (!dentry)
+   path-dentry = do_revalidate(path-dentry, nd);
+   if (!path-dentry)
goto need_lookup;
-   if (IS_ERR(dentry))
+   if (IS_ERR(path-dentry)) {
+   err = PTR_ERR(path-dentry);
goto fail;
+   }
goto done;
 
 fail:
-   return PTR_ERR(dentry);
+   return err;
 }
 
 /*

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 04/26] VFS: Make lookup_create() return a struct path

2007-07-30 Thread Jan Blunck
This patch changes lookup_create() into returning a struct path.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 arch/powerpc/platforms/cell/spufs/inode.c |   15 ++
 fs/namei.c|   75 +-
 include/linux/dcache.h|1 
 include/linux/namei.h |1 
 net/unix/af_unix.c|   17 +++---
 5 files changed, 50 insertions(+), 59 deletions(-)

--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -456,7 +456,7 @@ static struct file_system_type spufs_typ
 
 long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode)
 {
-   struct dentry *dentry;
+   struct path path;
int ret;
 
ret = -EINVAL;
@@ -475,26 +475,25 @@ long spufs_create(struct nameidata *nd, 
goto out;
}
 
-   dentry = lookup_create(nd, 1);
-   ret = PTR_ERR(dentry);
-   if (IS_ERR(dentry))
+   ret = lookup_create(nd, 1, path);
+   if (ret)
goto out_dir;
 
ret = -EEXIST;
-   if (dentry-d_inode)
+   if (path.dentry-d_inode)
goto out_dput;
 
mode = ~current-fs-umask;
 
if (flags  SPU_CREATE_GANG)
return spufs_create_gang(nd-dentry-d_inode,
-   dentry, nd-mnt, mode);
+path.dentry, path.mnt, mode);
else
return spufs_create_context(nd-dentry-d_inode,
-   dentry, nd-mnt, flags, mode);
+   path.dentry, path.mnt, flags, mode);
 
 out_dput:
-   dput(dentry);
+   dput_path(path, nd);
 out_dir:
mutex_unlock(nd-dentry-d_inode-i_mutex);
 out:
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1833,10 +1833,9 @@ do_link:
  *
  * Returns with nd-dentry-d_inode-i_mutex locked.
  */
-struct dentry *lookup_create(struct nameidata *nd, int is_dir)
+int lookup_create(struct nameidata *nd, int is_dir, struct path *path)
 {
-   struct path path = { .dentry = ERR_PTR(-EEXIST) } ;
-   int err;
+   int err = -EEXIST;
 
mutex_lock_nested(nd-dentry-d_inode-i_mutex, I_MUTEX_PARENT);
/*
@@ -1852,11 +1851,9 @@ struct dentry *lookup_create(struct name
/*
 * Do the final lookup.
 */
-   err = lookup_hash(nd, nd-last, path);
-   if (err) {
-   path.dentry = ERR_PTR(err);
+   err = lookup_hash(nd, nd-last, path);
+   if (err)
goto fail;
-   }
 
/*
 * Special case - lookup gave negative, but... we had foo/bar/
@@ -1864,16 +1861,14 @@ struct dentry *lookup_create(struct name
 * all is fine. Let's be bastards - you had / on the end, you've
 * been asking for (non-existent) directory. -ENOENT for you.
 */
-   if (!is_dir  nd-last.name[nd-last.len]  !path.dentry-d_inode)
+   if (!is_dir  nd-last.name[nd-last.len]  !path-dentry-d_inode)
goto enoent;
-   if (nd-mnt != path.mnt)
-   mntput(path.mnt);
-   return path.dentry;
+   return 0;
 enoent:
-   dput_path(path, nd);
-   path.dentry = ERR_PTR(-ENOENT);
+   dput_path(path, nd);
+   err = -ENOENT;
 fail:
-   return path.dentry;
+   return err;
 }
 EXPORT_SYMBOL_GPL(lookup_create);
 
@@ -1906,7 +1901,7 @@ asmlinkage long sys_mknodat(int dfd, con
 {
int error = 0;
char * tmp;
-   struct dentry * dentry;
+   struct path path;
struct nameidata nd;
 
if (S_ISDIR(mode))
@@ -1918,22 +1913,23 @@ asmlinkage long sys_mknodat(int dfd, con
error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, nd);
if (error)
goto out;
-   dentry = lookup_create(nd, 0);
-   error = PTR_ERR(dentry);
+   error = lookup_create(nd, 0, path);
 
if (!IS_POSIXACL(nd.dentry-d_inode))
mode = ~current-fs-umask;
-   if (!IS_ERR(dentry)) {
+   if (!error) {
switch (mode  S_IFMT) {
case 0: case S_IFREG:
-   error = vfs_create(nd.dentry-d_inode,dentry,mode,nd);
+   error = vfs_create(nd.dentry-d_inode, path.dentry,
+  mode, nd);
break;
case S_IFCHR: case S_IFBLK:
-   error = vfs_mknod(nd.dentry-d_inode,dentry,mode,
-   new_decode_dev(dev));
+   error = vfs_mknod(nd.dentry-d_inode, path.dentry,
+ mode, new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
-   error = vfs_mknod(nd.dentry-d_inode,dentry,mode,0);
+   error = vfs_mknod(nd.dentry-d_inode, path.dentry,
+ mode, 0);
  

[RFC 23/26] union-mount: copyup on rename

2007-07-30 Thread Jan Blunck
Add copyup renaming of regular files on union mounts. Directories are still
lazyly copied with the help of user-space.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namei.c |  133 -
 fs/union.c |8 ++-
 2 files changed, 129 insertions(+), 12 deletions(-)

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1491,6 +1491,8 @@ static int fastcall do_path_lookup(int d
nd-mnt = mntget(fs-pwdmnt);
nd-dentry = dget(fs-pwd);
read_unlock(fs-lock);
+   /* Force a union_relookup() */
+   nd-um_flags = LAST_LOWLEVEL;
} else {
struct dentry *dentry;
 
@@ -3478,6 +3480,97 @@ int vfs_rename(struct inode *old_dir, st
return error;
 }
 
+int vfs_rename_union(struct nameidata *oldnd, struct path *old,
+struct nameidata *newnd, struct path *new)
+{
+   struct inode *old_dir = oldnd-dentry-d_inode;
+   struct inode *new_dir = newnd-dentry-d_inode;
+   struct qstr old_name;
+   char *name;
+   struct dentry *dentry;
+   int error;
+
+   if (old-dentry-d_inode == new-dentry-d_inode)
+   return 0;
+
+   error = may_whiteout(old-dentry, 0);
+   if (error)
+   return error;
+   if (!old_dir-i_op || !old_dir-i_op-whiteout)
+   return -EPERM;
+
+   if (!new-dentry-d_inode)
+   error = may_create(new_dir, new-dentry, NULL);
+   else
+   error = may_delete(new_dir, new-dentry, 0);
+   if (error)
+   return error;
+
+   DQUOT_INIT(old_dir);
+   DQUOT_INIT(new_dir);
+
+   error = security_inode_rename(old_dir, old-dentry,
+ new_dir, new-dentry);
+   if (error)
+   return error;
+
+   error = -EBUSY;
+   if (d_mountpoint(old-dentry) || d_mountpoint(new-dentry))
+   return error;
+
+   error = -ENOMEM;
+   name = kmalloc(old-dentry-d_name.len, GFP_KERNEL);
+   if (!name)
+   return error;
+   strncpy(name, old-dentry-d_name.name, old-dentry-d_name.len);
+   name[old-dentry-d_name.len] = 0;
+   old_name.len = old-dentry-d_name.len;
+   old_name.hash = old-dentry-d_name.hash;
+   old_name.name = name;
+
+   /* possibly delete the existing new file */
+   if ((newnd-dentry == new-dentry-d_parent)  new-dentry-d_inode) {
+   /* FIXME: inode may be truncated while we hold a lock */
+   error = vfs_unlink(new_dir, new-dentry);
+   if (error)
+   goto freename;
+
+   dentry = __lookup_hash_kern(new-dentry-d_name,
+   newnd-dentry, newnd);
+   if (IS_ERR(dentry))
+   goto freename;
+
+   dput(new-dentry);
+   new-dentry = dentry;
+   }
+
+   /* copyup to the new file */
+   error = __union_copyup(old, newnd, new);
+   if (error)
+   goto freename;
+
+   /* whiteout the old file */
+   dentry = __lookup_hash_kern(old_name, oldnd-dentry, oldnd);
+   error = PTR_ERR(dentry);
+   if (IS_ERR(dentry))
+   goto freename;
+   error = vfs_whiteout(old_dir, dentry);
+   dput(dentry);
+
+   /* FIXME: This is acutally unlink()  create() ... */
+/*
+   if (!error) {
+   const char *new_name = old_dentry-d_name.name;
+   fsnotify_move(old_dir, new_dir, old_name.name, new_name, 0,
+ new_dentry-d_inode, old_dentry-d_inode);
+   }
+*/
+freename:
+   kfree(old_name.name);
+   return error;
+}
+
+
 static int do_rename(int olddfd, const char *oldname,
int newdfd, const char *newname)
 {
@@ -3495,10 +3588,7 @@ static int do_rename(int olddfd, const c
if (error)
goto exit1;
 
-   error = -EXDEV;
-   if (oldnd.mnt != newnd.mnt)
-   goto exit2;
-
+lock:
old_dir = oldnd.dentry;
error = -EBUSY;
if (oldnd.last_type != LAST_NORM)
@@ -3536,15 +3626,40 @@ static int do_rename(int olddfd, const c
error = -ENOTEMPTY;
if (new.dentry == trap)
goto exit5;
-   /* renaming on unions is done by the user-space */
+   /* renaming of directories on unions is done by the user-space */
error = -EXDEV;
-   if (is_unionized(oldnd.dentry, oldnd.mnt))
+   if (is_unionized(oldnd.dentry, oldnd.mnt) 
+   S_ISDIR(old.dentry-d_inode-i_mode))
goto exit5;
-   if (is_unionized(newnd.dentry, newnd.mnt))
+   /* renameing of other files on unions is done by copyup */
+   if ((is_unionized(oldnd.dentry, oldnd.mnt) 
+(oldnd.um_flags  LAST_LOWLEVEL)) ||
+   (is_unionized(newnd.dentry, newnd.mnt) 
+(newnd.um_flags  LAST_LOWLEVEL))) {
+   dput_path(new, 

[RFC 11/26] tmpfs white-out support

2007-07-30 Thread Jan Blunck
Introduce white-out support to tmpfs.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 include/linux/shmem_fs.h |1 
 mm/shmem.c   |   54 +++
 2 files changed, 55 insertions(+)

--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -33,6 +33,7 @@ struct shmem_sb_info {
int policy; /* Default NUMA memory alloc policy */
nodemask_t policy_nodes;/* nodemask for preferred and bind */
spinlock_tstat_lock;
+   struct inode *whiteout_inode;
 };
 
 static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1784,6 +1784,42 @@ static int shmem_create(struct inode *di
 }
 
 /*
+ * This is the whiteout support for tmpfs. It uses one singleton whiteout
+ * inode per superblock thus it is very similar to shmem_link().
+ */
+static int shmem_whiteout(struct inode *dir, struct dentry *dentry)
+{
+   struct shmem_sb_info *sbinfo = SHMEM_SB(dir-i_sb);
+   struct inode *inode = sbinfo-whiteout_inode;
+
+   if (!(dir-i_sb-s_flags  MS_WHITEOUT))
+   return -EPERM;
+
+   /*
+* No ordinary (disk based) filesystem counts whiteouts as inodes;
+* but each new link needs a new dentry, pinning lowmem, and
+* tmpfs dentries cannot be pruned until they are unlinked.
+*/
+   if (sbinfo-max_inodes) {
+   spin_lock(sbinfo-stat_lock);
+   if (!sbinfo-free_inodes) {
+   spin_unlock(sbinfo-stat_lock);
+   return -ENOSPC;
+   }
+   sbinfo-free_inodes--;
+   spin_unlock(sbinfo-stat_lock);
+   }
+
+   dir-i_size += BOGO_DIRENT_SIZE;
+   inode-i_ctime = dir-i_ctime = dir-i_mtime = CURRENT_TIME;
+   inc_nlink(inode);
+   atomic_inc(inode-i_count);/* New dentry reference */
+   dget(dentry);   /* Extra pinning count for the created dentry */
+   d_instantiate(dentry, inode);
+   return 0;
+}
+
+/*
  * Link a file..
  */
 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct 
dentry *dentry)
@@ -2231,6 +2267,9 @@ out:
 
 static void shmem_put_super(struct super_block *sb)
 {
+   struct shmem_sb_info *sbinfo = sb-s_fs_info;
+
+   iput(sbinfo-whiteout_inode);
kfree(sb-s_fs_info);
sb-s_fs_info = NULL;
 }
@@ -2305,6 +2344,19 @@ static int shmem_fill_super(struct super
if (!root)
goto failed_iput;
sb-s_root = root;
+
+#ifdef CONFIG_TMPFS
+   if (!(sb-s_flags  MS_NOUSER)) {
+   inode = shmem_get_inode(sb, S_IRUGO | S_IWUGO | S_IFWHT, 0);
+   if (!inode) {
+   dput(root);
+   goto failed;
+   }
+   sbinfo-whiteout_inode = inode;
+   sb-s_flags |= MS_WHITEOUT;
+   }
+#endif
+
return 0;
 
 failed_iput:
@@ -2410,6 +2462,7 @@ static const struct inode_operations shm
.rmdir  = shmem_rmdir,
.mknod  = shmem_mknod,
.rename = shmem_rename,
+   .whiteout   = shmem_whiteout,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
.setattr= shmem_notify_change,
@@ -2464,6 +2517,7 @@ static struct file_system_type tmpfs_fs_
.name   = tmpfs,
.get_sb = shmem_get_sb,
.kill_sb= kill_litter_super,
+   .fs_flags   = FS_WHT,
 };
 static struct vfsmount *shm_mnt;
 

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 24/26] union-mount: dont report EROFS for union mounts

2007-07-30 Thread Jan Blunck
SuS v2 requires we report a read only fs too. For union-mounts this is a very
expensive check. So I'm lazy and just disable the check if we are on a lower
layer of an union.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/open.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/fs/open.c
+++ b/fs/open.c
@@ -483,7 +483,7 @@ asmlinkage long sys_faccessat(int dfd, c
   special_file(nd.dentry-d_inode-i_mode))
goto out_path_release;
 
-   if(IS_RDONLY(nd.dentry-d_inode))
+   if (!(nd.um_flags  LAST_LOWLEVEL)  IS_RDONLY(nd.dentry-d_inode))
res = -EROFS;
 
 out_path_release:

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 03/26] VFS: Make lookup_hash() return a struct path

2007-07-30 Thread Jan Blunck
This patch changes lookup_hash() into returning a struct path.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namei.c |  113 ++---
 1 file changed, 57 insertions(+), 56 deletions(-)

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1297,27 +1297,27 @@ out:
  * needs parent already locked. Doesn't follow mounts.
  * SMP-safe.
  */
-static inline struct dentry * __lookup_hash(struct qstr *name, struct dentry 
*base, struct nameidata *nd)
+static int lookup_hash(struct nameidata *nd, struct qstr *name,
+  struct path *path)
 {
-   struct dentry *dentry;
struct inode *inode;
int err;
 
-   inode = base-d_inode;
+   inode = nd-dentry-d_inode;
 
err = permission(inode, MAY_EXEC, nd);
-   dentry = ERR_PTR(err);
if (err)
goto out;
 
-   dentry = __lookup_hash_kern(name, base, nd);
+   path-mnt =  nd-mnt;
+   path-dentry = __lookup_hash_kern(name, nd-dentry, nd);
+   if (IS_ERR(path-dentry)) {
+   err = PTR_ERR(path-dentry);
+   path-dentry = NULL;
+   path-mnt = NULL;
+   }
 out:
-   return dentry;
-}
-
-static struct dentry *lookup_hash(struct nameidata *nd)
-{
-   return __lookup_hash(nd-last, nd-dentry, nd);
+   return err;
 }
 
 /* SMP-safe */
@@ -1351,7 +1351,10 @@ struct dentry *lookup_one_len_nd(const c
err = __lookup_one_len(name, this, base, len);
if (err)
return ERR_PTR(err);
-   return __lookup_hash(this, base, nd);
+   err = permission(base-d_inode, MAY_EXEC, nd);
+   if (err)
+   return ERR_PTR(err);
+   return __lookup_hash_kern(this, base, nd);
 }
 
 struct dentry *lookup_one_len_kern(const char *name, struct dentry *base, int 
len)
@@ -1709,12 +1712,10 @@ int open_namei(int dfd, const char *path
dir = nd-dentry;
nd-flags = ~LOOKUP_PARENT;
mutex_lock(dir-d_inode-i_mutex);
-   path.dentry = lookup_hash(nd);
-   path.mnt = nd-mnt;
+   error = lookup_hash(nd, nd-last, path);
 
 do_last:
-   error = PTR_ERR(path.dentry);
-   if (IS_ERR(path.dentry)) {
+   if (error) {
mutex_unlock(dir-d_inode-i_mutex);
goto exit;
}
@@ -1817,8 +1818,7 @@ do_link:
}
dir = nd-dentry;
mutex_lock(dir-d_inode-i_mutex);
-   path.dentry = lookup_hash(nd);
-   path.mnt = nd-mnt;
+   error = lookup_hash(nd, nd-last, path);
__putname(nd-last.name);
goto do_last;
 }
@@ -1835,7 +1835,8 @@ do_link:
  */
 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 {
-   struct dentry *dentry = ERR_PTR(-EEXIST);
+   struct path path = { .dentry = ERR_PTR(-EEXIST) } ;
+   int err;
 
mutex_lock_nested(nd-dentry-d_inode-i_mutex, I_MUTEX_PARENT);
/*
@@ -1851,9 +1852,11 @@ struct dentry *lookup_create(struct name
/*
 * Do the final lookup.
 */
-   dentry = lookup_hash(nd);
-   if (IS_ERR(dentry))
+   err = lookup_hash(nd, nd-last, path);
+   if (err) {
+   path.dentry = ERR_PTR(err);
goto fail;
+   }
 
/*
 * Special case - lookup gave negative, but... we had foo/bar/
@@ -1861,14 +1864,16 @@ struct dentry *lookup_create(struct name
 * all is fine. Let's be bastards - you had / on the end, you've
 * been asking for (non-existent) directory. -ENOENT for you.
 */
-   if (!is_dir  nd-last.name[nd-last.len]  !dentry-d_inode)
+   if (!is_dir  nd-last.name[nd-last.len]  !path.dentry-d_inode)
goto enoent;
-   return dentry;
+   if (nd-mnt != path.mnt)
+   mntput(path.mnt);
+   return path.dentry;
 enoent:
-   dput(dentry);
-   dentry = ERR_PTR(-ENOENT);
+   dput_path(path, nd);
+   path.dentry = ERR_PTR(-ENOENT);
 fail:
-   return dentry;
+   return path.dentry;
 }
 EXPORT_SYMBOL_GPL(lookup_create);
 
@@ -2075,7 +2080,7 @@ static long do_rmdir(int dfd, const char
 {
int error = 0;
char * name;
-   struct dentry *dentry;
+   struct path path;
struct nameidata nd;
 
name = getname(pathname);
@@ -2098,12 +2103,11 @@ static long do_rmdir(int dfd, const char
goto exit1;
}
mutex_lock_nested(nd.dentry-d_inode-i_mutex, I_MUTEX_PARENT);
-   dentry = lookup_hash(nd);
-   error = PTR_ERR(dentry);
-   if (IS_ERR(dentry))
+   error = lookup_hash(nd, nd.last, path);
+   if (error)
goto exit2;
-   error = vfs_rmdir(nd.dentry-d_inode, dentry);
-   dput(dentry);
+   error = vfs_rmdir(nd.dentry-d_inode, path.dentry);
+   dput_path(path, nd);
 exit2:
mutex_unlock(nd.dentry-d_inode-i_mutex);
 exit1:
@@ -2158,7 +2162,7 @@ static long do_unlinkat(int dfd, const c
 {
int error = 0;
char * name;
-   struct 

[RFC 07/26] VFS: Introduce dput() variante that maintains a kill-list

2007-07-30 Thread Jan Blunck
This patch introduces a new variant of dput(). This becomes necessary to
prevent a recursive call to dput() from the union mount code.

  void __dput(struct dentry *dentry, struct list_head *list);

__dput() works mostly like the original dput() did. The main difference is
that it doesn't do a full d_kill() at the end but puts the dentry on a list as
soon as it isn't reachable anymore. Therefore the union mount code can savely
call __dput() when it wants to get rid of underlying dentry references during
a dput(). After calling __dput() the caller must make sure that on all
dentries __d_kill_final() is called. __d_kill_final() is actually doing the
dentry_iput() and is also dereferencing the parent.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/dcache.c |   60 +++-
 1 file changed, 55 insertions(+), 5 deletions(-)

--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -129,19 +129,56 @@ static void dentry_iput(struct dentry * 
  *
  * If this is the root of the dentry tree, return NULL.
  */
-static struct dentry *d_kill(struct dentry *dentry)
+static struct dentry *__d_kill(struct dentry *dentry, struct list_head *list)
 {
struct dentry *parent;
 
list_del(dentry-d_u.d_child);
dentry_stat.nr_dentry--;/* For d_free, below */
-   /*drops the locks, at that point nobody can reach this dentry */
+
+   if (list) {
+   list_del_init(dentry-d_alias);
+   /* at this point nobody can reach this dentry */
+   list_add(dentry-d_lru, list);
+   spin_unlock(dentry-d_lock);
+   spin_unlock(dcache_lock);
+   return NULL;
+   }
+
+   /* drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);
parent = dentry-d_parent;
d_free(dentry);
return dentry == parent ? NULL : parent;
 }
 
+void __dput(struct dentry *, struct list_head *);
+
+static void __d_kill_final(struct dentry *dentry, struct list_head *list)
+{
+   struct dentry *parent = dentry-d_parent;
+   struct inode *inode = dentry-d_inode;
+
+   if (inode) {
+   dentry-d_inode = NULL;
+   if (!inode-i_nlink)
+   fsnotify_inoderemove(inode);
+   if (dentry-d_op  dentry-d_op-d_iput)
+   dentry-d_op-d_iput(dentry, inode);
+   else
+   iput(inode);
+   }
+
+   d_free(dentry);
+   if (dentry != parent)
+   __dput(parent, list);
+}
+
+static struct dentry *d_kill(struct dentry *dentry)
+{
+   return __d_kill(dentry, NULL);
+}
+
 /* 
  * This is dput
  *
@@ -171,7 +208,7 @@ static struct dentry *d_kill(struct dent
  * no dcache lock, please.
  */
 
-void dput(struct dentry *dentry)
+void __dput(struct dentry *dentry, struct list_head *list)
 {
if (!dentry)
return;
@@ -215,14 +252,27 @@ kill_it:
 * delete it from there
 */
if (!list_empty(dentry-d_lru)) {
-   list_del(dentry-d_lru);
+   list_del_init(dentry-d_lru);
dentry_stat.nr_unused--;
}
-   dentry = d_kill(dentry);
+
+   dentry = __d_kill(dentry, list);
if (dentry)
goto repeat;
 }
 
+void dput(struct dentry *dentry)
+{
+   LIST_HEAD(mortuary);
+
+   __dput(dentry, mortuary);
+   while (!list_empty(mortuary)) {
+   dentry = list_entry(mortuary.next, struct dentry, d_lru);
+   list_del(dentry-d_lru);
+   __d_kill_final(dentry, mortuary);
+   }
+}
+
 /**
  * d_invalidate - invalidate a dentry
  * @dentry: dentry to invalidate

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 13/26] ext3 whiteout support

2007-07-30 Thread Jan Blunck
Introduce whiteout support for ext3.

- Needs a reserved inode number for white-outs
- S_OPAQUE isn't persistently stored

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/ext3/dir.c   |3 ++-
 fs/ext3/namei.c |   33 +
 fs/ext3/super.c |5 -
 include/linux/ext3_fs.h |5 -
 4 files changed, 43 insertions(+), 3 deletions(-)

--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -29,7 +29,8 @@
 #include linux/rbtree.h
 
 static unsigned char ext3_filetype_table[] = {
-   DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+   DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK,
+   DT_WHT
 };
 
 static int ext3_readdir(struct file *, void *, filldir_t);
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1081,6 +1081,7 @@ static unsigned char ext3_type_by_mode[S
[S_IFIFO  S_SHIFT]= EXT3_FT_FIFO,
[S_IFSOCK  S_SHIFT]   = EXT3_FT_SOCK,
[S_IFLNK  S_SHIFT]= EXT3_FT_SYMLINK,
+   [S_IFWHT  S_SHIFT]= EXT3_FT_WHT,
 };
 
 static inline void ext3_set_de_type(struct super_block *sb,
@@ -2070,6 +2071,37 @@ end_rmdir:
return retval;
 }
 
+static int ext3_whiteout(struct inode *dir, struct dentry *dentry)
+{
+   struct inode *inode;
+   int err, retries = 0;
+   handle_t *handle;
+
+retry:
+   handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir-i_sb) +
+   EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+   2*EXT3_QUOTA_INIT_BLOCKS(dir-i_sb));
+   if (IS_ERR(handle))
+   return PTR_ERR(handle);
+
+   if (IS_DIRSYNC(dir))
+   handle-h_sync = 1;
+
+   inode = ext3_new_inode (handle, dir, S_IFWHT | S_IRUGO);
+   err = PTR_ERR(inode);
+   if (IS_ERR(inode))
+   goto out_stop;
+
+   init_special_inode(inode, inode-i_mode, 0);
+   err = ext3_add_nondir(handle, dentry, inode);
+
+out_stop:
+   ext3_journal_stop(handle);
+   if (err == -ENOSPC  ext3_should_retry_alloc(dir-i_sb, retries))
+   goto retry;
+   return err;
+}
+
 static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 {
int retval;
@@ -2387,6 +2419,7 @@ const struct inode_operations ext3_dir_i
.mkdir  = ext3_mkdir,
.rmdir  = ext3_rmdir,
.mknod  = ext3_mknod,
+   .whiteout   = ext3_whiteout,
.rename = ext3_rename,
.setattr= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1500,6 +1500,9 @@ static int ext3_fill_super (struct super
sb-s_flags = (sb-s_flags  ~MS_POSIXACL) |
((sbi-s_mount_opt  EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
 
+   if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_WHITEOUT))
+   sb-s_flags |= MS_WHITEOUT;
+
if (le32_to_cpu(es-s_rev_level) == EXT3_GOOD_OLD_REV 
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
 EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -2764,7 +2767,7 @@ static struct file_system_type ext3_fs_t
.name   = ext3,
.get_sb = ext3_get_sb,
.kill_sb= kill_block_super,
-   .fs_flags   = FS_REQUIRES_DEV,
+   .fs_flags   = FS_REQUIRES_DEV | FS_WHT,
 };
 
 static int __init init_ext3_fs(void)
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -63,6 +63,7 @@
 #define EXT3_UNDEL_DIR_INO  6  /* Undelete directory inode */
 #define EXT3_RESIZE_INO 7  /* Reserved group descriptors 
inode */
 #define EXT3_JOURNAL_INO8  /* Journal inode */
+#define EXT3_WHT_INO9  /* Whiteout inode */
 
 /* First non-reserved inode for old ext3 filesystems */
 #define EXT3_GOOD_OLD_FIRST_INO11
@@ -582,6 +583,7 @@ static inline int ext3_valid_inum(struct
 #define EXT3_FEATURE_INCOMPAT_RECOVER  0x0004 /* Needs recovery */
 #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV  0x0008 /* Journal device */
 #define EXT3_FEATURE_INCOMPAT_META_BG  0x0010
+#define EXT3_FEATURE_INCOMPAT_WHITEOUT 0x0020
 
 #define EXT3_FEATURE_COMPAT_SUPP   EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
@@ -648,8 +650,9 @@ struct ext3_dir_entry_2 {
 #define EXT3_FT_FIFO   5
 #define EXT3_FT_SOCK   6
 #define EXT3_FT_SYMLINK7
+#define EXT3_FT_WHT8
 
-#define EXT3_FT_MAX8
+#define EXT3_FT_MAX9
 
 /*
  * EXT3_DIR_PAD defines the directory entries boundaries

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 12/26] ext2 white-out support

2007-07-30 Thread Jan Blunck
Introduce white-out support to ext2.

Known Bugs:
- Needs a reserved inode number for white-outs
- S_OPAQUE isn't persistently stored

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/ext2/dir.c   |2 ++
 fs/ext2/namei.c |   18 ++
 fs/ext2/super.c |5 -
 include/linux/ext2_fs.h |4 
 4 files changed, 28 insertions(+), 1 deletion(-)

--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -230,6 +230,7 @@ static unsigned char ext2_filetype_table
[EXT2_FT_FIFO]  = DT_FIFO,
[EXT2_FT_SOCK]  = DT_SOCK,
[EXT2_FT_SYMLINK]   = DT_LNK,
+   [EXT2_FT_WHT]   = DT_WHT,
 };
 
 #define S_SHIFT 12
@@ -241,6 +242,7 @@ static unsigned char ext2_type_by_mode[S
[S_IFIFO  S_SHIFT]= EXT2_FT_FIFO,
[S_IFSOCK  S_SHIFT]   = EXT2_FT_SOCK,
[S_IFLNK  S_SHIFT]= EXT2_FT_SYMLINK,
+   [S_IFWHT  S_SHIFT]= EXT2_FT_WHT,
 };
 
 static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -288,6 +288,23 @@ static int ext2_rmdir (struct inode * di
return err;
 }
 
+static int ext2_whiteout(struct inode *dir, struct dentry *dentry)
+{
+   struct inode *inode;
+   int err;
+
+   inode = ext2_new_inode (dir, S_IFWHT | S_IRUGO);
+   err = PTR_ERR(inode);
+   if (IS_ERR(inode))
+   goto out;
+
+   init_special_inode(inode, inode-i_mode, 0);
+   mark_inode_dirty(inode);
+   err = ext2_add_nondir(dentry, inode);
+out:
+   return err;
+}
+
 static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct inode * new_dir, struct dentry * new_dentry )
 {
@@ -382,6 +399,7 @@ const struct inode_operations ext2_dir_i
.mkdir  = ext2_mkdir,
.rmdir  = ext2_rmdir,
.mknod  = ext2_mknod,
+   .whiteout   = ext2_whiteout,
.rename = ext2_rename,
 #ifdef CONFIG_EXT2_FS_XATTR
.setxattr   = generic_setxattr,
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -752,6 +752,9 @@ static int ext2_fill_super(struct super_
ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
EXT2_MOUNT_XIP if not */
 
+   if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_WHITEOUT))
+   sb-s_flags |= MS_WHITEOUT;
+
if (le32_to_cpu(es-s_rev_level) == EXT2_GOOD_OLD_REV 
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
 EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -1299,7 +1302,7 @@ static struct file_system_type ext2_fs_t
.name   = ext2,
.get_sb = ext2_get_sb,
.kill_sb= kill_block_super,
-   .fs_flags   = FS_REQUIRES_DEV,
+   .fs_flags   = FS_REQUIRES_DEV | FS_WHT,
 };
 
 static int __init init_ext2_fs(void)
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -61,6 +61,7 @@
 #define EXT2_ROOT_INO   2  /* Root inode */
 #define EXT2_BOOT_LOADER_INO5  /* Boot loader inode */
 #define EXT2_UNDEL_DIR_INO  6  /* Undelete directory inode */
+#define EXT2_WHT_INO7  /* Whiteout inode */
 
 /* First non-reserved inode for old ext2 filesystems */
 #define EXT2_GOOD_OLD_FIRST_INO11
@@ -479,10 +480,12 @@ struct ext2_super_block {
 #define EXT3_FEATURE_INCOMPAT_RECOVER  0x0004
 #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV  0x0008
 #define EXT2_FEATURE_INCOMPAT_META_BG  0x0010
+#define EXT2_FEATURE_INCOMPAT_WHITEOUT 0x0020
 #define EXT2_FEATURE_INCOMPAT_ANY  0x
 
 #define EXT2_FEATURE_COMPAT_SUPP   EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP (EXT2_FEATURE_INCOMPAT_FILETYPE| \
+EXT2_FEATURE_INCOMPAT_WHITEOUT| \
 EXT2_FEATURE_INCOMPAT_META_BG)
 #define EXT2_FEATURE_RO_COMPAT_SUPP(EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
@@ -549,6 +552,7 @@ enum {
EXT2_FT_FIFO,
EXT2_FT_SOCK,
EXT2_FT_SYMLINK,
+   EXT2_FT_WHT,
EXT2_FT_MAX
 };
 

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 22/26] union-mount: white-out changes for copy-on-open

2007-07-30 Thread Jan Blunck
When files on an upper layer of the union stack are removed we need to
white-out the removed filename.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namei.c |   46 --
 1 file changed, 44 insertions(+), 2 deletions(-)

--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2253,6 +2253,13 @@ do_last:
 
/* Negative dentry, just create the file */
if (!path.dentry-d_inode || S_ISWHT(path.dentry-d_inode-i_mode)) {
+   if (path.dentry-d_parent != dir) {
+   dput_path(path, nd);
+   path.dentry = __lookup_hash_kern(nd-last, dir, nd);
+   path.mnt = nd-mnt;
+   goto do_last;
+   }
+
error = open_namei_create(nd, path, flag, mode);
if (error)
goto exit;
@@ -2373,6 +2380,16 @@ int lookup_create(struct nameidata *nd, 
 {
int err = -EEXIST;
 
+   if (is_unionized(nd-dentry, nd-mnt)) {
+   err = union_relookup_topmost(nd, nd-flags  ~LOOKUP_PARENT);
+   if (err) {
+   /* FIXME: This really sucks */
+   mutex_lock_nested(nd-dentry-d_inode-i_mutex,
+ I_MUTEX_PARENT);
+   goto fail;
+   }
+   }
+
mutex_lock_nested(nd-dentry-d_inode-i_mutex, I_MUTEX_PARENT);
/*
 * Yucky last component or no last component at all?
@@ -2391,6 +2408,16 @@ int lookup_create(struct nameidata *nd, 
if (err)
goto fail;
 
+   /* Special case - we found a whiteout */
+   if (path-dentry-d_inode  S_ISWHT(path-dentry-d_inode-i_mode)) {
+   if (path-dentry-d_parent != nd-dentry) {
+   dput_path(path, nd);
+   path-dentry = __lookup_hash_kern(nd-last, nd-dentry,
+ nd);
+   path-mnt = nd-mnt;
+   }
+   }
+
/*
 * Special case - lookup gave negative, but... we had foo/bar/
 * From the vfs_mknod() POV we just have a negative dentry -
@@ -2682,6 +2709,15 @@ static int do_whiteout(struct nameidata 
if (isdir  !directory_is_empty(path-dentry, path-mnt))
goto out;
 
+   mutex_unlock(nd-dentry-d_inode-i_mutex);
+   err = union_relookup_topmost(nd, nd-flags  ~LOOKUP_PARENT);
+   if (err) {
+   mutex_lock_nested(nd-dentry-d_inode-i_mutex,
+ I_MUTEX_PARENT);
+   goto out;
+   }
+   mutex_lock_nested(nd-dentry-d_inode-i_mutex, I_MUTEX_PARENT);
+
/* safe the name for a later lookup */
err = -ENOMEM;
name.name = kmalloc(dentry-d_name.len, GFP_KERNEL);
@@ -3012,7 +3048,10 @@ static long do_rmdir(int dfd, const char
error = hash_lookup_union(nd, nd.last, path);
if (error)
goto exit2;
-   error = vfs_rmdir(nd.dentry-d_inode, path.dentry);
+   if (is_unionized(nd.dentry, nd.mnt))
+   error = do_whiteout(nd, path, 1);
+   else
+   error = vfs_rmdir(nd.dentry-d_inode, path.dentry);
dput_path(path, nd);
 exit2:
mutex_unlock(nd.dentry-d_inode-i_mutex);
@@ -3091,7 +3130,10 @@ static long do_unlinkat(int dfd, const c
inode = path.dentry-d_inode;
if (inode)
atomic_inc(inode-i_count);
-   error = vfs_unlink(nd.dentry-d_inode, path.dentry);
+   if (is_unionized(nd.dentry, nd.mnt))
+   error = do_whiteout(nd, path, 0);
+   else
+   error = vfs_unlink(nd.dentry-d_inode, path.dentry);
exit2:
dput_path(path, nd);
}

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 00/26] VFS based Union Mount (V2)

2007-07-30 Thread Jan Blunck
Here is another post of the VFS based union mount implementation. Unlike the
traditional mount which hides the contents of the mount point, union mounts
present the merged view of the mount point and the mounted filesytem.

Recent changes:
- brand new union structure no longer tied to the dentryn, now works with bind
  mounts
- generic part of the whiteout patches extracted
- introduces MS_WHITEOUT to make the white-out patches independant of the
  union-mount stuff
- uses a singleton whiteout inode for the tmpfs filesystem (I need to fix this
  for ext2/3, too)
- renaming files on unions uses copyup now
- rewrote the union mount debugging code: it is now debugfs/relay based.
- random cleanups

I'm able to compile the kernel with this patches applied on a  3 layer union
mount with the seperate layers bind mounted to different locations. I haven't
done any performance tests since I think there is a more important topic
ahead: better readdir() support.

This series is against 2.6.22-rc6-mm1.

Comments are welcome,
Jan

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 14/26] union-mount: Documentation

2007-07-30 Thread Jan Blunck
Add simple documentation about union mounting in general and this
implementation in specific.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 Documentation/filesystems/union-mounts.txt |  172 +
 1 file changed, 172 insertions(+)

--- /dev/null
+++ b/Documentation/filesystems/union-mounts.txt
@@ -0,0 +1,172 @@
+VFS based Union Mounts
+--
+
+ 1. What are Union Mounts
+ 2. The Union Stack
+ 3. The White-out Filetype
+ 4. Renaming Unions
+ 5. Directory Reading
+ 6. Known Problems
+ 7. References
+
+---
+
+1. What are Union Mounts
+==
+
+Please note: this is NOT about UnionFS and it is NOT derived work!
+
+Traditionally the mount operation is opaque, which means that the content of
+the mount point, the directory where the file system is mounted on, is hidden
+by the content of the mounted file system's root directory until the file
+system is unmounted again. Unlike the traditional UNIX mount mechanism, that
+hides the contents of the mount point, a union mount presents a view as if
+both filesystems are merged together. Although only the topmost layer of the
+mount stack can be altered, it appears as if transparent file system mounts
+allow any file to be created, modified or deleted.
+
+Most people know the concepts and features of union mounts from other
+operating systems like Sun's Translucent Filesystem, Plan9 or BSD.
+
+Here are the key features of this implementation:
+- completely VFS based
+- does not change the namespace stacking
+- directory listings have duplicate entries removed
+- writable unions: only the topmost file system layer may be writable
+- writable unions: new white-out filetype handled inside the kernel
+
+---
+
+2. The Union Stack
+==
+
+The mounted file systems are organized in the file system hierarchy (tree of
+vfsmount structures), which keeps track about the stacking of file systems
+upon each other. The per-directory view on the file system hierarchy is called
+mount stack and reflects the order of file systems, which are mounted on a
+specific directory.
+
+Union mounts present a single unified view of the contents of two or more file
+systems as if they are merged together. Since the information which file
+system objects are part of a unified view is not directly available from the
+file system hierachy there is a need for a new structure. The file system
+objects, which are part of a unified view are ordered in a so-called union
+stack. Only directoties can be part of a unified view.
+
+The link between two layers of the union stack is maintained using the
+union_mount structure (#include linux/union.h):
+
+struct union_mount {
+   atomic_t u_count;   /* reference count */
+   struct mutex u_mutex;
+   struct list_head u_unions;  /* list head for d_unions */
+   struct hlist_node u_hash;   /* list head for seaching */
+   struct hlist_node u_rhash;  /* list head for reverse seaching */
+
+   struct path u_this; /* this is me */
+   struct path u_next; /* this is what I overlay */
+};
+
+The union_mount structure holds a reference (dget,mntget) to the next lower
+layer of the union stack. Since a dentry can be part of multiple unions
+(e.g. with bind mounts) they are tied together via the d_unions field of the
+dentry structure.
+
+All union_mount structures are cached in two hash tables, one for lookups of
+the next lower layer of the union stack and one for reverse lookups of the
+next upper layer of the union stack. The reverse lookup is necessary to
+resolve CWD relative path lookups. For calculation of the hash value, the
+(dentry,vfsmount) pair is used. The u_this field is used for the hash table
+which is used in forward lookups and the u_next field for the reverse lookups.
+
+During every new mount (or mount propagation), a new union_mount structure is
+allocated. A reference to the mountpoint's vfsmount and dentry is taken and
+stored in the u_next field.  In almost the same manner an union_mount
+structure is created during the first time lookup of a directory within a
+union mount point. In this case the lookup proceeds to all lower layers of the
+union. Therefore the complete union stack is constructed during lookups.
+
+The union_mount structures of a dentry are destroyed when the dentry itself is
+destroyed. Therefore the dentry cache is indirectly driving the union_mount
+cache like this is done for inodes too. Please note that lower layer
+union_mount structures are kept in memory until the topmost dentry is
+destroyed.
+
+---
+
+3. Writable Unions: The White-out Filetype and Copy-On-Open
+===
+
+The white-out 

[RFC 08/26] VFS: Export lives_below_in_same_fs()

2007-07-30 Thread Jan Blunck
Export lives_below_in_same_fs() for use in union mount code.

Signed-off-by: Jan Blunck [EMAIL PROTECTED]
---
 fs/namespace.c|3 ++-
 include/linux/mount.h |1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -793,7 +793,7 @@ static bool permit_mount(struct nameidat
return true;
 }
 
-static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
+int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry)
 {
while (1) {
if (d == dentry)
@@ -803,6 +803,7 @@ static int lives_below_in_same_fs(struct
d = d-d_parent;
}
 }
+EXPORT_SYMBOL_GPL(lives_below_in_same_fs);
 
 struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
int flag, uid_t owner)
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -106,6 +106,7 @@ extern void shrink_submounts(struct vfsm
 
 extern spinlock_t vfsmount_lock;
 extern dev_t name_to_dev_t(char *name);
+extern int lives_below_in_same_fs(struct dentry *, struct dentry *);
 
 #endif
 #endif /* _LINUX_MOUNT_H */

-- 

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [2/3] 2.6.23-rc1: known regressions with patches v3

2007-07-30 Thread Michal Piotrowski
Hi all,

Here is a list of some known regressions in 2.6.23-rc1
with patches available.

Feel free to add new regressions/remove fixed etc.
http://kernelnewbies.org/known_regressions

List of Aces

NameRegressions fixed since 21-Jun-2007
Adrian Bunk6
Andi Kleen 4
Andrew Morton  4
Linus Torvalds 4
Al Viro3
Jens Axboe 3
Tejun Heo  3
David Woodhouse2
Hugh Dickins   2



FS

Subject : NFSv4 poops itself
References  : http://lkml.org/lkml/2007/7/27/144
Last known good : ?
Submitter   : Jeff Garzik [EMAIL PROTECTED] 
Caused-By   : ?
Handled-By  : Trond Myklebust [EMAIL PROTECTED]
Patch   : http://lkml.org/lkml/2007/7/27/183
Status  : patch available



Media

Subject : usbvision: don't return an uninitialized value
References  : http://lkml.org/lkml/2007/7/23/65
Last known good : ?
Submitter   : Adrian Bunk [EMAIL PROTECTED]
Caused-By   : Thierry MERLE [EMAIL PROTECTED]
  commit c5f48367fe54c46805774eeea8e828de54a5ad7b
Handled-By  : Trent Piepho [EMAIL PROTECTED]
Patch   : http://lkml.org/lkml/2007/7/23/114
Status  : patch available



Memory management

Subject : [bug] SLUB  freeing locks
References  : http://lkml.org/lkml/2007/7/26/90
Last known good : ?
Submitter   : Ingo Molnar [EMAIL PROTECTED]
Caused-By   : ?
Handled-By  : Peter Zijlstra [EMAIL PROTECTED]
Patch   : http://lkml.org/lkml/2007/7/26/97
Status  : patch available



SYSFS

Subject : sysfs/udev broken in 2.6.23-rc1
References  : http://lkml.org/lkml/2007/7/24/276
Last known good : ?
Submitter   : Simon Arlott
Caused-By   : ?
Handled-By  : Kay Sievers [EMAIL PROTECTED]
  Cornelia Huck [EMAIL PROTECTED]
Patch   : http://lkml.org/lkml/2007/7/25/83
Status  : patch available



Regards,
Michal

--
LOG
http://www.stardust.webpages.pl/log/
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] basic delayed allocation in VFS

2007-07-30 Thread Mingming Cao
On Sun, 2007-07-29 at 20:24 +0100, Christoph Hellwig wrote:
 On Sun, Jul 29, 2007 at 11:30:36AM -0600, Andreas Dilger wrote:
  Sigh, we HAVE a patch that was only adding delalloc to ext4, but it
  was rejected because that functionality should go into the VFS.
  Since the performance improvement of delalloc is quite large, we'd
  like to get this into the kernel one way or another.  Can we make a
  decision if the ext4-specific delalloc is acceptable?
 
 I'm a big proponent of having proper common delalloc code, but the
 one proposed here is not generic for the existing filesystem using
 delalloc.  

To be fair, what Alex have so far is probably good enough for ext2/3
delayed allocation.

 It's still on my todo list to revamp the xfs code to get
 rid of some of the existing mess and make it useable genericly.  If
 the ext4 users are fine with the end result we could move to generic
 code.
 

Are you okay with having a ext4 delayed allocation implementation (i.e.
moving the code proposed in this thread to fs/ext4) first?  Then later
when you come up with a generic delayed allocation for both ext4 and xfs
we could make use of that generic implementation. Is that a acceptable
approach? 

Andrew, what do you think?


Regards,
Mingming

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 00/26] VFS based Union Mount (V2)

2007-07-30 Thread Al Boldi
Jan Blunck wrote:
 Here is another post of the VFS based union mount implementation. Unlike
 the traditional mount which hides the contents of the mount point, union
 mounts present the merged view of the mount point and the mounted
 filesytem.

Great!

 Recent changes:
 - brand new union structure no longer tied to the dentryn, now works with
 bind mounts
 - generic part of the whiteout patches extracted
 - introduces MS_WHITEOUT to make the white-out patches independant of the
   union-mount stuff
 - uses a singleton whiteout inode for the tmpfs filesystem (I need to fix
 this for ext2/3, too)
 - renaming files on unions uses copyup now

I wonder if this copyup functionality could be generalized to induce CoW when 
modifying hard-linked files.  Does that sound feasible?

 - rewrote the union mount debugging code: it is now debugfs/relay based.
 - random cleanups

 I'm able to compile the kernel with this patches applied on a  3 layer
 union mount with the seperate layers bind mounted to different locations.
 I haven't done any performance tests since I think there is a more
 important topic ahead: better readdir() support.

What about the umount oops?  Did that get fixed?

 This series is against 2.6.22-rc6-mm1.

Things as big and important like this should probably also be diff'd against 
mainline, to increase testing input.


Thanks!

--
Al

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] ufs: implement show_options

2007-07-30 Thread Evgeniy Dushistov
This patch contains implementation of show_options method for UFS,
it depend on add-in-sunos-41x-compatible-mode-for-ufs.patch and
add-in-sunos-41x-compatible-mode-for-ufs-fix.patch.


Signed-off-by: Evgeniy Dushistov [EMAIL PROTECTED]

---

Index: linux-2.6.23-rc1/fs/ufs/super.c
===
--- linux-2.6.23-rc1.orig/fs/ufs/super.c
+++ linux-2.6.23-rc1/fs/ufs/super.c
@@ -88,6 +88,8 @@
 #include linux/buffer_head.h
 #include linux/vfs.h
 #include linux/log2.h
+#include linux/mount.h
+#include linux/seq_file.h
 
 #include swab.h
 #include util.h
@@ -286,10 +288,21 @@ void ufs_warning (struct super_block * s
 }
 
 enum {
-   Opt_type_old, Opt_type_sunx86, Opt_type_sun, Opt_type_sunos, 
Opt_type_44bsd,
-   Opt_type_ufs2, Opt_type_hp, Opt_type_nextstepcd, Opt_type_nextstep,
-   Opt_type_openstep, Opt_onerror_panic, Opt_onerror_lock,
-   Opt_onerror_umount, Opt_onerror_repair, Opt_err
+   Opt_type_old = UFS_MOUNT_UFSTYPE_OLD,
+   Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86,
+   Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN,
+   Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS,
+   Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD,
+   Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2,
+   Opt_type_hp = UFS_MOUNT_UFSTYPE_HP,
+   Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD,
+   Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP,
+   Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP,
+   Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC,
+   Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK,
+   Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT,
+   Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR,
+   Opt_err
 };
 
 static match_table_t tokens = {
@@ -304,6 +317,7 @@ static match_table_t tokens = {
{Opt_type_nextstepcd, ufstype=nextstep-cd},
{Opt_type_nextstep, ufstype=nextstep},
{Opt_type_openstep, ufstype=openstep},
+/*end of possible ufs types */
{Opt_onerror_panic, onerror=panic},
{Opt_onerror_lock, onerror=lock},
{Opt_onerror_umount, onerror=umount},
@@ -1209,6 +1223,26 @@ static int ufs_remount (struct super_blo
return 0;
 }
 
+static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+   struct ufs_sb_info *sbi = UFS_SB(vfs-mnt_sb);
+   unsigned mval = sbi-s_mount_opt  UFS_MOUNT_UFSTYPE;
+   struct match_token *tp = tokens;
+
+   while (tp-token != Opt_onerror_panic  tp-token != mval)
+   ++tp;
+   BUG_ON(tp-token == Opt_onerror_panic);
+   seq_printf(seq, ,%s, tp-pattern);
+
+   mval = sbi-s_mount_opt  UFS_MOUNT_ONERROR;
+   while (tp-token != Opt_err  tp-token != mval)
+   ++tp;
+   BUG_ON(tp-token == Opt_err);
+   seq_printf(seq, ,%s, tp-pattern);
+
+   return 0;
+}
+
 static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
struct super_block *sb = dentry-d_sb;
@@ -1301,6 +1335,7 @@ static const struct super_operations ufs
.write_super= ufs_write_super,
.statfs = ufs_statfs,
.remount_fs = ufs_remount,
+   .show_options   = ufs_show_options,
 #ifdef CONFIG_QUOTA
.quota_read = ufs_quota_read,
.quota_write= ufs_quota_write,

-- 
/Evgeniy

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bonnie++ benchmarks for ext2,ext3,ext4,jfs,reiserfs,xfs,zfs on software raid 5

2007-07-30 Thread Al Boldi
Justin Piszcz wrote:
 CONFIG:

 Software RAID 5 (400GB x 6): Default mkfs parameters for all filesystems.
 Kernel was 2.6.21 or 2.6.22, did these awhile ago.
 Hardware was SATA with PCI-e only, nothing on the PCI bus.

 ZFS was userspace+fuse of course.

Wow! Userspace and still that efficient.

 Reiser was V3.
 EXT4 was created using the recommended options on its project page.

 RAW:

 ext2,7760M,56728,96.,180505,51,85484,17.,50946.7,80.,235541,21
.,373.667,0,16:10:16/64,2354,27,0,0,8455.67,14.6667,2211.67,26.
,0,0,9724,22.
 ext3,7760M,52702.7,94.,165005,60,82294.7,20.6667,52664,83.6667,258788,
33.,335.8,0,16:10:16/64,858.333,10.6667,10250.3,28.6667,4084,15,897
,12.6667,4024.33,12.,2754,11.
 ext4,7760M,53129.7,95,164515,59.,101678,31.6667,62194.3,98.6667,266716
,22.,405.767,0,16:10:16/64,1963.67,23.6667,0,0,20859,73.6667,1731,2
1.,9022,23.6667,16410,65.6667
 jfs,7760M,54606,92,191997,52,112764,33.6667,63585.3,99,274921,22.,383.
8,0,16:10:16/64,344,1,0,0,539.667,0,297.667,1,0,0,340,0
 reiserfs,7760M,51056.7,96,180607,67,106907,38.,61231.3,97.6667,275339,
29.,441.167,0,16:10:16/64,2516,60.6667,19174.3,60.6667,8194.33,54.3
333,2011,42.6667,6963.67,19.6667,9168.33,68.6667
 xfs,7760M,52985.7,93,158342,45,79682,14,60547.3,98,239101,20.,359.667,
0,16:10:16/64,415,4,0,0,1774.67,10.6667,454,4.7,14526.3,40,1572,12.
6667

 zfs,7760M,

Dissecting some of these numbers:

  speed %cpu  
 25601,43.,
 32198.7,4,
 13266.3, 2,
 44145.3,68.6667,
 129278,9,
 245.167,0,

 16:10:16/64,

  speed %cpu  
 218.333,2,
 2698.33,11.6667,
 7434.67,14.,
 244,2,
 2191.33,11.6667,
 5613.33,13.

Extrapolating these %cpu number makes ZFS the fastest.

Are you sure these numbers are correct?


Thanks!

--
Al

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bonnie++ benchmarks for ext2,ext3,ext4,jfs,reiserfs,xfs,zfs on software raid 5

2007-07-30 Thread Justin Piszcz



On Mon, 30 Jul 2007, Miklos Szeredi wrote:


Extrapolating these %cpu number makes ZFS the fastest.

Are you sure these numbers are correct?


Note, that %cpu numbers for fuse filesystems are inherently skewed,
because the CPU usage of the filesystem process itself is not taken
into account.

So the numbers are not all that good, but according to the zfs-fuse
author it hasn't been optimized yet, so they may improve.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-raid in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html



This was performed on an E6300, 1 core was ZFS/FUSE (or quite a bit of it 
anyway)


-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 12/26] ext2 white-out support

2007-07-30 Thread Theodore Tso
On Mon, Jul 30, 2007 at 06:13:35PM +0200, Jan Blunck wrote:
 Introduce white-out support to ext2.
 
 Known Bugs:
 - Needs a reserved inode number for white-outs

You picked different reserved inodes for the ext2 and ext3
filesystems.  That's good for a NACK right there.  The codepoints
(i.e., reserved inode numbers, feature bit masks, etc.) for ext2,
ext3, and ext4 MUST not overlap.  After all, someone might use tune2fs
-j to convert an ext2 filesystem to ext3, and is it's REALLY BAD that
you're using a reserved inode of 7 for ext2, and 9 for ext3.

Also, I note that you have created a new INCOMPAT feature flag support
for whiteouts.  That's really unfortunate; we try to avoid introducing
incompatible feature flags unless absolutely necessary; note that even
adding a COMPAT feature flag means that you need a new version of
e2fsprogs if you want e2fsck to be willing to touch that filesystem.

So --- if you're looking for a way to add whiteout support to
ext2/ext3 without needing a feature bit, here's how.  We allocate a
new inode flag in struct ext3_inode.i_flags:

#define EXT2_WHTOUT_FL   0x0004

We also allocate a new field in the ext2 superblock to store the
whiteout inode.  (Please coordinate with me so it's a superblock
field not in use by ext3/ext4, and so it's reserved so that no one
else uses it.)  The superblock field, call it s_whtout_ino, stores the
inode number for the white out inode.

When you create a new whiteout file, the code checks sb-s_whtout_ino,
and if it is zero, it allocates a new inode, and creates it as a
zero-length regular file (i_mode |= S_IFREG) with the EXT2_WHTOUT_FL
flag set in the inode, and then store the inode number in
sb-s_whtout_ino.  If sb-s_whtout_ino is non-zero, you must read in
the inode and make sure that the EXT2_WHTOUT_FL is set.  If it is not,
then allocate a new whiteout inode as described previously.  Then link
the inode into the directory as before.

When reading an inode, if the EXT2_WHTOUT_FL flag is set, then set the
in-memory mode of the inode to be S_IFWHT.  

That's pretty much about it.  For cleanliness sake, it would be good
if ext2_delete_inode clears sb-s_whtout_ino if the last whiteout link
has been deleted, but it's strictly speaking not necessary.  If you do
it this way, the filesystem is completely backwards compatible; the
whiteout files will just appear to links to a normal zero-lenth file.

I wouldn't bother with setting the directory type field to be DT_WHT,
given that they will never be returned to userspace anyway.

Regards,

- Ted
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html