Re: [PATCH] resolve duplicate flag no for PG_lazyfree

2007-05-14 Thread Fengguang Wu
On Sun, May 13, 2007 at 10:46:30PM -0700, Andrew Morton wrote:
 On Mon, 14 May 2007 10:37:18 +0800 Fengguang Wu [EMAIL PROTECTED] wrote:
 
  PG_lazyfree and PG_booked shares the same bit.
  
  Either it is a bug that shall fixed by the following patch, or
  the situation should be explicitly documented?
  
  Signed-off-by: Fengguang Wu [EMAIL PROTECTED]
  ---
   include/linux/page-flags.h |2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
  
  --- linux-2.6.21-mm2.orig/include/linux/page-flags.h
  +++ linux-2.6.21-mm2/include/linux/page-flags.h
  @@ -91,7 +91,7 @@
   #define PG_buddy   19  /* Page is free, on buddy lists */
   #define PG_booked  20  /* Has blocks reserved on-disk */
   
  -#define PG_lazyfree20  /* MADV_FREE potential 
  throwaway */
  +#define PG_lazyfree21  /* MADV_FREE potential 
  throwaway */
   
   /* PG_owner_priv_1 users should have descriptive aliases */
   #define PG_checked PG_owner_priv_1 /* Used by some filesystems */
 
 That's an accident: PG_lazyfree got added but the out-of-tree ext4 patches
 didn't get updated.
 
 otoh, the intersection between pages which are PageBooked() and pages which
 are PageLazyFree() should be zreo, so it'd be good to actually formalise
 this reuse within the ext4 patches.
 
 otoh2, PageLazyFree() could have reused PG_owner_priv_1.

otoh3: PG_lazyfree and PG_readahead can reuse the same bit, too.
PG_lazyfree applies to anonymous pages, while PG_readahead applies to
file backed pages.

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 17/41] ext2 convert to new aops.

2007-05-14 Thread npiggin
Cc: linux-ext4@vger.kernel.org
Cc: Linux Filesystems [EMAIL PROTECTED]
Signed-off-by: Nick Piggin [EMAIL PROTECTED]

 fs/ext2/dir.c   |   47 +--
 fs/ext2/ext2.h  |3 +++
 fs/ext2/inode.c |   24 +---
 3 files changed, 45 insertions(+), 29 deletions(-)

Index: linux-2.6/fs/ext2/inode.c
===
--- linux-2.6.orig/fs/ext2/inode.c
+++ linux-2.6/fs/ext2/inode.c
@@ -726,18 +726,21 @@ ext2_readpages(struct file *file, struct
return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
 }
 
-static int
-ext2_prepare_write(struct file *file, struct page *page,
-   unsigned from, unsigned to)
+int __ext2_write_begin(struct file *file, struct address_space *mapping,
+   loff_t pos, unsigned len, unsigned flags,
+   struct page **pagep, void **fsdata)
 {
-   return block_prepare_write(page,from,to,ext2_get_block);
+   return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+   ext2_get_block);
 }
 
 static int
-ext2_nobh_prepare_write(struct file *file, struct page *page,
-   unsigned from, unsigned to)
+ext2_write_begin(struct file *file, struct address_space *mapping,
+   loff_t pos, unsigned len, unsigned flags,
+   struct page **pagep, void **fsdata)
 {
-   return nobh_prepare_write(page,from,to,ext2_get_block);
+   *pagep = NULL;
+   return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata);
 }
 
 static int ext2_nobh_writepage(struct page *page,
@@ -773,8 +776,8 @@ const struct address_space_operations ex
.readpages  = ext2_readpages,
.writepage  = ext2_writepage,
.sync_page  = block_sync_page,
-   .prepare_write  = ext2_prepare_write,
-   .commit_write   = generic_commit_write,
+   .write_begin= ext2_write_begin,
+   .write_end  = generic_write_end,
.bmap   = ext2_bmap,
.direct_IO  = ext2_direct_IO,
.writepages = ext2_writepages,
@@ -791,8 +794,7 @@ const struct address_space_operations ex
.readpages  = ext2_readpages,
.writepage  = ext2_nobh_writepage,
.sync_page  = block_sync_page,
-   .prepare_write  = ext2_nobh_prepare_write,
-   .commit_write   = nobh_commit_write,
+   /* XXX: todo */
.bmap   = ext2_bmap,
.direct_IO  = ext2_direct_IO,
.writepages = ext2_writepages,
Index: linux-2.6/fs/ext2/dir.c
===
--- linux-2.6.orig/fs/ext2/dir.c
+++ linux-2.6/fs/ext2/dir.c
@@ -22,6 +22,7 @@
  */
 
 #include ext2.h
+#include linux/buffer_head.h
 #include linux/pagemap.h
 
 typedef struct ext2_dir_entry_2 ext2_dirent;
@@ -61,12 +62,14 @@ ext2_last_byte(struct inode *inode, unsi
return last_byte;
 }
 
-static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to)
+static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
-   struct inode *dir = page-mapping-host;
+   struct address_space *mapping = page-mapping;
+   struct inode *dir = mapping-host;
int err = 0;
+
dir-i_version++;
-   page-mapping-a_ops-commit_write(NULL, page, from, to);
+   block_write_end(NULL, mapping, pos, len, len, page, NULL);
if (IS_DIRSYNC(dir))
err = write_one_page(page, 1);
else
@@ -412,16 +415,18 @@ ino_t ext2_inode_by_name(struct inode * 
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
struct page *page, struct inode *inode)
 {
-   unsigned from = (char *) de - (char *) page_address(page);
-   unsigned to = from + le16_to_cpu(de-rec_len);
+   loff_t pos = (page-index  PAGE_CACHE_SHIFT) +
+   (char *) de - (char *) page_address(page);
+   unsigned len = le16_to_cpu(de-rec_len);
int err;
 
lock_page(page);
-   err = page-mapping-a_ops-prepare_write(NULL, page, from, to);
+   err = __ext2_write_begin(NULL, page-mapping, pos, len,
+   AOP_FLAG_UNINTERRUPTIBLE, page, NULL);
BUG_ON(err);
de-inode = cpu_to_le32(inode-i_ino);
-   ext2_set_de_type (de, inode);
-   err = ext2_commit_chunk(page, from, to);
+   ext2_set_de_type(de, inode);
+   err = ext2_commit_chunk(page, pos, len);
ext2_put_page(page);
dir-i_mtime = dir-i_ctime = CURRENT_TIME_SEC;
EXT2_I(dir)-i_flags = ~EXT2_BTREE_FL;
@@ -444,7 +449,7 @@ int ext2_add_link (struct dentry *dentry
unsigned long npages = dir_pages(dir);
unsigned long n;
char *kaddr;
-   

[patch 19/41] ext4 convert to new aops.

2007-05-14 Thread npiggin
Cc: linux-ext4@vger.kernel.org
Cc: Linux Filesystems [EMAIL PROTECTED]
Convert ext4 to use write_begin()/write_end() methods.

Signed-off-by: Badari Pulavarty [EMAIL PROTECTED]

 fs/ext4/inode.c |  147 +++-
 1 file changed, 93 insertions(+), 54 deletions(-)

Index: linux-2.6/fs/ext4/inode.c
===
--- linux-2.6.orig/fs/ext4/inode.c
+++ linux-2.6/fs/ext4/inode.c
@@ -1146,34 +1146,50 @@ static int do_journal_get_write_access(h
return ext4_journal_get_write_access(handle, bh);
 }
 
-static int ext4_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ext4_write_begin(struct file *file, struct address_space *mapping,
+   loff_t pos, unsigned len, unsigned flags,
+   struct page **pagep, void **fsdata)
 {
-   struct inode *inode = page-mapping-host;
+   struct inode *inode = mapping-host;
int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
handle_t *handle;
int retries = 0;
+   struct page *page;
+   pgoff_t index;
+   unsigned from, to;
+
+   index = pos  PAGE_CACHE_SHIFT;
+   from = pos  (PAGE_CACHE_SIZE - 1);
+   to = from + len;
 
 retry:
-   handle = ext4_journal_start(inode, needed_blocks);
-   if (IS_ERR(handle)) {
-   ret = PTR_ERR(handle);
-   goto out;
+   page = __grab_cache_page(mapping, index);
+   if (!page)
+   return -ENOMEM;
+   *pagep = page;
+
+   handle = ext4_journal_start(inode, needed_blocks);
+   if (IS_ERR(handle)) {
+   unlock_page(page);
+   page_cache_release(page);
+   ret = PTR_ERR(handle);
+   goto out;
}
-   if (test_opt(inode-i_sb, NOBH)  ext4_should_writeback_data(inode))
-   ret = nobh_prepare_write(page, from, to, ext4_get_block);
-   else
-   ret = block_prepare_write(page, from, to, ext4_get_block);
-   if (ret)
-   goto prepare_write_failed;
 
-   if (ext4_should_journal_data(inode)) {
+   ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+   ext4_get_block);
+
+   if (!ret  ext4_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, do_journal_get_write_access);
}
-prepare_write_failed:
-   if (ret)
+
+   if (ret) {
ext4_journal_stop(handle);
+   unlock_page(page);
+   page_cache_release(page);
+   }
+
if (ret == -ENOSPC  ext4_should_retry_alloc(inode-i_sb, retries))
goto retry;
 out:
@@ -1185,12 +1201,12 @@ int ext4_journal_dirty_data(handle_t *ha
int err = jbd2_journal_dirty_data(handle, bh);
if (err)
ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
-   bh, handle,err);
+   bh, handle, err);
return err;
 }
 
-/* For commit_write() in data=journal mode */
-static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
@@ -1205,78 +1221,100 @@ static int commit_write_fn(handle_t *han
  * ext4 never places buffers on inode-i_mapping-private_list.  metadata
  * buffers are managed internally.
  */
-static int ext4_ordered_commit_write(struct file *file, struct page *page,
-unsigned from, unsigned to)
+static int ext4_ordered_write_end(struct file *file,
+   struct address_space *mapping,
+   loff_t pos, unsigned len, unsigned copied,
+   struct page *page, void *fsdata)
 {
handle_t *handle = ext4_journal_current_handle();
-   struct inode *inode = page-mapping-host;
+   struct inode *inode = file-f_mapping-host;
+   unsigned from, to;
int ret = 0, ret2;
 
+   from = pos  (PAGE_CACHE_SIZE - 1);
+   to = from + len;
+
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, ext4_journal_dirty_data);
 
if (ret == 0) {
/*
-* generic_commit_write() will run mark_inode_dirty() if i_size
+* generic_write_end() will run mark_inode_dirty() if i_size
 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 * into that.
 */
loff_t new_i_size;
 
-   new_i_size = ((loff_t)page-index  PAGE_CACHE_SHIFT) + to;
+

[RFC] [patch 1/2] i_version update - vfs part

2007-05-14 Thread Cordenner jean noel
Concerning the first part of the set, the i_version field of the inode
structure has been reused. The field  has been redefined from unsigned
long to unsigned long long as the counter has to be 64bit.

Signed-off-by: Jean Noel Cordenner [EMAIL PROTECTED]

fs/binfmt_misc.c   |1 +
fs/libfs.c |9 +
fs/pipe.c  |1 +
include/linux/fs.h |2 +-

Index: linux-2.6.21-rc4-i_version/fs/binfmt_misc.c
===
--- linux-2.6.21-rc4-i_version.orig/fs/binfmt_misc.c2007-05-10 
14:14:48.0 +0200
+++ linux-2.6.21-rc4-i_version/fs/binfmt_misc.c 2007-05-10 14:18:45.0 
+0200
@@ -508,6 +508,7 @@
inode-i_blocks = 0;
inode-i_atime = inode-i_mtime = inode-i_ctime =
current_fs_time(inode-i_sb);
+   inode-i_version = 1;
}
return inode;
 }
Index: linux-2.6.21-rc4-i_version/fs/libfs.c
===
--- linux-2.6.21-rc4-i_version.orig/fs/libfs.c  2007-05-10 14:14:48.0 
+0200
+++ linux-2.6.21-rc4-i_version/fs/libfs.c   2007-05-10 17:26:13.0 
+0200
@@ -223,6 +223,7 @@
root-i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
root-i_uid = root-i_gid = 0;
root-i_atime = root-i_mtime = root-i_ctime = CURRENT_TIME;
+   root-i_version = 1;
dentry = d_alloc(NULL, d_name);
if (!dentry) {
iput(root);
@@ -246,6 +247,8 @@
struct inode *inode = old_dentry-d_inode;
 
inode-i_ctime = dir-i_ctime = dir-i_mtime = CURRENT_TIME;
+   inode-i_version++;
+   dir-i_version++;
inc_nlink(inode);
atomic_inc(inode-i_count);
dget(dentry);
@@ -278,6 +281,8 @@
struct inode *inode = dentry-d_inode;
 
inode-i_ctime = dir-i_ctime = dir-i_mtime = CURRENT_TIME;
+   inode-i_version++;
+   dir-i_version++;
drop_nlink(inode);
dput(dentry);
return 0;
@@ -314,6 +319,8 @@
 
old_dir-i_ctime = old_dir-i_mtime = new_dir-i_ctime =
new_dir-i_mtime = inode-i_ctime = CURRENT_TIME;
+   old_dir-i_version++;
+   new_dir-i_version++;
 
return 0;
 }
@@ -380,6 +387,7 @@
inode-i_uid = inode-i_gid = 0;
inode-i_blocks = 0;
inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME;
+   inode-i_version = 1;
inode-i_op = simple_dir_inode_operations;
inode-i_fop = simple_dir_operations;
inode-i_nlink = 2;
@@ -401,6 +409,7 @@
inode-i_uid = inode-i_gid = 0;
inode-i_blocks = 0;
inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME;
+   inode-i_version = 1;
inode-i_fop = files-ops;
inode-i_ino = i;
d_add(dentry, inode);
Index: linux-2.6.21-rc4-i_version/fs/pipe.c
===
--- linux-2.6.21-rc4-i_version.orig/fs/pipe.c   2007-05-10 14:14:48.0 
+0200
+++ linux-2.6.21-rc4-i_version/fs/pipe.c2007-05-10 14:18:45.0 
+0200
@@ -872,6 +872,7 @@
inode-i_uid = current-fsuid;
inode-i_gid = current-fsgid;
inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME;
+   inode-i_version = 1;
 
return inode;
 
Index: linux-2.6.21-rc4-i_version/include/linux/fs.h
===
--- linux-2.6.21-rc4-i_version.orig/include/linux/fs.h  2007-04-26 
16:23:59.0 +0200
+++ linux-2.6.21-rc4-i_version/include/linux/fs.h   2007-05-10 
17:23:38.0 +0200
@@ -536,7 +536,7 @@
uid_t   i_uid;
gid_t   i_gid;
dev_t   i_rdev;
-   unsigned long   i_version;
+   unsigned long long  i_version;
loff_t  i_size;
 #ifdef __NEED_I_SIZE_ORDERED
seqcount_t  i_size_seqcount;


-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/5][TAKE2] fallocate system call

2007-05-14 Thread Amit K. Arora
This is the new set of patches which take care of the review comments
received from the community (mainly from Andrew).

Description:
---
fallocate() is a new system call being proposed here which will allow
applications to preallocate space to any file(s) in a file system.
Each file system implementation that wants to use this feature will need
to support an inode operation called fallocate.

Applications can use this feature to avoid fragmentation to certain
level and thus get faster access speed. With preallocation, applications
also get a guarantee of space for particular file(s) - even if later the
the system becomes full.

Currently, glibc provides an interface called posix_fallocate() which
can be used for similar cause. Though this has the advantage of working
on all file systems, but it is quite slow (since it writes zeroes to
each block that has to be preallocated). Without a doubt, file systems
can do this more efficiently within the kernel, by implementing
the proposed fallocate() system call. It is expected that
posix_fallocate() will be modified to call this new system call first
and incase the kernel/filesystem does not implement it, it should fall
back to the current implementation of writing zeroes to the new blocks.

Interface:
-
The proposed system call's layout is:

 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)

fd: The descriptor of the open file.

mode*: This specifies the behavior of the system call. Currently the
  system call supports two modes - FA_ALLOCATE and FA_DEALLOCATE.
  FA_ALLOCATE: Applications can use this mode to preallocate blocks to
a given file (specified by fd). This mode changes the file size if
the preallocation is done beyond the EOF. It also updates the
ctime/mtime in the inode of the corresponding file, marking a
successfull allocation.
  FA_DEALLOCATE: This mode can be used by applications to deallocate the
previously preallocated blocks. This also may change the file size
and the ctime/mtime.
* New modes might get added in future. One such new mode which is
  already under discussion is FA_PREALLOCATE, which when used will
  preallocate space but will not change the filesize and [cm]time.
  Since the semantics of this new mode is not clear and agreed upon yet,
  this patchset does not implement it currently.

offset: This is the offset in bytes, from where the preallocation should
  start.

len: This is the number of bytes requested for preallocation (from
  offset).
  

sys_fallocate() on s390:
---
There is a problem with s390 ABI to implement sys_fallocate() with the
proposed order of arguments. Martin Schwidefsky has suggested a patch to
solve this problem which makes use of a wrapper in the kernel. This will
require special handling of this system call on s390 in glibc as well.
But, this seems to be the best solution so far.

Known Problem:
-
mmapped writes into uninitialized extents is a known problem with the
current ext4 patches. Like XFS, ext4 may need to implement
-page_mkwrite() to solve this. See:
http://lkml.org/lkml/2007/5/8/583

Since there is a talk of -fault() replacing -page_mkwrite() and also
with a generic block_page_mkwrite() implementation already posted, we
can implement this later some time. See:
http://lkml.org/lkml/2007/3/7/161
http://lkml.org/lkml/2007/3/18/198

ToDos:
-
1 Implementation on other architectures (other than i386, x86_64,
ppc64 and s390(x)). David Chinner has already posted a patch for ia64.
2 A generic file system operation to handle fallocate
(generic_fallocate), for filesystems that do _not_ have the fallocate
inode operation implemented.
3 Changes to glibc,
   a) to support fallocate() system call
   b) to make posix_fallocate() and posix_fallocate64() call fallocate()


Changelog:
-
Each post will have an individual changelog for the particular patch.
Following posts with patches follow:

Patch 1/5 : fallocate() implementation on i86, x86_64 and powerpc
Patch 2/5 : fallocate() on s390
Patch 3/5 : ext4: Extent overlap bugfix
Patch 4/5 : ext4: fallocate support in ext4
Patch 5/5 : ext4: write support for preallocated blocks

--
Regards,
Amit Arora
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5][TAKE2] fallocate() implementation on i86, x86_64 and powerpc

2007-05-14 Thread Amit K. Arora
This patch implements sys_fallocate() and adds support on i386, x86_64
and powerpc platforms.

Changelog:
-
Following changes were made to the previous version:
 1) Added description before sys_fallocate() definition.
 2) Return EINVAL for len=0 (With new draft that Ulrich pointed to,
posix_fallocate should return EINVAL for len = 0.
 3) Return EOPNOTSUPP if mode is not one of FA_ALLOCATE or FA_DEALLOCATE
 4) Do not return ENODEV for dirs (let individual file systems decide if
they want to support preallocation to directories or not.
 5) Check for wrap through zero.
 6) Update c/mtime if fallocate() succeeds.
 7) Added mode descriptions in fs.h
 8) Added variable names to function definition (fallocate inode op)

Here is the new patch:

Signed-off-by: Amit Arora [EMAIL PROTECTED]
---
 arch/i386/kernel/syscall_table.S |1 
 arch/powerpc/kernel/sys_ppc32.c  |7 +++
 arch/x86_64/kernel/functionlist  |1 
 fs/open.c|   89 +++
 include/asm-i386/unistd.h|3 -
 include/asm-powerpc/systbl.h |1 
 include/asm-powerpc/unistd.h |3 -
 include/asm-x86_64/unistd.h  |4 +
 include/linux/fs.h   |   13 +
 include/linux/syscalls.h |1 
 10 files changed, 120 insertions(+), 3 deletions(-)

Index: linux-2.6.21/arch/i386/kernel/syscall_table.S
===
--- linux-2.6.21.orig/arch/i386/kernel/syscall_table.S
+++ linux-2.6.21/arch/i386/kernel/syscall_table.S
@@ -319,3 +319,4 @@ ENTRY(sys_call_table)
.long sys_move_pages
.long sys_getcpu
.long sys_epoll_pwait
+   .long sys_fallocate /* 320 */
Index: linux-2.6.21/arch/x86_64/kernel/functionlist
===
--- linux-2.6.21.orig/arch/x86_64/kernel/functionlist
+++ linux-2.6.21/arch/x86_64/kernel/functionlist
@@ -931,6 +931,7 @@
 *(.text.sys_getitimer)
 *(.text.sys_getgroups)
 *(.text.sys_ftruncate)
+*(.text.sys_fallocate)
 *(.text.sysfs_lookup)
 *(.text.sys_exit_group)
 *(.text.stub_fork)
Index: linux-2.6.21/fs/open.c
===
--- linux-2.6.21.orig/fs/open.c
+++ linux-2.6.21/fs/open.c
@@ -351,6 +351,95 @@ asmlinkage long sys_ftruncate64(unsigned
 #endif
 
 /*
+ * sys_fallocate - preallocate blocks or free preallocated blocks
+ * @fd: the file descriptor
+ * @mode: mode specifies if fallocate should preallocate blocks OR free
+ *   (unallocate) preallocated blocks. Currently only FA_ALLOCATE and
+ *   FA_DEALLOCATE modes are supported.
+ * @offset: The offset within file, from where (un)allocation is being
+ * requested. It should not have a negative value.
+ * @len: The amount (in bytes) of space to be (un)allocated, from the offset.
+ *
+ * This system call, depending on the mode, preallocates or unallocates blocks
+ * for a file. The range of blocks depends on the value of offset and len
+ * arguments provided by the user/application. For FA_ALLOCATE mode, if this
+ * system call succeeds, subsequent writes to the file in the given range
+ * (specified by offset  len) should not fail - even if the file system
+ * later becomes full. Hence the preallocation done is persistent (valid
+ * even after reopen of the file and remount/reboot).
+ *
+ * Note: Incase the file system does not support preallocation,
+ * posix_fallocate() should fall back to the library implementation (i.e.
+ * allocating zero-filled new blocks to the file).
+ *
+ * Return Values
+ * 0   : On SUCCESS a value of zero is returned.
+ * error   : On Failure, an error code will be returned.
+ * An error code of -ENOSYS or -EOPNOTSUPP should make posix_fallocate()
+ * fall back on library implementation of fallocate.
+ *
+ * TBD Generic fallocate to be added for file systems that do not
+ *  support fallocate it.
+ */
+asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+{
+   struct file *file;
+   struct inode *inode;
+   long ret = -EINVAL;
+
+   if (offset  0 || len = 0)
+   goto out;
+
+   /* Return error if mode is not supported */
+   ret = -EOPNOTSUPP;
+   if (mode != FA_ALLOCATE  mode !=FA_DEALLOCATE)
+   goto out;
+
+   ret = -EBADF;
+   file = fget(fd);
+   if (!file)
+   goto out;
+   if (!(file-f_mode  FMODE_WRITE))
+   goto out_fput;
+
+   inode = file-f_path.dentry-d_inode;
+
+   ret = -ESPIPE;
+   if (S_ISFIFO(inode-i_mode))
+   goto out_fput;
+
+   ret = -ENODEV;
+   /*
+* Let individual file system decide if it supports preallocation
+* for directories or not.
+*/
+   if (!S_ISREG(inode-i_mode)  !S_ISDIR(inode-i_mode))
+   goto out_fput;
+
+   ret = -EFBIG;
+   /* Check for wrap through zero too */
+   if (((offset + 

[PATCH 2/5][TAKE2] fallocate() on s390

2007-05-14 Thread Amit K. Arora
This is the patch suggested by Martin Schwidefsky. Here are the comments
and patch from him.

-
From: Martin Schwidefsky [EMAIL PROTECTED]

This patch implements support of fallocate system call on s390(x)
platform. A wrapper is added to address the issue which s390 ABI has
with the arguments of this system call.

Signed-off-by: Martin Schwidefsky [EMAIL PROTECTED]
---

 arch/s390/kernel/compat_wrapper.S |   10 ++
 arch/s390/kernel/sys_s390.c   |   29 +
 arch/s390/kernel/syscalls.S   |1 +
 include/asm-s390/unistd.h |3 ++-
 4 files changed, 42 insertions(+), 1 deletion(-)

Index: linux-2.6.21/arch/s390/kernel/compat_wrapper.S
===
--- linux-2.6.21.orig/arch/s390/kernel/compat_wrapper.S
+++ linux-2.6.21/arch/s390/kernel/compat_wrapper.S
@@ -1682,3 +1682,13 @@ compat_sys_utimes_wrapper:
llgtr   %r2,%r2 # char *
llgtr   %r3,%r3 # struct compat_timeval *
jg  compat_sys_utimes
+
+   .globl  sys_fallocate_wrapper
+sys_fallocate_wrapper:
+   lgfr%r2,%r2 # int
+   lgfr%r3,%r3 # int
+   sllg%r4,%r4,32  # get high word of 64bit loff_t
+   lr  %r4,%r5 # get low word of 64bit loff_t
+   sllg%r5,%r6,32  # get high word of 64bit loff_t
+   l   %r5,164(%r15)   # get low word of 64bit loff_t
+   jg  sys_fallocate
Index: linux-2.6.21/arch/s390/kernel/syscalls.S
===
--- linux-2.6.21.orig/arch/s390/kernel/syscalls.S
+++ linux-2.6.21/arch/s390/kernel/syscalls.S
@@ -322,3 +322,4 @@ NI_SYSCALL  
/* 310 sys_move_pages *
 SYSCALL(sys_getcpu,sys_getcpu,sys_getcpu_wrapper)
 SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait_wrapper)
 SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper)
+SYSCALL(s390_fallocate,sys_fallocate,sys_fallocate_wrapper)
Index: linux-2.6.21/arch/s390/kernel/sys_s390.c
===
--- linux-2.6.21.orig/arch/s390/kernel/sys_s390.c
+++ linux-2.6.21/arch/s390/kernel/sys_s390.c
@@ -286,3 +286,32 @@ int kernel_execve(const char *filename, 
  d (__arg3) : memory);
return __svcres;
 }
+
+#ifndef CONFIG_64BIT
+/*
+ * This is a wrapper to call sys_fallocate(). For 31 bit s390 the last
+ * 64 bit argument len is split into the upper and lower 32 bits. The
+ * system call wrapper in the user space loads the value to %r6/%r7.
+ * The code in entry.S keeps the values in %r2 - %r6 where they are and
+ * stores %r7 to 96(%r15). But the standard C linkage requires that
+ * the whole 64 bit value for len is stored on the stack and doesn't
+ * use %r6 at all. So s390_fallocate has to convert the arguments from
+ *   %r2: fd, %r3: mode, %r4/%r5: offset, %r6/96(%r15)-99(%r15): len
+ * to
+ *   %r2: fd, %r3: mode, %r4/%r5: offset, 96(%r15)-103(%r15): len
+ */
+asmlinkage long s390_fallocate(int fd, int mode, loff_t offset,
+  u32 len_high, u32 len_low)
+{
+   union {
+   u64 len;
+   struct {
+   u32 high;
+   u32 low;
+   };
+   } cv;
+   cv.high = len_high;
+   cv.low = len_low;
+   return sys_fallocate(fd, mode, offset, cv.len);
+}
+#endif
Index: linux-2.6.21/include/asm-s390/unistd.h
===
--- linux-2.6.21.orig/include/asm-s390/unistd.h
+++ linux-2.6.21/include/asm-s390/unistd.h
@@ -251,8 +251,9 @@
 #define __NR_getcpu311
 #define __NR_epoll_pwait   312
 #define __NR_utimes313
+#define __NR_fallocate 314
 
-#define NR_syscalls 314
+#define NR_syscalls 315
 
 /* 
  * There are some system calls that are not present on 64 bit, some
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5][TAKE2] ext4: Extent overlap bugfix

2007-05-14 Thread Amit K. Arora
This patch adds a check for overlap of extents and cuts short the
new extent to be inserted, if there is a chance of overlap.

Changelog:
-
As suggested by Andrew, a check for wrap though zero has been added.

Here is the new patch:

Signed-off-by: Amit Arora [EMAIL PROTECTED]
---
 fs/ext4/extents.c   |   60 ++--
 include/linux/ext4_fs_extents.h |1 
 2 files changed, 59 insertions(+), 2 deletions(-)

Index: linux-2.6.21/fs/ext4/extents.c
===
--- linux-2.6.21.orig/fs/ext4/extents.c
+++ linux-2.6.21/fs/ext4/extents.c
@@ -1129,6 +1129,55 @@ ext4_can_extents_be_merged(struct inode 
 }
 
 /*
+ * check if a portion of the newext extent overlaps with an
+ * existing extent.
+ *
+ * If there is an overlap discovered, it updates the length of the newext
+ * such that there will be no overlap, and then returns 1.
+ * If there is no overlap found, it returns 0.
+ */
+unsigned int ext4_ext_check_overlap(struct inode *inode,
+   struct ext4_extent *newext,
+   struct ext4_ext_path *path)
+{
+   unsigned long b1, b2;
+   unsigned int depth, len1;
+   unsigned int ret = 0;
+
+   b1 = le32_to_cpu(newext-ee_block);
+   len1 = le16_to_cpu(newext-ee_len);
+   depth = ext_depth(inode);
+   if (!path[depth].p_ext)
+   goto out;
+   b2 = le32_to_cpu(path[depth].p_ext-ee_block);
+
+   /*
+* get the next allocated block if the extent in the path
+* is before the requested block(s) 
+*/
+   if (b2  b1) {
+   b2 = ext4_ext_next_allocated_block(path);
+   if (b2 == EXT_MAX_BLOCK)
+   goto out;
+   }
+
+   /* check for wrap through zero */
+   if (b1 + len1  b1) {
+   len1 = EXT_MAX_BLOCK - b1;
+   newext-ee_len = cpu_to_le16(len1);
+   ret = 1;
+   }
+
+   /* check for overlap */
+   if (b1 + len1  b2) {
+   newext-ee_len = cpu_to_le16(b2 - b1);
+   ret = 1;
+   }
+out:
+   return ret;
+}
+
+/*
  * ext4_ext_insert_extent:
  * tries to merge requsted extent into the existing extent or
  * inserts requested extent as new one into the tree,
@@ -2032,7 +2081,15 @@ int ext4_ext_get_blocks(handle_t *handle
 
/* allocate new block */
goal = ext4_ext_find_goal(inode, path, iblock);
-   allocated = max_blocks;
+
+   /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
+   newex.ee_block = cpu_to_le32(iblock);
+   newex.ee_len = cpu_to_le16(max_blocks);
+   err = ext4_ext_check_overlap(inode, newex, path);
+   if (err)
+   allocated = le16_to_cpu(newex.ee_len);
+   else
+   allocated = max_blocks;
newblock = ext4_new_blocks(handle, inode, goal, allocated, err);
if (!newblock)
goto out2;
@@ -2040,7 +2097,6 @@ int ext4_ext_get_blocks(handle_t *handle
goal, newblock, allocated);
 
/* try to insert new extent into found leaf and return */
-   newex.ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(newex, newblock);
newex.ee_len = cpu_to_le16(allocated);
err = ext4_ext_insert_extent(handle, inode, path, newex);
Index: linux-2.6.21/include/linux/ext4_fs_extents.h
===
--- linux-2.6.21.orig/include/linux/ext4_fs_extents.h
+++ linux-2.6.21/include/linux/ext4_fs_extents.h
@@ -190,6 +190,7 @@ ext4_ext_invalidate_cache(struct inode *
 
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_insert(struct inode *, struct 
ext4_ext_path *);
+extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent 
*, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct 
ext4_ext_path *, struct ext4_extent *);
 extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, 
ext_prepare_callback, void *);
 extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct 
ext4_ext_path *);
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5][TAKE2] ext4: fallocate support in ext4

2007-05-14 Thread Amit K. Arora
This patch implements -fallocate() inode operation in ext4. With this
patch users of ext4 file systems will be able to use fallocate() system
call for persistent preallocation.

Current implementation only supports preallocation for regular files
(directories not supported as of date) with extent maps. This patch
does not support block-mapped files currently.

Only FA_ALLOCATE mode is being supported as of now. Supporting
FA_DEALLOCATE mode is a To Do item.

Changelog:
-
Here are the changes from the previous post:
 1) Added more description for ext4_fallocate().
 2) Now returning EOPNOTSUPP when files are block-mapped (non-extent).
 3) Moved journal_start  journal_stop inside the while loop.
 4) Replaced BUG_ON with WARN_ON  ext4_error.
 5) Make EXT4_BLOCK_ALIGN use ALIGN macro internally.
 6) Added variable names in the function declaration of ext4_fallocate()
 7) Converted macros that handle uninitialized extents into inline
functions.

Here is the updated patch:

Signed-off-by: Amit Arora [EMAIL PROTECTED]
---
 fs/ext4/extents.c   |  241 +---
 fs/ext4/file.c  |1 
 include/linux/ext4_fs.h |8 +
 include/linux/ext4_fs_extents.h |   12 +
 4 files changed, 221 insertions(+), 41 deletions(-)

Index: linux-2.6.21/fs/ext4/extents.c
===
--- linux-2.6.21.orig/fs/ext4/extents.c
+++ linux-2.6.21/fs/ext4/extents.c
@@ -283,7 +283,7 @@ static void ext4_ext_show_path(struct in
} else if (path-p_ext) {
ext_debug(  %d:%d:%llu ,
  le32_to_cpu(path-p_ext-ee_block),
- le16_to_cpu(path-p_ext-ee_len),
+ ext4_ext_get_actual_len(path-p_ext),
  ext_pblock(path-p_ext));
} else
ext_debug(  []);
@@ -306,7 +306,7 @@ static void ext4_ext_show_leaf(struct in
 
for (i = 0; i  le16_to_cpu(eh-eh_entries); i++, ex++) {
ext_debug(%d:%d:%llu , le32_to_cpu(ex-ee_block),
- le16_to_cpu(ex-ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug(\n);
 }
@@ -426,7 +426,7 @@ ext4_ext_binsearch(struct inode *inode, 
ext_debug(  - %d:%llu:%d ,
le32_to_cpu(path-p_ext-ee_block),
ext_pblock(path-p_ext),
-   le16_to_cpu(path-p_ext-ee_len));
+   ext4_ext_get_actual_len(path-p_ext));
 
 #ifdef CHECK_BINSEARCH
{
@@ -687,7 +687,7 @@ static int ext4_ext_split(handle_t *hand
ext_debug(move %d:%llu:%d in new leaf %llu\n,
le32_to_cpu(path[depth].p_ext-ee_block),
ext_pblock(path[depth].p_ext),
-   le16_to_cpu(path[depth].p_ext-ee_len),
+   ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
sizeof(struct ext4_extent));
@@ -1107,7 +1107,19 @@ static int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
 {
-   if (le32_to_cpu(ex1-ee_block) + le16_to_cpu(ex1-ee_len) !=
+   unsigned short ext1_ee_len, ext2_ee_len;
+
+   /*
+* Make sure that either both extents are uninitialized, or
+* both are _not_.
+*/
+   if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+   return 0;
+
+   ext1_ee_len = ext4_ext_get_actual_len(ex1);
+   ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+   if (le32_to_cpu(ex1-ee_block) + ext1_ee_len !=
le32_to_cpu(ex2-ee_block))
return 0;
 
@@ -1116,14 +1128,14 @@ ext4_can_extents_be_merged(struct inode 
 * as an RO_COMPAT feature, refuse to merge to extents if
 * this can result in the top bit of ee_len being set.
 */
-   if (le16_to_cpu(ex1-ee_len) + le16_to_cpu(ex2-ee_len)  EXT_MAX_LEN)
+   if (ext1_ee_len + ext2_ee_len  EXT_MAX_LEN)
return 0;
 #ifdef AGGRESSIVE_TEST
if (le16_to_cpu(ex1-ee_len) = 4)
return 0;
 #endif
 
-   if (ext_pblock(ex1) + le16_to_cpu(ex1-ee_len) == ext_pblock(ex2))
+   if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
return 1;
return 0;
 }
@@ -1145,7 +1157,7 @@ unsigned int ext4_ext_check_overlap(stru
unsigned int ret = 0;
 
b1 = le32_to_cpu(newext-ee_block);
-   len1 = le16_to_cpu(newext-ee_len);
+   len1 = ext4_ext_get_actual_len(newext);
depth = ext_depth(inode);
if (!path[depth].p_ext)
goto out;
@@ -1192,8 +1204,9 @@ int 

[PATCH 5/5][TAKE2] ext4: write support for preallocated blocks

2007-05-14 Thread Amit K. Arora
This patch adds write support to the uninitialized extents that get
created when a preallocation is done using fallocate(). It takes care of
splitting the extents into multiple (upto three) extents and merging the
new split extents with neighbouring ones, if possible.

Changelog:
-
 1) Replaced BUG_ON with WARN_ON  ext4_error.
 2) Added variable names to the function declaration of
ext4_ext_try_to_merge().
 3) Updated variable declarations to use multiple-definitions-per-line.
 4) if((a=foo())).. was broken into a=foo(); if(a)..
 5) Removed extra spaces.

Here is the updated patch:

Signed-off-by: Amit Arora [EMAIL PROTECTED]
---
 fs/ext4/extents.c   |  234 +++-
 include/linux/ext4_fs_extents.h |3 
 2 files changed, 210 insertions(+), 27 deletions(-)

Index: linux-2.6.21/fs/ext4/extents.c
===
--- linux-2.6.21.orig/fs/ext4/extents.c
+++ linux-2.6.21/fs/ext4/extents.c
@@ -1141,6 +1141,54 @@ ext4_can_extents_be_merged(struct inode 
 }
 
 /*
+ * This function tries to merge the ex extent to the next extent in the tree.
+ * It always tries to merge towards right. If you want to merge towards
+ * left, pass ex - 1 as argument instead of ex.
+ * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
+ * 1 if they got merged.
+ */
+int ext4_ext_try_to_merge(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex)
+{
+   struct ext4_extent_header *eh;
+   unsigned int depth, len;
+   int merge_done = 0;
+   int uninitialized = 0;
+
+   depth = ext_depth(inode);
+   BUG_ON(path[depth].p_hdr == NULL);
+   eh = path[depth].p_hdr;
+
+   while (ex  EXT_LAST_EXTENT(eh))
+   {
+   if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
+   break;
+   /* merge with next extent! */
+   if (ext4_ext_is_uninitialized(ex))
+   uninitialized = 1;
+   ex-ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+   + ext4_ext_get_actual_len(ex + 1));
+   if (uninitialized)
+   ext4_ext_mark_uninitialized(ex);
+
+   if (ex + 1  EXT_LAST_EXTENT(eh)) {
+   len = (EXT_LAST_EXTENT(eh) - ex - 1)
+   * sizeof(struct ext4_extent);
+   memmove(ex + 1, ex + 2, len);
+   }
+   eh-eh_entries = cpu_to_le16(le16_to_cpu(eh-eh_entries) - 1);
+   merge_done = 1;
+   WARN_ON(eh-eh_entries == 0);
+   if (!eh-eh_entries)
+   ext4_error(inode-i_sb, ext4_ext_try_to_merge,
+  inode#%lu, eh-eh_entries = 0!, inode-i_ino);
+   }
+
+   return merge_done;
+}
+
+/*
  * check if a portion of the newext extent overlaps with an
  * existing extent.
  *
@@ -1328,25 +1376,7 @@ has_space:
 
 merge:
/* try to merge extents to the right */
-   while (nearex  EXT_LAST_EXTENT(eh)) {
-   if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
-   break;
-   /* merge with next extent! */
-   if (ext4_ext_is_uninitialized(nearex))
-   uninitialized = 1;
-   nearex-ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
-   + ext4_ext_get_actual_len(nearex + 1));
-   if (uninitialized)
-   ext4_ext_mark_uninitialized(nearex);
-
-   if (nearex + 1  EXT_LAST_EXTENT(eh)) {
-   len = (EXT_LAST_EXTENT(eh) - nearex - 1)
-   * sizeof(struct ext4_extent);
-   memmove(nearex + 1, nearex + 2, len);
-   }
-   eh-eh_entries = cpu_to_le16(le16_to_cpu(eh-eh_entries)-1);
-   BUG_ON(eh-eh_entries == 0);
-   }
+   ext4_ext_try_to_merge(inode, path, nearex);
 
/* try to merge extents to the left */
 
@@ -2012,15 +2042,152 @@ void ext4_ext_release(struct super_block
 #endif
 }
 
+/*
+ * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * to an uninitialized extent. It may result in splitting the uninitialized
+ * extent into multiple extents (upto three - one initialized and two
+ * uninitialized).
+ * There are three possibilities:
+ *   a There is no split required: Entire extent should be initialized
+ *   b Splits in two extents: Write is happening at either end of the extent
+ *   c Splits in three extents: Somone is writing in middle of the extent
+ */
+int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
+   struct ext4_ext_path *path,
+   ext4_fsblk_t iblock,
+  

Re: [PATCH 2/5][TAKE2] fallocate() on s390 - glibc wrapper

2007-05-14 Thread Amit K. Arora
On Mon, May 14, 2007 at 08:18:34PM +0530, Amit K. Arora wrote:
 This is the patch suggested by Martin Schwidefsky. Here are the comments
 and patch from him.

Martin also suggested a wrapper in glibc to handle this system call on
s390. Posting it here so that we get feedback for this too.
Here it is:

.globl __fallocate
ENTRY(__fallocate)
stm %r6,%r7,28(%r15)/* save %r6/%r7 on stack */
cfi_offset (%r7, -68)
cfi_offset (%r6, -72)
lm  %r6,%r7,96(%r15)/* load loff_t len from stack */
svc SYS_ify(fallocate)
lm  %r6,%r7,28(%r15)/* restore %r6/%r7 from stack */
br  %r14
PSEUDO_END(__fallocate)

--
Regards,
Amit Arora
 
 -
 From: Martin Schwidefsky [EMAIL PROTECTED]
 
 This patch implements support of fallocate system call on s390(x)
 platform. A wrapper is added to address the issue which s390 ABI has
 with the arguments of this system call.
 
 Signed-off-by: Martin Schwidefsky [EMAIL PROTECTED]
 ---
 
  arch/s390/kernel/compat_wrapper.S |   10 ++
  arch/s390/kernel/sys_s390.c   |   29 +
  arch/s390/kernel/syscalls.S   |1 +
  include/asm-s390/unistd.h |3 ++-
  4 files changed, 42 insertions(+), 1 deletion(-)
 
 Index: linux-2.6.21/arch/s390/kernel/compat_wrapper.S
 ===
 --- linux-2.6.21.orig/arch/s390/kernel/compat_wrapper.S
 +++ linux-2.6.21/arch/s390/kernel/compat_wrapper.S
 @@ -1682,3 +1682,13 @@ compat_sys_utimes_wrapper:
   llgtr   %r2,%r2 # char *
   llgtr   %r3,%r3 # struct compat_timeval *
   jg  compat_sys_utimes
 +
 + .globl  sys_fallocate_wrapper
 +sys_fallocate_wrapper:
 + lgfr%r2,%r2 # int
 + lgfr%r3,%r3 # int
 + sllg%r4,%r4,32  # get high word of 64bit loff_t
 + lr  %r4,%r5 # get low word of 64bit loff_t
 + sllg%r5,%r6,32  # get high word of 64bit loff_t
 + l   %r5,164(%r15)   # get low word of 64bit loff_t
 + jg  sys_fallocate
 Index: linux-2.6.21/arch/s390/kernel/syscalls.S
 ===
 --- linux-2.6.21.orig/arch/s390/kernel/syscalls.S
 +++ linux-2.6.21/arch/s390/kernel/syscalls.S
 @@ -322,3 +322,4 @@ NI_SYSCALL
 /* 310 sys_move_pages *
  SYSCALL(sys_getcpu,sys_getcpu,sys_getcpu_wrapper)
  SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait_wrapper)
  SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper)
 +SYSCALL(s390_fallocate,sys_fallocate,sys_fallocate_wrapper)
 Index: linux-2.6.21/arch/s390/kernel/sys_s390.c
 ===
 --- linux-2.6.21.orig/arch/s390/kernel/sys_s390.c
 +++ linux-2.6.21/arch/s390/kernel/sys_s390.c
 @@ -286,3 +286,32 @@ int kernel_execve(const char *filename, 
 d (__arg3) : memory);
   return __svcres;
  }
 +
 +#ifndef CONFIG_64BIT
 +/*
 + * This is a wrapper to call sys_fallocate(). For 31 bit s390 the last
 + * 64 bit argument len is split into the upper and lower 32 bits. The
 + * system call wrapper in the user space loads the value to %r6/%r7.
 + * The code in entry.S keeps the values in %r2 - %r6 where they are and
 + * stores %r7 to 96(%r15). But the standard C linkage requires that
 + * the whole 64 bit value for len is stored on the stack and doesn't
 + * use %r6 at all. So s390_fallocate has to convert the arguments from
 + *   %r2: fd, %r3: mode, %r4/%r5: offset, %r6/96(%r15)-99(%r15): len
 + * to
 + *   %r2: fd, %r3: mode, %r4/%r5: offset, 96(%r15)-103(%r15): len
 + */
 +asmlinkage long s390_fallocate(int fd, int mode, loff_t offset,
 +u32 len_high, u32 len_low)
 +{
 + union {
 + u64 len;
 + struct {
 + u32 high;
 + u32 low;
 + };
 + } cv;
 + cv.high = len_high;
 + cv.low = len_low;
 + return sys_fallocate(fd, mode, offset, cv.len);
 +}
 +#endif
 Index: linux-2.6.21/include/asm-s390/unistd.h
 ===
 --- linux-2.6.21.orig/include/asm-s390/unistd.h
 +++ linux-2.6.21/include/asm-s390/unistd.h
 @@ -251,8 +251,9 @@
  #define __NR_getcpu  311
  #define __NR_epoll_pwait 312
  #define __NR_utimes  313
 +#define __NR_fallocate   314
 
 -#define NR_syscalls 314
 +#define NR_syscalls 315
 
  /* 
   * There are some system calls that are not present on 64 bit, some
 -
 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to 

Re: [PATCH] resolve duplicate flag no for PG_lazyfree

2007-05-14 Thread Theodore Tso
On Sun, May 13, 2007 at 10:46:30PM -0700, Andrew Morton wrote:
 otoh, the intersection between pages which are PageBooked() and pages which
 are PageLazyFree() should be zreo, so it'd be good to actually formalise
 this reuse within the ext4 patches.
 
 otoh2, PageLazyFree() could have reused PG_owner_priv_1.
 
 Rik, Ted: any thoughts?  We do need to scrimp on page flags: when we
 finally run out, we're screwed.

It makes sense to me.  PG_lazyfree is currently only in -mm, right?  I
don't see it in my git tree.  It would probably would be a good idea
to make sure that we check to add some sanity checking code if it
isn't there already that PG_lazyfree isn't already set when try to set
PG_lazyfree (just in case there is a bug in the future which causes
the should-never-happen case of trying lazy free a PageBooked page).

- Ted
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: poor performance of mount due to libblkid

2007-05-14 Thread Shapor Naghibzadeh
On Thu, May 10, 2007 at 02:44:48AM -0400, Theodore Tso wrote:
 put it.  The device names of USB storage devices end up getting
 reused, so in practice what is in blkid.tab is merely the last storage
 device that was plugged in, not every single one going back forever.

My point with the USB example was that it keeps their labels around in a
world-readable cache infinitely (or until a device with the same name gets
mounted again).  Its probably not a security issue in most cases, but its
clutter which one doesn't expect to stick around.

 One easy way of solving this problem is when we're parsing the file,
 try to stat the device file, and if it doesn't exist, to skip parsing
 the line together.  This would prevent blkid.tab from growing without
 bound given your workload.

This idea of doing garbage collection every time blkid.tab is read destroys
the cache if, for example, you mount /usr or /var before other block devices
have been brought up.  AoE and nbd come to mind as a potentially large number
of devices that might not exist until later in the boot process.

 The whole point of blkid.tab file was so that having searched all of
 the devices to find the particular filesystem with a specified volume
 label or UUID, that all of the information that was gathered doesn't
 have to be searched a next time you need to do a mount-by-uuid or
 mount-by-label.  And if you have a large number of disks that you
 might have to potentially spin up, you definitely want to keep this
 cache across boots, which is why we store it in /etc/blkid.tab.

Ok, but why do we bother caching the filesystem type?  The desire to optimize
the scanning for UUIDs or labels is indeed a real problem, but caching the
filesystem type has the potential for introducing bugs and doesn't seem to
have any real payoff.  I for one have been bitten by the ext2 to ext3 upgrade
bug more than once.

There should be a better way of maintaining a UUID and label cache other than
having mount keep an XML cache in /etc (which seems to violate the Linux
filesystem hierarchy standard).  Certainly having it enabled by default when
there is no desire to mount by UUID or label is wasteful and probably the most
common case.

 So it sounds like the short-term fix is to simply add a test so that
 if the device isn't present, we should just ignore the entry when we
 read it into memory.  The longer-term fix is use a more sophisticated
 in-core representation which doesn't have a linear search time, and so
 that algorithms to detect multiple lines referring to the same device
 don't take O(n**2).  We should also fix mount to avoid having it
 unconditionally read in the blkid.tab file.  The assumption was the
 overhead for doing so should not be measurable.

The first and safest step would seem to be removing the use of blkid.tab from
mount except when trying to mount by UUID or volume label to prevent the
performance issue when the cache is large.  I think garbage collection is more
complex to do safely and the whole approach might some re-thinking.

Shapor
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] [patch 2/2] i_version update - ext4 part

2007-05-14 Thread Mingming Cao
On Mon, 2007-05-14 at 14:31 -0600, Andreas Dilger wrote:
 On May 14, 2007  14:21 -0600, Andreas Dilger wrote:
  On May 14, 2007  13:05 +0200, Cordenner jean noel wrote:
   @@ -331,12 +331,13 @@
 } osd2; /* OS dependent 2 */
 __le16  i_extra_isize;
 __le16  i_pad1;
   + __le32  i_disk_version_hi;
  
  No, this is not correct.  There are already several other fields here
  (nanosecond ctime, mtime, atime, crtime (creation time)) so you need
  to use the correct reserved field for this.
  
  __u16   i_extra_isize;
  __u16   i_pad1;
  __u32   i_ctime_extra;  /* extra Change time (nsec  2 | epoch) */
  __u32   i_mtime_extra;  /* extra Modification time (nsec  2 | epoch)*/
  __u32   i_atime_extra;  /* extra Access time (nsec  2 | epoch) */
  __u32   i_crtime;   /* File creation time */
  __u32   i_crtime_extra; /* extra File creation time (nsec  2 |epoch)*/
 
 Sorry, I meant to add (before hitting send :-) that the field after
 i_crtime_extra is supposed to be i_disk_version_hi.
 


 See the patch from Kalpak Shah [RFC] 64-bit inode version which also handles
 the case for expanding i_extra_isize to cover the needed extra fields if
 i_extra_isize is not large enough.  That patch didn't include the 64-bit
 i_version_hi yet, because there wasn't yet agreement at that time if
 the iversion_hi should be allocated separately, but that was since decided.
 

Kalpak already sent a patch [PATCH] Add i_version_hi for 64-bit version,
which added i_version_hi after i_crtime. His patch has in ext4 git tree
for a while.

ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/ext4-patches/2.6.21-
ext4-1

Regards,
Mingming

 Without that patch, your patch will possibly corrupt the extended attributes
 by just overwriting i_disk_version_hi while ignoring the actual value of
 i_extra_isize.  This would clobber the EA magic and result in loss of all
 EAs in that inode.
 
 Cheers, Andreas
 --
 Andreas Dilger
 Principal Software Engineer
 Cluster File Systems, Inc.
 
 -
 To unsubscribe from this list: send the line unsubscribe linux-ext4 in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/5][TAKE2] fallocate() implementation on i86, x86_64 and powerpc

2007-05-14 Thread Stephen Rothwell
On Mon, 14 May 2007 20:15:24 +0530 Amit K. Arora [EMAIL PROTECTED] wrote:

 This patch implements sys_fallocate() and adds support on i386, x86_64
 and powerpc platforms.

This patch no longer applies to Linus' tree - for a start there is no file
arch/x86_64/kernel/functionlist any more.

Can you rebase it, please?

--
Cheers,
Stephen Rothwell[EMAIL PROTECTED]
http://www.canb.auug.org.au/~sfr/


pgpjkEuu0iHkJ.pgp
Description: PGP signature