[PATCH 1/3] ext4: different maxbytes functions for bitmap extent files

2007-12-04 Thread Eric Sandeen
use 2 different maxbytes functions for bitmapped  extent-based
files.

Signed-off-by: Eric Sandeen [EMAIL PROTECTED]

---

Index: linux-2.6.24-rc3/fs/ext4/super.c
===
--- linux-2.6.24-rc3.orig/fs/ext4/super.c
+++ linux-2.6.24-rc3/fs/ext4/super.c
@@ -1656,17 +1656,57 @@ static void ext4_orphan_cleanup (struct 
 }
 
 /*
- * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * Maximal extent format file size.
+ * Resulting logical blkno at s_maxbytes must fit in our on-disk
+ * extent format containers, within a sector_t, and within i_blocks
+ * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
+ * so that won't be a limiting factor.
+ *
+ * Note, this does *not* consider any metadata overhead for vfs i_blocks.
+ */
+static loff_t ext4_max_size(int blkbits)
+{
+   loff_t res;
+   loff_t upper_limit = MAX_LFS_FILESIZE;
+
+   /* small i_blocks in vfs inode? */
+   if (sizeof(blkcnt_t)  sizeof(u64)) {
+   /*
+* CONFIG_LSF is not enabled implies the inode
+* i_block represent total blocks in 512 bytes
+* 32 == size of vfs inode i_blocks * 8
+*/
+   upper_limit = (1LL  32) - 1;
+
+   /* total blocks in file system block size */
+   upper_limit = (blkbits - 9);
+   upper_limit = blkbits;
+   }
+
+   /* 32-bit extent-start container, ee_block */
+   res = 1LL  32;
+   res = blkbits;
+   res -= 1;
+
+   /* Sanity check against vm-  vfs- imposed limits */
+   if (res  upper_limit)
+   res = upper_limit;
+
+   return res;
+}
+
+/*
+ * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
  * block limit, and also a limit of (248 - 1) 512-byte sectors in i_blocks.
  * We need to be 1 filesystem block less than the 248 sector limit.
  */
-static loff_t ext4_max_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits)
 {
loff_t res = EXT4_NDIR_BLOCKS;
int meta_blocks;
loff_t upper_limit;
/* This is calculated to be the largest file size for a
-* dense, file such that the total number of
+* dense, bitmapped file such that the total number of
 * sectors in the file, including data and all indirect blocks,
 * does not exceed 248 -1
 * __u32 i_blocks_lo and _u16 i_blocks_high representing the

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] ext4: Handle different max offsets for bitmap extent-based files

2007-12-04 Thread Eric Sandeen
Basic approach: have both ext4_max_bitmap_size() and ext4_max_size()
functions to compute max offsets for both types of formats.

Use vfs sb-s_maxbytes for the native maxbytes, i.e. extent-format files.

Put the smaller bitmap limit in a new sbi-s_bitmap_maxbytes in the ext4
superblock info structure.

Catch bitmap files in ext4_file_write() and ext4_setattr() to limit
extending writes, llseeks, and truncates to too-large offsets which the
VFS let through due to the extent-format maxbytes.  On write, allow
writes up to the max, but then stop, by using iov_shorten() to limit the
size of the write to the maximum.

3 patches follow:

ext4_two_maxbytes_functions.patch - differentiate the maxbytes f'ns
ext4_bitmap_maxbytes_vfs.patch - export iov_shorten from kernel
ext4_bitmap_maxbytes.patch - store, and limit to, bitmap_maxbytes

Comments?

Thanks,
-Eric
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] ext4: (V2) store maxbytes for bitmapped files and return EFBIG as appropriate

2007-12-04 Thread Eric Sandeen
Crud, minor off-by-one in ext4_file_write.  V2 below:

===

Calculate  store the max offset for bitmapped files, and
catch too-large seeks, truncates, and writes in ext4, shortening
or rejecting as appropriate.

Signed-off-by: Eric Sandeen [EMAIL PROTECTED]

---

Index: linux-2.6.24-rc3/fs/ext4/super.c
===
--- linux-2.6.24-rc3.orig/fs/ext4/super.c
+++ linux-2.6.24-rc3/fs/ext4/super.c
@@ -1979,6 +1979,7 @@ static int ext4_fill_super (struct super
}
}
 
+   sbi-s_bitmap_maxbytes = ext4_max_bitmap_size(sb-s_blocksize_bits);
sb-s_maxbytes = ext4_max_size(sb-s_blocksize_bits);
 
if (le32_to_cpu(es-s_rev_level) == EXT4_GOOD_OLD_REV) {
Index: linux-2.6.24-rc3/fs/ext4/file.c
===
--- linux-2.6.24-rc3.orig/fs/ext4/file.c
+++ linux-2.6.24-rc3/fs/ext4/file.c
@@ -56,8 +56,29 @@ ext4_file_write(struct kiocb *iocb, cons
ssize_t ret;
int err;
 
-   ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+   /*
+* If we have encountered a bitmap-format file, the size limit
+* is smaller than s_maxbytes, which is for extent-mapped files.
+*/
 
+   if (!(EXT4_I(inode)-i_flags  EXT4_EXTENTS_FL)) {
+   struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb);
+   size_t length = iov_length(iov, nr_segs);
+
+   if (pos = sbi-s_bitmap_maxbytes) {
+   if (length || pos  sbi-s_bitmap_maxbytes) {
+   return -EFBIG;
+   }
+   /* zero-length writes at maxbytes are OK */
+   }
+
+   if (pos + length  sbi-s_bitmap_maxbytes) {
+   nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
+ sbi-s_bitmap_maxbytes - pos);
+   }
+   }
+
+   ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
/*
 * Skip flushing if there was an error, or if nothing was written.
 */
Index: linux-2.6.24-rc3/fs/ext4/inode.c
===
--- linux-2.6.24-rc3.orig/fs/ext4/inode.c
+++ linux-2.6.24-rc3/fs/ext4/inode.c
@@ -309,7 +309,9 @@ static int ext4_block_to_path(struct ino
offsets[n++] = i_block  (ptrs - 1);
final = ptrs;
} else {
-   ext4_warning(inode-i_sb, ext4_block_to_path, block  big);
+   ext4_warning(inode-i_sb, ext4_block_to_path, block %u  
max,
+   i_block + direct_blocks +
+   indirect_blocks + double_blocks);
}
if (boundary)
*boundary = final - 1 - (i_block  (ptrs - 1));
@@ -3212,6 +3214,17 @@ int ext4_setattr(struct dentry *dentry, 
ext4_journal_stop(handle);
}
 
+   if (attr-ia_valid  ATTR_SIZE) {
+   if (!(EXT4_I(inode)-i_flags  EXT4_EXTENTS_FL)) {
+   struct ext4_sb_info *sbi = EXT4_SB(inode-i_sb);
+
+   if (attr-ia_size  sbi-s_bitmap_maxbytes) {
+   error = -EFBIG;
+   goto err_out;
+   }
+   }
+   }
+
if (S_ISREG(inode-i_mode) 
attr-ia_valid  ATTR_SIZE  attr-ia_size  inode-i_size) {
handle_t *handle;
Index: linux-2.6.24-rc3/include/linux/ext4_fs_sb.h
===
--- linux-2.6.24-rc3.orig/include/linux/ext4_fs_sb.h
+++ linux-2.6.24-rc3/include/linux/ext4_fs_sb.h
@@ -38,6 +38,7 @@ struct ext4_sb_info {
ext4_group_t s_groups_count;/* Number of groups in the fs */
unsigned long s_overhead_last;  /* Last calculated overhead */
unsigned long s_blocks_last;/* Last seen block count */
+   loff_t s_bitmap_maxbytes;   /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block * s_es; /* Pointer to the super block in the 
buffer */
struct buffer_head ** s_group_desc;

-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Bug 9483] circular locking dependency detected

2007-12-04 Thread Andrew Morton
On Tue, 4 Dec 2007 22:25:18 +0100
Ingo Molnar [EMAIL PROTECTED] wrote:

 
 * Aneesh Kumar K.V [EMAIL PROTECTED] wrote:
 
  ===
  [ INFO: possible circular locking dependency detected ]
  2.6.24-rc3 #6
  ---
  bash/2294 is trying to acquire lock:
  (journal-j_list_lock){--..}, at: [c01eee2f] 
  journal_try_to_free_buffers+0x76/0x10c
  
  but task is already holding lock:
  (inode_lock){--..}, at: [c01864b6] drop_pagecache+0x48/0xd8
  
  which lock already depends on the new lock.
 
 Andrew, drop_pagecache() is root-only and it has some known deadlock, 
 right?
 

yup.   It takes inode_lock at too high a level so it can walk the per-sb inode
lists.
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Flex_BG ialloc awareness.

2007-12-04 Thread Jose R. Santos
On Mon, 3 Dec 2007 13:42:47 -0700
Andreas Dilger [EMAIL PROTECTED] wrote:

 On Dec 03, 2007  13:05 -0600, Jose R. Santos wrote:
  @@ -600,6 +600,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct 
  super_block *sb,
  ext4_grpblk_t group_freed;
  +   ext4_group_t meta_group;
 
 Please do not call these meta_groups.  This already means something very
 specific (i.e. desc_per_block groups) and using it for FLEX_BG is confusing.
 One possibly desirable relation is if the FLEX_BG count is some integer or
 power-of-two multiple of the metabg count.  That would allow the FLEX_BG
 code to share the same in-memory group struct as the mballoc code and save
 on some memory overhead.

Yes, need to clean the naming on some of these.  I also need to look
into the mballoc code to see if there is anything I can reuse.

  +   meta_group = ext4_meta_group(sbi, block_group);
  +   
  spin_lock(sbi-s_meta_groups[meta_group].meta_group_lock);
  +   sbi-s_meta_groups[meta_group].free_inodes++;
  +   if (is_directory)
  +   sbi-s_meta_groups[meta_group].num_dirs--;
  +   
  spin_unlock(sbi-s_meta_groups[meta_group].meta_group_lock);
 
 This can be as many as hundreds or thousands of spin locks.  Why not use
 the same hashed locking code as the group descriptors themselves?
 
   spin_lock(sb_bgl_lock(sbi, meta_group));
   spin_unlock(sb_bgl_lock(sbi, meta_group));
 
 This scales with the number of CPUs and chance of contention is very low.

Excellent.  I was thinking that one spinlock per flex_bg was overkill
as well but I did not know the existence of blockgroup_lock.h.

  +int find_group_meta(struct super_block *sb, struct inode *parent)
  +{
  +   ext4_group_t parent_mgroup = parent_group / sbi-s_groups_per_meta;
 
 This could use ext4_meta_group(sbi, parent_group)?

Yes, thanks for catching.
 
  +static inline ext4_group_t ext4_meta_group(struct ext4_sb_info *sbi,
  +ext4_group_t block_group)
  +{
  +   return block_group/sbi-s_groups_per_meta;
  +}
 
 It would be preferable to limit s_groups_per_meta to be a power-of-two
 so that this can become a shift instead of a divide.

Seems like I always fall into the same trap.  I'll change this.

 Cheers, Andreas
 --
 Andreas Dilger
 Sr. Staff Engineer, Lustre Group
 Sun Microsystems of Canada, Inc.
 

Thanks.

-JRS
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 32/35] jbd: Fix assertion failure in fs/jbd/checkpoint.c

2007-12-04 Thread akpm
From: Jan Kara [EMAIL PROTECTED]

Before we start committing a transaction, we call
__journal_clean_checkpoint_list() to cleanup transaction's written-back
buffers.

If this call happens to remove all of them (and there were already some
buffers), __journal_remove_checkpoint() will decide to free the transaction
because it isn't (yet) a committing transaction and soon we fail some
assertion - the transaction really isn't ready to be freed :).

We change the check in __journal_remove_checkpoint() to free only a
transaction in T_FINISHED state.  The locking there is subtle though (as
everywhere in JBD ;().  We use j_list_lock to protect the check and a
subsequent call to __journal_drop_transaction() and do the same in the end
of journal_commit_transaction() which is the only place where a transaction
can get to T_FINISHED state.

Probably I'm too paranoid here and such locking is not really necessary -
checkpoint lists are processed only from log_do_checkpoint() where a
transaction must be already committed to be processed or from
__journal_clean_checkpoint_list() where kjournald itself calls it and thus
transaction cannot change state either.  Better be safe if something
changes in future...

Signed-off-by: Jan Kara [EMAIL PROTECTED]
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Andrew Morton [EMAIL PROTECTED]
---

 fs/jbd/checkpoint.c |   12 ++--
 fs/jbd/commit.c |8 
 include/linux/jbd.h |2 ++
 3 files changed, 12 insertions(+), 10 deletions(-)

diff -puN fs/jbd/checkpoint.c~jbd-fix-assertion-failure-in-fs-jbd-checkpointc 
fs/jbd/checkpoint.c
--- a/fs/jbd/checkpoint.c~jbd-fix-assertion-failure-in-fs-jbd-checkpointc
+++ a/fs/jbd/checkpoint.c
@@ -602,15 +602,15 @@ int __journal_remove_checkpoint(struct j
 
/*
 * There is one special case to worry about: if we have just pulled the
-* buffer off a committing transaction's forget list, then even if the
-* checkpoint list is empty, the transaction obviously cannot be
-* dropped!
+* buffer off a running or committing transaction's checkpoing list,
+* then even if the checkpoint list is empty, the transaction obviously
+* cannot be dropped!
 *
-* The locking here around j_committing_transaction is a bit sleazy.
+* The locking here around t_state is a bit sleazy.
 * See the comment at the end of journal_commit_transaction().
 */
-   if (transaction == journal-j_committing_transaction) {
-   JBUFFER_TRACE(jh, belongs to committing transaction);
+   if (transaction-t_state != T_FINISHED) {
+   JBUFFER_TRACE(jh, belongs to running/committing transaction);
goto out;
}
 
diff -puN fs/jbd/commit.c~jbd-fix-assertion-failure-in-fs-jbd-checkpointc 
fs/jbd/commit.c
--- a/fs/jbd/commit.c~jbd-fix-assertion-failure-in-fs-jbd-checkpointc
+++ a/fs/jbd/commit.c
@@ -858,10 +858,10 @@ restart_loop:
}
spin_unlock(journal-j_list_lock);
/*
-* This is a bit sleazy.  We borrow j_list_lock to protect
-* journal-j_committing_transaction in __journal_remove_checkpoint.
-* Really, __journal_remove_checkpoint should be using j_state_lock but
-* it's a bit hassle to hold that across __journal_remove_checkpoint
+* This is a bit sleazy.  We use j_list_lock to protect transition
+* of a transaction into T_FINISHED state and calling
+* __journal_drop_transaction(). Otherwise we could race with
+* other checkpointing code processing the transaction...
 */
spin_lock(journal-j_state_lock);
spin_lock(journal-j_list_lock);
diff -puN include/linux/jbd.h~jbd-fix-assertion-failure-in-fs-jbd-checkpointc 
include/linux/jbd.h
--- a/include/linux/jbd.h~jbd-fix-assertion-failure-in-fs-jbd-checkpointc
+++ a/include/linux/jbd.h
@@ -439,6 +439,8 @@ struct transaction_s
/*
 * Transaction's current state
 * [no locking - only kjournald alters this]
+* [j_list_lock] guards transition of a transaction into T_FINISHED
+* state and subsequent call of __journal_drop_transaction()
 * FIXME: needs barriers
 * KLUDGE: [use j_state_lock]
 */
_
-
To unsubscribe from this list: send the line unsubscribe linux-ext4 in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html