Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-14 Thread tytso

   From: "Stephen C. Tweedie" [EMAIL PROTECTED]
   Date:   Mon, 11 Oct 1999 17:34:36 +0100 (BST)

   The _fast_ quick fix is to maintain a per-inode list of dirty buffers
   and to invalidate that list when we do a delete.  This works for
   directories if we only support truncate back to zero --- it obviously
   gets things wrong if we allow partial truncates of directories (but why
   would anyone want to allow that?!)

   This would have minimal performance implication and would also allow
   fast fsync() of indirect block metadata for regular files.

I've actually had patches to do fast fsync for some time, but I
thought you said you had some changes in queue which would make the
need for this obsolete, so didn't bother to submit this, since it is a
bit of a hack.  Here it is though, for folks to comment upon.  Code to
invalidate the list of dirty buffers is missing from this code, but it
wouldn't be hard to add.

- Ted

Patch generated: on Wed Aug 18 15:01:49 EDT 1999 by [EMAIL PROTECTED]
against Linux version 2.2.10
 
===
RCS file: fs/ext2/RCS/fsync.c,v
retrieving revision 1.1
diff -u -r1.1 fs/ext2/fsync.c
--- fs/ext2/fsync.c 1999/07/21 10:45:22 1.1
+++ fs/ext2/fsync.c 1999/07/21 10:45:26
@@ -250,36 +250,83 @@
return err;
 }
 
-/*
- * File may be NULL when we are called. Perhaps we shouldn't
- * even pass file to fsync ?
- */
-
-int ext2_sync_file(struct file * file, struct dentry *dentry)
+static int sync_inode(struct inode *inode)
 {
-   int wait, err = 0;
-   struct inode *inode = dentry-d_inode;
-
+   int i, wait, err = 0;
+   __u32   *blklist;
+   
if (S_ISLNK(inode-i_mode)  !(inode-i_blocks))
-   /*
-* Don't sync fast links!
-*/
-   goto skip;
+   return 0;
 
-   for (wait=0; wait=1; wait++)
-   {
-   err |= sync_direct (inode, wait);
-   err |= sync_indirect (inode,
+   if (!S_ISDIR(inode-i_mode)  inode-u.ext2_i.i_ffsync_flag) {
+   blklist = inode-u.ext2_i.i_ffsync_blklist;
+   for (wait = 0; wait =1; wait++) {
+   for (i=0; i  inode-u.ext2_i.i_ffsync_ptr; i++) {
+#if 0  /* Debugging */
+   if (!wait)
+   printk("Fasy sync: %d\n", blklist[i]);
+#endif 
+   err |= sync_block(inode, blklist[i], wait);
+   }
+   }
+   } else {
+   for (wait=0; wait=1; wait++) {
+   err |= sync_direct (inode, wait);
+   err |= sync_indirect (inode,
  inode-u.ext2_i.i_data+EXT2_IND_BLOCK,
- wait);
-   err |= sync_dindirect (inode,
+ wait);
+   err |= sync_dindirect (inode,
   inode-u.ext2_i.i_data+EXT2_DIND_BLOCK, 
-  wait);
-   err |= sync_tindirect (inode, 
+  wait);
+   err |= sync_tindirect (inode, 
   inode-u.ext2_i.i_data+EXT2_TIND_BLOCK, 
-  wait);
+  wait);
+   }
}
-skip:
+   inode-u.ext2_i.i_ffsync_flag = 1;
+   inode-u.ext2_i.i_ffsync_ptr = 0;
err |= ext2_sync_inode (inode);
+   return err;
+}
+
+/*
+ * File may be NULL when we are called by msync on a vma.  In the
+ * future, the VFS layer should be changed to not pass the struct file
+ * parameter to the fsync function, since it's not used by any of the
+ * implementations (and the dentry parameter is all that we need).
+ */
+int ext2_sync_file(struct file * file, struct dentry *dentry)
+{
+   int err = 0;
+
+   err = sync_inode(dentry-d_inode);
+   if (dentry-d_parent  dentry-d_parent-d_inode)
+   err |= sync_inode(dentry-d_parent-d_inode);
+   
return err ? -EIO : 0;
+}
+
+/*
+ * This function adds a list of blocks to be written out by fsync.  If
+ * it exceeds NUM_EXT2_FFSYNC_BLKS, then we turn off the fast fsync flag.
+ */
+void ext2_ffsync_add_blk(struct inode *inode, __u32 blk)
+{
+   int i;
+   __u32   *blklist;
+
+   if (inode-u.ext2_i.i_ffsync_flag == 0)
+   return;
+#if 0  /* Debugging */
+   printk("Add fast sync: %d\n", blk);
+#endif
+   blklist = inode-u.ext2_i.i_ffsync_blklist;
+   for (i=0; i  inode-u.ext2_i.i_ffsync_ptr; i++)
+   if (blklist[i] == blk)
+   return;
+   if (inode-u.ext2_i.i_ffsync_ptr  NUM_EXT2_FFSYNC_BLKS) {

Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-13 Thread Raul Miller

On 11 Oct 1999 17:58:54 -0500, [EMAIL PROTECTED] (Eric W. Biederman) said:
  Ultimately we really want to have indirect blocks, and the directory
  in the page cache as it should result in more uniform code, and
  faster partial truncates (as well as faster syncs).

Stephen C. Tweedie [EMAIL PROTECTED] wrote:
 There is one major potential future problem with moving this to the
 page cache. At some point I want to be able to extend the large (64G)
 memory support on Intel to include the page cache in high memory.
 The buffer cache would still live in low memory. If we do that, then
 moving filesystem metadata out of permanently-mapped buffer memory
 and into the page cache is going to complicate directory and indirect
 operations significantly.

Because of write ordering, I presume -- the dirty buffers you want most
likely want to be able to write are (of the dirty buffers) the ones you
most likely want to move out to large memory?

Is there a problem with the simple answer (don't move dirty buffers
there)?  It seems like the only problem there would be memory pressure,
nothing algorithmic.  If so, I'm guessing that the trick is how to best
save write order for dirty blocks in large memory?

Unless I'm way off base, the second simplest answer is don't move dirty
meta-data blocks out to large memory.  That's going to cost you an if ()
-- and I suppose it may mean some extra L1 cache pressure for machines
with large memory support compiled in -- but otherwise doesn't seem any
worse than the current situation.

-- 
Raul



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-12 Thread Stephen C. Tweedie

Hi,

On Sat, 9 Oct 1999 23:53:01 +0200 (CEST), Andrea Arcangeli
[EMAIL PROTECTED] said:

 What I said about bforget in my old email is still true. The _only_ reason
 for using bforget instead of brelse is to get buffer performances (that in
 2.3.x are not so interesting as in 2.2.x as in 2.3.x flushpage is just
 doing the interesting stuff with the real data).

 The current design bug in 2.3.20pre2 and previous has nothing to do with
 bforget.

Nope.  We spent a fair amount of effort on this with the page cache
changes.  The ext2 truncate code is really, really careful to provide
bforget() with the correct conditions to get rid of the buffer: it is a
closely designed interaction between delete and bforget.  Remember, we
have exactly the same potential problems when freeing dirty indirect
blocks as we have when freeing directory blocks.

There's a big comment near the top of fs/ext2/truncate.c about the way
in which this works.

 The right fix is to do a query on the hash every time you overlap a
 buffer on the page cache.

Ouch --- that pays the penalty for normal data blocks every time you
pull them into cache.  No way.

The correct way to extend the current rules cleanly is to make the
truncate code do a bforget() on data blocks as well as indirect blocks
if, and only if, the file is not a regular file.  That will deal with
the symlink case too, and will mean that we are using the same mechanism
for all of our dynamically-allocatable metadata.

--Stephen



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-12 Thread Stephen C. Tweedie

Hi,

On 11 Oct 1999 17:58:54 -0500, [EMAIL PROTECTED] (Eric W. Biederman)
said:

 What about adding to the end of ext2_alloc_block:

 bh = get_hash_table(inode-i_dev, result, inode-i_sb-s_blocksize);
 /* something is playing with our fresh block, make them stop.  ;-) */
 if (bh) {
   if (buffer_dirty(bh)) { 
   mark_buffer_clean(bh);
   wait_on_buffer(bh);
   }
   bforget(bh);
 }

Again, it's a lot of extra unnecessary lookups.  The advantages of
having a dirty buffer list include being fast, and also massively
speeding up the metadata update part of fsync.

 Ultimately we really want to have indirect blocks, and
 the directory in the page cache as it should result in
 more uniform code, and faster partial truncates (as well as faster
 syncs).

There is one major potential future problem with moving this to the page
cache.  At some point I want to be able to extend the large (64G) memory
support on Intel to include the page cache in high memory.  The buffer
cache would still live in low memory.  If we do that, then moving
filesystem metadata out of permanently-mapped buffer memory and into the
page cache is going to complicate directory and indirect operations
significantly.

--Stephen



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-12 Thread Andrea Arcangeli

On Tue, 12 Oct 1999, Stephen C. Tweedie wrote:

changes.  The ext2 truncate code is really, really careful to provide

I was _not_ talking about ext2 at all. I was talking about the bforget and
brelse semantics. As bforget fallback to brelse you shouldn't expect
bforget to really destroy the buffer. It may do that but only if it's
possible.

You are using a _trick_ to let bforget to do the thing you want. If all
filesystem needs such thing it's ugly having to duplicate that current
ext2 tricks all over the place.

The correct way to extend the current rules cleanly is to make the
truncate code do a bforget() on data blocks as well as indirect blocks
if, and only if, the file is not a regular file.  That will deal with
the symlink case too, and will mean that we are using the same mechanism
for all of our dynamically-allocatable metadata.

IMHO we should drop all that race-prone tricks isntead of adding new ones.

If you use bforget to avoid fs corruption you are asking for troubles and
IMHO you are going in the wrong direction.

You may change bforget to do the right thing cleanly, but it will be badly
blocking. While you shouldn't block unless you are the one who wants to
reuse the buffer that is currently under I/O or dirty. So if you'll change
bforget then you'll block in the wrong place.

Andrea



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-12 Thread Andrea Arcangeli

On 11 Oct 1999, Eric W. Biederman wrote:

What about adding to the end of ext2_alloc_block:

It's _equally_ slow. Do you seen my patch? I prefer to do the query at the
higher lever to save cpu cache.

Andrea



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-12 Thread Stephen C. Tweedie

Hi,

On Tue, 12 Oct 1999 15:39:35 +0200 (CEST), Andrea Arcangeli
[EMAIL PROTECTED] said:

 On Tue, 12 Oct 1999, Stephen C. Tweedie wrote:
 changes.  The ext2 truncate code is really, really careful to provide

 I was _not_ talking about ext2 at all. I was talking about the bforget and
 brelse semantics. As bforget fallback to brelse you shouldn't expect
 bforget to really destroy the buffer. It may do that but only if it's
 possible.

 You are using a _trick_ to let bforget to do the thing you want. If all
 filesystem needs such thing it's ugly having to duplicate that current
 ext2 tricks all over the place.

Andrea, you are just trying to relax carefully designed buffer cache
semantics which are relied upon by the current filesystems.  Saying it
is a trick doesn't help matters much.

 If you use bforget to avoid fs corruption you are asking for troubles and
 IMHO you are going in the wrong direction.

Umm, your last proposal was to do a hash lookup on each new page cache
buffer mapping.  That is a significant performance cost, which IMHO is
not exactly the right direction either. :)

 You may change bforget to do the right thing cleanly, but it will be badly
 blocking. While you shouldn't block unless you are the one who wants to
 reuse the buffer that is currently under I/O or dirty. So if you'll change
 bforget then you'll block in the wrong place.

Unfortunately, the plain fact is that the current page cache relies on
data blocks never being in the buffer cache hash lists, and certainly
never being dirty in that cache.  _That_ is the invariant we need to
preserve.  truncate already does that for regular files.  Please, let's
try to keep that clean invariant rather than increase the cost of normal
page cache operations.

--Stephen



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-12 Thread Andrea Arcangeli

On Tue, 12 Oct 1999, Stephen C. Tweedie wrote:

Umm, your last proposal was to do a hash lookup on each new page cache
buffer mapping.  That is a significant performance cost, which IMHO is
not exactly the right direction either. :)

It's not obvious that the only thing to consider are performances. It's
not obvious to me that `less performances' == `wrong way' as you state
above.

That's been the simpler way I had to solve the design bug in a way that
looks obviously right to me (no way for mistakes or races with my way) and
without breaking all lowlevel filesystems.

Said that I will think soon about the better way that you are proposing
and that will break all filesystems again (except NFS and SMBFS where the
metadata lives on the server). I had not thought about this
metadata-in-page-cache way carefully yet. (I wanted to do that yesterday
but I destroyed all my filesystems with a cp /dev/zero /var/tmp due an
fdisk bug and so I wasted some good time ...). If there is just some code
available to see I'd like to see it (as it will avoid me reading lots of
emails to understand what's going on). Thanks!

Andrea



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-12 Thread Benjamin C.R. LaHaise

G'day!

On Tue, 12 Oct 1999, Stephen C. Tweedie wrote:
...
 Andrea, you are just trying to relax carefully designed buffer cache
 semantics which are relied upon by the current filesystems.  Saying it
 is a trick doesn't help matters much.

Andrea's right in that the semantics make it far too easy to introduce
bugs.  How about making bforget set a flag that the buffer head is to be
destroyed as soon as it's released?  If a filesystem re-uses the buffer,
mark_buffer_dirty can clear this destroyed flag (to handle the case where
a buffer head becomes legitimately reused).

-ben



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-11 Thread Eric W. Biederman

"Stephen C. Tweedie" [EMAIL PROTECTED] writes:

 Hi,
 
 On Sun, 10 Oct 1999 16:57:18 +0200 (CEST), Andrea Arcangeli
 [EMAIL PROTECTED] said:
 
  My point was that even being forced to do a lookup before creating
  each empty buffer, will be still faster than 2.2.x as in 2.3.x the hash
  will contain only metadata. Less elements means faster lookups.
 
 The _fast_ quick fix is to maintain a per-inode list of dirty buffers
 and to invalidate that list when we do a delete.  This works for
 directories if we only support truncate back to zero --- it obviously
 gets things wrong if we allow partial truncates of directories (but why
 would anyone want to allow that?!)
 
 This would have minimal performance implication and would also allow
 fast fsync() of indirect block metadata for regular files.

What about adding to the end of ext2_alloc_block:

bh = get_hash_table(inode-i_dev, result, inode-i_sb-s_blocksize);
/* something is playing with our fresh block, make them stop.  ;-) */
if (bh) {
if (buffer_dirty(bh)) { 
mark_buffer_clean(bh);
wait_on_buffer(bh);
}
bforget(bh);
}

This is in a relatively slow/uncommon path and also catches
races if indirect blocks are allocated to a file, instead
of just directory syncs.

Ultimately we really want to have indirect blocks, and
the directory in the page cache as it should result in
more uniform code, and faster partial truncates (as well as faster
syncs).

Eric



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-11 Thread Alexander Viro



On Mon, 11 Oct 1999, Stephen C. Tweedie wrote:

 Hi,
 
 On Sun, 10 Oct 1999 16:57:18 +0200 (CEST), Andrea Arcangeli
 [EMAIL PROTECTED] said:
 
  My point was that even being forced to do a lookup before creating
  each empty buffer, will be still faster than 2.2.x as in 2.3.x the hash
  will contain only metadata. Less elements means faster lookups.
 
 The _fast_ quick fix is to maintain a per-inode list of dirty buffers
 and to invalidate that list when we do a delete.  This works for
 directories if we only support truncate back to zero --- it obviously
 gets things wrong if we allow partial truncates of directories (but why
 would anyone want to allow that?!)

AFFS. OTOH there we are dropping the known bh every time.



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-11 Thread Stephen C. Tweedie

Hi,

On Sun, 10 Oct 1999 16:57:18 +0200 (CEST), Andrea Arcangeli
[EMAIL PROTECTED] said:

 My point was that even being forced to do a lookup before creating
 each empty buffer, will be still faster than 2.2.x as in 2.3.x the hash
 will contain only metadata. Less elements means faster lookups.

The _fast_ quick fix is to maintain a per-inode list of dirty buffers
and to invalidate that list when we do a delete.  This works for
directories if we only support truncate back to zero --- it obviously
gets things wrong if we allow partial truncates of directories (but why
would anyone want to allow that?!)

This would have minimal performance implication and would also allow
fast fsync() of indirect block metadata for regular files.

--Stephen



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-10 Thread Mikulas Patocka

On Sat, 9 Oct 1999, Andrea Arcangeli wrote:

 On Fri, 8 Oct 1999, Mikulas Patocka wrote:
 
 Here goes quick'n'dirty patch. It does bforget(). It should prevent file
 corruption.
 
 wrong patch. bforget give you no guarantee at all. bfoget always fallback
 to brelse if necessary.
 
 What I said about bforget in my old email is still true. The _only_ reason
 for using bforget instead of brelse is to get buffer performances (that in
 2.3.x are not so interesting as in 2.2.x as in 2.3.x flushpage is just
 doing the interesting stuff with the real data).

Yes, there is race when bforget is called while buffer is being flushed.
mark_buffer_clean();wait_on_buffer(); should be used.

Mikulas Patocka



Re: [patch] [possible race in ext2] Re: how to write get_block?

1999-10-09 Thread Andrea Arcangeli

On Fri, 8 Oct 1999, Mikulas Patocka wrote:

Here goes quick'n'dirty patch. It does bforget(). It should prevent file
corruption.

wrong patch. bforget give you no guarantee at all. bfoget always fallback
to brelse if necessary.

What I said about bforget in my old email is still true. The _only_ reason
for using bforget instead of brelse is to get buffer performances (that in
2.3.x are not so interesting as in 2.2.x as in 2.3.x flushpage is just
doing the interesting stuff with the real data).

The current design bug in 2.3.20pre2 and previous has nothing to do with
bforget.

The right fix is to do a query on the hash every time you overlap a buffer
on the page cache. If you'll find the buffer - you are going to overlap in
the page-cache - in the hash, then you'll have to flush it exactly as
flushpage does. My way has no cache coherency downside and it's completly
safe. It will impact a bit the CPU though. Still far better than 2.2.x
because the hash will be almost empty in 2.3.x compared with the huge one
of 2.2.x. I just had to do something like that to fix the ramdisk device.
Now I'll implement the thing.

Andrea



[patch] [possible race in ext2] Re: how to write get_block?

1999-10-08 Thread Mikulas Patocka

 Sheesh... Yes, you are right. It was not a problem with the old buffer
 cache, but now... Arrgh. Races in truncate(), film at 11. Pheeewww...
 Unless I'm seriously misunderstanding what happens you've just dug out a
 race in ext2. It doesn't do bforget() on the data blocks in directories.
 If I'm not mistaken the same scenario applies here just fine.

Here goes quick'n'dirty patch. It does bforget(). It should prevent file
corruption.

Mikulas Patocka

--- linux-2.3.19.tar.gz#utar/linux/fs/ext2/truncate.c   Tue Jun 29 18:22:08 1999
+++ linux/fs/ext2/truncate.cFri Oct  8 18:40:36 1999
@@ -101,6 +101,14 @@
  * -- WSH, 1998
  */
 
+/* M.P.: quick  dirty patch to prevent data corruption */
+
+static void brelse_blocks(struct inode *i, unsigned long block_to_free, unsigned long 
+count)
+{
+   while (count--)
+   bforget(get_hash_table(i-i_dev, block_to_free++, 
+i-i_sb-s_blocksize));
+}
+
 /*
  * Check whether any of the slots in an indirect block are
  * still in use, and if not free the block.
@@ -184,14 +192,17 @@
else if (block_to_free == tmp - free_count)
free_count++;
else {
+   if (!S_ISREG(inode-i_mode)) brelse_blocks(inode, 
+block_to_free, free_count);
ext2_free_blocks (inode, block_to_free, free_count);
free_this:
block_to_free = tmp;
free_count = 1;
}
}
-   if (free_count  0)
+   if (free_count  0) {
+   if (!S_ISREG(inode-i_mode)) brelse_blocks(inode, block_to_free, 
+free_count);
ext2_free_blocks (inode, block_to_free, free_count);
+   }
return retry;
 }
 
@@ -247,14 +258,17 @@
else if (block_to_free == tmp - free_count)
free_count++;
else {
+   if (!S_ISREG(inode-i_mode)) brelse_blocks(inode, 
+block_to_free, free_count);
ext2_free_blocks (inode, block_to_free, free_count);
free_this:
block_to_free = tmp;
free_count = 1;
}
}
-   if (free_count  0)
+   if (free_count  0) {
+   if (!S_ISREG(inode-i_mode)) brelse_blocks(inode, block_to_free, 
+free_count);
ext2_free_blocks (inode, block_to_free, free_count);
+   }
/*
 * Check the block and dispose of the ind_bh buffer.
 */