PATCH: WIP super lock contention mitigation in 2.4

2000-07-12 Thread Juan J. Quintela


Hi
I have ported Stephen patch from 2.2.8 to 2.4, but now I have
contention in the inode bitmaps.  Can I use the same trick
there that he used in the block bitmap???

Later, Juan.

diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude 
base/fs/ext2/balloc.c working/fs/ext2/balloc.c
--- base/fs/ext2/balloc.c   Sat May 20 20:24:56 2000
+++ working/fs/ext2/balloc.cWed Jul 12 03:55:11 2000
@@ -9,6 +9,13 @@
  *  Enhanced block allocation by Stephen Tweedie ([EMAIL PROTECTED]), 1993
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *David S. Miller ([EMAIL PROTECTED]), 1995
+ *
+ *  Dropped use of the superblock lock.
+ *  This means that we have to take extra care not to block between any
+ *  operations on the bitmap and the corresponding update to the group 
+ *  descriptors.
+ *   Stephen C. Tweedie ([EMAIL PROTECTED]), 1999
+ * 
  */
 
 #include 
@@ -70,42 +77,33 @@
 }
 
 /*
- * Read the bitmap for a given block_group, reading into the specified 
- * slot in the superblock's bitmap cache.
+ * Read the bitmap for a given block_group.
  *
- * Return >=0 on success or a -ve error code.
+ * Return the buffer_head on success or NULL on IO error.
  */
 
-static int read_block_bitmap (struct super_block * sb,
-  unsigned int block_group,
-  unsigned long bitmap_nr)
+static struct buffer_head * read_block_bitmap (struct super_block * sb,
+unsigned int block_group)
 {
struct ext2_group_desc * gdp;
struct buffer_head * bh = NULL;
-   int retval = -EIO;

gdp = ext2_get_group_desc (sb, block_group, NULL);
if (!gdp)
-   goto error_out;
-   retval = 0;
+   return NULL;
+
bh = bread (sb->s_dev, le32_to_cpu(gdp->bg_block_bitmap), sb->s_blocksize);
if (!bh) {
ext2_error (sb, "read_block_bitmap",
"Cannot read block bitmap - "
"block_group = %d, block_bitmap = %lu",
block_group, (unsigned long) gdp->bg_block_bitmap);
-   retval = -EIO;
+   return NULL;
}
-   /*
-* On IO error, just leave a zero in the superblock's block pointer for
-* this group.  The IO will be retried next time.
-*/
-error_out:
-   sb->u.ext2_sb.s_block_bitmap_number[bitmap_nr] = block_group;
-   sb->u.ext2_sb.s_block_bitmap[bitmap_nr] = bh;
-   return retval;
+   return bh;
 }
 
+
 /*
  * load_block_bitmap loads the block bitmap for a blocks group
  *
@@ -122,9 +120,9 @@
 static int __load_block_bitmap (struct super_block * sb,
unsigned int block_group)
 {
-   int i, j, retval = 0;
+   int i, j;
unsigned long block_bitmap_number;
-   struct buffer_head * block_bitmap;
+   struct buffer_head * block_bitmap, *cached_bitmap = NULL;
 
if (block_group >= sb->u.ext2_sb.s_groups_count)
ext2_panic (sb, "load_block_bitmap",
@@ -132,27 +130,34 @@
"block_group = %d, groups_count = %lu",
block_group, sb->u.ext2_sb.s_groups_count);
 
+retry:
if (sb->u.ext2_sb.s_groups_count <= EXT2_MAX_GROUP_LOADED) {
if (sb->u.ext2_sb.s_block_bitmap[block_group]) {
if (sb->u.ext2_sb.s_block_bitmap_number[block_group] ==
-   block_group)
+   block_group) {
+   brelse(cached_bitmap);
return block_group;
+   }
ext2_error (sb, "__load_block_bitmap",
"block_group != block_bitmap_number");
}
-   retval = read_block_bitmap (sb, block_group, block_group);
-   if (retval < 0)
-   return retval;
+   if (!cached_bitmap)
+   goto load;
+   sb->u.ext2_sb.s_block_bitmap_number[block_group] = block_group;
+   sb->u.ext2_sb.s_block_bitmap[block_group] = cached_bitmap;
return block_group;
}
 
for (i = 0; i < sb->u.ext2_sb.s_loaded_block_bitmaps &&
sb->u.ext2_sb.s_block_bitmap_number[i] != block_group; i++)
;
+
+   /* already cached? */
if (i < sb->u.ext2_sb.s_loaded_block_bitmaps &&
sb->u.ext2_sb.s_block_bitmap_number[i] == block_group) {
block_bitmap_number = sb->u.ext2_sb.s_block_bitmap_number[i];
block_bitmap = sb->u.ext2_sb.s_block_bitmap[i];
+
for (j = i; j > 0; j--) {
sb->u.ext2_sb.s_block_bitmap_number[j] =
sb->u.ext2_sb.s_block_bitmap_number[j - 1];
@@ -161,28 +166,36 @@
}
  

PATCH: Trying to get back IO performance (WIP)

2000-07-03 Thread Juan J. Quintela


Hi
This patch is against test3-pre2.
It gives here good performance in the first run, and very bad
in the following ones of dbench 48.  I am hitting here problems with
the locking scheme.  I get a lot of contention in __wait_on_supper.
Almost all the dbench processes are waiting in:

0xc7427dcc 0xc0116fbd schedule+0x389 (0xc4840c20, 0x12901d, 0xc7427ea0, 0x123456
7, 0xc7426000)
   kernel .text 0xc010 0xc0116c34 0xc01173c0
   0xc013639c __wait_on_super+0x184 (0xc13f4c00)
   kernel .text 0xc010 0xc0136218 0xc0136410
   0xc01523e5 ext2_alloc_block+0x21 (0xc4840c20, 0x12901d, 0xc7427ea0)
   kernel .text 0xc010 0xc01523c4 0xc015245c
0xc7427e5c 0xc0152892 block_getblk+0x15e (0xc4840c20, 0xc316b5e0, 0x9, 0x15, 0xc
7427ea0)
   kernel .text 0xc010 0xc0152734 0xc0152a68
0xc7427eac 0xc0152ed0 ext2_get_block+0x468 (0xc4840c20, 0x15, 0xc2d99de0, 0x1)
   kernel .text 0xc010 0xc0152a68 0xc0152fc0
0xc7427ef4 0xc0133ae3 __block_prepare_write+0xe7 (0xc4840c20, 0xc11f7f78, 0x0, 0
x1000, 0xc0152a68)
   kernel .text 0xc010 0xc01339fc 0xc0133bf0
0xc7427f18 0xc0134121 block_prepare_write+0x21 (0xc11f7f78, 0x0, 0x1000, 0xc0152
a68)
   kernel .text 0xc010 0xc0134100 0xc013413c
[0]more> 
0xc7427f30 0xc01531d1 ext2_prepare_write+0x19 (0xc06b4f00, 0xc11f7f78, 0x0, 0x10
00)
   kernel .text 0xc010 0xc01531b8 0xc01531d8
0xc7427f90 0xc0127b8d generic_file_write+0x305 (0xc06b4f00, 0x8050461, 0xafc2, 0
xc06b4f20)
   kernel .text 0xc010 0xc0127888 0xc0127cf0
0xc7427fbc 0xc0130ea8 sys_write+0xe8 (0x9, 0x804b460, 0xffc3, 0x28, 0x1082)
   kernel .text 0xc010 0xc0130dc0 0xc0130ed0
   0xc0109874 system_call+0x34
   kernel .text 0xc010 0xc0109840 0xc0109878


This behavior also happens with vanilla kernel, only that it is not
so easy to reproduce.  Vanilla Kernel also gives normally worse IO
throughput than the first run with this patch.

Comments/suggerences/fixes are welcome.

Later, Juan.

This patch does:
 - Introduces WRITETRY logic that means: write this buffer if that
   is possible, but never block.  If we have to block, skip the
   write. (mainly from Jens axboe)
 - Uses that logic in sync_page_buffers(), i.e. never try to block
   in writes, when writing buffers.
 - Do all the blocking/waiting calling balance_dirty() in
   shrink_mmap.
 - export the sync_page_buffers function.
 - make try_to_free_buffers never generate any IO.
 - Change the caller of that two functions accordingly with the new
   semantics.


diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude 
base/drivers/block/ll_rw_blk.c working/drivers/block/ll_rw_blk.c
--- base/drivers/block/ll_rw_blk.c  Fri Jun 30 18:42:30 2000
+++ working/drivers/block/ll_rw_blk.c   Mon Jul  3 00:03:00 2000
@@ -556,7 +556,7 @@
unsigned int sector, count;
int max_segments = MAX_SEGMENTS;
struct request * req = NULL;
-   int rw_ahead, max_sectors, el_ret;
+   int r_ahead, w_ahead, max_sectors, el_ret;
struct list_head *head = &q->queue_head;
int latency;
elevator_t *elevator = &q->elevator;
@@ -584,30 +584,24 @@
}
}
 
-   rw_ahead = 0;   /* normal case; gets changed below for READA */
+   r_ahead = w_ahead = 0;
switch (rw) {
case READA:
-   rw_ahead = 1;
+   r_ahead = 1;
rw = READ;  /* drop into READ */
case READ:
if (buffer_uptodate(bh)) /* Hmmph! Already have it */
goto end_io;
kstat.pgpgin++;
break;
-   case WRITERAW:
-   rw = WRITE;
-   goto do_write;  /* Skip the buffer refile */
+   case WRITETRY:
+   w_ahead = 1;
case WRITE:
if (!test_and_clear_bit(BH_Dirty, &bh->b_state))
goto end_io;/* Hmmph! Nothing to write */
refile_buffer(bh);
-   do_write:
-   /*
-* We don't allow the write-requests to fill up the
-* queue completely:  we want some room for reads,
-* as they take precedence. The last third of the
-* requests are only for reads.
-*/
+   case WRITERAW:  /* Skip the buffer refile */
+   rw = WRITE;
kstat.pgpgout++;
break;
   

Too many contention in __wait_on_super

2000-07-03 Thread Juan J. Quintela


Hi 

   I have been changing the write of buffers in the IO layer, and I
have found that the system gets a lot of contention in
__wait_on_super().  I am using test3-pre1 + kdb patch. I see also the
stalls/vmstat strange output without the kdb patch.  I use it to be
able to see the back-traces.  You can also look at the vmstat output.
I see that vmstat 1 output when running dbench 48, the results doesn't
make sense that we are almost not doing IO in an IO test :

vmstat output:
 r  b  w   swpd   free   buff  cache  si  sobibo   incs  us  sy  id
 0 48  2   5004   3112   1948 112652   0   0 0   151  221   238   2   1  96
 0 48  2   5004   3108   1948 112652   0   0 0   133  195   188   0   0  99
 0 48  2   5004   3108   1948 112652   0   0 0   141  192   184   0   0  99

when things are working properly, that field is between 1000-2000.

Now the traces:

Almost all the dbench processes are stuck with the first back-trace.

Someone find some sense to that output and/or know how to fix that
problem.

Any comment are welcome.  If you need more information, let me know.

Later, Juan.


Most common back-trace:

0xc721bdcc 0xc0116fbd schedule+0x389 (0xc6e3bb60, 0x98e01, 0xc721bea0, 0x1234567
, 0xc721a000)
   kernel .text 0xc010 0xc0116c34 0xc01173c0
   0xc01363ac __wait_on_super+0x184 (0xc13f4c00)
   kernel .text 0xc010 0xc0136228 0xc0136420
   0xc01523f5 ext2_alloc_block+0x21 (0xc6e3bb60, 0x98e01, 0xc721bea0)
   kernel .text 0xc010 0xc01523d4 0xc015246c
0xc721be5c 0xc01528a2 block_getblk+0x15e (0xc6e3bb60, 0xc2f75c20, 0x8d, 0x99, 0x
c721bea0)
   kernel .text 0xc010 0xc0152744 0xc0152a78
0xc721beac 0xc0152ee0 ext2_get_block+0x468 (0xc6e3bb60, 0x99, 0xc221e440, 0x1)
   kernel .text 0xc010 0xc0152a78 0xc0152fd0
0xc721bef4 0xc0133af3 __block_prepare_write+0xe7 (0xc6e3bb60, 0xc11c1ee8, 0x0, 0
x1000, 0xc0152a78)
   kernel .text 0xc010 0xc0133a0c 0xc0133c00
0xc721bf18 0xc0134131 block_prepare_write+0x21 (0xc11c1ee8, 0x0, 0x1000, 0xc0152
a78)
   kernel .text 0xc010 0xc0134110 0xc013414c
0xc721bf30 0xc01531e1 ext2_prepare_write+0x19 (0xc719b5e0, 0xc11c1ee8, 0x0, 0x10
00)
   kernel .text 0xc010 0xc01531c8 0xc01531e8
0xc721bf90 0xc0127b8d generic_file_write+0x305 (0xc719b5e0, 0x8054469, 0x6fba, 0
xc719b600)
   kernel .text 0xc010 0xc0127888 0xc0127cf0
0xc721bfbc 0xc0130ea8 sys_write+0xe8 (0x9, 0x804b460, 0xffc3, 0x28, 0x1092)
   kernel .text 0xc010 0xc0130dc0 0xc0130ed0
   0xc0109874 system_call+0x34
   kernel .text 0xc010 0xc0109840 0xc0109878

or in:
0xc320de58 0xc0116fbd schedule+0x389 (0xc4c130c0, 0xc13f4c00, 0xc4c138e0, 0x1234
567, 0xc320c000)
   kernel .text 0xc010 0xc0116c34 0xc01173c0
   0xc01363ac __wait_on_super+0x184 (0xc13f4c00)
   kernel .text 0xc010 0xc0136228 0xc0136420
   0xc0151ce9 ext2_new_inode+0x6d (0xc4c130c0, 0x8180, 0xc320df00)
   kernel .text 0xc010 0xc0151c7c 0xc01521dc
0xc320df04 0xc0154555 ext2_create+0x1d (0xc4c130c0, 0xc5c17b60, 0x8180)
   kernel .text 0xc010 0xc0154538 0xc01545f4
0xc320df28 0xc013df64 vfs_create+0xdc (0xc4c130c0, 0xc5c17b60, 0x180)
   kernel .text 0xc010 0xc013de88 0xc013dfcc
0xc320df58 0xc013e198 open_namei+0x1cc (0xc4c12000, 0x243, 0x180, 0xc320df7c)
   kernel .text 0xc010 0xc013dfcc 0xc013e6cc
0xc320df98 0xc0130376 filp_open+0x3a (0xc4c12000, 0x242, 0x180)
   kernel .text 0xc010 0xc013033c 0xc0130398
0xc320dfbc 0xc01306dc sys_open+0x68 (0xbc4c, 0x242, 0x180, 0x242, 0xbc4c
)
   kernel .text 0xc010 0xc0130674 0xc01307dc
   0xc0109874 system_call+0x34
   kernel .text 0xc010 0xc0109840 0xc0109878


or in:
0xc5a19e50 0xc0116fbd schedule+0x389 (0xc5b58a00, 0x1000, 0xfff4, 0xc5a18000
, 0x1234567)
   kernel .text 0xc010 0xc0116c34 0xc01173c0
   0xc0131db8 __wait_on_buffer+0x1d8 (0xc5b58a00)
   kernel .text 0xc010 0xc0131be0 0xc0131e34
   0xc0133773 unmap_buffer+0x33 (0xc5b58a00)
   kernel .text 0xc010 0xc0133740 0xc01337a0
0xc5a19eb8 0xc01338e1 unmap_underlying_metadata+0x29 (0xc536fc00)
[1]more> 
   kernel .text 0xc010 0xc01338b8 0xc01338f0
0xc5a19ef4 0xc0133b0f __block_prepare_write+0x103 (0xc62c25e0, 0xc110e680, 0x0, 
0x1000, 0xc0152a78)
   

Fwd: PATCH: Rewrite of truncate_inode_pages (WIP)

2000-05-29 Thread Juan J. Quintela


[Sorry if you receives it twice, I forget to CC: fsdevel in my first mail]





Hi
the actual truncate_inode_pages can actually *write* part of
the file that has just been truncated.  That makes that things doesn't
work.  I can trigger the BUG in fs:/inode.c:clear_inode using mmap002
and riel aging patch in 2.4.0-test1.  The BUG triggered is:

if (inode->i_data.nrpages)
BUG();

The problem is that inode->i_data.nrpages can become negative. I have
got here -6 truncating a file of 384MB.

This patch deals with this problem, and try to do truncate_inode_pages
safe.

Notice that the function only works for truncate complete files, I
have left the old function for the partial case.  If people think that
this function is OK, I will incorporate the partial case.  I have no
done it because I want feedback from the MM people and the FS people,
and the algorithms for complete files are cleaner.

I would like to now if it is possible to have a locked buffer without
the page being locked, i.e.  If I can manipulate the buffers in one
page without locking them when I have the page lock. (I don't think
so, and then I have to lock also buffers).

With this patch things are rock solid here.

This patch does:

- defines a new function: discard_buffer, it will lock the buffer
  (waiting if needed) and remove the buffer from all the queues.
  It is like unmap_buffer, but makes sure that we don't do any IO
  and that we remove the buffer from all the lists.

- defines a new function: block_destroy_buffers: it is a mix of
  block_flushpage and do_try_to_free_pages.  It will make all the
  buffers in that page disappear calling discard_buffer.  Notice the
  way that we iterate through all the buffer heads.  I think that it
  is not racy, but I would like to hear comments from people than know
  more about buffer heads handling.

- It changes __remove_inode_pages to check that there are not buffers
  in that page.  Users of the function must make sure about that.  It
  changes all the callers of the function satisfy that requirement.

- I change invalidate_inode_pages (again).  Now block_destroy_buffers
  can wait, then we are *civilized* citizens and drop any lock that
  we have before call that block_destroy_buffers, and reaquire later.

- It has my *old* rewrite of truncate_inode_pages to use two auxiliary
  functions and lock the same way for the partial page and for the
  rest.  (That code is wrong anyway, pass to the next function).

- The new function truncate_inode_pages, that is a copy of
  invalidate_inodes, but we wait for locked pages.

Comments, please??

I am very interested in the coments from the fs-people.  What they
think about the locking that I am using in the buffer.c file.

Acknolegments:  This patch would have not been possible without the comments or 
arjan, bcrl, davej, riel and RogerL,  in
#kernelnewbies (irc.openprojects.net).  They did a lot
of review of previous versions of the patch.  The
errors are mine, but good ideas come from discussions
with them :)

Later, Juan.

diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/fs/buffer.c 
testing/fs/buffer.c
--- base/fs/buffer.cTue May 30 01:21:47 2000
+++ testing/fs/buffer.c Tue May 30 02:33:50 2000
@@ -1281,6 +1281,56 @@
}
 }
 
+/**
+ * discard_buffer - discard that buffer without doing any IO
+ * @bh: buffer to discard
+ * 
+ * This function removes a buffer from all the queues, without doing
+ * any IO, we are not interested in the contents of the buffer.  This
+ * function can block if the buffer is locked.
+ */
+static struct buffer_head *discard_buffer(struct buffer_head * bh)
+{
+   int index = BUFSIZE_INDEX(bh->b_size);
+   struct buffer_head *next;
+
+   /* grab the lru lock here to block bdflush. */
+   atomic_inc(&bh->b_count);
+   lock_buffer(bh);
+   next = bh->b_this_page;
+   clear_bit(BH_Uptodate, &bh->b_state);
+   clear_bit(BH_Mapped, &bh->b_state);
+   clear_bit(BH_Req, &bh->b_state);
+   clear_bit(BH_New, &bh->b_state);
+
+   spin_lock(&lru_list_lock);
+   write_lock(&hash_table_lock);
+   spin_lock(&free_list[index].lock);
+   spin_lock(&unused_list_lock);
+
+   if (!atomic_dec_and_test(&bh->b_count))
+   BUG();
+
+   __hash_unlink(bh);
+   /* The bunffer can be either on the regular
+* queues or on the free list..
+*/
+   if (bh->b_dev != B_FREE)
+   __remove_from_queues(bh);
+   else
+   __remove_from_free_list(bh, index);
+   __put_unused_buffer_head(bh);   
+   spin_unlock(&unused_list_lock);
+   write_unlock(&hash_table_lock);
+   spin_unlock(&free_list[index].lock);
+   spin_unlock(&lru_list_lock);
+   /* We can unlock the buffer, we have just returned it.
+* Ditto for the counter 
+ */
+   return next;
+}
+
+
 /*

PATCH: Rewrite of truncate_inode_pages (take 4)

2000-05-24 Thread Juan J. Quintela

>>>>> "juan" == Juan J Quintela <[EMAIL PROTECTED]> writes:

Hi Linus
   I am resending to you this patch, it works here, and nobody has
   complained about it the previous times that I post it here.

Later, Juan.

juan> I have reworte the function truncate_inode_pages.  The version
juan> in vanilla pre9-3 does busy waiting in the partial page, with
juan> this version the locking for the partial page and from the
juan> rest of the pages is the same.  This make that we have less
juan> special cases.  For the rest of pages the function works the
juan> same.  The only difference is that version is cleaner IMHO.
juan> Or there are some corner case that I have failed to see?

juan> Comments?

juan> Later, Juan.

juan> I have CC: the linux-fsdevel people, they are the users of
juan> that function, could somebody give me some feedback against
juan> the change?



diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c 
testing/mm/filemap.c
--- work/mm/filemap.c   Fri May 12 23:46:46 2000
+++ testing/mm/filemap.cSun May 14 22:08:45 2000
@@ -146,9 +146,39 @@
spin_unlock(&pagecache_lock);
 }
 
-/*
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+   memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+   
+   if (page->buffers)
+   block_flushpage(page, partial);
+
+}
+
+static inline void truncate_complete_page(struct page *page)
+{
+   if (!page->buffers || block_flushpage(page, 0))
+   lru_cache_del(page);
+   
+   /*
+* We remove the page from the page cache _after_ we have
+* destroyed all buffer-cache references to it. Otherwise some
+* other process might think this inode page is not in the
+* page cache and creates a buffer-cache alias to it causing
+* all sorts of fun problems ...  
+*/
+   remove_inode_page(page);
+   page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
  */
 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 {
@@ -168,11 +198,10 @@
 
page = list_entry(curr, struct page, list);
curr = curr->next;
-
offset = page->index;
 
-   /* page wholly truncated - free it */
-   if (offset >= start) {
+   /* Is one of the pages to truncate? */
+   if ((offset >= start) || (partial && (offset + 1) == start)) {
if (TryLockPage(page)) {
page_cache_get(page);
spin_unlock(&pagecache_lock);
@@ -183,22 +212,14 @@
page_cache_get(page);
spin_unlock(&pagecache_lock);
 
-   if (!page->buffers || block_flushpage(page, 0))
-   lru_cache_del(page);
-
-   /*
-* We remove the page from the page cache
-* _after_ we have destroyed all buffer-cache
-* references to it. Otherwise some other process
-* might think this inode page is not in the
-* page cache and creates a buffer-cache alias
-* to it causing all sorts of fun problems ...
-*/
-   remove_inode_page(page);
+   if (partial && (offset + 1) == start) {
+   truncate_partial_page(page, partial);
+   partial = 0;
+   } else 
+   truncate_complete_page(page);
 
UnlockPage(page);
page_cache_release(page);
-   page_cache_release(page);
 
/*
 * We have done things without the pagecache lock,
@@ -209,37 +230,6 @@
 */
goto repeat;
}
-   /*
-* there is only one partial page possible.
-*/
-   if (!partial)
-   continue;
-
-   /* and it's the one preceeding the first wholly truncated page */
-   if ((offset + 1) != start)
-   continue;
-
-   /* partial truncate, clear end of page */
-

PATCH: Rewrite of truncate_inode_pages (take 3)

2000-05-22 Thread Juan J. Quintela


Hi
I have reworte the function truncate_inode_pages.  The version
in vanilla pre9-3 does busy waiting in the partial page, with
this version the locking for the partial page and from the
rest of the pages is the same.  This make that we have less
special cases.  For the rest of pages the function works the
same.  The only difference is that version is cleaner IMHO.
Or there are some corner case that I have failed to see?

Comments?

Later, Juan.

I have CC: the linux-fsdevel people, they are the users of
that function, could somebody give me some feedback against
the change?


diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c 
testing/mm/filemap.c
--- work/mm/filemap.c   Fri May 12 23:46:46 2000
+++ testing/mm/filemap.cSun May 14 22:08:45 2000
@@ -146,9 +146,39 @@
spin_unlock(&pagecache_lock);
 }
 
-/*
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+   memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+   
+   if (page->buffers)
+   block_flushpage(page, partial);
+
+}
+
+static inline void truncate_complete_page(struct page *page)
+{
+   if (!page->buffers || block_flushpage(page, 0))
+   lru_cache_del(page);
+   
+   /*
+* We remove the page from the page cache _after_ we have
+* destroyed all buffer-cache references to it. Otherwise some
+* other process might think this inode page is not in the
+* page cache and creates a buffer-cache alias to it causing
+* all sorts of fun problems ...  
+*/
+   remove_inode_page(page);
+   page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
  */
 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 {
@@ -168,11 +198,10 @@
 
page = list_entry(curr, struct page, list);
curr = curr->next;
-
offset = page->index;
 
-   /* page wholly truncated - free it */
-   if (offset >= start) {
+   /* Is one of the pages to truncate? */
+   if ((offset >= start) || (partial && (offset + 1) == start)) {
if (TryLockPage(page)) {
page_cache_get(page);
spin_unlock(&pagecache_lock);
@@ -183,22 +212,14 @@
page_cache_get(page);
spin_unlock(&pagecache_lock);
 
-   if (!page->buffers || block_flushpage(page, 0))
-   lru_cache_del(page);
-
-   /*
-* We remove the page from the page cache
-* _after_ we have destroyed all buffer-cache
-* references to it. Otherwise some other process
-* might think this inode page is not in the
-* page cache and creates a buffer-cache alias
-* to it causing all sorts of fun problems ...
-*/
-   remove_inode_page(page);
+   if (partial && (offset + 1) == start) {
+   truncate_partial_page(page, partial);
+   partial = 0;
+   } else 
+   truncate_complete_page(page);
 
UnlockPage(page);
page_cache_release(page);
-   page_cache_release(page);
 
/*
 * We have done things without the pagecache lock,
@@ -209,37 +230,6 @@
 */
goto repeat;
}
-   /*
-* there is only one partial page possible.
-*/
-   if (!partial)
-   continue;
-
-   /* and it's the one preceeding the first wholly truncated page */
-   if ((offset + 1) != start)
-   continue;
-
-   /* partial truncate, clear end of page */
-   if (TryLockPage(page)) {
-   spin_unlock(&pagecache_lock);
-   goto repeat;
-   }
-   page_cache_get(page);
-   spin_unlock(&pagecache_lock);
-
-   memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-   if (page->buffers)
-   block_flushpage(page, partial);
-
-   p

PATCH: Rewrite of truncate_inode_pages (try2)

2000-05-17 Thread Juan J. Quintela


Hi Linus
I just repost my patch to truncate_inode_pages.  The version
in vanilla pre9-2 does busy waiting in partial pages, this one
didn't do busy waiting and put the locking code common for
partial and non partial pages.  

Linus, if you don't like it, could you tell me what is the
problem?

Thanks, Later.

diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c 
testing/mm/filemap.c
--- work/mm/filemap.c   Fri May 12 23:46:46 2000
+++ testing/mm/filemap.cSun May 14 22:08:45 2000
@@ -146,9 +146,39 @@
spin_unlock(&pagecache_lock);
 }
 
-/*
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+   memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+   
+   if (page->buffers)
+   block_flushpage(page, partial);
+
+}
+
+static inline void truncate_complete_page(struct page *page)
+{
+   if (!page->buffers || block_flushpage(page, 0))
+   lru_cache_del(page);
+   
+   /*
+* We remove the page from the page cache _after_ we have
+* destroyed all buffer-cache references to it. Otherwise some
+* other process might think this inode page is not in the
+* page cache and creates a buffer-cache alias to it causing
+* all sorts of fun problems ...  
+*/
+   remove_inode_page(page);
+   page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
  */
 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 {
@@ -168,11 +198,10 @@
 
page = list_entry(curr, struct page, list);
curr = curr->next;
-
offset = page->index;
 
-   /* page wholly truncated - free it */
-   if (offset >= start) {
+   /* Is one of the pages to truncate? */
+   if ((offset >= start) || (partial && (offset + 1) == start)) {
if (TryLockPage(page)) {
page_cache_get(page);
spin_unlock(&pagecache_lock);
@@ -183,22 +212,14 @@
page_cache_get(page);
spin_unlock(&pagecache_lock);
 
-   if (!page->buffers || block_flushpage(page, 0))
-   lru_cache_del(page);
-
-   /*
-* We remove the page from the page cache
-* _after_ we have destroyed all buffer-cache
-* references to it. Otherwise some other process
-* might think this inode page is not in the
-* page cache and creates a buffer-cache alias
-* to it causing all sorts of fun problems ...
-*/
-   remove_inode_page(page);
+   if (partial && (offset + 1) == start) {
+   truncate_partial_page(page, partial);
+   partial = 0;
+   } else 
+   truncate_complete_page(page);
 
UnlockPage(page);
page_cache_release(page);
-   page_cache_release(page);
 
/*
 * We have done things without the pagecache lock,
@@ -209,37 +230,6 @@
 */
goto repeat;
}
-   /*
-* there is only one partial page possible.
-*/
-   if (!partial)
-   continue;
-
-   /* and it's the one preceeding the first wholly truncated page */
-   if ((offset + 1) != start)
-   continue;
-
-   /* partial truncate, clear end of page */
-   if (TryLockPage(page)) {
-   spin_unlock(&pagecache_lock);
-   goto repeat;
-   }
-   page_cache_get(page);
-   spin_unlock(&pagecache_lock);
-
-   memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-   if (page->buffers)
-   block_flushpage(page, partial);
-
-   partial = 0;
-
-   /*
-* we have dropped the spinlock so we have to
-* restart.
-*/
-   UnlockPage(page);
-   page_cache_release(page);
-   goto repeat;
}
spin_unlock(&pagecache_lock);
 }


-- 
In theory, pract

PATCH: Rewrite of truncate_inode_pages

2000-05-14 Thread Juan J. Quintela


Hi
I have just rewrite the function truncate_inode_pages, it did
busy waiting for the partial page, with the new rewrite, we do the
same locking for normal pages and the partial pages.  This will help
in having less special cases.

Comments?

Later, Juan.

PD. This mail is also sent to the linux-fsdevel asking for
opininions/suggestions, they help to find a bug in the rewrite of
invalidate_inode_pages. 

diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c 
testing/mm/filemap.c
--- work/mm/filemap.c   Fri May 12 23:46:46 2000
+++ testing/mm/filemap.cSun May 14 22:08:45 2000
@@ -146,9 +146,39 @@
spin_unlock(&pagecache_lock);
 }
 
-/*
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+   memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+   
+   if (page->buffers)
+   block_flushpage(page, partial);
+
+}
+
+static inline void truncate_complete_page(struct page *page)
+{
+   if (!page->buffers || block_flushpage(page, 0))
+   lru_cache_del(page);
+   
+   /*
+* We remove the page from the page cache _after_ we have
+* destroyed all buffer-cache references to it. Otherwise some
+* other process might think this inode page is not in the
+* page cache and creates a buffer-cache alias to it causing
+* all sorts of fun problems ...  
+*/
+   remove_inode_page(page);
+   page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from with to truncate
+ *
  * Truncate the page cache at a set offset, removing the pages
  * that are beyond that offset (and zeroing out partial pages).
+ * If any page is locked we wait for it to become unlocked.
  */
 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 {
@@ -168,11 +198,10 @@
 
page = list_entry(curr, struct page, list);
curr = curr->next;
-
offset = page->index;
 
-   /* page wholly truncated - free it */
-   if (offset >= start) {
+   /* Is one of the pages to truncate? */
+   if ((offset >= start) || (partial && (offset + 1) == start)) {
if (TryLockPage(page)) {
page_cache_get(page);
spin_unlock(&pagecache_lock);
@@ -183,22 +212,14 @@
page_cache_get(page);
spin_unlock(&pagecache_lock);
 
-   if (!page->buffers || block_flushpage(page, 0))
-   lru_cache_del(page);
-
-   /*
-* We remove the page from the page cache
-* _after_ we have destroyed all buffer-cache
-* references to it. Otherwise some other process
-* might think this inode page is not in the
-* page cache and creates a buffer-cache alias
-* to it causing all sorts of fun problems ...
-*/
-   remove_inode_page(page);
+   if (partial && (offset + 1) == start) {
+   truncate_partial_page(page, partial);
+   partial = 0;
+   } else 
+   truncate_complete_page(page);
 
UnlockPage(page);
page_cache_release(page);
-   page_cache_release(page);
 
/*
 * We have done things without the pagecache lock,
@@ -209,37 +230,6 @@
 */
goto repeat;
}
-   /*
-* there is only one partial page possible.
-*/
-   if (!partial)
-   continue;
-
-   /* and it's the one preceeding the first wholly truncated page */
-   if ((offset + 1) != start)
-   continue;
-
-   /* partial truncate, clear end of page */
-   if (TryLockPage(page)) {
-   spin_unlock(&pagecache_lock);
-   goto repeat;
-   }
-   page_cache_get(page);
-   spin_unlock(&pagecache_lock);
-
-   memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-   if (page->buffers)
-   block_flushpage(page, partial);
-
-   partial = 0;
-
-   /*
-* we have dropped the spinlock so we have to
-* restart.
-*/
-   UnlockPage(page);
-   page_cache_release(page);
-   goto repeat;

Re: PATCH: rewrite of invalidate_inode_pages

2000-05-12 Thread Juan J. Quintela

> "trond" == Trond Myklebust <[EMAIL PROTECTED]> writes:

> " " == Arjan van de Ven <[EMAIL PROTECTED]> writes:
..

>> I'd vote for "invalidate_unlocked_inode_pages", as it also
>> suggests that the locked pages aren't invalidated.

trond> That sounds very good to me. Just so long as the name becomes more
trond> self-documenting than it is now.
trond> Intelligent people are making mistakes about what we want to do with
trond> this function, so it definitely needs to be documented more clearly.

Here it is a patch that changes all the refernences in the kernel from
invalidate_inode_pages to invalidate_unlocked_inode_pages.

I have CC: the mail to linux-fsdevel because the SMB people also use
that function.

Comments?

Later, Juan.

PD. This patch aplies in top of my previous patch to this function,
that you can get from the MM-list of from:

http://carpanta.dc.fi.udc.es/~quintela/kernel/2.3.99-pre7/page_cache_get.diff


diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS 
works/fs/nfs/inode.c testing/fs/nfs/inode.c
--- works/fs/nfs/inode.cWed Apr 26 02:28:56 2000
+++ testing/fs/nfs/inode.c  Fri May 12 19:22:00 2000
@@ -586,7 +586,7 @@
NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
 
-   invalidate_inode_pages(inode);
+   invalidate_unlocked_inode_pages(inode);
 
memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
NFS_CACHEINV(inode);
@@ -982,12 +982,12 @@
  * of the server's inode.
  *
  * This is a bit tricky because we have to make sure all dirty pages
- * have been sent off to the server before calling invalidate_inode_pages.
- * To make sure no other process adds more write requests while we try
- * our best to flush them, we make them sleep during the attribute refresh.
+ * have been sent off to the server before calling
+ * invalidate_unlocked_inode_pages.  To make sure no other process
+ * adds more write requests while we try our best to flush them, we
+ * make them sleep during the attribute refresh.
  *
- * A very similar scenario holds for the dir cache.
- */
+ * A very similar scenario holds for the dir cache.  */
 int
 nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS 
works/fs/smbfs/cache.c testing/fs/smbfs/cache.c
--- works/fs/smbfs/cache.c  Sat Dec  4 09:30:56 1999
+++ testing/fs/smbfs/cache.cFri May 12 19:22:21 2000
@@ -326,7 +326,7 @@
 * Get rid of any unlocked pages, and clear the
 * 'valid' flag in case a scan is in progress.
 */
-   invalidate_inode_pages(dir);
+   invalidate_unlocked_inode_pages(dir);
dir->u.smbfs_i.cache_valid &= ~SMB_F_CACHEVALID;
dir->u.smbfs_i.oldmtime = 0;
 }
diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS 
works/fs/smbfs/inode.c testing/fs/smbfs/inode.c
--- works/fs/smbfs/inode.c  Thu Mar 16 01:51:14 2000
+++ testing/fs/smbfs/inode.cFri May 12 19:22:12 2000
@@ -205,7 +205,7 @@
 * But we do want to invalidate the caches ...
 */
if (!S_ISDIR(inode->i_mode))
-   invalidate_inode_pages(inode);
+   invalidate_unlocked_inode_pages(inode);
else
smb_invalid_dir_cache(inode);
error = -EIO;
@@ -263,7 +263,7 @@
 (long) last_time, (long) inode->i_mtime);
 #endif
if (!S_ISDIR(inode->i_mode))
-   invalidate_inode_pages(inode);
+   invalidate_unlocked_inode_pages(inode);
else
smb_invalid_dir_cache(inode);
}
diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS 
works/include/linux/fs.h testing/include/linux/fs.h
--- works/include/linux/fs.hFri May 12 01:11:42 2000
+++ testing/include/linux/fs.h  Fri May 12 19:22:52 2000
@@ -954,7 +954,7 @@
 extern void balance_dirty(kdev_t);
 extern int check_disk_change(kdev_t);
 extern int invalidate_inodes(struct super_block *);
-extern void invalidate_inode_pages(struct inode *);
+extern void invalidate_unlocked_inode_pages(struct inode *);
 #define invalidate_buffers(dev)__invalidate_buffers((dev), 0)
 #define destroy_buffers(dev)   __invalidate_buffers((dev), 1)
 extern void __invalidate_buffers(kdev_t dev, int);
diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS 
works/kernel/ksyms.c testing/kernel/ksyms.c
--- works/kernel/ksyms.cFri May 12 01:11:43 2000
+++ testing/kernel/ksyms.c  Fri May 12 19:22:35 2000
@@ -175,7 +175,7 @@
 EXPORT_SYMBOL(check_disk_change);
 EXPORT_SYMBOL(__invalidate_buffers);
 EXPORT_SYMBOL(invalidate_inodes);
-EXPORT_SYMBOL(invalidate_inode_pages);
+EXPORT_SYMBOL(invalidate_unlocked_inode_pages);
 EXPORT_SYMBOL(truncate_inode_pages);
 EXPORT_SYMBOL(fsyn