PATCH: WIP super lock contention mitigation in 2.4
Hi I have ported Stephen patch from 2.2.8 to 2.4, but now I have contention in the inode bitmaps. Can I use the same trick there that he used in the block bitmap??? Later, Juan. diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/fs/ext2/balloc.c working/fs/ext2/balloc.c --- base/fs/ext2/balloc.c Sat May 20 20:24:56 2000 +++ working/fs/ext2/balloc.cWed Jul 12 03:55:11 2000 @@ -9,6 +9,13 @@ * Enhanced block allocation by Stephen Tweedie ([EMAIL PROTECTED]), 1993 * Big-endian to little-endian byte-swapping/bitmaps by *David S. Miller ([EMAIL PROTECTED]), 1995 + * + * Dropped use of the superblock lock. + * This means that we have to take extra care not to block between any + * operations on the bitmap and the corresponding update to the group + * descriptors. + * Stephen C. Tweedie ([EMAIL PROTECTED]), 1999 + * */ #include @@ -70,42 +77,33 @@ } /* - * Read the bitmap for a given block_group, reading into the specified - * slot in the superblock's bitmap cache. + * Read the bitmap for a given block_group. * - * Return >=0 on success or a -ve error code. + * Return the buffer_head on success or NULL on IO error. */ -static int read_block_bitmap (struct super_block * sb, - unsigned int block_group, - unsigned long bitmap_nr) +static struct buffer_head * read_block_bitmap (struct super_block * sb, +unsigned int block_group) { struct ext2_group_desc * gdp; struct buffer_head * bh = NULL; - int retval = -EIO; gdp = ext2_get_group_desc (sb, block_group, NULL); if (!gdp) - goto error_out; - retval = 0; + return NULL; + bh = bread (sb->s_dev, le32_to_cpu(gdp->bg_block_bitmap), sb->s_blocksize); if (!bh) { ext2_error (sb, "read_block_bitmap", "Cannot read block bitmap - " "block_group = %d, block_bitmap = %lu", block_group, (unsigned long) gdp->bg_block_bitmap); - retval = -EIO; + return NULL; } - /* -* On IO error, just leave a zero in the superblock's block pointer for -* this group. The IO will be retried next time. -*/ -error_out: - sb->u.ext2_sb.s_block_bitmap_number[bitmap_nr] = block_group; - sb->u.ext2_sb.s_block_bitmap[bitmap_nr] = bh; - return retval; + return bh; } + /* * load_block_bitmap loads the block bitmap for a blocks group * @@ -122,9 +120,9 @@ static int __load_block_bitmap (struct super_block * sb, unsigned int block_group) { - int i, j, retval = 0; + int i, j; unsigned long block_bitmap_number; - struct buffer_head * block_bitmap; + struct buffer_head * block_bitmap, *cached_bitmap = NULL; if (block_group >= sb->u.ext2_sb.s_groups_count) ext2_panic (sb, "load_block_bitmap", @@ -132,27 +130,34 @@ "block_group = %d, groups_count = %lu", block_group, sb->u.ext2_sb.s_groups_count); +retry: if (sb->u.ext2_sb.s_groups_count <= EXT2_MAX_GROUP_LOADED) { if (sb->u.ext2_sb.s_block_bitmap[block_group]) { if (sb->u.ext2_sb.s_block_bitmap_number[block_group] == - block_group) + block_group) { + brelse(cached_bitmap); return block_group; + } ext2_error (sb, "__load_block_bitmap", "block_group != block_bitmap_number"); } - retval = read_block_bitmap (sb, block_group, block_group); - if (retval < 0) - return retval; + if (!cached_bitmap) + goto load; + sb->u.ext2_sb.s_block_bitmap_number[block_group] = block_group; + sb->u.ext2_sb.s_block_bitmap[block_group] = cached_bitmap; return block_group; } for (i = 0; i < sb->u.ext2_sb.s_loaded_block_bitmaps && sb->u.ext2_sb.s_block_bitmap_number[i] != block_group; i++) ; + + /* already cached? */ if (i < sb->u.ext2_sb.s_loaded_block_bitmaps && sb->u.ext2_sb.s_block_bitmap_number[i] == block_group) { block_bitmap_number = sb->u.ext2_sb.s_block_bitmap_number[i]; block_bitmap = sb->u.ext2_sb.s_block_bitmap[i]; + for (j = i; j > 0; j--) { sb->u.ext2_sb.s_block_bitmap_number[j] = sb->u.ext2_sb.s_block_bitmap_number[j - 1]; @@ -161,28 +166,36 @@ }
PATCH: Trying to get back IO performance (WIP)
Hi This patch is against test3-pre2. It gives here good performance in the first run, and very bad in the following ones of dbench 48. I am hitting here problems with the locking scheme. I get a lot of contention in __wait_on_supper. Almost all the dbench processes are waiting in: 0xc7427dcc 0xc0116fbd schedule+0x389 (0xc4840c20, 0x12901d, 0xc7427ea0, 0x123456 7, 0xc7426000) kernel .text 0xc010 0xc0116c34 0xc01173c0 0xc013639c __wait_on_super+0x184 (0xc13f4c00) kernel .text 0xc010 0xc0136218 0xc0136410 0xc01523e5 ext2_alloc_block+0x21 (0xc4840c20, 0x12901d, 0xc7427ea0) kernel .text 0xc010 0xc01523c4 0xc015245c 0xc7427e5c 0xc0152892 block_getblk+0x15e (0xc4840c20, 0xc316b5e0, 0x9, 0x15, 0xc 7427ea0) kernel .text 0xc010 0xc0152734 0xc0152a68 0xc7427eac 0xc0152ed0 ext2_get_block+0x468 (0xc4840c20, 0x15, 0xc2d99de0, 0x1) kernel .text 0xc010 0xc0152a68 0xc0152fc0 0xc7427ef4 0xc0133ae3 __block_prepare_write+0xe7 (0xc4840c20, 0xc11f7f78, 0x0, 0 x1000, 0xc0152a68) kernel .text 0xc010 0xc01339fc 0xc0133bf0 0xc7427f18 0xc0134121 block_prepare_write+0x21 (0xc11f7f78, 0x0, 0x1000, 0xc0152 a68) kernel .text 0xc010 0xc0134100 0xc013413c [0]more> 0xc7427f30 0xc01531d1 ext2_prepare_write+0x19 (0xc06b4f00, 0xc11f7f78, 0x0, 0x10 00) kernel .text 0xc010 0xc01531b8 0xc01531d8 0xc7427f90 0xc0127b8d generic_file_write+0x305 (0xc06b4f00, 0x8050461, 0xafc2, 0 xc06b4f20) kernel .text 0xc010 0xc0127888 0xc0127cf0 0xc7427fbc 0xc0130ea8 sys_write+0xe8 (0x9, 0x804b460, 0xffc3, 0x28, 0x1082) kernel .text 0xc010 0xc0130dc0 0xc0130ed0 0xc0109874 system_call+0x34 kernel .text 0xc010 0xc0109840 0xc0109878 This behavior also happens with vanilla kernel, only that it is not so easy to reproduce. Vanilla Kernel also gives normally worse IO throughput than the first run with this patch. Comments/suggerences/fixes are welcome. Later, Juan. This patch does: - Introduces WRITETRY logic that means: write this buffer if that is possible, but never block. If we have to block, skip the write. (mainly from Jens axboe) - Uses that logic in sync_page_buffers(), i.e. never try to block in writes, when writing buffers. - Do all the blocking/waiting calling balance_dirty() in shrink_mmap. - export the sync_page_buffers function. - make try_to_free_buffers never generate any IO. - Change the caller of that two functions accordingly with the new semantics. diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/drivers/block/ll_rw_blk.c working/drivers/block/ll_rw_blk.c --- base/drivers/block/ll_rw_blk.c Fri Jun 30 18:42:30 2000 +++ working/drivers/block/ll_rw_blk.c Mon Jul 3 00:03:00 2000 @@ -556,7 +556,7 @@ unsigned int sector, count; int max_segments = MAX_SEGMENTS; struct request * req = NULL; - int rw_ahead, max_sectors, el_ret; + int r_ahead, w_ahead, max_sectors, el_ret; struct list_head *head = &q->queue_head; int latency; elevator_t *elevator = &q->elevator; @@ -584,30 +584,24 @@ } } - rw_ahead = 0; /* normal case; gets changed below for READA */ + r_ahead = w_ahead = 0; switch (rw) { case READA: - rw_ahead = 1; + r_ahead = 1; rw = READ; /* drop into READ */ case READ: if (buffer_uptodate(bh)) /* Hmmph! Already have it */ goto end_io; kstat.pgpgin++; break; - case WRITERAW: - rw = WRITE; - goto do_write; /* Skip the buffer refile */ + case WRITETRY: + w_ahead = 1; case WRITE: if (!test_and_clear_bit(BH_Dirty, &bh->b_state)) goto end_io;/* Hmmph! Nothing to write */ refile_buffer(bh); - do_write: - /* -* We don't allow the write-requests to fill up the -* queue completely: we want some room for reads, -* as they take precedence. The last third of the -* requests are only for reads. -*/ + case WRITERAW: /* Skip the buffer refile */ + rw = WRITE; kstat.pgpgout++; break;
Too many contention in __wait_on_super
Hi I have been changing the write of buffers in the IO layer, and I have found that the system gets a lot of contention in __wait_on_super(). I am using test3-pre1 + kdb patch. I see also the stalls/vmstat strange output without the kdb patch. I use it to be able to see the back-traces. You can also look at the vmstat output. I see that vmstat 1 output when running dbench 48, the results doesn't make sense that we are almost not doing IO in an IO test : vmstat output: r b w swpd free buff cache si sobibo incs us sy id 0 48 2 5004 3112 1948 112652 0 0 0 151 221 238 2 1 96 0 48 2 5004 3108 1948 112652 0 0 0 133 195 188 0 0 99 0 48 2 5004 3108 1948 112652 0 0 0 141 192 184 0 0 99 when things are working properly, that field is between 1000-2000. Now the traces: Almost all the dbench processes are stuck with the first back-trace. Someone find some sense to that output and/or know how to fix that problem. Any comment are welcome. If you need more information, let me know. Later, Juan. Most common back-trace: 0xc721bdcc 0xc0116fbd schedule+0x389 (0xc6e3bb60, 0x98e01, 0xc721bea0, 0x1234567 , 0xc721a000) kernel .text 0xc010 0xc0116c34 0xc01173c0 0xc01363ac __wait_on_super+0x184 (0xc13f4c00) kernel .text 0xc010 0xc0136228 0xc0136420 0xc01523f5 ext2_alloc_block+0x21 (0xc6e3bb60, 0x98e01, 0xc721bea0) kernel .text 0xc010 0xc01523d4 0xc015246c 0xc721be5c 0xc01528a2 block_getblk+0x15e (0xc6e3bb60, 0xc2f75c20, 0x8d, 0x99, 0x c721bea0) kernel .text 0xc010 0xc0152744 0xc0152a78 0xc721beac 0xc0152ee0 ext2_get_block+0x468 (0xc6e3bb60, 0x99, 0xc221e440, 0x1) kernel .text 0xc010 0xc0152a78 0xc0152fd0 0xc721bef4 0xc0133af3 __block_prepare_write+0xe7 (0xc6e3bb60, 0xc11c1ee8, 0x0, 0 x1000, 0xc0152a78) kernel .text 0xc010 0xc0133a0c 0xc0133c00 0xc721bf18 0xc0134131 block_prepare_write+0x21 (0xc11c1ee8, 0x0, 0x1000, 0xc0152 a78) kernel .text 0xc010 0xc0134110 0xc013414c 0xc721bf30 0xc01531e1 ext2_prepare_write+0x19 (0xc719b5e0, 0xc11c1ee8, 0x0, 0x10 00) kernel .text 0xc010 0xc01531c8 0xc01531e8 0xc721bf90 0xc0127b8d generic_file_write+0x305 (0xc719b5e0, 0x8054469, 0x6fba, 0 xc719b600) kernel .text 0xc010 0xc0127888 0xc0127cf0 0xc721bfbc 0xc0130ea8 sys_write+0xe8 (0x9, 0x804b460, 0xffc3, 0x28, 0x1092) kernel .text 0xc010 0xc0130dc0 0xc0130ed0 0xc0109874 system_call+0x34 kernel .text 0xc010 0xc0109840 0xc0109878 or in: 0xc320de58 0xc0116fbd schedule+0x389 (0xc4c130c0, 0xc13f4c00, 0xc4c138e0, 0x1234 567, 0xc320c000) kernel .text 0xc010 0xc0116c34 0xc01173c0 0xc01363ac __wait_on_super+0x184 (0xc13f4c00) kernel .text 0xc010 0xc0136228 0xc0136420 0xc0151ce9 ext2_new_inode+0x6d (0xc4c130c0, 0x8180, 0xc320df00) kernel .text 0xc010 0xc0151c7c 0xc01521dc 0xc320df04 0xc0154555 ext2_create+0x1d (0xc4c130c0, 0xc5c17b60, 0x8180) kernel .text 0xc010 0xc0154538 0xc01545f4 0xc320df28 0xc013df64 vfs_create+0xdc (0xc4c130c0, 0xc5c17b60, 0x180) kernel .text 0xc010 0xc013de88 0xc013dfcc 0xc320df58 0xc013e198 open_namei+0x1cc (0xc4c12000, 0x243, 0x180, 0xc320df7c) kernel .text 0xc010 0xc013dfcc 0xc013e6cc 0xc320df98 0xc0130376 filp_open+0x3a (0xc4c12000, 0x242, 0x180) kernel .text 0xc010 0xc013033c 0xc0130398 0xc320dfbc 0xc01306dc sys_open+0x68 (0xbc4c, 0x242, 0x180, 0x242, 0xbc4c ) kernel .text 0xc010 0xc0130674 0xc01307dc 0xc0109874 system_call+0x34 kernel .text 0xc010 0xc0109840 0xc0109878 or in: 0xc5a19e50 0xc0116fbd schedule+0x389 (0xc5b58a00, 0x1000, 0xfff4, 0xc5a18000 , 0x1234567) kernel .text 0xc010 0xc0116c34 0xc01173c0 0xc0131db8 __wait_on_buffer+0x1d8 (0xc5b58a00) kernel .text 0xc010 0xc0131be0 0xc0131e34 0xc0133773 unmap_buffer+0x33 (0xc5b58a00) kernel .text 0xc010 0xc0133740 0xc01337a0 0xc5a19eb8 0xc01338e1 unmap_underlying_metadata+0x29 (0xc536fc00) [1]more> kernel .text 0xc010 0xc01338b8 0xc01338f0 0xc5a19ef4 0xc0133b0f __block_prepare_write+0x103 (0xc62c25e0, 0xc110e680, 0x0, 0x1000, 0xc0152a78)
Fwd: PATCH: Rewrite of truncate_inode_pages (WIP)
[Sorry if you receives it twice, I forget to CC: fsdevel in my first mail] Hi the actual truncate_inode_pages can actually *write* part of the file that has just been truncated. That makes that things doesn't work. I can trigger the BUG in fs:/inode.c:clear_inode using mmap002 and riel aging patch in 2.4.0-test1. The BUG triggered is: if (inode->i_data.nrpages) BUG(); The problem is that inode->i_data.nrpages can become negative. I have got here -6 truncating a file of 384MB. This patch deals with this problem, and try to do truncate_inode_pages safe. Notice that the function only works for truncate complete files, I have left the old function for the partial case. If people think that this function is OK, I will incorporate the partial case. I have no done it because I want feedback from the MM people and the FS people, and the algorithms for complete files are cleaner. I would like to now if it is possible to have a locked buffer without the page being locked, i.e. If I can manipulate the buffers in one page without locking them when I have the page lock. (I don't think so, and then I have to lock also buffers). With this patch things are rock solid here. This patch does: - defines a new function: discard_buffer, it will lock the buffer (waiting if needed) and remove the buffer from all the queues. It is like unmap_buffer, but makes sure that we don't do any IO and that we remove the buffer from all the lists. - defines a new function: block_destroy_buffers: it is a mix of block_flushpage and do_try_to_free_pages. It will make all the buffers in that page disappear calling discard_buffer. Notice the way that we iterate through all the buffer heads. I think that it is not racy, but I would like to hear comments from people than know more about buffer heads handling. - It changes __remove_inode_pages to check that there are not buffers in that page. Users of the function must make sure about that. It changes all the callers of the function satisfy that requirement. - I change invalidate_inode_pages (again). Now block_destroy_buffers can wait, then we are *civilized* citizens and drop any lock that we have before call that block_destroy_buffers, and reaquire later. - It has my *old* rewrite of truncate_inode_pages to use two auxiliary functions and lock the same way for the partial page and for the rest. (That code is wrong anyway, pass to the next function). - The new function truncate_inode_pages, that is a copy of invalidate_inodes, but we wait for locked pages. Comments, please?? I am very interested in the coments from the fs-people. What they think about the locking that I am using in the buffer.c file. Acknolegments: This patch would have not been possible without the comments or arjan, bcrl, davej, riel and RogerL, in #kernelnewbies (irc.openprojects.net). They did a lot of review of previous versions of the patch. The errors are mine, but good ideas come from discussions with them :) Later, Juan. diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/fs/buffer.c testing/fs/buffer.c --- base/fs/buffer.cTue May 30 01:21:47 2000 +++ testing/fs/buffer.c Tue May 30 02:33:50 2000 @@ -1281,6 +1281,56 @@ } } +/** + * discard_buffer - discard that buffer without doing any IO + * @bh: buffer to discard + * + * This function removes a buffer from all the queues, without doing + * any IO, we are not interested in the contents of the buffer. This + * function can block if the buffer is locked. + */ +static struct buffer_head *discard_buffer(struct buffer_head * bh) +{ + int index = BUFSIZE_INDEX(bh->b_size); + struct buffer_head *next; + + /* grab the lru lock here to block bdflush. */ + atomic_inc(&bh->b_count); + lock_buffer(bh); + next = bh->b_this_page; + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); + + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + spin_lock(&free_list[index].lock); + spin_lock(&unused_list_lock); + + if (!atomic_dec_and_test(&bh->b_count)) + BUG(); + + __hash_unlink(bh); + /* The bunffer can be either on the regular +* queues or on the free list.. +*/ + if (bh->b_dev != B_FREE) + __remove_from_queues(bh); + else + __remove_from_free_list(bh, index); + __put_unused_buffer_head(bh); + spin_unlock(&unused_list_lock); + write_unlock(&hash_table_lock); + spin_unlock(&free_list[index].lock); + spin_unlock(&lru_list_lock); + /* We can unlock the buffer, we have just returned it. +* Ditto for the counter + */ + return next; +} + + /*
PATCH: Rewrite of truncate_inode_pages (take 4)
>>>>> "juan" == Juan J Quintela <[EMAIL PROTECTED]> writes: Hi Linus I am resending to you this patch, it works here, and nobody has complained about it the previous times that I post it here. Later, Juan. juan> I have reworte the function truncate_inode_pages. The version juan> in vanilla pre9-3 does busy waiting in the partial page, with juan> this version the locking for the partial page and from the juan> rest of the pages is the same. This make that we have less juan> special cases. For the rest of pages the function works the juan> same. The only difference is that version is cleaner IMHO. juan> Or there are some corner case that I have failed to see? juan> Comments? juan> Later, Juan. juan> I have CC: the linux-fsdevel people, they are the users of juan> that function, could somebody give me some feedback against juan> the change? diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c testing/mm/filemap.c --- work/mm/filemap.c Fri May 12 23:46:46 2000 +++ testing/mm/filemap.cSun May 14 22:08:45 2000 @@ -146,9 +146,39 @@ spin_unlock(&pagecache_lock); } -/* +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + + if (page->buffers) + block_flushpage(page, partial); + +} + +static inline void truncate_complete_page(struct page *page) +{ + if (!page->buffers || block_flushpage(page, 0)) + lru_cache_del(page); + + /* +* We remove the page from the page cache _after_ we have +* destroyed all buffer-cache references to it. Otherwise some +* other process might think this inode page is not in the +* page cache and creates a buffer-cache alias to it causing +* all sorts of fun problems ... +*/ + remove_inode_page(page); + page_cache_release(page); +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * * Truncate the page cache at a set offset, removing the pages * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. */ void truncate_inode_pages(struct address_space * mapping, loff_t lstart) { @@ -168,11 +198,10 @@ page = list_entry(curr, struct page, list); curr = curr->next; - offset = page->index; - /* page wholly truncated - free it */ - if (offset >= start) { + /* Is one of the pages to truncate? */ + if ((offset >= start) || (partial && (offset + 1) == start)) { if (TryLockPage(page)) { page_cache_get(page); spin_unlock(&pagecache_lock); @@ -183,22 +212,14 @@ page_cache_get(page); spin_unlock(&pagecache_lock); - if (!page->buffers || block_flushpage(page, 0)) - lru_cache_del(page); - - /* -* We remove the page from the page cache -* _after_ we have destroyed all buffer-cache -* references to it. Otherwise some other process -* might think this inode page is not in the -* page cache and creates a buffer-cache alias -* to it causing all sorts of fun problems ... -*/ - remove_inode_page(page); + if (partial && (offset + 1) == start) { + truncate_partial_page(page, partial); + partial = 0; + } else + truncate_complete_page(page); UnlockPage(page); page_cache_release(page); - page_cache_release(page); /* * We have done things without the pagecache lock, @@ -209,37 +230,6 @@ */ goto repeat; } - /* -* there is only one partial page possible. -*/ - if (!partial) - continue; - - /* and it's the one preceeding the first wholly truncated page */ - if ((offset + 1) != start) - continue; - - /* partial truncate, clear end of page */ -
PATCH: Rewrite of truncate_inode_pages (take 3)
Hi I have reworte the function truncate_inode_pages. The version in vanilla pre9-3 does busy waiting in the partial page, with this version the locking for the partial page and from the rest of the pages is the same. This make that we have less special cases. For the rest of pages the function works the same. The only difference is that version is cleaner IMHO. Or there are some corner case that I have failed to see? Comments? Later, Juan. I have CC: the linux-fsdevel people, they are the users of that function, could somebody give me some feedback against the change? diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c testing/mm/filemap.c --- work/mm/filemap.c Fri May 12 23:46:46 2000 +++ testing/mm/filemap.cSun May 14 22:08:45 2000 @@ -146,9 +146,39 @@ spin_unlock(&pagecache_lock); } -/* +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + + if (page->buffers) + block_flushpage(page, partial); + +} + +static inline void truncate_complete_page(struct page *page) +{ + if (!page->buffers || block_flushpage(page, 0)) + lru_cache_del(page); + + /* +* We remove the page from the page cache _after_ we have +* destroyed all buffer-cache references to it. Otherwise some +* other process might think this inode page is not in the +* page cache and creates a buffer-cache alias to it causing +* all sorts of fun problems ... +*/ + remove_inode_page(page); + page_cache_release(page); +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * * Truncate the page cache at a set offset, removing the pages * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. */ void truncate_inode_pages(struct address_space * mapping, loff_t lstart) { @@ -168,11 +198,10 @@ page = list_entry(curr, struct page, list); curr = curr->next; - offset = page->index; - /* page wholly truncated - free it */ - if (offset >= start) { + /* Is one of the pages to truncate? */ + if ((offset >= start) || (partial && (offset + 1) == start)) { if (TryLockPage(page)) { page_cache_get(page); spin_unlock(&pagecache_lock); @@ -183,22 +212,14 @@ page_cache_get(page); spin_unlock(&pagecache_lock); - if (!page->buffers || block_flushpage(page, 0)) - lru_cache_del(page); - - /* -* We remove the page from the page cache -* _after_ we have destroyed all buffer-cache -* references to it. Otherwise some other process -* might think this inode page is not in the -* page cache and creates a buffer-cache alias -* to it causing all sorts of fun problems ... -*/ - remove_inode_page(page); + if (partial && (offset + 1) == start) { + truncate_partial_page(page, partial); + partial = 0; + } else + truncate_complete_page(page); UnlockPage(page); page_cache_release(page); - page_cache_release(page); /* * We have done things without the pagecache lock, @@ -209,37 +230,6 @@ */ goto repeat; } - /* -* there is only one partial page possible. -*/ - if (!partial) - continue; - - /* and it's the one preceeding the first wholly truncated page */ - if ((offset + 1) != start) - continue; - - /* partial truncate, clear end of page */ - if (TryLockPage(page)) { - spin_unlock(&pagecache_lock); - goto repeat; - } - page_cache_get(page); - spin_unlock(&pagecache_lock); - - memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); - if (page->buffers) - block_flushpage(page, partial); - - p
PATCH: Rewrite of truncate_inode_pages (try2)
Hi Linus I just repost my patch to truncate_inode_pages. The version in vanilla pre9-2 does busy waiting in partial pages, this one didn't do busy waiting and put the locking code common for partial and non partial pages. Linus, if you don't like it, could you tell me what is the problem? Thanks, Later. diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c testing/mm/filemap.c --- work/mm/filemap.c Fri May 12 23:46:46 2000 +++ testing/mm/filemap.cSun May 14 22:08:45 2000 @@ -146,9 +146,39 @@ spin_unlock(&pagecache_lock); } -/* +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + + if (page->buffers) + block_flushpage(page, partial); + +} + +static inline void truncate_complete_page(struct page *page) +{ + if (!page->buffers || block_flushpage(page, 0)) + lru_cache_del(page); + + /* +* We remove the page from the page cache _after_ we have +* destroyed all buffer-cache references to it. Otherwise some +* other process might think this inode page is not in the +* page cache and creates a buffer-cache alias to it causing +* all sorts of fun problems ... +*/ + remove_inode_page(page); + page_cache_release(page); +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * * Truncate the page cache at a set offset, removing the pages * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. */ void truncate_inode_pages(struct address_space * mapping, loff_t lstart) { @@ -168,11 +198,10 @@ page = list_entry(curr, struct page, list); curr = curr->next; - offset = page->index; - /* page wholly truncated - free it */ - if (offset >= start) { + /* Is one of the pages to truncate? */ + if ((offset >= start) || (partial && (offset + 1) == start)) { if (TryLockPage(page)) { page_cache_get(page); spin_unlock(&pagecache_lock); @@ -183,22 +212,14 @@ page_cache_get(page); spin_unlock(&pagecache_lock); - if (!page->buffers || block_flushpage(page, 0)) - lru_cache_del(page); - - /* -* We remove the page from the page cache -* _after_ we have destroyed all buffer-cache -* references to it. Otherwise some other process -* might think this inode page is not in the -* page cache and creates a buffer-cache alias -* to it causing all sorts of fun problems ... -*/ - remove_inode_page(page); + if (partial && (offset + 1) == start) { + truncate_partial_page(page, partial); + partial = 0; + } else + truncate_complete_page(page); UnlockPage(page); page_cache_release(page); - page_cache_release(page); /* * We have done things without the pagecache lock, @@ -209,37 +230,6 @@ */ goto repeat; } - /* -* there is only one partial page possible. -*/ - if (!partial) - continue; - - /* and it's the one preceeding the first wholly truncated page */ - if ((offset + 1) != start) - continue; - - /* partial truncate, clear end of page */ - if (TryLockPage(page)) { - spin_unlock(&pagecache_lock); - goto repeat; - } - page_cache_get(page); - spin_unlock(&pagecache_lock); - - memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); - if (page->buffers) - block_flushpage(page, partial); - - partial = 0; - - /* -* we have dropped the spinlock so we have to -* restart. -*/ - UnlockPage(page); - page_cache_release(page); - goto repeat; } spin_unlock(&pagecache_lock); } -- In theory, pract
PATCH: Rewrite of truncate_inode_pages
Hi I have just rewrite the function truncate_inode_pages, it did busy waiting for the partial page, with the new rewrite, we do the same locking for normal pages and the partial pages. This will help in having less special cases. Comments? Later, Juan. PD. This mail is also sent to the linux-fsdevel asking for opininions/suggestions, they help to find a bug in the rewrite of invalidate_inode_pages. diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude work/mm/filemap.c testing/mm/filemap.c --- work/mm/filemap.c Fri May 12 23:46:46 2000 +++ testing/mm/filemap.cSun May 14 22:08:45 2000 @@ -146,9 +146,39 @@ spin_unlock(&pagecache_lock); } -/* +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + + if (page->buffers) + block_flushpage(page, partial); + +} + +static inline void truncate_complete_page(struct page *page) +{ + if (!page->buffers || block_flushpage(page, 0)) + lru_cache_del(page); + + /* +* We remove the page from the page cache _after_ we have +* destroyed all buffer-cache references to it. Otherwise some +* other process might think this inode page is not in the +* page cache and creates a buffer-cache alias to it causing +* all sorts of fun problems ... +*/ + remove_inode_page(page); + page_cache_release(page); +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from with to truncate + * * Truncate the page cache at a set offset, removing the pages * that are beyond that offset (and zeroing out partial pages). + * If any page is locked we wait for it to become unlocked. */ void truncate_inode_pages(struct address_space * mapping, loff_t lstart) { @@ -168,11 +198,10 @@ page = list_entry(curr, struct page, list); curr = curr->next; - offset = page->index; - /* page wholly truncated - free it */ - if (offset >= start) { + /* Is one of the pages to truncate? */ + if ((offset >= start) || (partial && (offset + 1) == start)) { if (TryLockPage(page)) { page_cache_get(page); spin_unlock(&pagecache_lock); @@ -183,22 +212,14 @@ page_cache_get(page); spin_unlock(&pagecache_lock); - if (!page->buffers || block_flushpage(page, 0)) - lru_cache_del(page); - - /* -* We remove the page from the page cache -* _after_ we have destroyed all buffer-cache -* references to it. Otherwise some other process -* might think this inode page is not in the -* page cache and creates a buffer-cache alias -* to it causing all sorts of fun problems ... -*/ - remove_inode_page(page); + if (partial && (offset + 1) == start) { + truncate_partial_page(page, partial); + partial = 0; + } else + truncate_complete_page(page); UnlockPage(page); page_cache_release(page); - page_cache_release(page); /* * We have done things without the pagecache lock, @@ -209,37 +230,6 @@ */ goto repeat; } - /* -* there is only one partial page possible. -*/ - if (!partial) - continue; - - /* and it's the one preceeding the first wholly truncated page */ - if ((offset + 1) != start) - continue; - - /* partial truncate, clear end of page */ - if (TryLockPage(page)) { - spin_unlock(&pagecache_lock); - goto repeat; - } - page_cache_get(page); - spin_unlock(&pagecache_lock); - - memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); - if (page->buffers) - block_flushpage(page, partial); - - partial = 0; - - /* -* we have dropped the spinlock so we have to -* restart. -*/ - UnlockPage(page); - page_cache_release(page); - goto repeat;
Re: PATCH: rewrite of invalidate_inode_pages
> "trond" == Trond Myklebust <[EMAIL PROTECTED]> writes: > " " == Arjan van de Ven <[EMAIL PROTECTED]> writes: .. >> I'd vote for "invalidate_unlocked_inode_pages", as it also >> suggests that the locked pages aren't invalidated. trond> That sounds very good to me. Just so long as the name becomes more trond> self-documenting than it is now. trond> Intelligent people are making mistakes about what we want to do with trond> this function, so it definitely needs to be documented more clearly. Here it is a patch that changes all the refernences in the kernel from invalidate_inode_pages to invalidate_unlocked_inode_pages. I have CC: the mail to linux-fsdevel because the SMB people also use that function. Comments? Later, Juan. PD. This patch aplies in top of my previous patch to this function, that you can get from the MM-list of from: http://carpanta.dc.fi.udc.es/~quintela/kernel/2.3.99-pre7/page_cache_get.diff diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS works/fs/nfs/inode.c testing/fs/nfs/inode.c --- works/fs/nfs/inode.cWed Apr 26 02:28:56 2000 +++ testing/fs/nfs/inode.c Fri May 12 19:22:00 2000 @@ -586,7 +586,7 @@ NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - invalidate_inode_pages(inode); + invalidate_unlocked_inode_pages(inode); memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); NFS_CACHEINV(inode); @@ -982,12 +982,12 @@ * of the server's inode. * * This is a bit tricky because we have to make sure all dirty pages - * have been sent off to the server before calling invalidate_inode_pages. - * To make sure no other process adds more write requests while we try - * our best to flush them, we make them sleep during the attribute refresh. + * have been sent off to the server before calling + * invalidate_unlocked_inode_pages. To make sure no other process + * adds more write requests while we try our best to flush them, we + * make them sleep during the attribute refresh. * - * A very similar scenario holds for the dir cache. - */ + * A very similar scenario holds for the dir cache. */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS works/fs/smbfs/cache.c testing/fs/smbfs/cache.c --- works/fs/smbfs/cache.c Sat Dec 4 09:30:56 1999 +++ testing/fs/smbfs/cache.cFri May 12 19:22:21 2000 @@ -326,7 +326,7 @@ * Get rid of any unlocked pages, and clear the * 'valid' flag in case a scan is in progress. */ - invalidate_inode_pages(dir); + invalidate_unlocked_inode_pages(dir); dir->u.smbfs_i.cache_valid &= ~SMB_F_CACHEVALID; dir->u.smbfs_i.oldmtime = 0; } diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS works/fs/smbfs/inode.c testing/fs/smbfs/inode.c --- works/fs/smbfs/inode.c Thu Mar 16 01:51:14 2000 +++ testing/fs/smbfs/inode.cFri May 12 19:22:12 2000 @@ -205,7 +205,7 @@ * But we do want to invalidate the caches ... */ if (!S_ISDIR(inode->i_mode)) - invalidate_inode_pages(inode); + invalidate_unlocked_inode_pages(inode); else smb_invalid_dir_cache(inode); error = -EIO; @@ -263,7 +263,7 @@ (long) last_time, (long) inode->i_mtime); #endif if (!S_ISDIR(inode->i_mode)) - invalidate_inode_pages(inode); + invalidate_unlocked_inode_pages(inode); else smb_invalid_dir_cache(inode); } diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS works/include/linux/fs.h testing/include/linux/fs.h --- works/include/linux/fs.hFri May 12 01:11:42 2000 +++ testing/include/linux/fs.h Fri May 12 19:22:52 2000 @@ -954,7 +954,7 @@ extern void balance_dirty(kdev_t); extern int check_disk_change(kdev_t); extern int invalidate_inodes(struct super_block *); -extern void invalidate_inode_pages(struct inode *); +extern void invalidate_unlocked_inode_pages(struct inode *); #define invalidate_buffers(dev)__invalidate_buffers((dev), 0) #define destroy_buffers(dev) __invalidate_buffers((dev), 1) extern void __invalidate_buffers(kdev_t dev, int); diff -u -urN --exclude=CVS --exclude=*~ --exclude=.#* --exclude=TAGS works/kernel/ksyms.c testing/kernel/ksyms.c --- works/kernel/ksyms.cFri May 12 01:11:43 2000 +++ testing/kernel/ksyms.c Fri May 12 19:22:35 2000 @@ -175,7 +175,7 @@ EXPORT_SYMBOL(check_disk_change); EXPORT_SYMBOL(__invalidate_buffers); EXPORT_SYMBOL(invalidate_inodes); -EXPORT_SYMBOL(invalidate_inode_pages); +EXPORT_SYMBOL(invalidate_unlocked_inode_pages); EXPORT_SYMBOL(truncate_inode_pages); EXPORT_SYMBOL(fsyn