Re: [Pvfs2-users] Kernel 2.6.17 Oops

Murali Vilayannur Wed, 12 Jul 2006 14:54:46 -0700

Hi guys,
Could you test if the following patch fixes the oops and/or does not cause
any other regressions?
Thanks,
Murali


On Mon, 10 Jul 2006, Number Cruncher wrote:

> I believe it was the interruption (Ctrl-C) of a compiler using files on
> the PVFS2 fs.
>
> Robert Latham wrote:
>
> >On Mon, Jul 10, 2006 at 02:46:19PM +0100, Number Cruncher wrote:
> >
> >
> >>I'm getting kernel oopses with the latest Fedora Core 4 smp kernel and
> >>pvfs2-1.5.1
> >>
> >>
> >
> >Hi simon
> >
> >We've seen something like this too.  The 2.6.17-something fedora kernels
> >(2.6.17-1.2145_FC5 and the one prior) oops during
> >our nightly tests, but the 2.6.16-something kernels did fine.   We're
> >seeing this when one of our nightly test scripts tries exec several
> >instances of 'ls' simultaneously.   What workload are you running when
> >you see this?
> >
> >thanks
> >==rob
> >
> >
> >
> _______________________________________________
> Pvfs2-users mailing list
> [email protected]
> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-users
>
>

Index: src/kernel/linux-2.6/inode.c
===================================================================
RCS file: /anoncvs/pvfs2/src/kernel/linux-2.6/inode.c,v
retrieving revision 1.68
diff -u -r1.68 inode.c
--- src/kernel/linux-2.6/inode.c        23 Jun 2006 20:59:29 -0000      1.68
+++ src/kernel/linux-2.6/inode.c        12 Jul 2006 21:49:31 -0000
@@ -14,139 +14,27 @@
 #include "pvfs2-bufmap.h"
 #include "pvfs2-types.h"
 
-/** Read page-sized blocks from file.  This code is only used in the mmap
- *  path.
- *
- *  \note Current implementation ignores max_blocks parameter and always
- *        reads exactly one block.  Because the only time this function
- *        seems to be called is from pvfs2_get_block(), this appears to be
- *        ok.
- */
-static int pvfs2_get_blocks(
-    struct inode *inode,
-    sector_t lblock,
-    unsigned long max_blocks,
-    struct buffer_head *bh_result,
-    int create)
+static int read_one_page(struct page *page)
 {
-    int ret = -EIO;
-    uint32_t max_block = 0;
+    void *page_data;
+    int ret, max_block;
     ssize_t bytes_read = 0;
-    struct page *page = NULL;
-    void *page_data = NULL;
-    pvfs2_inode_t *pvfs2_inode = PVFS2_I(inode);
-
-    /*
-      We're faking our inode block size to be PAGE_CACHE_SIZE
-      to play nicely with the page cache.
-
-      In some reality, inode->i_blksize != PAGE_CACHE_SIZE and
-      inode->i_blkbits != PAGE_CACHE_SHIFT
-    */
+    struct inode *inode = page->mapping->host;
     const uint32_t blocksize = PAGE_CACHE_SIZE;  /* inode->i_blksize */
     const uint32_t blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */
 
-    /* check assumption before continuing */
-    if (max_blocks != 1) {
-       pvfs2_error("pvfs2_get_blocks called with invalid max_blocks (%lu)\n",
-                   max_blocks);
-    }
-
-    pvfs2_print("pvfs2_get_blocks called for lblock %lu\n",
-                (unsigned long)lblock);
-
-    /*
-      this is a quick workaround for the case when a sequence
-      of sequential blocks are needed to be read, but the
-      operation that issued it was cancelled by a signal.
-      if we detect a sequential pattern of failures, don't
-      even bother issuing the upcall since that's way more
-      expensive to do and it will also fail due to the signal
-      raised.
-
-      the first failure will have the failed block as lblock-1,
-      but the mpage code (the caller) tries twice in a row, so the
-      second time, the failed block index will be == to lblock.
-
-      NOTE: the easiest way to test this is to run a program
-      from a pvfs2 volume and ctrl-c it before it's fully
-      read into memory.
-    */
-    if ((pvfs2_inode->last_failed_block_index_read > 0) &&
-        ((pvfs2_inode->last_failed_block_index_read == (lblock - 1)) ||
-         (pvfs2_inode->last_failed_block_index_read == lblock)))
-    {
-        pvfs2_print("pvfs2: skipping get_block on index %d due "
-                    "to previous failure.\n", (int)lblock);
-        pvfs2_inode->last_failed_block_index_read = lblock;
-        /*
-          NOTE: we don't need to worry about cleaning up the
-          mmap_ra_cache in userspace from here because on I/O
-          failure, the pvfs2-client-core will be restarted
-        */
-        return -EIO;
-    }
-
-    page = bh_result->b_page;
+    pvfs2_print("pvfs2_readpage called with page %p\n",page);
     page_data = pvfs2_kmap(page);
 
-    /*
-      NOTE: this unsafely *assumes* that the size stored in the inode
-      is accurate.
-
-      make sure we're not looking for a page block past the end of
-      this file;
-    */
     max_block = ((inode->i_size / blocksize) + 1);
+
     if (page->index < max_block)
     {
         loff_t blockptr_offset =
             (((loff_t)page->index) << blockbits);
-
-        /*
-          NOTE: This is conceptually backwards.  we could be
-          implementing the file_read as generic_file_read and doing
-          the actual i/o here (via readpage).  There are *no* plans to
-          do this.
-
-          The main reason it's not like that now is because of the
-          mismatch of page cache size and the inode blocksize that
-          we're using.  It's more efficient in the general case to use
-          the larger blocksize (~4M rather than ~4K) for reading &
-          writing (via pvfs2_inode_read/pvfs2_file_write).  For now it
-          seems that this call can *only* handle reads of
-          PAGE_CACHE_SIZE blocks, which is terribly slow for us.
-
-          set the readahead size to be the entire file size so that
-          subsequent calls have the opportunity to be userspace read
-          cache hits; any readahead data the client pulls in is
-          flushed (both from userspace and the page cache) on vfs file
-          close
-
-         ALSO NOTE: This call actually only reads one PAGE_CACHE_SIZE
-         block (blocksize), even though we might have been asked to
-         read more than that. -- RobR
-        */
         bytes_read = pvfs2_inode_read(
-            inode, page_data, blocksize, &blockptr_offset, 0,
-            inode->i_size);
-
-        if (bytes_read < 0)
-        {
-            pvfs2_print("pvfs2_get_blocks: failed to read page block %d\n",
-                        (int)page->index);
-            pvfs2_inode->last_failed_block_index_read = page->index;
-        }
-        else
-        {
-            pvfs2_print("pvfs2_get_blocks: read %d bytes | offset is %d | "
-                        "cur_block is %d | max_block is %d\n",
-                        (int)bytes_read, (int)blockptr_offset,
-                        (int)page->index, (int)max_block);
-            pvfs2_inode->last_failed_block_index_read = 0;
-        }
+            inode, page_data, blocksize, &blockptr_offset, 0, inode->i_size);
     }
-
     /* only zero remaining unread portions of the page data */
     if (bytes_read > 0)
     {
@@ -156,11 +44,8 @@
     {
         memset(page_data, 0, blocksize);
     }
-
     /* takes care of potential aliasing */
     flush_dcache_page(page);
-    pvfs2_kunmap(page);
-
     if (bytes_read < 0)
     {
         ret = bytes_read;
@@ -173,44 +58,19 @@
         {
             ClearPageError(page);
         }
-
-        bh_result->b_data = page_data;
-        bh_result->b_size = bytes_read;
-
-#ifdef PVFS2_LINUX_KERNEL_2_4
-        set_bit(BH_Mapped, &(bh_result)->b_state);
-        mark_buffer_uptodate(bh_result, 1);
-        bh_result->b_blocknr = lblock;
-#else
-        map_bh(bh_result, inode->i_sb, lblock);
-        set_buffer_uptodate(bh_result);
-#endif
         ret = 0;
     }
+    pvfs2_kunmap(page);
+    /* unlock the page after the ->readpage() routine completes */
+    unlock_page(page);
     return ret;
 }
 
-/** Passes request for a block on to pvfs2_get_blocks()
- */
-static int pvfs2_get_block(
-    struct inode *ip,
-    get_block_block_type lblock,
-    struct buffer_head *bh_result,
-    int create)
-{
-    pvfs2_print("pvfs2_get_block called with block %lu\n",
-                (unsigned long)lblock);
-    return pvfs2_get_blocks(ip, lblock, 1, bh_result, create);
-}
-
-/** Passes request for a page on to pvfs2_get_block()
- */
 static int pvfs2_readpage(
     struct file *file,
     struct page *page)
 {
-    pvfs2_print("pvfs2_readpage called with page %p\n",page);
-    return pvfs2_kernel_readpage(page, pvfs2_get_block);
+    return read_one_page(page);
 }
 
 #ifndef PVFS2_LINUX_KERNEL_2_4
@@ -220,8 +80,25 @@
     struct list_head *pages,
     unsigned nr_pages)
 {
+    int page_idx;
+    int ret;
+
     pvfs2_print("pvfs2_readpages called\n");
-    return mpage_readpages(mapping, pages, nr_pages, pvfs2_get_block);
+
+    for (page_idx = 0; page_idx < nr_pages; page_idx++)
+    {
+        struct page *page;
+        page = list_entry(pages->prev, struct page, lru);
+        list_del(&page->lru);
+        if (!add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+            ret = read_one_page(page);
+        }
+        else {
+            page_cache_release(page);
+        }
+    }
+    BUG_ON(!list_empty(pages));
+    return 0;
 }
 
 #ifdef HAVE_INT_RETURN_ADDRESS_SPACE_OPERATIONS_INVALIDATEPAGE
@@ -250,7 +127,6 @@
 #endif
 {
     pvfs2_print("pvfs2_releasepage called on page %p\n", page);
-    try_to_free_buffers(page);
     return 0;
 }

_______________________________________________
Pvfs2-users mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-users

Re: [Pvfs2-users] Kernel 2.6.17 Oops

Reply via email to