Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-07-02 Thread Badari Pulavarty
On Thu, 2007-06-14 at 13:06 -0700, Andrew Morton wrote:
> On Thu, 14 Jun 2007 12:38:39 -0700
> [EMAIL PROTECTED] wrote:
> 
> > This patchset cleans up the page cache handling by replacing
> > open coded shifts and adds through inline function calls.
> 

Some of us (crazy) people are trying to support read for hugetlbfs
in order to get oprofile work on large-page-backed-executables by
libhugetlbfs.

Currently, I can't use any generic support. I have this ugly patch
to get oprofile work. Christoph's clean ups would allow me to set
per-mapping pagesize and get this to work, without any hacks.

Thanks,
Badari

 fs/hugetlbfs/inode.c |  117 +++
 1 file changed, 117 insertions(+)

Index: linux/fs/hugetlbfs/inode.c
===
--- linux.orig/fs/hugetlbfs/inode.c 2007-05-18 04:16:27.0 -0700
+++ linux/fs/hugetlbfs/inode.c  2007-06-22 10:46:09.0 -0700
@@ -160,6 +160,122 @@ full_search:
 #endif
 
 /*
+ * Support for read()
+ */
+static int
+hugetlbfs_read_actor(struct page *page, unsigned long offset,
+   char __user *buf, unsigned long count,
+   unsigned long size)
+{
+   char *kaddr;
+   unsigned long to_copy;
+   int i, chunksize;
+
+   if (size > count)
+   size = count;
+
+   /* Find which 4k chunk and offset with in that chunk */
+   i = offset >> PAGE_CACHE_SHIFT;
+   offset = offset & ~PAGE_CACHE_MASK;
+   to_copy = size;
+
+   while (to_copy) {
+   chunksize = PAGE_CACHE_SIZE;
+   if (offset)
+   chunksize -= offset;
+   if (chunksize > to_copy)
+   chunksize = to_copy;
+
+#if 0
+printk("Coping i=%d page: %p offset %d chunk %d\n", i, [i], offset, 
chunksize);
+#endif
+   kaddr = kmap([i]);
+   memcpy(buf, kaddr + offset, chunksize);
+   kunmap([i]);
+   offset = 0;
+   to_copy -= chunksize;
+   buf += chunksize;
+   i++;
+   }
+   return size;
+}
+
+
+ssize_t
+hugetlbfs_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+   struct address_space *mapping = filp->f_mapping;
+   struct inode *inode = mapping->host;
+   unsigned long index = *ppos >> HPAGE_SHIFT;
+   unsigned long end_index;
+   loff_t isize;
+   unsigned long offset;
+   ssize_t retval = 0;
+
+   /* validate user buffer and len */
+   if (len == 0)
+   goto out;
+
+   isize = i_size_read(inode);
+   if (!isize)
+   goto out;
+
+   offset = *ppos & ~HPAGE_MASK;
+   end_index = (isize - 1) >> HPAGE_SHIFT;
+   for (;;) {
+   struct page *page;
+   unsigned long nr, ret;
+
+   /* nr is the maximum number of bytes to copy from this page */
+   nr = HPAGE_SIZE;
+   if (index >= end_index) {
+   if (index > end_index)
+   goto out;
+   nr = ((isize - 1) & ~HPAGE_MASK) + 1;
+   if (nr <= offset) {
+   goto out;
+   }
+   }
+   nr = nr - offset;
+
+   /* Find the page */
+   page = find_get_page(mapping, index);
+   if (unlikely(page == NULL)) {
+   /*
+* We can't find the page in the cache - bail out
+* TODO - should we zero out the user buffer ?
+*/
+   goto out;
+   }
+#if 0
+printk("Found page %p at index %d offset %d nr %d\n", page, index, offset, nr);
+#endif
+
+   /*
+* Ok, we have the page, so now we can copy it to user space...
+*/
+   ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+   if (ret < 0) {
+   retval = retval ? : ret;
+   goto out;
+   }
+
+   offset += ret;
+   retval += ret;
+   len -= ret;
+   index += offset >> HPAGE_SHIFT;
+   offset &= ~HPAGE_MASK;
+
+   page_cache_release(page);
+   if (ret == nr && len)
+   continue;
+   goto out;
+   }
+out:
+   return retval;
+}
+
+/*
  * Read a page. Again trivial. If it didn't already exist
  * in the page cache, it is zero-filled.
  */
@@ -565,6 +681,7 @@ static void init_once(void *foo, kmem_ca
 }
 
 struct file_operations hugetlbfs_file_operations = {
+   .read   = hugetlbfs_read,
.mmap   = hugetlbfs_file_mmap,
.fsync  = simple_sync_file,
.get_unmapped_area  = hugetlb_get_unmapped_area,


-
To unsubscribe from this 

Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-07-02 Thread Badari Pulavarty
On Thu, 2007-06-14 at 13:06 -0700, Andrew Morton wrote:
 On Thu, 14 Jun 2007 12:38:39 -0700
 [EMAIL PROTECTED] wrote:
 
  This patchset cleans up the page cache handling by replacing
  open coded shifts and adds through inline function calls.
 

Some of us (crazy) people are trying to support read for hugetlbfs
in order to get oprofile work on large-page-backed-executables by
libhugetlbfs.

Currently, I can't use any generic support. I have this ugly patch
to get oprofile work. Christoph's clean ups would allow me to set
per-mapping pagesize and get this to work, without any hacks.

Thanks,
Badari

 fs/hugetlbfs/inode.c |  117 +++
 1 file changed, 117 insertions(+)

Index: linux/fs/hugetlbfs/inode.c
===
--- linux.orig/fs/hugetlbfs/inode.c 2007-05-18 04:16:27.0 -0700
+++ linux/fs/hugetlbfs/inode.c  2007-06-22 10:46:09.0 -0700
@@ -160,6 +160,122 @@ full_search:
 #endif
 
 /*
+ * Support for read()
+ */
+static int
+hugetlbfs_read_actor(struct page *page, unsigned long offset,
+   char __user *buf, unsigned long count,
+   unsigned long size)
+{
+   char *kaddr;
+   unsigned long to_copy;
+   int i, chunksize;
+
+   if (size  count)
+   size = count;
+
+   /* Find which 4k chunk and offset with in that chunk */
+   i = offset  PAGE_CACHE_SHIFT;
+   offset = offset  ~PAGE_CACHE_MASK;
+   to_copy = size;
+
+   while (to_copy) {
+   chunksize = PAGE_CACHE_SIZE;
+   if (offset)
+   chunksize -= offset;
+   if (chunksize  to_copy)
+   chunksize = to_copy;
+
+#if 0
+printk(Coping i=%d page: %p offset %d chunk %d\n, i, page[i], offset, 
chunksize);
+#endif
+   kaddr = kmap(page[i]);
+   memcpy(buf, kaddr + offset, chunksize);
+   kunmap(page[i]);
+   offset = 0;
+   to_copy -= chunksize;
+   buf += chunksize;
+   i++;
+   }
+   return size;
+}
+
+
+ssize_t
+hugetlbfs_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+   struct address_space *mapping = filp-f_mapping;
+   struct inode *inode = mapping-host;
+   unsigned long index = *ppos  HPAGE_SHIFT;
+   unsigned long end_index;
+   loff_t isize;
+   unsigned long offset;
+   ssize_t retval = 0;
+
+   /* validate user buffer and len */
+   if (len == 0)
+   goto out;
+
+   isize = i_size_read(inode);
+   if (!isize)
+   goto out;
+
+   offset = *ppos  ~HPAGE_MASK;
+   end_index = (isize - 1)  HPAGE_SHIFT;
+   for (;;) {
+   struct page *page;
+   unsigned long nr, ret;
+
+   /* nr is the maximum number of bytes to copy from this page */
+   nr = HPAGE_SIZE;
+   if (index = end_index) {
+   if (index  end_index)
+   goto out;
+   nr = ((isize - 1)  ~HPAGE_MASK) + 1;
+   if (nr = offset) {
+   goto out;
+   }
+   }
+   nr = nr - offset;
+
+   /* Find the page */
+   page = find_get_page(mapping, index);
+   if (unlikely(page == NULL)) {
+   /*
+* We can't find the page in the cache - bail out
+* TODO - should we zero out the user buffer ?
+*/
+   goto out;
+   }
+#if 0
+printk(Found page %p at index %d offset %d nr %d\n, page, index, offset, nr);
+#endif
+
+   /*
+* Ok, we have the page, so now we can copy it to user space...
+*/
+   ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+   if (ret  0) {
+   retval = retval ? : ret;
+   goto out;
+   }
+
+   offset += ret;
+   retval += ret;
+   len -= ret;
+   index += offset  HPAGE_SHIFT;
+   offset = ~HPAGE_MASK;
+
+   page_cache_release(page);
+   if (ret == nr  len)
+   continue;
+   goto out;
+   }
+out:
+   return retval;
+}
+
+/*
  * Read a page. Again trivial. If it didn't already exist
  * in the page cache, it is zero-filled.
  */
@@ -565,6 +681,7 @@ static void init_once(void *foo, kmem_ca
 }
 
 struct file_operations hugetlbfs_file_operations = {
+   .read   = hugetlbfs_read,
.mmap   = hugetlbfs_file_mmap,
.fsync  = simple_sync_file,
.get_unmapped_area  = hugetlb_get_unmapped_area,


-
To unsubscribe from this list: send the line 

Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-17 Thread William Lee Irwin III
On Sun, 17 Jun 2007, Matt Mackall wrote:
>> Is it? Last I looked it had reverted to handing out reverse-contiguous
>> pages.

On Sun, Jun 17, 2007 at 07:08:41PM -0700, Christoph Lameter wrote:
> I thought that was fixed? Bill Irwin was working on it.
> But the contiguous pages usually only work shortly after boot. After 
> awhile memory gets sufficiently scrambled that the coalescing in the I/O 
> layer becomes ineffective.

It fell off the bottom of my priority queue, sorry.


-- wli
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-17 Thread Arjan van de Ven
On Sun, 2007-06-17 at 19:08 -0700, Christoph Lameter wrote:
> On Sun, 17 Jun 2007, Matt Mackall wrote:
> 
> > Is it? Last I looked it had reverted to handing out reverse-contiguous
> > pages.
> 
> I thought that was fixed? Bill Irwin was working on it.
> 
> But the contiguous pages usually only work shortly after boot. After 
> awhile memory gets sufficiently scrambled that the coalescing in the I/O 
> layer becomes ineffective.

the buddy allocator at least defragments itself somewhat (granted, it's
not perfect and the per cpu page queues spoil the game too...)

-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via 
http://www.linuxfirmwarekit.org

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-17 Thread Christoph Lameter
On Sun, 17 Jun 2007, Matt Mackall wrote:

> Is it? Last I looked it had reverted to handing out reverse-contiguous
> pages.

I thought that was fixed? Bill Irwin was working on it.

But the contiguous pages usually only work shortly after boot. After 
awhile memory gets sufficiently scrambled that the coalescing in the I/O 
layer becomes ineffective.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-17 Thread Christoph Lameter
On Sun, 17 Jun 2007, Matt Mackall wrote:

 Is it? Last I looked it had reverted to handing out reverse-contiguous
 pages.

I thought that was fixed? Bill Irwin was working on it.

But the contiguous pages usually only work shortly after boot. After 
awhile memory gets sufficiently scrambled that the coalescing in the I/O 
layer becomes ineffective.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-17 Thread Arjan van de Ven
On Sun, 2007-06-17 at 19:08 -0700, Christoph Lameter wrote:
 On Sun, 17 Jun 2007, Matt Mackall wrote:
 
  Is it? Last I looked it had reverted to handing out reverse-contiguous
  pages.
 
 I thought that was fixed? Bill Irwin was working on it.
 
 But the contiguous pages usually only work shortly after boot. After 
 awhile memory gets sufficiently scrambled that the coalescing in the I/O 
 layer becomes ineffective.

the buddy allocator at least defragments itself somewhat (granted, it's
not perfect and the per cpu page queues spoil the game too...)

-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via 
http://www.linuxfirmwarekit.org

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-17 Thread William Lee Irwin III
On Sun, 17 Jun 2007, Matt Mackall wrote:
 Is it? Last I looked it had reverted to handing out reverse-contiguous
 pages.

On Sun, Jun 17, 2007 at 07:08:41PM -0700, Christoph Lameter wrote:
 I thought that was fixed? Bill Irwin was working on it.
 But the contiguous pages usually only work shortly after boot. After 
 awhile memory gets sufficiently scrambled that the coalescing in the I/O 
 layer becomes ineffective.

It fell off the bottom of my priority queue, sorry.


-- wli
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-16 Thread Matt Mackall
On Sat, Jun 16, 2007 at 06:25:00PM -0700, Arjan van de Ven wrote:
> 
> > You: conceptully-new add-on which benefits 0.25% of the user base, provided
> > they select the right config options and filesystem.
> > 
> > Me: simpler enhancement which benefits 100% of the user base (ie: includes
> > 4k blocksize, 4k pagesize) and which also fixes your performance problem
> > with that HBA.
> 
> note that at least 2.6 is doing this "sort of", better than 2.4 at
> least. (30% hitrate or something like that).

Is it? Last I looked it had reverted to handing out reverse-contiguous
pages.

You can see this by running /proc/pid/pagemap through hexdump.

-- 
Mathematics is the supreme nostalgia of our time.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-16 Thread Arjan van de Ven

> You: conceptully-new add-on which benefits 0.25% of the user base, provided
> they select the right config options and filesystem.
> 
> Me: simpler enhancement which benefits 100% of the user base (ie: includes
> 4k blocksize, 4k pagesize) and which also fixes your performance problem
> with that HBA.

note that at least 2.6 is doing this "sort of", better than 2.4 at
least. (30% hitrate or something like that).

In addition, systems with an IOMMU (many really large systems have that,
as well as several x86 ones, with more and more of that in the future),
this is a moot point since the IOMMU will just linearize for the device.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-16 Thread Arjan van de Ven

 You: conceptully-new add-on which benefits 0.25% of the user base, provided
 they select the right config options and filesystem.
 
 Me: simpler enhancement which benefits 100% of the user base (ie: includes
 4k blocksize, 4k pagesize) and which also fixes your performance problem
 with that HBA.

note that at least 2.6 is doing this sort of, better than 2.4 at
least. (30% hitrate or something like that).

In addition, systems with an IOMMU (many really large systems have that,
as well as several x86 ones, with more and more of that in the future),
this is a moot point since the IOMMU will just linearize for the device.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-16 Thread Matt Mackall
On Sat, Jun 16, 2007 at 06:25:00PM -0700, Arjan van de Ven wrote:
 
  You: conceptully-new add-on which benefits 0.25% of the user base, provided
  they select the right config options and filesystem.
  
  Me: simpler enhancement which benefits 100% of the user base (ie: includes
  4k blocksize, 4k pagesize) and which also fixes your performance problem
  with that HBA.
 
 note that at least 2.6 is doing this sort of, better than 2.4 at
 least. (30% hitrate or something like that).

Is it? Last I looked it had reverted to handing out reverse-contiguous
pages.

You can see this by running /proc/pid/pagemap through hexdump.

-- 
Mathematics is the supreme nostalgia of our time.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-15 Thread Dave Kleikamp
On Thu, 2007-06-14 at 15:04 -0700, Andrew Morton wrote:
> > On Thu, 14 Jun 2007 14:37:33 -0700 (PDT) Christoph Lameter <[EMAIL 
> > PROTECTED]> wrote:
> > On Thu, 14 Jun 2007, Andrew Morton wrote:
> > 
> > > We want the 100% case.
> > 
> > Yes that is what we intend to do. Universal support for larger blocksize. 
> > I.e. your desktop filesystem will use 64k page size and server platforms 
> > likely much larger.
> 
> With 64k pagesize the amount of memory required to hold a kernel tree (say)
> will go from 270MB to 1400MB.   This is not an optimisation.
> 
> Several 64k pagesize people have already spent time looking at various
> tail-packing schemes to get around this serious problem.  And that's on
> _server_ class machines.  Large ones.  I don't think
> laptop/desktop/samll-server machines would want to go anywhere near this.

I'm one of the ones investigating 64 KB pagesize tail-packing schemes,
and I believe Christoph's cleanups will reduce the intrusiveness and
improve the readability of a tail-packing solution.  I'll add my vote in
support of these patches.

Thanks,
Shaggy
-- 
David Kleikamp
IBM Linux Technology Center

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-15 Thread David Chinner
On Thu, Jun 14, 2007 at 07:23:40PM -0700, Andrew Morton wrote:
> On Thu, 14 Jun 2007 19:04:27 -0700 (PDT) Christoph Lameter <[EMAIL 
> PROTECTED]> wrote:
> > > > Of course there is. The seeks are reduced since there are an factor 
> > > > of 16 less metadata blocks. fsck does not read files. It just reads 
> > > > metadata structures. And the larger contiguous areas the faster.
> > > 
> > > Some metadata is contiguous: inode tables, some directories (if they got
> > > lucky), bitmap tables.  But fsck surely reads them in a single swoop
> > > anyway, so there's no gain there.
> > 
> > The metadata needs to refer to 1/16th of the earlier pages that need to be 
> > tracked. metadata is shrunk significantly.
> 
> Only if the filesystems are altered to use larger blocksizes and if the
> operator then chooses to use that feature.  Then they suck for small-sized
> (and even medium-sized) files.

Devil's Advocate:

In that case, we should remove support for block sizes smaller than
a page because they suck for large-sized (and even medium sized)
files and we shouldn't allow people to use them.

> So you're still talking about corner cases: specialised applications which
> require careful setup and administrator intervention.

Yes, like 512 byte block size filesystems using large directory
block sizes for dedicated mail servers. i.e. optimised for large
numbers of small files in each directory.

> What can we do to optimise the common case?

The common case is pretty good already for common case workloads.

What we need to do is provide options for workloads where tuning the
common case config is simply not sufficient. We already provide the
option to optimise for small file sizes, but we have no option to
optimise for large file sizes

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-15 Thread David Chinner
On Thu, Jun 14, 2007 at 07:23:40PM -0700, Andrew Morton wrote:
 On Thu, 14 Jun 2007 19:04:27 -0700 (PDT) Christoph Lameter [EMAIL 
 PROTECTED] wrote:
Of course there is. The seeks are reduced since there are an factor 
of 16 less metadata blocks. fsck does not read files. It just reads 
metadata structures. And the larger contiguous areas the faster.
   
   Some metadata is contiguous: inode tables, some directories (if they got
   lucky), bitmap tables.  But fsck surely reads them in a single swoop
   anyway, so there's no gain there.
  
  The metadata needs to refer to 1/16th of the earlier pages that need to be 
  tracked. metadata is shrunk significantly.
 
 Only if the filesystems are altered to use larger blocksizes and if the
 operator then chooses to use that feature.  Then they suck for small-sized
 (and even medium-sized) files.

Devil's Advocate:

In that case, we should remove support for block sizes smaller than
a page because they suck for large-sized (and even medium sized)
files and we shouldn't allow people to use them.

 So you're still talking about corner cases: specialised applications which
 require careful setup and administrator intervention.

Yes, like 512 byte block size filesystems using large directory
block sizes for dedicated mail servers. i.e. optimised for large
numbers of small files in each directory.

 What can we do to optimise the common case?

The common case is pretty good already for common case workloads.

What we need to do is provide options for workloads where tuning the
common case config is simply not sufficient. We already provide the
option to optimise for small file sizes, but we have no option to
optimise for large file sizes

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-15 Thread Dave Kleikamp
On Thu, 2007-06-14 at 15:04 -0700, Andrew Morton wrote:
  On Thu, 14 Jun 2007 14:37:33 -0700 (PDT) Christoph Lameter [EMAIL 
  PROTECTED] wrote:
  On Thu, 14 Jun 2007, Andrew Morton wrote:
  
   We want the 100% case.
  
  Yes that is what we intend to do. Universal support for larger blocksize. 
  I.e. your desktop filesystem will use 64k page size and server platforms 
  likely much larger.
 
 With 64k pagesize the amount of memory required to hold a kernel tree (say)
 will go from 270MB to 1400MB.   This is not an optimisation.
 
 Several 64k pagesize people have already spent time looking at various
 tail-packing schemes to get around this serious problem.  And that's on
 _server_ class machines.  Large ones.  I don't think
 laptop/desktop/samll-server machines would want to go anywhere near this.

I'm one of the ones investigating 64 KB pagesize tail-packing schemes,
and I believe Christoph's cleanups will reduce the intrusiveness and
improve the readability of a tail-packing solution.  I'll add my vote in
support of these patches.

Thanks,
Shaggy
-- 
David Kleikamp
IBM Linux Technology Center

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

> > The metadata needs to refer to 1/16th of the earlier pages that need to be 
> > tracked. metadata is shrunk significantly.
> 
> Only if the filesystems are altered to use larger blocksizes and if the
> operator then chooses to use that feature.  Then they suck for small-sized
> (and even medium-sized) files.

Nope. File systems already support that. The changes to XFS and ext2 are 
basically just doing the cleanups that we are discussing here plus some 
changes to set_blocksize.
 
> So you're still talking about corner cases: specialised applications which
> require careful setup and administrator intervention.
> 
> What can we do to optimise the common case?

The common filesystem will be able to support large block sizes easily. 
Most filesystems already run on 16k and 64k page size platforms and do 
just fine. All the work is already done. Just the VM needs to give them 
support for lager page sizes on smaller page size platforms.

This is optimizing the common case.

> The alleged fsck benefit is also unrelated to variable PAGE_CACHE_SIZE. 
> It's a feature of larger (unweildy?) blocksize, and xfs already has that
> working (doesn't it?)

It has a hack with severe limitations like we have done in many other 
components of the kernel. These hacks can be removed if the large 
blocksize support is merged. XFS still has the problem that the block 
layer without page cache support for higher pages cannot easily deal with 
large contiguous pages.

> There may be some benefits to some future version of ext4.

I have already run ext4 with 64k blocksize on x86_64 with 4k. It has been 
done for years with 64k page size on IA64 and powerpc and there is no fs 
issue with running it on 4k platforms with the large blocksize patchset.
The filesystems work reliably. The core linux code is the issue that we 
need to solve and this is the beginning of doing so.
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
On Thu, 14 Jun 2007 19:04:27 -0700 (PDT) Christoph Lameter <[EMAIL PROTECTED]> 
wrote:

> > > Of course there is. The seeks are reduced since there are an factor 
> > > of 16 less metadata blocks. fsck does not read files. It just reads 
> > > metadata structures. And the larger contiguous areas the faster.
> > 
> > Some metadata is contiguous: inode tables, some directories (if they got
> > lucky), bitmap tables.  But fsck surely reads them in a single swoop
> > anyway, so there's no gain there.
> 
> The metadata needs to refer to 1/16th of the earlier pages that need to be 
> tracked. metadata is shrunk significantly.

Only if the filesystems are altered to use larger blocksizes and if the
operator then chooses to use that feature.  Then they suck for small-sized
(and even medium-sized) files.

So you're still talking about corner cases: specialised applications which
require careful setup and administrator intervention.

What can we do to optimise the common case?

> > Other metadata (indirect blocks) are 100% discontiguous, and reading those
> > with a 64k IO into 64k of memory is completely dumb.
> 
> The effect of a larger page size is that the filesystem will 
> place more meta data into a single page instead of spreading it out. 
> Reading a mass of meta data with a 64k read is an intelligent choice to 
> make in particular if there is a large series of such reads.

Again: requires larger blocksize: specialised, uninteresting for what will
remain the common case: 4k blocksize.

The alleged fsck benefit is also unrelated to variable PAGE_CACHE_SIZE. 
It's a feature of larger (unweildy?) blocksize, and xfs already has that
working (doesn't it?)

There may be some benefits to some future version of ext4.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

> There will be files which should use 64k but which instead end up using 4k.
> 
> There will be files which should use 4k but which instead end up using 64k.
> 
> Because determining which size to use requires either operator intervention
> or kernel heuristics, both of which will be highly unreliable.
> 
> It's better to just make 4k pages go faster.

Initially its quite easy to have a filesystem for your 4k files (basically 
the distro you are running) and an archive for video / audio etc files 
that has 64k size for data. In the future filesystem may support sizes set 
per directory. Basically if things get to slow you can pull the lever.

> > Magical? There is nothing magical about doing transfers in the size that 
> > is supported by a device. That is good sense.
> 
> By magical heuristics I'm referring to the (required) tricks and guesses
> which the kernel will need to deploy to be able to guess which page-size it
> should use for each file.
> 
> Because without such heuristics, none of this new stuff which you're
> proposing would ever get used by 90% of apps on 90% of machines.

In the patchset V3 one f.e. simply formats a volume by specifying the 
desired blocksize. If one gets into trouble with fsck and other slowdown 
associated with large file I/O then they are going to be quite fast to 
format a partition with larger blocksize. Its a know technology in many 
Unixes.

The approach essentially gives one freedom to choose a page size. This is 
a tradeoff between desired speed, expected file sizes, filesystem behavior 
and acceptable fragmentation overhead. If we do this approach then I think 
we will see the mkfs.XXX  tools to automatically make intelligent choices
on which page size to use. They are all stuck at 4k at the moment.

> > Of course there is. The seeks are reduced since there are an factor 
> > of 16 less metadata blocks. fsck does not read files. It just reads 
> > metadata structures. And the larger contiguous areas the faster.
> 
> Some metadata is contiguous: inode tables, some directories (if they got
> lucky), bitmap tables.  But fsck surely reads them in a single swoop
> anyway, so there's no gain there.

The metadata needs to refer to 1/16th of the earlier pages that need to be 
tracked. metadata is shrunk significantly.
 
> Other metadata (indirect blocks) are 100% discontiguous, and reading those
> with a 64k IO into 64k of memory is completely dumb.

The effect of a larger page size is that the filesystem will 
place more meta data into a single page instead of spreading it out. 
Reading a mass of meta data with a 64k read is an intelligent choice to 
make in particular if there is a large series of such reads.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
On Thu, 14 Jun 2007 17:45:43 -0700 (PDT) Christoph Lameter <[EMAIL PROTECTED]> 
wrote:

> On Thu, 14 Jun 2007, Andrew Morton wrote:
> 
> > > I do not think that the 100% users will do kernel compiles all day like 
> > > we do. We likely would prefer 4k page size for our small text files.
> > 
> > There are many, many applications which use small files.
> 
> There is no problem with them using 4k page size concurrently to a higher 
> page size for other files.

There will be files which should use 64k but which instead end up using 4k.

There will be files which should use 4k but which instead end up using 64k.

Because determining which size to use requires either operator intervention
or kernel heuristics, both of which will be highly unreliable.

It's better to just make 4k pages go faster.

> > > I never understood the point of that exercise. If you have variable page 
> > > size then the 64k page size can be used specific to files that benefit 
> > > from it. Typically usage scenarios are video audio streaming I/O, large 
> > > picture files, large documents with embedded images. These are the major
> > > usage scenarioes today and we suck the. Our DVD/CD subsystems are 
> > > currently not capable of directly reading from these devices into the 
> > > page 
> > > cache since they do not do I/O in 4k chunks.
> > 
> > So with sufficient magical kernel heuristics or operator intervention, some
> > people will gain some benefit from 64k pagesize.  Most people with most
> > workloads will remain where they are: shoving zillions of physically
> > discontiguous pages into fixed-size sg lists.
> 
> Magical? There is nothing magical about doing transfers in the size that 
> is supported by a device. That is good sense.

By magical heuristics I'm referring to the (required) tricks and guesses
which the kernel will need to deploy to be able to guess which page-size it
should use for each file.

Because without such heuristics, none of this new stuff which you're
proposing would ever get used by 90% of apps on 90% of machines.

> > > Every 64k block contains more information and the number of pages managed
> > > is reduced by a factor of 16. Less seeks , less tlb pressure , less 
> > > reads, 
> > > more cpu cache and cpu cache prefetch friendly behavior.
> > 
> > argh.  Everything you say is just wrong.  A fsck involves zillions of
> > discontiguous small reads.  It is largely seek-bound, so there is no
> > benefit to be had here.  Your proposed change will introduce regressions by
> > causing larger amounts of physical reading and large amounts of memory
> > consumption.
> 
> Of course there is. The seeks are reduced since there are an factor 
> of 16 less metadata blocks. fsck does not read files. It just reads 
> metadata structures. And the larger contiguous areas the faster.

Some metadata is contiguous: inode tables, some directories (if they got
lucky), bitmap tables.  But fsck surely reads them in a single swoop
anyway, so there's no gain there.

Other metadata (indirect blocks) are 100% discontiguous, and reading those
with a 64k IO into 64k of memory is completely dumb.

And yes, I'm referring to the 90% case again.  The one we want to
optimise for.



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

> > I do not think that the 100% users will do kernel compiles all day like 
> > we do. We likely would prefer 4k page size for our small text files.
> 
> There are many, many applications which use small files.

There is no problem with them using 4k page size concurrently to a higher 
page size for other files.

> > I never understood the point of that exercise. If you have variable page 
> > size then the 64k page size can be used specific to files that benefit 
> > from it. Typically usage scenarios are video audio streaming I/O, large 
> > picture files, large documents with embedded images. These are the major
> > usage scenarioes today and we suck the. Our DVD/CD subsystems are 
> > currently not capable of directly reading from these devices into the page 
> > cache since they do not do I/O in 4k chunks.
> 
> So with sufficient magical kernel heuristics or operator intervention, some
> people will gain some benefit from 64k pagesize.  Most people with most
> workloads will remain where they are: shoving zillions of physically
> discontiguous pages into fixed-size sg lists.

Magical? There is nothing magical about doing transfers in the size that 
is supported by a device. That is good sense.

> > Every 64k block contains more information and the number of pages managed
> > is reduced by a factor of 16. Less seeks , less tlb pressure , less reads, 
> > more cpu cache and cpu cache prefetch friendly behavior.
> 
> argh.  Everything you say is just wrong.  A fsck involves zillions of
> discontiguous small reads.  It is largely seek-bound, so there is no
> benefit to be had here.  Your proposed change will introduce regressions by
> causing larger amounts of physical reading and large amounts of memory
> consumption.

Of course there is. The seeks are reduced since there are an factor 
of 16 less metadata blocks. fsck does not read files. It just reads 
metadata structures. And the larger contiguous areas the faster.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread David Chinner
On Thu, Jun 14, 2007 at 04:41:18PM -0700, Andrew Morton wrote:
> > On Fri, 15 Jun 2007 09:30:02 +1000 David Chinner <[EMAIL PROTECTED]> wrote:
> > xfs_repair also uses direct I/O and does it's own userspace block
> > caching and so avoids the problems involved with low memory, context
> > unaware cache reclaim and blockdev cache thrashing.
> 
> umm, that sounds like a mistake to me.  fscks tend to get run when there's
> no swap online.  A small system with a large disk risks going oom and can
> no longer be booted. 

xfs_repair is never run at boot time - we don't force periodic
boot time checks like ext3/4 does so this isn't a problem.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread David Chinner
On Thu, Jun 14, 2007 at 01:06:45PM -0700, Andrew Morton wrote:
> On Thu, 14 Jun 2007 12:38:39 -0700
> [EMAIL PROTECTED] wrote:
> 
> > This patchset cleans up the page cache handling by replacing
> > open coded shifts and adds through inline function calls.
> 
> If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
> become pointless obfuscation.

The open coding of shifts, masks, and other associated cruft is a real
problem. It leads to ugly and hard to understand code when you have to do
anything complex. That means when you come back to that code 6 months later,
you've got to take to the time to understand exactly what all that logic is
doing again.

IMO, xfs_page_state_convert() is a great example of where open coding
of PAGE_CACHE_SIZE manipulations lead to eye-bleeding code. This
patch set would go a long way to help clean up that mess.

IOWs, like hch, I think this patch set stands on it's own merit
regardless of concerns over variable page cache page sizes

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
> On Fri, 15 Jun 2007 09:30:02 +1000 David Chinner <[EMAIL PROTECTED]> wrote:
> On Thu, Jun 14, 2007 at 03:04:17PM -0700, Andrew Morton wrote:
> > fsck is single-threaded (hence no locking issues) and operates against the
> > blockdev pagecache and does a _lot_ of small reads (indirect blocks,
> > especially).
> 
> Commenting purely about the above statement (and not on large pages
> or block sizes), xfs-repair has had multithreaded capability for some
> time now. E.g. from the xfs_repair man page:
> 
>-MDisable  multi-threaded  mode. Normally, xfs_repair runs with
>twice the number of threads as processors.
> 
> 
> We have the second generation multithreading code out for review
> right now. e.g:
> 
> http://oss.sgi.com/archives/xfs/2007-06/msg00069.html
> 
> xfs_repair also uses direct I/O and does it's own userspace block
> caching and so avoids the problems involved with low memory, context
> unaware cache reclaim and blockdev cache thrashing.

umm, that sounds like a mistake to me.  fscks tend to get run when there's
no swap online.  A small system with a large disk risks going oom and can
no longer be booted.  Whereas if the fsck relies upon kernel caching it'll
run slower but will complete.

> And to top it all off, some of the prefetch smarts we added result
> in reading multiple sparse metadata blocks in a single, larger I/O,
> so repair is now often bandwidth bound rather than seek bound...
> 
> All I'm trying to say here is that you shouldn't assume that the
> problems a particular filesystem fsck has is common to all the
> rest

Yup.  I was of course referring to fsck.extN.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread David Chinner
On Thu, Jun 14, 2007 at 03:04:17PM -0700, Andrew Morton wrote:
> fsck is single-threaded (hence no locking issues) and operates against the
> blockdev pagecache and does a _lot_ of small reads (indirect blocks,
> especially).

Commenting purely about the above statement (and not on large pages
or block sizes), xfs-repair has had multithreaded capability for some
time now. E.g. from the xfs_repair man page:

   -MDisable  multi-threaded  mode. Normally, xfs_repair runs with
 twice the number of threads as processors.


We have the second generation multithreading code out for review
right now. e.g:

http://oss.sgi.com/archives/xfs/2007-06/msg00069.html

xfs_repair also uses direct I/O and does it's own userspace block
caching and so avoids the problems involved with low memory, context
unaware cache reclaim and blockdev cache thrashing.

And to top it all off, some of the prefetch smarts we added result
in reading multiple sparse metadata blocks in a single, larger I/O,
so repair is now often bandwidth bound rather than seek bound...

All I'm trying to say here is that you shouldn't assume that the
problems a particular filesystem fsck has is common to all the
rest

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
> On Thu, 14 Jun 2007 15:22:46 -0700 (PDT) Christoph Lameter <[EMAIL 
> PROTECTED]> wrote:
> On Thu, 14 Jun 2007, Andrew Morton wrote:
> 
> > With 64k pagesize the amount of memory required to hold a kernel tree (say)
> > will go from 270MB to 1400MB.   This is not an optimisation.
> 
> I do not think that the 100% users will do kernel compiles all day like 
> we do. We likely would prefer 4k page size for our small text files.

There are many, many applications which use small files.

> > Several 64k pagesize people have already spent time looking at various
> > tail-packing schemes to get around this serious problem.  And that's on
> > _server_ class machines.  Large ones.  I don't think
> > laptop/desktop/samll-server machines would want to go anywhere near this.
> 
> I never understood the point of that exercise. If you have variable page 
> size then the 64k page size can be used specific to files that benefit 
> from it. Typically usage scenarios are video audio streaming I/O, large 
> picture files, large documents with embedded images. These are the major
> usage scenarioes today and we suck the. Our DVD/CD subsystems are 
> currently not capable of directly reading from these devices into the page 
> cache since they do not do I/O in 4k chunks.

So with sufficient magical kernel heuristics or operator intervention, some
people will gain some benefit from 64k pagesize.  Most people with most
workloads will remain where they are: shoving zillions of physically
discontiguous pages into fixed-size sg lists.

Whereas with contig-pagecache, all users on all machines with all workloads
will benefit from the improved merging.

> > > fsck times etc etc are becoming an issue for desktop 
> > > systems
> > 
> > I don't see what fsck has to do with it.
> > 
> > fsck is single-threaded (hence no locking issues) and operates against the
> > blockdev pagecache and does a _lot_ of small reads (indirect blocks,
> > especially).  If the memory consumption for each 4k read jumps to 64k, fsck
> > is likely to slow down due to performing a lot more additional IO and due
> > to entering page reclaim much earlier.
> 
> Every 64k block contains more information and the number of pages managed
> is reduced by a factor of 16. Less seeks , less tlb pressure , less reads, 
> more cpu cache and cpu cache prefetch friendly behavior.

argh.  Everything you say is just wrong.  A fsck involves zillions of
discontiguous small reads.  It is largely seek-bound, so there is no
benefit to be had here.  Your proposed change will introduce regressions by
causing larger amounts of physical reading and large amounts of memory
consumption.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

> With 64k pagesize the amount of memory required to hold a kernel tree (say)
> will go from 270MB to 1400MB.   This is not an optimisation.

I do not think that the 100% users will do kernel compiles all day like 
we do. We likely would prefer 4k page size for our small text files.

> Several 64k pagesize people have already spent time looking at various
> tail-packing schemes to get around this serious problem.  And that's on
> _server_ class machines.  Large ones.  I don't think
> laptop/desktop/samll-server machines would want to go anywhere near this.

I never understood the point of that exercise. If you have variable page 
size then the 64k page size can be used specific to files that benefit 
from it. Typically usage scenarios are video audio streaming I/O, large 
picture files, large documents with embedded images. These are the major
usage scenarioes today and we suck the. Our DVD/CD subsystems are 
currently not capable of directly reading from these devices into the page 
cache since they do not do I/O in 4k chunks.

> > fsck times etc etc are becoming an issue for desktop 
> > systems
> 
> I don't see what fsck has to do with it.
> 
> fsck is single-threaded (hence no locking issues) and operates against the
> blockdev pagecache and does a _lot_ of small reads (indirect blocks,
> especially).  If the memory consumption for each 4k read jumps to 64k, fsck
> is likely to slow down due to performing a lot more additional IO and due
> to entering page reclaim much earlier.

Every 64k block contains more information and the number of pages managed
is reduced by a factor of 16. Less seeks , less tlb pressure , less reads, 
more cpu cache and cpu cache prefetch friendly behavior.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
> On Thu, 14 Jun 2007 14:37:33 -0700 (PDT) Christoph Lameter <[EMAIL 
> PROTECTED]> wrote:
> On Thu, 14 Jun 2007, Andrew Morton wrote:
> 
> > We want the 100% case.
> 
> Yes that is what we intend to do. Universal support for larger blocksize. 
> I.e. your desktop filesystem will use 64k page size and server platforms 
> likely much larger.

With 64k pagesize the amount of memory required to hold a kernel tree (say)
will go from 270MB to 1400MB.   This is not an optimisation.

Several 64k pagesize people have already spent time looking at various
tail-packing schemes to get around this serious problem.  And that's on
_server_ class machines.  Large ones.  I don't think
laptop/desktop/samll-server machines would want to go anywhere near this.

> fsck times etc etc are becoming an issue for desktop 
> systems

I don't see what fsck has to do with it.

fsck is single-threaded (hence no locking issues) and operates against the
blockdev pagecache and does a _lot_ of small reads (indirect blocks,
especially).  If the memory consumption for each 4k read jumps to 64k, fsck
is likely to slow down due to performing a lot more additional IO and due
to entering page reclaim much earlier.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

> We want the 100% case.

Yes that is what we intend to do. Universal support for larger blocksize. 
I.e. your desktop filesystem will use 64k page size and server platforms 
likely much larger. fsck times etc etc are becoming an issue for desktop 
systems given the capacities and lockinhg becomes an issue the more 
multicore your desktops become.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
> On Thu, 14 Jun 2007 14:20:04 -0700 (PDT) Christoph Lameter <[EMAIL 
> PROTECTED]> wrote:
> > I think the best way to proceed would be to investigate that _general_
> > optimisation and then, based upon the results of that work, decide whether
> > further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
> > and if so, what they should be.
> 
> As has been pointed out performance is only one beneficial issue of
> having a higher page cache. It is doubtful in principle that the proposed 
> alternative can work given that locking overhead and management overhead
> by the VM are not minimized but made more complex by your envisioned 
> solution.

Why do we have to replay all of this?

You: conceptully-new add-on which benefits 0.25% of the user base, provided
they select the right config options and filesystem.

Me: simpler enhancement which benefits 100% of the user base (ie: includes
4k blocksize, 4k pagesize) and which also fixes your performance problem
with that HBA.


We want the 100% case.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Dave McCracken
On Thursday 14 June 2007, Christoph Hellwig wrote:
> Christophs patches are an extremly useful cleanup and can stand on their
> own.  Right now PAGE_CACHE_SIZE and friends are in there and now one can
> keep them distinct because their useage is not clear at all.  By making
> the macros per-mapping at least the useage is clear.
>
> That beeing said we should do a full conversion so that PAGE_CACHE_SIZE
> just goes away, otherwise the whole excercise is rather pointless.

I agree with Christoph  and Christoph here.  The page_cache_xxx() macros are 
cleaner than PAGE_CACHE_SIZE.  Too many places have gotten it wrong too many 
times.  Let's go ahead with them even if we never implement variable cache 
page size.

Dave McCracken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

> If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
> become pointless obfuscation.

But there is no such resonable scenario that I am aware of unless we 
continue to add workarounds for the issues covered here to the VM.

And it was pointed out to you that such approach can never stand in place 
of the different uses of having a larger page cache.

> I think the best way to proceed would be to investigate that _general_
> optimisation and then, based upon the results of that work, decide whether
> further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
> and if so, what they should be.

As has been pointed out performance is only one beneficial issue of
having a higher page cache. It is doubtful in principle that the proposed 
alternative can work given that locking overhead and management overhead
by the VM are not minimized but made more complex by your envisioned 
solution.

The solution here significantly cleans up the page cache even if we never 
go to the variable page cache. If we do get there then numerous 
workarounds that we have in the tree because of not supporting larger I/O 
go away cleaning up the VM further. The large disk sizes can be handled in 
a reasonable way (f.e. fsck times would decrease) since we can handle
large contiguous chunks of memory. This is a necessary strategic move for 
the Linux kernel. It would also pave the way of managing large chunks
of contiguous memory for other ways and has the potential of getting rid
of such sore spots as the hugetlb filesystem.



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Hellwig
On Thu, Jun 14, 2007 at 01:06:45PM -0700, Andrew Morton wrote:
> On Thu, 14 Jun 2007 12:38:39 -0700
> [EMAIL PROTECTED] wrote:
> 
> > This patchset cleans up the page cache handling by replacing
> > open coded shifts and adds through inline function calls.
> 
> If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
> become pointless obfuscation.
> 
> Let's put our horses ahead of our carts.  We had a lengthy discussion about
> variable PAGE_CACHE_SIZE in which I pointed out that the performance
> benefits could be replicated in a manner which doesn't add complexity to
> core VFS and which provides immediate benefit to all filesystems without
> any need to alter them: populate contiguous pagecache pages with physically
> contiguous pages.
> 
> I think the best way to proceed would be to investigate that _general_
> optimisation and then, based upon the results of that work, decide whether
> further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
> and if so, what they should be.

Christophs patches are an extremly useful cleanup and can stand on their
own.  Right now PAGE_CACHE_SIZE and friends are in there and now one can
keep them distinct because their useage is not clear at all.  By making
the macros per-mapping at least the useage is clear.

That beeing said we should do a full conversion so that PAGE_CACHE_SIZE
just goes away, otherwise the whole excercise is rather pointless.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
On Thu, 14 Jun 2007 12:38:39 -0700
[EMAIL PROTECTED] wrote:

> This patchset cleans up the page cache handling by replacing
> open coded shifts and adds through inline function calls.

If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
become pointless obfuscation.

Let's put our horses ahead of our carts.  We had a lengthy discussion about
variable PAGE_CACHE_SIZE in which I pointed out that the performance
benefits could be replicated in a manner which doesn't add complexity to
core VFS and which provides immediate benefit to all filesystems without
any need to alter them: populate contiguous pagecache pages with physically
contiguous pages.

I think the best way to proceed would be to investigate that _general_
optimisation and then, based upon the results of that work, decide whether
further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
and if so, what they should be.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
On Thu, 14 Jun 2007 12:38:39 -0700
[EMAIL PROTECTED] wrote:

 This patchset cleans up the page cache handling by replacing
 open coded shifts and adds through inline function calls.

If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
become pointless obfuscation.

Let's put our horses ahead of our carts.  We had a lengthy discussion about
variable PAGE_CACHE_SIZE in which I pointed out that the performance
benefits could be replicated in a manner which doesn't add complexity to
core VFS and which provides immediate benefit to all filesystems without
any need to alter them: populate contiguous pagecache pages with physically
contiguous pages.

I think the best way to proceed would be to investigate that _general_
optimisation and then, based upon the results of that work, decide whether
further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
and if so, what they should be.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Hellwig
On Thu, Jun 14, 2007 at 01:06:45PM -0700, Andrew Morton wrote:
 On Thu, 14 Jun 2007 12:38:39 -0700
 [EMAIL PROTECTED] wrote:
 
  This patchset cleans up the page cache handling by replacing
  open coded shifts and adds through inline function calls.
 
 If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
 become pointless obfuscation.
 
 Let's put our horses ahead of our carts.  We had a lengthy discussion about
 variable PAGE_CACHE_SIZE in which I pointed out that the performance
 benefits could be replicated in a manner which doesn't add complexity to
 core VFS and which provides immediate benefit to all filesystems without
 any need to alter them: populate contiguous pagecache pages with physically
 contiguous pages.
 
 I think the best way to proceed would be to investigate that _general_
 optimisation and then, based upon the results of that work, decide whether
 further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
 and if so, what they should be.

Christophs patches are an extremly useful cleanup and can stand on their
own.  Right now PAGE_CACHE_SIZE and friends are in there and now one can
keep them distinct because their useage is not clear at all.  By making
the macros per-mapping at least the useage is clear.

That beeing said we should do a full conversion so that PAGE_CACHE_SIZE
just goes away, otherwise the whole excercise is rather pointless.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

 If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
 become pointless obfuscation.

But there is no such resonable scenario that I am aware of unless we 
continue to add workarounds for the issues covered here to the VM.

And it was pointed out to you that such approach can never stand in place 
of the different uses of having a larger page cache.

 I think the best way to proceed would be to investigate that _general_
 optimisation and then, based upon the results of that work, decide whether
 further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
 and if so, what they should be.

As has been pointed out performance is only one beneficial issue of
having a higher page cache. It is doubtful in principle that the proposed 
alternative can work given that locking overhead and management overhead
by the VM are not minimized but made more complex by your envisioned 
solution.

The solution here significantly cleans up the page cache even if we never 
go to the variable page cache. If we do get there then numerous 
workarounds that we have in the tree because of not supporting larger I/O 
go away cleaning up the VM further. The large disk sizes can be handled in 
a reasonable way (f.e. fsck times would decrease) since we can handle
large contiguous chunks of memory. This is a necessary strategic move for 
the Linux kernel. It would also pave the way of managing large chunks
of contiguous memory for other ways and has the potential of getting rid
of such sore spots as the hugetlb filesystem.



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Dave McCracken
On Thursday 14 June 2007, Christoph Hellwig wrote:
 Christophs patches are an extremly useful cleanup and can stand on their
 own.  Right now PAGE_CACHE_SIZE and friends are in there and now one can
 keep them distinct because their useage is not clear at all.  By making
 the macros per-mapping at least the useage is clear.

 That beeing said we should do a full conversion so that PAGE_CACHE_SIZE
 just goes away, otherwise the whole excercise is rather pointless.

I agree with Christoph  and Christoph here.  The page_cache_xxx() macros are 
cleaner than PAGE_CACHE_SIZE.  Too many places have gotten it wrong too many 
times.  Let's go ahead with them even if we never implement variable cache 
page size.

Dave McCracken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
 On Thu, 14 Jun 2007 14:20:04 -0700 (PDT) Christoph Lameter [EMAIL 
 PROTECTED] wrote:
  I think the best way to proceed would be to investigate that _general_
  optimisation and then, based upon the results of that work, decide whether
  further _specialised_ changes such as variable PAGE_CACHE_SIZE are needed,
  and if so, what they should be.
 
 As has been pointed out performance is only one beneficial issue of
 having a higher page cache. It is doubtful in principle that the proposed 
 alternative can work given that locking overhead and management overhead
 by the VM are not minimized but made more complex by your envisioned 
 solution.

Why do we have to replay all of this?

You: conceptully-new add-on which benefits 0.25% of the user base, provided
they select the right config options and filesystem.

Me: simpler enhancement which benefits 100% of the user base (ie: includes
4k blocksize, 4k pagesize) and which also fixes your performance problem
with that HBA.


We want the 100% case.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

 We want the 100% case.

Yes that is what we intend to do. Universal support for larger blocksize. 
I.e. your desktop filesystem will use 64k page size and server platforms 
likely much larger. fsck times etc etc are becoming an issue for desktop 
systems given the capacities and lockinhg becomes an issue the more 
multicore your desktops become.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
 On Thu, 14 Jun 2007 14:37:33 -0700 (PDT) Christoph Lameter [EMAIL 
 PROTECTED] wrote:
 On Thu, 14 Jun 2007, Andrew Morton wrote:
 
  We want the 100% case.
 
 Yes that is what we intend to do. Universal support for larger blocksize. 
 I.e. your desktop filesystem will use 64k page size and server platforms 
 likely much larger.

With 64k pagesize the amount of memory required to hold a kernel tree (say)
will go from 270MB to 1400MB.   This is not an optimisation.

Several 64k pagesize people have already spent time looking at various
tail-packing schemes to get around this serious problem.  And that's on
_server_ class machines.  Large ones.  I don't think
laptop/desktop/samll-server machines would want to go anywhere near this.

 fsck times etc etc are becoming an issue for desktop 
 systems

I don't see what fsck has to do with it.

fsck is single-threaded (hence no locking issues) and operates against the
blockdev pagecache and does a _lot_ of small reads (indirect blocks,
especially).  If the memory consumption for each 4k read jumps to 64k, fsck
is likely to slow down due to performing a lot more additional IO and due
to entering page reclaim much earlier.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

 With 64k pagesize the amount of memory required to hold a kernel tree (say)
 will go from 270MB to 1400MB.   This is not an optimisation.

I do not think that the 100% users will do kernel compiles all day like 
we do. We likely would prefer 4k page size for our small text files.

 Several 64k pagesize people have already spent time looking at various
 tail-packing schemes to get around this serious problem.  And that's on
 _server_ class machines.  Large ones.  I don't think
 laptop/desktop/samll-server machines would want to go anywhere near this.

I never understood the point of that exercise. If you have variable page 
size then the 64k page size can be used specific to files that benefit 
from it. Typically usage scenarios are video audio streaming I/O, large 
picture files, large documents with embedded images. These are the major
usage scenarioes today and we suck the. Our DVD/CD subsystems are 
currently not capable of directly reading from these devices into the page 
cache since they do not do I/O in 4k chunks.

  fsck times etc etc are becoming an issue for desktop 
  systems
 
 I don't see what fsck has to do with it.
 
 fsck is single-threaded (hence no locking issues) and operates against the
 blockdev pagecache and does a _lot_ of small reads (indirect blocks,
 especially).  If the memory consumption for each 4k read jumps to 64k, fsck
 is likely to slow down due to performing a lot more additional IO and due
 to entering page reclaim much earlier.

Every 64k block contains more information and the number of pages managed
is reduced by a factor of 16. Less seeks , less tlb pressure , less reads, 
more cpu cache and cpu cache prefetch friendly behavior.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
 On Thu, 14 Jun 2007 15:22:46 -0700 (PDT) Christoph Lameter [EMAIL 
 PROTECTED] wrote:
 On Thu, 14 Jun 2007, Andrew Morton wrote:
 
  With 64k pagesize the amount of memory required to hold a kernel tree (say)
  will go from 270MB to 1400MB.   This is not an optimisation.
 
 I do not think that the 100% users will do kernel compiles all day like 
 we do. We likely would prefer 4k page size for our small text files.

There are many, many applications which use small files.

  Several 64k pagesize people have already spent time looking at various
  tail-packing schemes to get around this serious problem.  And that's on
  _server_ class machines.  Large ones.  I don't think
  laptop/desktop/samll-server machines would want to go anywhere near this.
 
 I never understood the point of that exercise. If you have variable page 
 size then the 64k page size can be used specific to files that benefit 
 from it. Typically usage scenarios are video audio streaming I/O, large 
 picture files, large documents with embedded images. These are the major
 usage scenarioes today and we suck the. Our DVD/CD subsystems are 
 currently not capable of directly reading from these devices into the page 
 cache since they do not do I/O in 4k chunks.

So with sufficient magical kernel heuristics or operator intervention, some
people will gain some benefit from 64k pagesize.  Most people with most
workloads will remain where they are: shoving zillions of physically
discontiguous pages into fixed-size sg lists.

Whereas with contig-pagecache, all users on all machines with all workloads
will benefit from the improved merging.

   fsck times etc etc are becoming an issue for desktop 
   systems
  
  I don't see what fsck has to do with it.
  
  fsck is single-threaded (hence no locking issues) and operates against the
  blockdev pagecache and does a _lot_ of small reads (indirect blocks,
  especially).  If the memory consumption for each 4k read jumps to 64k, fsck
  is likely to slow down due to performing a lot more additional IO and due
  to entering page reclaim much earlier.
 
 Every 64k block contains more information and the number of pages managed
 is reduced by a factor of 16. Less seeks , less tlb pressure , less reads, 
 more cpu cache and cpu cache prefetch friendly behavior.

argh.  Everything you say is just wrong.  A fsck involves zillions of
discontiguous small reads.  It is largely seek-bound, so there is no
benefit to be had here.  Your proposed change will introduce regressions by
causing larger amounts of physical reading and large amounts of memory
consumption.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread David Chinner
On Thu, Jun 14, 2007 at 03:04:17PM -0700, Andrew Morton wrote:
 fsck is single-threaded (hence no locking issues) and operates against the
 blockdev pagecache and does a _lot_ of small reads (indirect blocks,
 especially).

Commenting purely about the above statement (and not on large pages
or block sizes), xfs-repair has had multithreaded capability for some
time now. E.g. from the xfs_repair man page:

   -MDisable  multi-threaded  mode. Normally, xfs_repair runs with
 twice the number of threads as processors.


We have the second generation multithreading code out for review
right now. e.g:

http://oss.sgi.com/archives/xfs/2007-06/msg00069.html

xfs_repair also uses direct I/O and does it's own userspace block
caching and so avoids the problems involved with low memory, context
unaware cache reclaim and blockdev cache thrashing.

And to top it all off, some of the prefetch smarts we added result
in reading multiple sparse metadata blocks in a single, larger I/O,
so repair is now often bandwidth bound rather than seek bound...

All I'm trying to say here is that you shouldn't assume that the
problems a particular filesystem fsck has is common to all the
rest

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread David Chinner
On Thu, Jun 14, 2007 at 01:06:45PM -0700, Andrew Morton wrote:
 On Thu, 14 Jun 2007 12:38:39 -0700
 [EMAIL PROTECTED] wrote:
 
  This patchset cleans up the page cache handling by replacing
  open coded shifts and adds through inline function calls.
 
 If we never inflict variable PAGE_CACHE_SIZE upon the kernel, these changes
 become pointless obfuscation.

The open coding of shifts, masks, and other associated cruft is a real
problem. It leads to ugly and hard to understand code when you have to do
anything complex. That means when you come back to that code 6 months later,
you've got to take to the time to understand exactly what all that logic is
doing again.

IMO, xfs_page_state_convert() is a great example of where open coding
of PAGE_CACHE_SIZE manipulations lead to eye-bleeding code. This
patch set would go a long way to help clean up that mess.

IOWs, like hch, I think this patch set stands on it's own merit
regardless of concerns over variable page cache page sizes

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread David Chinner
On Thu, Jun 14, 2007 at 04:41:18PM -0700, Andrew Morton wrote:
  On Fri, 15 Jun 2007 09:30:02 +1000 David Chinner [EMAIL PROTECTED] wrote:
  xfs_repair also uses direct I/O and does it's own userspace block
  caching and so avoids the problems involved with low memory, context
  unaware cache reclaim and blockdev cache thrashing.
 
 umm, that sounds like a mistake to me.  fscks tend to get run when there's
 no swap online.  A small system with a large disk risks going oom and can
 no longer be booted. 

xfs_repair is never run at boot time - we don't force periodic
boot time checks like ext3/4 does so this isn't a problem.

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

  I do not think that the 100% users will do kernel compiles all day like 
  we do. We likely would prefer 4k page size for our small text files.
 
 There are many, many applications which use small files.

There is no problem with them using 4k page size concurrently to a higher 
page size for other files.

  I never understood the point of that exercise. If you have variable page 
  size then the 64k page size can be used specific to files that benefit 
  from it. Typically usage scenarios are video audio streaming I/O, large 
  picture files, large documents with embedded images. These are the major
  usage scenarioes today and we suck the. Our DVD/CD subsystems are 
  currently not capable of directly reading from these devices into the page 
  cache since they do not do I/O in 4k chunks.
 
 So with sufficient magical kernel heuristics or operator intervention, some
 people will gain some benefit from 64k pagesize.  Most people with most
 workloads will remain where they are: shoving zillions of physically
 discontiguous pages into fixed-size sg lists.

Magical? There is nothing magical about doing transfers in the size that 
is supported by a device. That is good sense.

  Every 64k block contains more information and the number of pages managed
  is reduced by a factor of 16. Less seeks , less tlb pressure , less reads, 
  more cpu cache and cpu cache prefetch friendly behavior.
 
 argh.  Everything you say is just wrong.  A fsck involves zillions of
 discontiguous small reads.  It is largely seek-bound, so there is no
 benefit to be had here.  Your proposed change will introduce regressions by
 causing larger amounts of physical reading and large amounts of memory
 consumption.

Of course there is. The seeks are reduced since there are an factor 
of 16 less metadata blocks. fsck does not read files. It just reads 
metadata structures. And the larger contiguous areas the faster.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
On Thu, 14 Jun 2007 17:45:43 -0700 (PDT) Christoph Lameter [EMAIL PROTECTED] 
wrote:

 On Thu, 14 Jun 2007, Andrew Morton wrote:
 
   I do not think that the 100% users will do kernel compiles all day like 
   we do. We likely would prefer 4k page size for our small text files.
  
  There are many, many applications which use small files.
 
 There is no problem with them using 4k page size concurrently to a higher 
 page size for other files.

There will be files which should use 64k but which instead end up using 4k.

There will be files which should use 4k but which instead end up using 64k.

Because determining which size to use requires either operator intervention
or kernel heuristics, both of which will be highly unreliable.

It's better to just make 4k pages go faster.

   I never understood the point of that exercise. If you have variable page 
   size then the 64k page size can be used specific to files that benefit 
   from it. Typically usage scenarios are video audio streaming I/O, large 
   picture files, large documents with embedded images. These are the major
   usage scenarioes today and we suck the. Our DVD/CD subsystems are 
   currently not capable of directly reading from these devices into the 
   page 
   cache since they do not do I/O in 4k chunks.
  
  So with sufficient magical kernel heuristics or operator intervention, some
  people will gain some benefit from 64k pagesize.  Most people with most
  workloads will remain where they are: shoving zillions of physically
  discontiguous pages into fixed-size sg lists.
 
 Magical? There is nothing magical about doing transfers in the size that 
 is supported by a device. That is good sense.

By magical heuristics I'm referring to the (required) tricks and guesses
which the kernel will need to deploy to be able to guess which page-size it
should use for each file.

Because without such heuristics, none of this new stuff which you're
proposing would ever get used by 90% of apps on 90% of machines.

   Every 64k block contains more information and the number of pages managed
   is reduced by a factor of 16. Less seeks , less tlb pressure , less 
   reads, 
   more cpu cache and cpu cache prefetch friendly behavior.
  
  argh.  Everything you say is just wrong.  A fsck involves zillions of
  discontiguous small reads.  It is largely seek-bound, so there is no
  benefit to be had here.  Your proposed change will introduce regressions by
  causing larger amounts of physical reading and large amounts of memory
  consumption.
 
 Of course there is. The seeks are reduced since there are an factor 
 of 16 less metadata blocks. fsck does not read files. It just reads 
 metadata structures. And the larger contiguous areas the faster.

Some metadata is contiguous: inode tables, some directories (if they got
lucky), bitmap tables.  But fsck surely reads them in a single swoop
anyway, so there's no gain there.

Other metadata (indirect blocks) are 100% discontiguous, and reading those
with a 64k IO into 64k of memory is completely dumb.

And yes, I'm referring to the 90% case again.  The one we want to
optimise for.



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

 There will be files which should use 64k but which instead end up using 4k.
 
 There will be files which should use 4k but which instead end up using 64k.
 
 Because determining which size to use requires either operator intervention
 or kernel heuristics, both of which will be highly unreliable.
 
 It's better to just make 4k pages go faster.

Initially its quite easy to have a filesystem for your 4k files (basically 
the distro you are running) and an archive for video / audio etc files 
that has 64k size for data. In the future filesystem may support sizes set 
per directory. Basically if things get to slow you can pull the lever.

  Magical? There is nothing magical about doing transfers in the size that 
  is supported by a device. That is good sense.
 
 By magical heuristics I'm referring to the (required) tricks and guesses
 which the kernel will need to deploy to be able to guess which page-size it
 should use for each file.
 
 Because without such heuristics, none of this new stuff which you're
 proposing would ever get used by 90% of apps on 90% of machines.

In the patchset V3 one f.e. simply formats a volume by specifying the 
desired blocksize. If one gets into trouble with fsck and other slowdown 
associated with large file I/O then they are going to be quite fast to 
format a partition with larger blocksize. Its a know technology in many 
Unixes.

The approach essentially gives one freedom to choose a page size. This is 
a tradeoff between desired speed, expected file sizes, filesystem behavior 
and acceptable fragmentation overhead. If we do this approach then I think 
we will see the mkfs.XXX  tools to automatically make intelligent choices
on which page size to use. They are all stuck at 4k at the moment.

  Of course there is. The seeks are reduced since there are an factor 
  of 16 less metadata blocks. fsck does not read files. It just reads 
  metadata structures. And the larger contiguous areas the faster.
 
 Some metadata is contiguous: inode tables, some directories (if they got
 lucky), bitmap tables.  But fsck surely reads them in a single swoop
 anyway, so there's no gain there.

The metadata needs to refer to 1/16th of the earlier pages that need to be 
tracked. metadata is shrunk significantly.
 
 Other metadata (indirect blocks) are 100% discontiguous, and reading those
 with a 64k IO into 64k of memory is completely dumb.

The effect of a larger page size is that the filesystem will 
place more meta data into a single page instead of spreading it out. 
Reading a mass of meta data with a 64k read is an intelligent choice to 
make in particular if there is a large series of such reads.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Andrew Morton
On Thu, 14 Jun 2007 19:04:27 -0700 (PDT) Christoph Lameter [EMAIL PROTECTED] 
wrote:

   Of course there is. The seeks are reduced since there are an factor 
   of 16 less metadata blocks. fsck does not read files. It just reads 
   metadata structures. And the larger contiguous areas the faster.
  
  Some metadata is contiguous: inode tables, some directories (if they got
  lucky), bitmap tables.  But fsck surely reads them in a single swoop
  anyway, so there's no gain there.
 
 The metadata needs to refer to 1/16th of the earlier pages that need to be 
 tracked. metadata is shrunk significantly.

Only if the filesystems are altered to use larger blocksizes and if the
operator then chooses to use that feature.  Then they suck for small-sized
(and even medium-sized) files.

So you're still talking about corner cases: specialised applications which
require careful setup and administrator intervention.

What can we do to optimise the common case?

  Other metadata (indirect blocks) are 100% discontiguous, and reading those
  with a 64k IO into 64k of memory is completely dumb.
 
 The effect of a larger page size is that the filesystem will 
 place more meta data into a single page instead of spreading it out. 
 Reading a mass of meta data with a 64k read is an intelligent choice to 
 make in particular if there is a large series of such reads.

Again: requires larger blocksize: specialised, uninteresting for what will
remain the common case: 4k blocksize.

The alleged fsck benefit is also unrelated to variable PAGE_CACHE_SIZE. 
It's a feature of larger (unweildy?) blocksize, and xfs already has that
working (doesn't it?)

There may be some benefits to some future version of ext4.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 00/14] Page cache cleanup in anticipation of Large Blocksize support

2007-06-14 Thread Christoph Lameter
On Thu, 14 Jun 2007, Andrew Morton wrote:

  The metadata needs to refer to 1/16th of the earlier pages that need to be 
  tracked. metadata is shrunk significantly.
 
 Only if the filesystems are altered to use larger blocksizes and if the
 operator then chooses to use that feature.  Then they suck for small-sized
 (and even medium-sized) files.

Nope. File systems already support that. The changes to XFS and ext2 are 
basically just doing the cleanups that we are discussing here plus some 
changes to set_blocksize.
 
 So you're still talking about corner cases: specialised applications which
 require careful setup and administrator intervention.
 
 What can we do to optimise the common case?

The common filesystem will be able to support large block sizes easily. 
Most filesystems already run on 16k and 64k page size platforms and do 
just fine. All the work is already done. Just the VM needs to give them 
support for lager page sizes on smaller page size platforms.

This is optimizing the common case.

 The alleged fsck benefit is also unrelated to variable PAGE_CACHE_SIZE. 
 It's a feature of larger (unweildy?) blocksize, and xfs already has that
 working (doesn't it?)

It has a hack with severe limitations like we have done in many other 
components of the kernel. These hacks can be removed if the large 
blocksize support is merged. XFS still has the problem that the block 
layer without page cache support for higher pages cannot easily deal with 
large contiguous pages.

 There may be some benefits to some future version of ext4.

I have already run ext4 with 64k blocksize on x86_64 with 4k. It has been 
done for years with 64k page size on IA64 and powerpc and there is no fs 
issue with running it on 4k platforms with the large blocksize patchset.
The filesystems work reliably. The core linux code is the issue that we 
need to solve and this is the beginning of doing so.
 
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/