[RFC PATCH] vfs: optimization to /proc/pid/mountinfo patch

2008-02-04 Thread Ram Pai
1) reports deleted inode in dentry_path() consistent with that in __d_path()
2) modified __d_path() to use prepend(), reducing the size of __d_path()
3) moved all the functionality that reports mount information in /proc under
CONFIG_PROC_FS.

Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
CONFIG_PROC_FS.



Signed-off-by: Ram Pai [EMAIL PROTECTED]
---
 fs/dcache.c  |   59 +++
 fs/namespace.c   |2 +
 fs/seq_file.c|2 +
 include/linux/dcache.h   |3 ++
 include/linux/seq_file.h |3 ++
 5 files changed, 34 insertions(+), 35 deletions(-)

Index: linux-2.6.23/fs/dcache.c
===
--- linux-2.6.23.orig/fs/dcache.c
+++ linux-2.6.23/fs/dcache.c
@@ -1747,6 +1747,17 @@ shouldnt_be_hashed:
goto shouldnt_be_hashed;
 }
 
+static int prepend(char **buffer, int *buflen, const char *str,
+ int namelen)
+{
+   *buflen -= namelen;
+   if (*buflen  0)
+   return 1;
+   *buffer -= namelen;
+   memcpy(*buffer, str, namelen);
+   return 0;
+}
+
 /**
  * d_path - return the path of a dentry
  * @dentry: dentry to report
@@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den
 {
char * end = buffer+buflen;
char * retval;
-   int namelen;
 
-   *--end = '\0';
-   buflen--;
-   if (!IS_ROOT(dentry)  d_unhashed(dentry)) {
-   buflen -= 10;
-   end -= 10;
-   if (buflen  0)
+   prepend(end, buflen, \0, 1);
+   if (!IS_ROOT(dentry)  d_unhashed(dentry) 
+   prepend(end, buflen,  (deleted), 10))
goto Elong;
-   memcpy(end,  (deleted), 10);
-   }
 
if (buflen  1)
goto Elong;
@@ -1805,13 +1810,10 @@ static char *__d_path(struct dentry *den
}
parent = dentry-d_parent;
prefetch(parent);
-   namelen = dentry-d_name.len;
-   buflen -= namelen + 1;
-   if (buflen  0)
+   if (prepend(end, buflen, dentry-d_name.name,
+   dentry-d_name.len) ||
+   prepend(end, buflen, /, 1))
goto Elong;
-   end -= namelen;
-   memcpy(end, dentry-d_name.name, namelen);
-   *--end = '/';
retval = end;
dentry = parent;
}
@@ -1819,12 +1821,9 @@ static char *__d_path(struct dentry *den
return retval;
 
 global_root:
-   namelen = dentry-d_name.len;
-   buflen -= namelen;
-   if (buflen  0)
-   goto Elong;
-   retval -= namelen-1;/* hit the slash */
-   memcpy(retval, dentry-d_name.name, namelen);
+   retval += 1;/* hit the slash */
+   if (prepend(retval, buflen, dentry-d_name.name, dentry-d_name.len))
+   goto Elong;
return retval;
 Elong:
return ERR_PTR(-ENAMETOOLONG);
@@ -1890,17 +1889,8 @@ char *dynamic_dname(struct dentry *dentr
return memcpy(buffer, temp, sz);
 }
 
-static int prepend(char **buffer, int *buflen, const char *str,
- int namelen)
-{
-   *buflen -= namelen;
-   if (*buflen  0)
-   return 1;
-   *buffer -= namelen;
-   memcpy(*buffer, str, namelen);
-   return 0;
-}
 
+#ifdef CONFIG_PROC_FS
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
@@ -1910,11 +1900,9 @@ char *dentry_path(struct dentry *dentry,
char *retval;
 
spin_lock(dcache_lock);
-   prepend(end, buflen, \0, 1);
-   if (!IS_ROOT(dentry)  d_unhashed(dentry)) {
-   if (prepend(end, buflen, //deleted, 9))
+   if (!IS_ROOT(dentry)  d_unhashed(dentry) 
+   prepend(end, buflen,  (deleted), 10))
goto Elong;
-   }
if (buflen  1)
goto Elong;
/* Get '/' right */
@@ -1943,6 +1931,7 @@ Elong:
spin_unlock(dcache_lock);
return ERR_PTR(-ENAMETOOLONG);
 }
+#endif /* CONFIG_PROC_FS */
 
 /*
  * NOTE! The user-level library version returns a
Index: linux-2.6.23/fs/namespace.c
===
--- linux-2.6.23.orig/fs/namespace.c
+++ linux-2.6.23/fs/namespace.c
@@ -609,6 +609,7 @@ void mnt_unpin(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(mnt_unpin);
 
+#ifdef CONFIG_PROC_FS
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
@@ -795,6 +796,7 @@ const struct seq_operations mountstats_o
.stop   = m_stop,
.show   = show_vfsstat,
 };
+#endif  /* CONFIG_PROC_FS */
 
 /**
  * may_umount_tree - check if a mount tree is busy
Index: linux-2.6.23/fs/seq_file.c

Re: [RFC PATCH] vfs: optimization to /proc/pid/mountinfo patch

2008-02-04 Thread Andrew Morton
On Mon, 04 Feb 2008 01:15:05 -0800 Ram Pai [EMAIL PROTECTED] wrote:

 1) reports deleted inode in dentry_path() consistent with that in __d_path()
 2) modified __d_path() to use prepend(), reducing the size of __d_path()
 3) moved all the functionality that reports mount information in /proc under
   CONFIG_PROC_FS.
 
 Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
 impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
 CONFIG_PROC_FS.
   

Do `make menuconfig', then hit '/' and search for proc_fs.

It'll tell you that you need to set EMBEDDED=y to disable procfs.

  fs/dcache.c  |   59 
 +++
  fs/namespace.c   |2 +
  fs/seq_file.c|2 +
  include/linux/dcache.h   |3 ++
  include/linux/seq_file.h |3 ++

Please resend after testing that, thanks.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RESEND] [PATCH] ext3,4:fdatasync should skip metadata writeout when overwriting

2008-02-04 Thread Hisashi Hifumi
Hi.

Currently fdatasync is identical to fsync in ext3,4.
I think fdatasync should skip journal flush in data=ordered and data=writeback 
mode
when it overwrites to already-instantiated blocks on HDD.
When I_DIRTY_DATASYNC flag is not set, fdatasync should skip journal writeout
because this indicates only atime or/and mtime updates.  

Following patch is the same approach of ext2's fsync code(ext2_sync_file).

I did a performance test using the sysbench.

#sysbench --num-threads=128 --max-requests=5 --test=fileio 
--file-total-size=128G 
--file-test-mode=rndwr --file-fsync-mode=fdatasync run

The result was:

-2.6.24
Operations performed:  0 Read, 50080 Write, 59600 Other = 109680 Total
Read 0b  Written 782.5Mb  Total transferred 782.5Mb  (12.116Mb/sec)
  775.45 Requests/sec executed

Test execution summary:
total time:  64.5814s
total number of events:  50080
total time taken by event execution: 3713.9836
per-request statistics:
 min:0.s
 avg:0.0742s
 max:0.9375s
 approx.  95 percentile: 0.2901s

Threads fairness:
events (avg/stddev):   391.2500/23.26
execution time (avg/stddev):   29.0155/1.99


-2.6.24-patched
Operations performed:  0 Read, 50009 Write, 61596 Other = 111605 Total
Read 0b  Written 781.39Mb  Total transferred 781.39Mb  (16.419Mb/sec)
 1050.83 Requests/sec executed

Test execution summary:
total time:  47.5900s
total number of events:  50009
total time taken by event execution: 2934.5768
per-request statistics:
 min:0.s
 avg:0.0587s
 max:0.8938s
 approx.  95 percentile: 0.1993s

Threads fairness:
events (avg/stddev):   390.6953/22.64
execution time (avg/stddev):   22.9264/1.17


Filesystem I/O throughput was improved.

Thanks.

Signed-off-by :Hisashi Hifumi [EMAIL PROTECTED]

diff -Nrup linux-2.6.24.org/fs/ext3/fsync.c linux-2.6.24/fs/ext3/fsync.c
--- linux-2.6.24.org/fs/ext3/fsync.c2008-01-25 07:58:37.0 +0900
+++ linux-2.6.24/fs/ext3/fsync.c2008-02-04 12:42:42.0 +0900
@@ -72,6 +72,9 @@ int ext3_sync_file(struct file * file, s
goto out;
}
 
+   if (datasync  !(inode-i_state  I_DIRTY_DATASYNC))
+   goto out;
+
/*
 * The VFS has written the file data.  If the inode is unaltered
 * then we need not start a commit.
diff -Nrup linux-2.6.24.org/fs/ext4/fsync.c linux-2.6.24/fs/ext4/fsync.c
--- linux-2.6.24.org/fs/ext4/fsync.c2008-01-25 07:58:37.0 +0900
+++ linux-2.6.24/fs/ext4/fsync.c2008-02-04 12:43:37.0 +0900
@@ -72,6 +72,9 @@ int ext4_sync_file(struct file * file, s
goto out;
}
 
+if (datasync  !(inode-i_state  I_DIRTY_DATASYNC))
+goto out;
+
/*
 * The VFS has written the file data.  If the inode is unaltered
 * then we need not start a commit.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/3] mm: Add NR_WRITEBACK_TEMP counter

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Fuse will use temporary buffers to write back dirty data from memory
mappings (normal writes are done synchronously).  This is needed,
because there cannot be any guarantee about the time in which a write
will complete.

By using temporary buffers, from the MM's point if view the page is
written back immediately.  If the writeout was due to memory pressure,
this effectively migrates data from a full zone to a less full zone.

This patch adds a new counter (NR_WRITEBACK_TEMP) for the number of
pages used as temporary buffers.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/proc/proc_misc.c
===
--- linux.orig/fs/proc/proc_misc.c  2008-02-04 12:29:00.0 +0100
+++ linux/fs/proc/proc_misc.c   2008-02-04 13:01:35.0 +0100
@@ -178,6 +178,7 @@ static int meminfo_read_proc(char *page,
PageTables:   %8lu kB\n
NFS_Unstable: %8lu kB\n
Bounce:   %8lu kB\n
+   WritebackTmp: %8lu kB\n
CommitLimit:  %8lu kB\n
Committed_AS: %8lu kB\n
VmallocTotal: %8lu kB\n
@@ -209,6 +210,7 @@ static int meminfo_read_proc(char *page,
K(global_page_state(NR_PAGETABLE)),
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
+   K(global_page_state(NR_WRITEBACK_TEMP)),
K(allowed),
K(committed),
(unsigned long)VMALLOC_TOTAL  10,
Index: linux/include/linux/mmzone.h
===
--- linux.orig/include/linux/mmzone.h   2008-02-04 12:29:01.0 +0100
+++ linux/include/linux/mmzone.h2008-02-04 13:01:35.0 +0100
@@ -95,6 +95,7 @@ enum zone_stat_item {
NR_UNSTABLE_NFS,/* NFS unstable pages */
NR_BOUNCE,
NR_VMSCAN_WRITE,
+   NR_WRITEBACK_TEMP,  /* Writeback using temporary buffers */
 #ifdef CONFIG_NUMA
NUMA_HIT,   /* allocated in intended node */
NUMA_MISS,  /* allocated in non intended node */
Index: linux/drivers/base/node.c
===
--- linux.orig/drivers/base/node.c  2008-02-04 12:28:53.0 +0100
+++ linux/drivers/base/node.c   2008-02-04 13:01:35.0 +0100
@@ -64,6 +64,7 @@ static ssize_t node_read_meminfo(struct 
   Node %d PageTables:   %8lu kB\n
   Node %d NFS_Unstable: %8lu kB\n
   Node %d Bounce:   %8lu kB\n
+  Node %d WritebackTmp: %8lu kB\n
   Node %d Slab: %8lu kB\n
   Node %d SReclaimable: %8lu kB\n
   Node %d SUnreclaim:   %8lu kB\n,
@@ -86,6 +87,7 @@ static ssize_t node_read_meminfo(struct 
   nid, K(node_page_state(nid, NR_PAGETABLE)),
   nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
   nid, K(node_page_state(nid, NR_BOUNCE)),
+  nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)),
   nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
   nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-02-04 13:01:23.0 +0100
+++ linux/mm/page-writeback.c   2008-02-04 13:01:35.0 +0100
@@ -211,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_
avail_dirty = dirty -
(global_page_state(NR_FILE_DIRTY) +
 global_page_state(NR_WRITEBACK) +
-global_page_state(NR_UNSTABLE_NFS));
+global_page_state(NR_UNSTABLE_NFS) +
+global_page_state(NR_WRITEBACK_TEMP));
 
if (avail_dirty  0)
avail_dirty = 0;

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 3/3] fuse: support writable mmap

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Quoting Linus (3 years ago, FUSE inclusion discussions):

  User-space filesystems are hard to get right. I'd claim that they
   are almost impossible, unless you limit them somehow (shared
   writable mappings are the nastiest part - if you don't have those,
   you can reasonably limit your problems by limiting the number of
   dirty pages you accept through normal write() calls).

Instead of attempting the impossible, I've just waited for the dirty
page accounting infrastructure to materialize (thanks to Peter
Zijlstra and others).  This nicely solved the biggest problem:
limiting the number of pages used for write caching.

Some small details remained, however, which this largish patch
attempts to address.  It provides a page writeback implementation for
fuse, which is completely safe against VM related deadlocks.
Performance may not be very good for certain usage patterns, but
generally it should be acceptable.

It has been tested extensively with fsx-linux and bash-shared-mapping.

This patch depends on
mm-bdi-allow-setting-a-maximum-for-the-bdi-dirty-limit-fix.patch


Fuse page writeback design
--

fuse_writepage() allocates a new temporary page with
GFP_NOFS|__GFP_HIGHMEM.  It copies the contents of the original page,
and queues a WRITE request to the userspace filesystem using this temp
page.

The writeback is finished instantly from the MM's point of view: the
page is removed from the radix trees, and the PageDirty and
PageWriteback flags are cleared.

For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented.  The per-bdi writeback count is not decremented until the
actual write completes.

On dirtying the page, fuse waits for a previous write to finish before
proceeding.  This makes sure, there can only be one temporary page used
at a time for one cached page.

This approach is wasteful in both memory and CPU bandwidth, so why is
this complication needed?

The basic problem is that there can be no guarantee about the time in
which the userspace filesystem will complete a write.  It may be buggy
or even malicious, and fail to complete WRITE requests.  We don't want
unrelated parts of the system to grind to a halt in such cases.

Also a filesystem may need additional resources (particularly memory)
to complete a WRITE request.  There's a great danger of a deadlock if
that allocation may wait for the writepage to finish.

Currently there are several cases where the kernel can block on page
writeback:

  - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
  - page migration
  - throttle_vm_writeout (through NR_WRITEBACK)
  - sync(2)

Of course in some cases (fsync, msync) we explicitly want to allow
blocking.  So for these cases new code has to be added to fuse, since
the VM is not tracking writeback pages for us any more.

As an extra safetly measure, the maximum dirty ratio allocated to a
single fuse filesystem is set to 1% by default.  This way one (or
several) buggy or malicious fuse filesystems cannot slow down the rest
of the system by hogging dirty memory.

With appropriate privileges, this limit can be raised through
'/sys/class/bdi/bdi/max_ratio'.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/dev.c
===
--- linux.orig/fs/fuse/dev.c2008-02-04 15:24:03.0 +0100
+++ linux/fs/fuse/dev.c 2008-02-04 15:24:47.0 +0100
@@ -47,6 +47,14 @@ struct fuse_req *fuse_request_alloc(void
return req;
 }
 
+struct fuse_req *fuse_request_alloc_nofs(void)
+{
+   struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
+   if (req)
+   fuse_request_init(req);
+   return req;
+}
+
 void fuse_request_free(struct fuse_req *req)
 {
kmem_cache_free(fuse_req_cachep, req);
@@ -430,6 +438,17 @@ void request_send_background(struct fuse
 }
 
 /*
+ * Called under fc-lock
+ *
+ * fc-connected must have been checked previously
+ */
+void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+{
+   req-isreply = 1;
+   request_send_nowait_locked(fc, req);
+}
+
+/*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
  * aborted bail out.
Index: linux/fs/fuse/dir.c
===
--- linux.orig/fs/fuse/dir.c2008-02-04 15:24:03.0 +0100
+++ linux/fs/fuse/dir.c 2008-02-04 15:24:47.0 +0100
@@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr 
 }
 
 /*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   struct fuse_inode *fi = get_fuse_inode(inode);
+
+  

[patch 0/3] fuse: writable mmap

2008-02-04 Thread Miklos Szeredi
This is short series for fuse writable mmap support.

The first two patches are small additions to mm infrastructure.  The
third is a large patch for fuse.  It also depends on the mm: bdi:
export BDI attributes in sysfs series.

I don't mind if this goes into 2.6.25 (guess, that depends on whether
the bdi things go).

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Peter Staubach

Miklos Szeredi wrote:
  
  

Would you describe the situation that would cause the kernel to
go into an infinite loop, please?



The patch basically does:

do {
...
error = inode-i_op-foo()
...
} while (error == ESTALE);

What is the guarantee, that -foo() will not always return ESTALE?
  

You skimmed over some stuff, like the pathname lookup component
contained in the first set of dots...

I can't guarantee that -foo() won't always return ESTALE.

That said, the loop is not unbreakable.  At least for NFS, a signal
to the process will interrupt the loop because the error returned
will change from ESTALE to EINTR.



In FUSE interrupts are sent to userspace, and the filesystem decides
what to do with them.  So it is entirely possible and valid for a
filesystem to ignore an interrupt.  If an operation was non-blocking
(such as one returning an error), then there would in fact be no
purpose in checking interrupts.

  


Why do you think that it is valid to ignore pending signals?
You seem to be asserting that it okay for processes to hang,
uninterruptibly, when accessing files on fuse mounted file
systems?

Perhaps the right error to return when there is a signal
pending is EINTR and not ESTALE or some other error?  There
has to be some way for the application to detect that its
system call was interrupted due to a signal pending.


So while sending a signal might reliably work in NFS to break out of
the loop, it does not necessarily work for other filesystems, and fuse
may not be the only one affected.

  


Have you noticed another one?  I would be happy to chat with the
developers for that file system to see if this support would
negatively impact them.


Also up till now, returning ESTALE in a fuse filesystem was a
perfectly valid thing to do.  This patch changes the behavior of that
rather drastically.  There might be installed systems that rely on
current behavior, and we want to avoid breaking those on a kernel
upgrade.

  


Perhaps the explanation for what ESTALE means was not clear?
If there are fuse file systems which really do support the
notion of ESTALE, then it seems to me that they would also
benefit from this support, ie. the ability to do some recovery
from the situation.


A few solutions come to mind, perhaps the best is to introduce a
kernel internal errno value (ERETRYSTALE), that forces the relevant
system calls to be retried.

NFS could transform ESTALE errors to ERETRYSTALE and get the desired
behavior, while other filesystems would not be affected.


We don't need more error numbers, we've got plenty already.  :-)

Do you have anything more specific about any real problems?
I see lots of mays and coulds, but I don't see anything
that I can do to make this support better.

   Thanx...

  ps
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 0/3] add perform_write to a_ops

2008-02-04 Thread Miklos Szeredi
a_ops-perform_write() was left out from Nick Piggin's new a_ops
patchset, as it was non-essential, and postponed for later inclusion.

This short series reintroduces it, but only adds the fuse
implementation and not simple_perform_write(), which I'm not sure
would be a significant improvement.

This allows larger than 4k buffered writes for fuse, which is one of
the most requested features.

This goes on top of the fuse: writable mmap patches.

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 3/3] fuse: implement perform_write

2008-02-04 Thread Miklos Szeredi
From: Nick Piggin [EMAIL PROTECTED]

Introduce fuse_perform_write. With fusexmp (a passthrough filesystem), large
(1MB) writes into a backing tmpfs filesystem are sped up by almost 4 times
(256MB/s vs 71MB/s).

[EMAIL PROTECTED]:

 - split into smaller functions
 - testing

Signed-off-by: Nick Piggin [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/file.c
===
--- linux.orig/fs/fuse/file.c   2008-02-04 17:11:18.0 +0100
+++ linux/fs/fuse/file.c2008-02-04 17:11:59.0 +0100
@@ -677,6 +677,148 @@ static int fuse_write_end(struct file *f
return res;
 }
 
+static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+   struct inode *inode, loff_t pos,
+   size_t count)
+{
+   size_t res;
+   unsigned offset;
+   unsigned i;
+
+   for (i = 0; i  req-num_pages; i++)
+   fuse_wait_on_page_writeback(inode, req-pages[i]-index);
+
+   res = fuse_send_write(req, file, inode, pos, count, NULL);
+
+   offset = req-page_offset;
+   count = res;
+   for (i = 0; i  req-num_pages; i++) {
+   struct page *page = req-pages[i];
+
+   if (!req-out.h.error  !offset  count = PAGE_CACHE_SIZE)
+   SetPageUptodate(page);
+
+   /* Just ignore count underflow on last page */
+   count -= PAGE_CACHE_SIZE - offset;
+   offset = 0;
+
+   unlock_page(page);
+   page_cache_release(page);
+   }
+
+   return res;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+  struct address_space *mapping,
+  struct iov_iter *ii, loff_t pos)
+{
+   struct fuse_conn *fc = get_fuse_conn(mapping-host);
+   unsigned offset = pos  (PAGE_CACHE_SIZE - 1);
+   size_t count = 0;
+   int err;
+
+   req-page_offset = offset;
+
+   do {
+   size_t tmp;
+   struct page *page;
+   pgoff_t index = pos  PAGE_CACHE_SHIFT;
+   size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+iov_iter_count(ii));
+
+   bytes = min_t(size_t, bytes, fc-max_write - count);
+
+ again:
+   err = -EFAULT;
+   if (iov_iter_fault_in_readable(ii, bytes))
+   break;
+
+   err = -ENOMEM;
+   page = __grab_cache_page(mapping, index);
+   if (!page)
+   break;
+
+   pagefault_disable();
+   tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+   pagefault_enable();
+   flush_dcache_page(page);
+
+   if (!tmp) {
+   unlock_page(page);
+   page_cache_release(page);
+   bytes = min(bytes, iov_iter_single_seg_count(ii));
+   goto again;
+   }
+
+   err = 0;
+   req-pages[req-num_pages] = page;
+   req-num_pages++;
+
+   iov_iter_advance(ii, tmp);
+   count += tmp;
+   pos += tmp;
+   offset += tmp;
+   if (offset == PAGE_CACHE_SIZE)
+   offset = 0;
+
+   } while (iov_iter_count(ii)  count  fc-max_write 
+req-num_pages  FUSE_MAX_PAGES_PER_REQ  offset == 0);
+
+   return count  0 ? count : err;
+}
+
+static ssize_t fuse_perform_write(struct file *file,
+ struct address_space *mapping,
+ struct iov_iter *ii, loff_t pos)
+{
+   struct inode *inode = mapping-host;
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   int err = 0;
+   ssize_t res = 0;
+
+   if (is_bad_inode(inode))
+   return -EIO;
+
+   do {
+   struct fuse_req *req;
+   ssize_t count;
+
+   req = fuse_get_req(fc);
+   if (IS_ERR(req)) {
+   err = PTR_ERR(req);
+   break;
+   }
+
+   count = fuse_fill_write_pages(req, mapping, ii, pos);
+   if (count = 0) {
+   err = count;
+   } else {
+   size_t num_written;
+
+   num_written = fuse_send_write_pages(req, file, inode,
+   pos, count);
+   err = req-out.h.error;
+   if (!err) {
+   res += num_written;
+   pos += num_written;
+
+   /* break out of the loop on short write */
+   if (num_written != count)
+   

Re: [RFC] ext3: per-process soft-syncing data=ordered mode

2008-02-04 Thread Jan Kara
On Sat 02-02-08 00:26:00, Al Boldi wrote:
 Chris Mason wrote:
  On Thursday 31 January 2008, Jan Kara wrote:
   On Thu 31-01-08 11:56:01, Chris Mason wrote:
On Thursday 31 January 2008, Al Boldi wrote:
 The big difference between ordered and writeback is that once the
 slowdown starts, ordered goes into ~100% iowait, whereas writeback
 continues 100% user.
   
Does data=ordered write buffers in the order they were dirtied?  This
might explain the extreme problems in transactional workloads.
  
 Well, it does but we submit them to block layer all at once so
   elevator should sort the requests for us...
 
  nr_requests is fairly small, so a long stream of random requests should
  still end up being random IO.
 
  Al, could you please compare the write throughput from vmstat for the
  data=ordered vs data=writeback runs?  I would guess the data=ordered one
  has a lower overall write throughput.
 
 That's what I would have guessed, but it's actually going up 4x fold for 
 mysql from 559mb to 2135mb, while the db-size ends up at 549mb.
  So you say we write 4-times as much data in ordered mode as in writeback
mode. Hmm, probably possible because we force all the dirty data to disk
when committing a transation in ordered mode (and don't do this in
writeback mode). So if the workload repeatedly dirties the whole DB, we are
going to write the whole DB several times in ordered mode but in writeback
mode we just keep the data in memory all the time. But this is what you
ask for if you mount in ordered mode so I wouldn't consider it a bug.
  I still don't like your hack with per-process journal mode setting but we
could easily do per-file journal mode setting (we already have a flag to do
data journaling for a file) and that would help at least your DB
workload...

Honza
-- 
Jan Kara [EMAIL PROTECTED]
SUSE Labs, CR
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Miklos Szeredi
  In FUSE interrupts are sent to userspace, and the filesystem decides
  what to do with them.  So it is entirely possible and valid for a
  filesystem to ignore an interrupt.  If an operation was non-blocking
  (such as one returning an error), then there would in fact be no
  purpose in checking interrupts.
 

 
 Why do you think that it is valid to ignore pending signals?
 You seem to be asserting that it okay for processes to hang,
 uninterruptibly, when accessing files on fuse mounted file
 systems?
 
 Perhaps the right error to return when there is a signal
 pending is EINTR and not ESTALE or some other error?  There
 has to be some way for the application to detect that its
 system call was interrupted due to a signal pending.

Traditionally a lot of filesystem related system calls are not
interruptible, and for good reason.  For example what happens, if an
app receives a signal, while the filesystem is performing a rename()
request?  It would be very confusing if the call returned EINTR, but
the rename would successfully complete regardless.

We had a related problem with the open(O_CREAT) call in fuse, which
was interruptible between the creation and the actual open because of
a design mistake.  So it could return EINTR, after the file was
created, and this broke a real world application (don't have details
at hand, but could dig them out if you are interested).

I don't know what NFS does, but returning EINTR without actually
canceling an operation in the server is generally not a good idea.

  So while sending a signal might reliably work in NFS to break out of
  the loop, it does not necessarily work for other filesystems, and fuse
  may not be the only one affected.
 

 
 Have you noticed another one?  I would be happy to chat with the
 developers for that file system to see if this support would
 negatively impact them.

Oh, I have no idea.  And I wouldn't want to do a full audit of all the
filesystems to find out.  But if you do, please go ahead.

  A few solutions come to mind, perhaps the best is to introduce a
  kernel internal errno value (ERETRYSTALE), that forces the relevant
  system calls to be retried.
 
  NFS could transform ESTALE errors to ERETRYSTALE and get the desired
  behavior, while other filesystems would not be affected.
 
 We don't need more error numbers, we've got plenty already.  :-)

That's a rather poor excuse against a simple solution which would
spare us some backward compatibility problems.

 Do you have anything more specific about any real problems?
 I see lots of mays and coulds, but I don't see anything
 that I can do to make this support better.

Implement the above suggestion?  Or something else.

Otherwise I have to NAK this patch due to the possibility of it
breaking existing fuse installations.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Miklos Szeredi
  I don't know what NFS does, but returning EINTR without actually
  canceling an operation in the server is generally not a good idea.
 

 
 This is what NFS has been doing, for several decades, and no one
 has complained yet.

Is it really?  Man nfs says something quite different (emphasis mine):

   intrIf an NFS file operation has a *major timeout* and  it  is
   hard  mounted,  then  allow signals to interupt the file
   operation and cause it to return EINTR  to  the  calling
   program.  The *default* is to *not* allow file operations to
   be *interrupted*.

  Have you noticed another one?  I would be happy to chat with the
  developers for that file system to see if this support would
  negatively impact them.
  
 
  Oh, I have no idea.  And I wouldn't want to do a full audit of all the
  filesystems to find out.  But if you do, please go ahead.
 

 
 Well, you brought it up.  I thought that perhaps you had something
 other than FUD.

It's not FUD, it's being careful not to break an implementation when
changing an API in a backward incompatbile way.

 Please describe this real and existing fuse installation so that I can
 better understand the situation and the real requirements here.

I have already done so:

  Also up till now, returning ESTALE in a fuse filesystem was a
   perfectly valid thing to do.  This patch changes the behavior of
   that rather drastically.  There might be installed systems that
   rely on current behavior, and we want to avoid breaking those on a
   kernel upgrade.

Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Peter Staubach

Miklos Szeredi wrote:

In FUSE interrupts are sent to userspace, and the filesystem decides
what to do with them.  So it is entirely possible and valid for a
filesystem to ignore an interrupt.  If an operation was non-blocking
(such as one returning an error), then there would in fact be no
purpose in checking interrupts.

  
  

Why do you think that it is valid to ignore pending signals?
You seem to be asserting that it okay for processes to hang,
uninterruptibly, when accessing files on fuse mounted file
systems?

Perhaps the right error to return when there is a signal
pending is EINTR and not ESTALE or some other error?  There
has to be some way for the application to detect that its
system call was interrupted due to a signal pending.



Traditionally a lot of filesystem related system calls are not
interruptible, and for good reason.  For example what happens, if an
app receives a signal, while the filesystem is performing a rename()
request?  It would be very confusing if the call returned EINTR, but
the rename would successfully complete regardless.

We had a related problem with the open(O_CREAT) call in fuse, which
was interruptible between the creation and the actual open because of
a design mistake.  So it could return EINTR, after the file was
created, and this broke a real world application (don't have details
at hand, but could dig them out if you are interested).

I don't know what NFS does, but returning EINTR without actually
canceling an operation in the server is generally not a good idea.

  


This is what NFS has been doing, for several decades, and no one
has complained yet.  It is just generally accepted.  I do agree
that it isn't the best of semantics, but it does seem to work and
does solve a real problem which exists if you don't allow an
operation to be interrupted.  The alternative, for NFS clients,
was potentially to block an application until a server, which
might never come back up, comes back up.  It was a serious
problem and worse than this resolution.

Yes, I'd like to hear the details and find out why it was a
problem.  If you allow the fuse file system to block waiting
on things which may never occur, than you are going to have a
problem.  I would suggest considering this now instead of waiting
until it is too late.  We can learn from the NFS experience instead
of just dismissing it.



So while sending a signal might reliably work in NFS to break out of
the loop, it does not necessarily work for other filesystems, and fuse
may not be the only one affected.

  
  

Have you noticed another one?  I would be happy to chat with the
developers for that file system to see if this support would
negatively impact them.



Oh, I have no idea.  And I wouldn't want to do a full audit of all the
filesystems to find out.  But if you do, please go ahead.

  


Well, you brought it up.  I thought that perhaps you had something
other than FUD.


A few solutions come to mind, perhaps the best is to introduce a
kernel internal errno value (ERETRYSTALE), that forces the relevant
system calls to be retried.

NFS could transform ESTALE errors to ERETRYSTALE and get the desired
behavior, while other filesystems would not be affected.
  

We don't need more error numbers, we've got plenty already.  :-)



That's a rather poor excuse against a simple solution which would
spare us some backward compatibility problems.

  


Potential backwards compatibility problems and none are even known
or even considered.

The solution here isn't to create more hacks and a new error number
for this purpose is just a hack.


Do you have anything more specific about any real problems?
I see lots of mays and coulds, but I don't see anything
that I can do to make this support better.



Implement the above suggestion?  Or something else.

Otherwise I have to NAK this patch due to the possibility of it
breaking existing fuse installations.


Please describe this real and existing fuse installation so that I can
better understand the situation and the real requirements here.

Instead of attempting to block this proposal, what about considering
how to architect fuse to handle the situation instead of pretending
that fuse won't have the same problem to solve if it isn't solved
here?  I have a real problem to solve and I need to get it resolved.
I have real customers, with real problems, and not just theoretical
and vague ones.

 ps
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/3] mm: bdi: export bdi_writeout_inc()

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Fuse needs this for writable mmap support.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-02-04 12:29:01.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-02-04 13:01:23.0 +0100
@@ -149,6 +149,8 @@ static inline unsigned long bdi_stat_err
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
+extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+
 /*
  * Flags in backing_dev_info::capability
  * - The first two flags control whether dirty pages will contribute to the
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-02-04 12:29:01.0 +0100
+++ linux/mm/page-writeback.c   2008-02-04 13:01:23.0 +0100
@@ -168,6 +168,16 @@ static inline void __bdi_writeout_inc(st
  bdi-max_prop_frac);
 }
 
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+   unsigned long flags;
+
+   local_irq_save(flags);
+   __bdi_writeout_inc(bdi);
+   local_irq_restore(flags);
+}
+EXPORT_SYMBOL(bdi_writeout_inc);
+
 static inline void task_dirty_inc(struct task_struct *tsk)
 {
prop_inc_single(vm_dirties, tsk-dirties);

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/3] vfs: introduce perform_write in a_ops

2008-02-04 Thread Miklos Szeredi
From: Nick Piggin [EMAIL PROTECTED]

Introduce a new perform_write() address space operation.

This is a single-call, bulk version of write_begin/write_end
operations.  It is only used in the buffered write path (write_begin
must still be implemented), and not for in-kernel writes to pagecache.

For some filesystems, using this can provide significant speedups.

Signed-off-by: Nick Piggin [EMAIL PROTECTED]
Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-02-04 15:24:03.0 +0100
+++ linux/include/linux/fs.h2008-02-04 16:24:19.0 +0100
@@ -469,6 +469,9 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
 
+   ssize_t (*perform_write)(struct file *, struct address_space *mapping,
+   struct iov_iter *i, loff_t pos);
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
Index: linux/mm/filemap.c
===
--- linux.orig/mm/filemap.c 2008-02-04 15:24:03.0 +0100
+++ linux/mm/filemap.c  2008-02-04 16:22:55.0 +0100
@@ -2312,7 +2312,9 @@ generic_file_buffered_write(struct kiocb
struct iov_iter i;
 
iov_iter_init(i, iov, nr_segs, count, written);
-   if (a_ops-write_begin)
+   if (a_ops-perform_write)
+   status = a_ops-perform_write(file, mapping, i, pos);
+   else if (a_ops-write_begin)
status = generic_perform_write(file, i, pos);
else
status = generic_perform_write_2copy(file, i, pos);
Index: linux/Documentation/filesystems/vfs.txt
===
--- linux.orig/Documentation/filesystems/vfs.txt2008-02-04 
12:28:50.0 +0100
+++ linux/Documentation/filesystems/vfs.txt 2008-02-04 16:23:44.0 
+0100
@@ -533,6 +533,9 @@ struct address_space_operations {
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
+   ssize_t (*perform_write)(struct file *, struct address_space *mapping,
+   struct iov_iter *i, loff_t pos);
+
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
@@ -664,6 +667,17 @@ struct address_space_operations {
 Returns  0 on failure, otherwise the number of bytes (= 'copied')
 that were able to be copied into pagecache.
 
+  perform_write: This is a single-call, bulk version of write_begin/write_end
+operations. It is only used in the buffered write path (write_begin
+must still be implemented), and not for in-kernel writes to pagecache.
+It takes an iov_iter structure, which provides a descriptor for the
+source data (and has associated iov_iter_xxx helpers to operate on
+that data). There are also file, mapping, and pos arguments, which
+specify the destination of the data.
+
+Returns  0 on failure if nothing was written out, otherwise returns
+the number of bytes copied into pagecache.
+
   bmap: called by the VFS to map a logical block offset within object to
physical block number. This method is used by the FIBMAP
ioctl and for working with swap-files.  To be able to swap to

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] vfs: optimization to /proc/pid/mountinfo patch

2008-02-04 Thread Miklos Szeredi
 1) reports deleted inode in dentry_path() consistent with that in __d_path()
 2) modified __d_path() to use prepend(), reducing the size of __d_path()
 3) moved all the functionality that reports mount information in /proc under
   CONFIG_PROC_FS.
 
 Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
 impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
 CONFIG_PROC_FS.
   
 
 
 Signed-off-by: Ram Pai [EMAIL PROTECTED]
 ---
  fs/dcache.c  |   59 
 +++
  fs/namespace.c   |2 +
  fs/seq_file.c|2 +
  include/linux/dcache.h   |3 ++
  include/linux/seq_file.h |3 ++
  5 files changed, 34 insertions(+), 35 deletions(-)
 
 Index: linux-2.6.23/fs/dcache.c
 ===
 --- linux-2.6.23.orig/fs/dcache.c
 +++ linux-2.6.23/fs/dcache.c
 @@ -1747,6 +1747,17 @@ shouldnt_be_hashed:
   goto shouldnt_be_hashed;
  }
  
 +static int prepend(char **buffer, int *buflen, const char *str,
 +   int namelen)
 +{
 + *buflen -= namelen;
 + if (*buflen  0)
 + return 1;

This is confusing.  Should return -ENAMETOOLONG intead (see Chapter 16
in Documentation/CodingStyle).

 + *buffer -= namelen;
 + memcpy(*buffer, str, namelen);
 + return 0;
 +}
 +
  /**
   * d_path - return the path of a dentry
   * @dentry: dentry to report
 @@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den
  {
   char * end = buffer+buflen;
   char * retval;
 - int namelen;
  
 - *--end = '\0';
 - buflen--;
 - if (!IS_ROOT(dentry)  d_unhashed(dentry)) {
 - buflen -= 10;
 - end -= 10;
 - if (buflen  0)
 + prepend(end, buflen, \0, 1);
 + if (!IS_ROOT(dentry)  d_unhashed(dentry) 
 + prepend(end, buflen,  (deleted), 10))

And this should test for prepend() != 0 or prepend()  0 instead,
otherwise it could easily be misread as if prepend() succeeded,
then

And similarly for all the later calls.

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 0/3] add perform_write to a_ops

2008-02-04 Thread Christoph Hellwig
On Mon, Feb 04, 2008 at 06:04:10PM +0100, Miklos Szeredi wrote:
 a_ops-perform_write() was left out from Nick Piggin's new a_ops
 patchset, as it was non-essential, and postponed for later inclusion.
 
 This short series reintroduces it, but only adds the fuse
 implementation and not simple_perform_write(), which I'm not sure
 would be a significant improvement.
 
 This allows larger than 4k buffered writes for fuse, which is one of
 the most requested features.
 
 This goes on top of the fuse: writable mmap patches.

Please don't do this, but rather implement your own .aio_write.  There's
very little in generic_file_aio_write that wouldn't be handle by
-perform_write and we should rather factor those up or move to higher
layers than adding this ill-defined abstraction.

-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/3] fuse: clean up setting i_size in write

2008-02-04 Thread Miklos Szeredi
From: Miklos Szeredi [EMAIL PROTECTED]

Extract common code for setting i_size in write functions into a
common helper.

Signed-off-by: Miklos Szeredi [EMAIL PROTECTED]
---

Index: linux/fs/fuse/file.c
===
--- linux.orig/fs/fuse/file.c   2008-02-04 13:01:39.0 +0100
+++ linux/fs/fuse/file.c2008-02-04 13:02:03.0 +0100
@@ -610,13 +610,24 @@ static int fuse_write_begin(struct file 
return 0;
 }
 
+static void fuse_write_update_size(struct inode *inode, loff_t pos)
+{
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   struct fuse_inode *fi = get_fuse_inode(inode);
+
+   spin_lock(fc-lock);
+   fi-attr_version = ++fc-attr_version;
+   if (pos  inode-i_size)
+   i_size_write(inode, pos);
+   spin_unlock(fc-lock);
+}
+
 static int fuse_buffered_write(struct file *file, struct inode *inode,
   loff_t pos, unsigned count, struct page *page)
 {
int err;
size_t nres;
struct fuse_conn *fc = get_fuse_conn(inode);
-   struct fuse_inode *fi = get_fuse_inode(inode);
unsigned offset = pos  (PAGE_CACHE_SIZE - 1);
struct fuse_req *req;
 
@@ -643,12 +654,7 @@ static int fuse_buffered_write(struct fi
err = -EIO;
if (!err) {
pos += nres;
-   spin_lock(fc-lock);
-   fi-attr_version = ++fc-attr_version;
-   if (pos  inode-i_size)
-   i_size_write(inode, pos);
-   spin_unlock(fc-lock);
-
+   fuse_write_update_size(inode, pos);
if (count == PAGE_CACHE_SIZE)
SetPageUptodate(page);
}
@@ -766,12 +772,8 @@ static ssize_t fuse_direct_io(struct fil
}
fuse_put_request(fc, req);
if (res  0) {
-   if (write) {
-   spin_lock(fc-lock);
-   if (pos  inode-i_size)
-   i_size_write(inode, pos);
-   spin_unlock(fc-lock);
-   }
+   if (write)
+   fuse_write_update_size(inode, pos);
*ppos = pos;
}
fuse_invalidate_attr(inode);

--
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 0/3] add perform_write to a_ops

2008-02-04 Thread Miklos Szeredi
  a_ops-perform_write() was left out from Nick Piggin's new a_ops
  patchset, as it was non-essential, and postponed for later inclusion.
  
  This short series reintroduces it, but only adds the fuse
  implementation and not simple_perform_write(), which I'm not sure
  would be a significant improvement.
  
  This allows larger than 4k buffered writes for fuse, which is one of
  the most requested features.
  
  This goes on top of the fuse: writable mmap patches.
 
 Please don't do this, but rather implement your own .aio_write.  There's
 very little in generic_file_aio_write that wouldn't be handle by
 -perform_write and we should rather factor those up or move to higher
 layers than adding this ill-defined abstraction.
 

Moving up to higher layers might not be possible, due to lock/unlock
of i_mutex being inside generic_file_aio_write().

But with fuse being the only user, it's not a huge issue duplicating
some code.

Nick, were there any other candidates, that would want to use such an
interface in the future?

Thanks,
Miklos
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 0/3] add perform_write to a_ops

2008-02-04 Thread Christoph Hellwig
On Mon, Feb 04, 2008 at 09:52:06PM +0100, Miklos Szeredi wrote:
 Moving up to higher layers might not be possible, due to lock/unlock
 of i_mutex being inside generic_file_aio_write().

Well some bits can be moved up.  Here's my grand plan which I plan
to implement once I get some time for it (or let someone else do
if they beat me):

 - generic_segment_checks goes to fs/read_write.c before caling into
   the filesystem
 - dito for vfs_check_frozen
 - generic_write_checks is a suitable helper already
 - dito for remove_suid
 - dito for file_update_time
 - after that there's not a whole lot left in generic_file_aio_write,
   except for direct I/O handling which will probably be very fs-specific
   if you have your own buffered I/O code

generic_file_buffered_write is an almost trivial wrapper around what's
-perform_write in Nick's earlier patches and a helper for the syncing
activity.



 
 But with fuse being the only user, it's not a huge issue duplicating
 some code.
 
 Nick, were there any other candidates, that would want to use such an
 interface in the future?
 
 Thanks,
 Miklos
 -
 To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---
-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [2.6.24 REGRESSION] BUG: Soft lockup - with VFS

2008-02-04 Thread Andrew Morton
On Mon, 28 Jan 2008 09:31:43 +0100 Oliver Pinter (Pintér Olivér)  [EMAIL 
PROTECTED] wrote:

 hi all!
 
 in the 2.6.24 become i some soft lockups with usb-phone, when i pluged
 in the mobile, then the vfs-layer crashed. am afternoon can i the
 .config send, and i bisected the kernel, when i have time.
 
 pictures from crash:
 http://students.zipernowsky.hu/~oliverp/kernel/regression_2624/

It looks like selinux's file_has_perm() is doing spin_lock() on an
uninitialised (or already locked) spinlock.


-
To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] ext3: per-process soft-syncing data=ordered mode

2008-02-04 Thread Al Boldi
Jan Kara wrote:
 On Sat 02-02-08 00:26:00, Al Boldi wrote:
  Chris Mason wrote:
   Al, could you please compare the write throughput from vmstat for the
   data=ordered vs data=writeback runs?  I would guess the data=ordered
   one has a lower overall write throughput.
 
  That's what I would have guessed, but it's actually going up 4x fold for
  mysql from 559mb to 2135mb, while the db-size ends up at 549mb.

   So you say we write 4-times as much data in ordered mode as in writeback
 mode. Hmm, probably possible because we force all the dirty data to disk
 when committing a transation in ordered mode (and don't do this in
 writeback mode). So if the workload repeatedly dirties the whole DB, we
 are going to write the whole DB several times in ordered mode but in
 writeback mode we just keep the data in memory all the time. But this is
 what you ask for if you mount in ordered mode so I wouldn't consider it a
 bug.

Ok, maybe not a bug, but a bit inefficient.  Check out this workload:

sync;

while :; do
  dd  /dev/full  /mnt/sda2/x.dmp bs=1M count=20
  rm -f /mnt/sda2/x.dmp
  usleep 1
done

vmstat 1 ( with mount /dev/sda2 /mnt/sda2 -o data=writeback)  note io-bo 

procs ---memory-- ---swap-- -io --system-- cpu
 r  b   swpd   free   buff  cache   si   sobibo   incs us sy id wa
 2  0  0 293008   5232  5743600 0 0   18   206  4 80 16  0
 1  0  0 282840   5232  6762000 0 0   18   238  3 81 16  0
 1  0  0 297032   5244  5336400 0   152   21   211  4 79 17  0
 1  0  0 285236   5244  6522400 0 0   18   232  4 80 16  0
 1  0  0 299464   5244  5088000 0 0   18   222  4 80 16  0
 1  0  0 290156   5244  6017600 0 0   18   236  3 80 17  0
 0  0  0 302124   5256  4778800 0   152   21   213  4 80 16  0
 1  0  0 292180   5256  5824800 0 0   18   239  3 81 16  0
 1  0  0 287452   5256  6244400 0 0   18   202  3 80 17  0
 1  0  0 293016   5256  5739200 0 0   18   250  4 80 16  0
 0  0  0 302052   5256  4778800 0 0   19   194  3 81 16  0
 1  0  0 297536   5268  5292800 0   152   20   233  4 79 17  0
 1  0  0 286468   5268  6387200 0 0   18   212  3 81 16  0
 1  0  0 301572   5268  4881200 0 0   18   267  4 79 17  0
 1  0  0 292636   5268  5777600 0 0   18   208  4 80 16  0
 1  0  0 302124   5280  4778800 0   152   21   237  4 80 16  0
 1  0  0 291436   5280  5897600 0 0   18   205  3 81 16  0
 1  0  0 302068   5280  4778800 0 0   18   234  3 81 16  0
 1  0  0 293008   5280  5738800 0 0   18   221  4 79 17  0
 1  0  0 297288   5292  5253200 0   156   22   233  2 81 16  1
 1  0  0 294676   5292  5572400 0 0   19   199  3 81 16  0


vmstat 1 (with mount /dev/sda2 /mnt/sda2 -o data=ordered)

procs ---memory-- ---swap-- -io --system-- cpu
 r  b   swpd   free   buff  cache   si   sobibo   incs us sy id wa
 2  0  0 291052   5156  5901600 0 0   19   223  3 82 15  0
 1  0  0 291408   5156  5870400 0 0   18   218  3 81 16  0
 1  0  0 291888   5156  5827600 020   23   229  3 80 17  0
 1  0  0 300764   5168  4947200 0 12864   91   235  3 69 13 15
 1  0  0 300740   5168  4945600 0 0   19   215  3 80 17  0
 1  0  0 301088   5168  4904400 0 0   18   241  4 80 16  0
 1  0  0 298220   5168  5187200 0 0   18   225  3 81 16  0
 0  1  0 289168   5168  6075200 0 12712   45   237  3 77 15  5
 1  0  0 300260   5180  4985200 0   152   68   211  4 72 15  9
 1  0  0 298616   5180  5146000 0 0   18   237  3 81 16  0
 1  0  0 296988   5180  5309200 0 0   18   223  3 81 16  0
 1  0  0 296608   5180  5348000 0 0   18   223  3 81 16  0
 0  0  0 301640   5192  4803600 0 12868   93   206  4 67 13 16
 0  0  0 301624   5192  4803600 0 0   21   218  3 81 16  0
 0  0  0 301600   5192  4803600 0 0   18   212  3 81 16  0
 0  0  0 301584   5192  4803600 0 0   18   209  4 80 16  0
 0  0  0 301568   5192  4803600 0 0   18   208  3 81 16  0
 1  0  0 285520   5204  6454800 0 12864   95   216  3 69 13 15
 2  0  0 285124   5204  6492400 0 0   18   222  4 80 16  0
 1  0  0 283612   5204  6639200 0 0   18   231  3 81 16  0
 1  0  0 284216   5204  6573600 0 0   18   218  4 80 16  0
 0  1  0 289160   5204  6075200 0 12712   56   213  3 74 15  8
 1  0  0 285884   5216  64128