date:20080204

[RFC PATCH] vfs: optimization to /proc//mountinfo patch

2008-02-04 Thread Ram Pai

1) reports deleted inode in dentry_path() consistent with that in __d_path()
2) modified __d_path() to use prepend(), reducing the size of __d_path()
3) moved all the functionality that reports mount information in /proc under
CONFIG_PROC_FS.

Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
CONFIG_PROC_FS.



Signed-off-by: Ram Pai <[EMAIL PROTECTED]>
---
 fs/dcache.c  |   59 +++
 fs/namespace.c   |2 +
 fs/seq_file.c|2 +
 include/linux/dcache.h   |3 ++
 include/linux/seq_file.h |3 ++
 5 files changed, 34 insertions(+), 35 deletions(-)

Index: linux-2.6.23/fs/dcache.c
===
--- linux-2.6.23.orig/fs/dcache.c
+++ linux-2.6.23/fs/dcache.c
@@ -1747,6 +1747,17 @@ shouldnt_be_hashed:
goto shouldnt_be_hashed;
 }
 
+static int prepend(char **buffer, int *buflen, const char *str,
+ int namelen)
+{
+   *buflen -= namelen;
+   if (*buflen < 0)
+   return 1;
+   *buffer -= namelen;
+   memcpy(*buffer, str, namelen);
+   return 0;
+}
+
 /**
  * d_path - return the path of a dentry
  * @dentry: dentry to report
@@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den
 {
char * end = buffer+buflen;
char * retval;
-   int namelen;
 
-   *--end = '\0';
-   buflen--;
-   if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
-   buflen -= 10;
-   end -= 10;
-   if (buflen < 0)
+   prepend(&end, &buflen, "\0", 1);
+   if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+   prepend(&end, &buflen, " (deleted)", 10))
goto Elong;
-   memcpy(end, " (deleted)", 10);
-   }
 
if (buflen < 1)
goto Elong;
@@ -1805,13 +1810,10 @@ static char *__d_path(struct dentry *den
}
parent = dentry->d_parent;
prefetch(parent);
-   namelen = dentry->d_name.len;
-   buflen -= namelen + 1;
-   if (buflen < 0)
+   if (prepend(&end, &buflen, dentry->d_name.name,
+   dentry->d_name.len) ||
+   prepend(&end, &buflen, "/", 1))
goto Elong;
-   end -= namelen;
-   memcpy(end, dentry->d_name.name, namelen);
-   *--end = '/';
retval = end;
dentry = parent;
}
@@ -1819,12 +1821,9 @@ static char *__d_path(struct dentry *den
return retval;
 
 global_root:
-   namelen = dentry->d_name.len;
-   buflen -= namelen;
-   if (buflen < 0)
-   goto Elong;
-   retval -= namelen-1;/* hit the slash */
-   memcpy(retval, dentry->d_name.name, namelen);
+   retval += 1;/* hit the slash */
+   if (prepend(&retval, &buflen, dentry->d_name.name, dentry->d_name.len))
+   goto Elong;
return retval;
 Elong:
return ERR_PTR(-ENAMETOOLONG);
@@ -1890,17 +1889,8 @@ char *dynamic_dname(struct dentry *dentr
return memcpy(buffer, temp, sz);
 }
 
-static int prepend(char **buffer, int *buflen, const char *str,
- int namelen)
-{
-   *buflen -= namelen;
-   if (*buflen < 0)
-   return 1;
-   *buffer -= namelen;
-   memcpy(*buffer, str, namelen);
-   return 0;
-}
 
+#ifdef CONFIG_PROC_FS
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
@@ -1910,11 +1900,9 @@ char *dentry_path(struct dentry *dentry,
char *retval;
 
spin_lock(&dcache_lock);
-   prepend(&end, &buflen, "\0", 1);
-   if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
-   if (prepend(&end, &buflen, "//deleted", 9))
+   if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+   prepend(&end, &buflen, " (deleted)", 10))
goto Elong;
-   }
if (buflen < 1)
goto Elong;
/* Get '/' right */
@@ -1943,6 +1931,7 @@ Elong:
spin_unlock(&dcache_lock);
return ERR_PTR(-ENAMETOOLONG);
 }
+#endif /* CONFIG_PROC_FS */
 
 /*
  * NOTE! The user-level library version returns a
Index: linux-2.6.23/fs/namespace.c
===
--- linux-2.6.23.orig/fs/namespace.c
+++ linux-2.6.23/fs/namespace.c
@@ -609,6 +609,7 @@ void mnt_unpin(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(mnt_unpin);
 
+#ifdef CONFIG_PROC_FS
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
@@ -795,6 +796,7 @@ const struct seq_operations mountstats_o
.stop   = m_stop,
.show   = show_vfsstat,
 };
+#endif  /* CONFIG_PROC_FS */
 
 /**
  * may_umount_tree - check if a mount tree is busy
Index: linux-2.6.

Re: [RFC PATCH] vfs: optimization to /proc//mountinfo patch

2008-02-04 Thread Andrew Morton

On Mon, 04 Feb 2008 01:15:05 -0800 Ram Pai <[EMAIL PROTECTED]> wrote:

> 1) reports deleted inode in dentry_path() consistent with that in __d_path()
> 2) modified __d_path() to use prepend(), reducing the size of __d_path()
> 3) moved all the functionality that reports mount information in /proc under
>   CONFIG_PROC_FS.
> 
> Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
> impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
> CONFIG_PROC_FS.
>   

Do `make menuconfig', then hit '/' and search for "proc_fs".

It'll tell you that you need to set EMBEDDED=y to disable procfs.

>  fs/dcache.c  |   59 
> +++
>  fs/namespace.c   |2 +
>  fs/seq_file.c|2 +
>  include/linux/dcache.h   |3 ++
>  include/linux/seq_file.h |3 ++

Please resend after testing that, thanks.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RESEND] [PATCH] ext3,4:fdatasync should skip metadata writeout when overwriting

2008-02-04 Thread Hisashi Hifumi

Hi.

Currently fdatasync is identical to fsync in ext3,4.
I think fdatasync should skip journal flush in data=ordered and data=writeback 
mode
when it overwrites to already-instantiated blocks on HDD.
When I_DIRTY_DATASYNC flag is not set, fdatasync should skip journal writeout
because this indicates only atime or/and mtime updates.  

Following patch is the same approach of ext2's fsync code(ext2_sync_file).

I did a performance test using the sysbench.

#sysbench --num-threads=128 --max-requests=5 --test=fileio 
--file-total-size=128G 
--file-test-mode=rndwr --file-fsync-mode=fdatasync run

The result was:

-2.6.24
Operations performed:  0 Read, 50080 Write, 59600 Other = 109680 Total
Read 0b  Written 782.5Mb  Total transferred 782.5Mb  (12.116Mb/sec)
  775.45 Requests/sec executed

Test execution summary:
total time:  64.5814s
total number of events:  50080
total time taken by event execution: 3713.9836
per-request statistics:
 min:0.s
 avg:0.0742s
 max:0.9375s
 approx.  95 percentile: 0.2901s

Threads fairness:
events (avg/stddev):   391.2500/23.26
execution time (avg/stddev):   29.0155/1.99


-2.6.24-patched
Operations performed:  0 Read, 50009 Write, 61596 Other = 111605 Total
Read 0b  Written 781.39Mb  Total transferred 781.39Mb  (16.419Mb/sec)
 1050.83 Requests/sec executed

Test execution summary:
total time:  47.5900s
total number of events:  50009
total time taken by event execution: 2934.5768
per-request statistics:
 min:0.s
 avg:0.0587s
 max:0.8938s
 approx.  95 percentile: 0.1993s

Threads fairness:
events (avg/stddev):   390.6953/22.64
execution time (avg/stddev):   22.9264/1.17


Filesystem I/O throughput was improved.

Thanks.

Signed-off-by :Hisashi Hifumi <[EMAIL PROTECTED]>

diff -Nrup linux-2.6.24.org/fs/ext3/fsync.c linux-2.6.24/fs/ext3/fsync.c
--- linux-2.6.24.org/fs/ext3/fsync.c2008-01-25 07:58:37.0 +0900
+++ linux-2.6.24/fs/ext3/fsync.c2008-02-04 12:42:42.0 +0900
@@ -72,6 +72,9 @@ int ext3_sync_file(struct file * file, s
goto out;
}
 
+   if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+   goto out;
+
/*
 * The VFS has written the file data.  If the inode is unaltered
 * then we need not start a commit.
diff -Nrup linux-2.6.24.org/fs/ext4/fsync.c linux-2.6.24/fs/ext4/fsync.c
--- linux-2.6.24.org/fs/ext4/fsync.c2008-01-25 07:58:37.0 +0900
+++ linux-2.6.24/fs/ext4/fsync.c2008-02-04 12:43:37.0 +0900
@@ -72,6 +72,9 @@ int ext4_sync_file(struct file * file, s
goto out;
}
 
+if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+goto out;
+
/*
 * The VFS has written the file data.  If the inode is unaltered
 * then we need not start a commit.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 2/3] mm: Add NR_WRITEBACK_TEMP counter

2008-02-04 Thread Miklos Szeredi

From: Miklos Szeredi <[EMAIL PROTECTED]>

Fuse will use temporary buffers to write back dirty data from memory
mappings (normal writes are done synchronously).  This is needed,
because there cannot be any guarantee about the time in which a write
will complete.

By using temporary buffers, from the MM's point if view the page is
written back immediately.  If the writeout was due to memory pressure,
this effectively migrates data from a full zone to a less full zone.

This patch adds a new counter (NR_WRITEBACK_TEMP) for the number of
pages used as temporary buffers.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/fs/proc/proc_misc.c
===
--- linux.orig/fs/proc/proc_misc.c  2008-02-04 12:29:00.0 +0100
+++ linux/fs/proc/proc_misc.c   2008-02-04 13:01:35.0 +0100
@@ -178,6 +178,7 @@ static int meminfo_read_proc(char *page,
"PageTables:   %8lu kB\n"
"NFS_Unstable: %8lu kB\n"
"Bounce:   %8lu kB\n"
+   "WritebackTmp: %8lu kB\n"
"CommitLimit:  %8lu kB\n"
"Committed_AS: %8lu kB\n"
"VmallocTotal: %8lu kB\n"
@@ -209,6 +210,7 @@ static int meminfo_read_proc(char *page,
K(global_page_state(NR_PAGETABLE)),
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
+   K(global_page_state(NR_WRITEBACK_TEMP)),
K(allowed),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
Index: linux/include/linux/mmzone.h
===
--- linux.orig/include/linux/mmzone.h   2008-02-04 12:29:01.0 +0100
+++ linux/include/linux/mmzone.h2008-02-04 13:01:35.0 +0100
@@ -95,6 +95,7 @@ enum zone_stat_item {
NR_UNSTABLE_NFS,/* NFS unstable pages */
NR_BOUNCE,
NR_VMSCAN_WRITE,
+   NR_WRITEBACK_TEMP,  /* Writeback using temporary buffers */
 #ifdef CONFIG_NUMA
NUMA_HIT,   /* allocated in intended node */
NUMA_MISS,  /* allocated in non intended node */
Index: linux/drivers/base/node.c
===
--- linux.orig/drivers/base/node.c  2008-02-04 12:28:53.0 +0100
+++ linux/drivers/base/node.c   2008-02-04 13:01:35.0 +0100
@@ -64,6 +64,7 @@ static ssize_t node_read_meminfo(struct 
   "Node %d PageTables:   %8lu kB\n"
   "Node %d NFS_Unstable: %8lu kB\n"
   "Node %d Bounce:   %8lu kB\n"
+  "Node %d WritebackTmp: %8lu kB\n"
   "Node %d Slab: %8lu kB\n"
   "Node %d SReclaimable: %8lu kB\n"
   "Node %d SUnreclaim:   %8lu kB\n",
@@ -86,6 +87,7 @@ static ssize_t node_read_meminfo(struct 
   nid, K(node_page_state(nid, NR_PAGETABLE)),
   nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
   nid, K(node_page_state(nid, NR_BOUNCE)),
+  nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)),
   nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
   nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-02-04 13:01:23.0 +0100
+++ linux/mm/page-writeback.c   2008-02-04 13:01:35.0 +0100
@@ -211,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_
avail_dirty = dirty -
(global_page_state(NR_FILE_DIRTY) +
 global_page_state(NR_WRITEBACK) +
-global_page_state(NR_UNSTABLE_NFS));
+global_page_state(NR_UNSTABLE_NFS) +
+global_page_state(NR_WRITEBACK_TEMP));
 
if (avail_dirty < 0)
avail_dirty = 0;

--
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 3/3] fuse: support writable mmap

2008-02-04 Thread Miklos Szeredi

From: Miklos Szeredi <[EMAIL PROTECTED]>

Quoting Linus (3 years ago, FUSE inclusion discussions):

  "User-space filesystems are hard to get right. I'd claim that they
   are almost impossible, unless you limit them somehow (shared
   writable mappings are the nastiest part - if you don't have those,
   you can reasonably limit your problems by limiting the number of
   dirty pages you accept through normal "write()" calls)."

Instead of attempting the impossible, I've just waited for the dirty
page accounting infrastructure to materialize (thanks to Peter
Zijlstra and others).  This nicely solved the biggest problem:
limiting the number of pages used for write caching.

Some small details remained, however, which this largish patch
attempts to address.  It provides a page writeback implementation for
fuse, which is completely safe against VM related deadlocks.
Performance may not be very good for certain usage patterns, but
generally it should be acceptable.

It has been tested extensively with fsx-linux and bash-shared-mapping.

This patch depends on
mm-bdi-allow-setting-a-maximum-for-the-bdi-dirty-limit-fix.patch


Fuse page writeback design
--

fuse_writepage() allocates a new temporary page with
GFP_NOFS|__GFP_HIGHMEM.  It copies the contents of the original page,
and queues a WRITE request to the userspace filesystem using this temp
page.

The writeback is finished instantly from the MM's point of view: the
page is removed from the radix trees, and the PageDirty and
PageWriteback flags are cleared.

For the duration of the actual write, the NR_WRITEBACK_TEMP counter is
incremented.  The per-bdi writeback count is not decremented until the
actual write completes.

On dirtying the page, fuse waits for a previous write to finish before
proceeding.  This makes sure, there can only be one temporary page used
at a time for one cached page.

This approach is wasteful in both memory and CPU bandwidth, so why is
this complication needed?

The basic problem is that there can be no guarantee about the time in
which the userspace filesystem will complete a write.  It may be buggy
or even malicious, and fail to complete WRITE requests.  We don't want
unrelated parts of the system to grind to a halt in such cases.

Also a filesystem may need additional resources (particularly memory)
to complete a WRITE request.  There's a great danger of a deadlock if
that allocation may wait for the writepage to finish.

Currently there are several cases where the kernel can block on page
writeback:

  - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER
  - page migration
  - throttle_vm_writeout (through NR_WRITEBACK)
  - sync(2)

Of course in some cases (fsync, msync) we explicitly want to allow
blocking.  So for these cases new code has to be added to fuse, since
the VM is not tracking writeback pages for us any more.

As an extra safetly measure, the maximum dirty ratio allocated to a
single fuse filesystem is set to 1% by default.  This way one (or
several) buggy or malicious fuse filesystems cannot slow down the rest
of the system by hogging dirty memory.

With appropriate privileges, this limit can be raised through
'/sys/class/bdi//max_ratio'.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/fs/fuse/dev.c
===
--- linux.orig/fs/fuse/dev.c2008-02-04 15:24:03.0 +0100
+++ linux/fs/fuse/dev.c 2008-02-04 15:24:47.0 +0100
@@ -47,6 +47,14 @@ struct fuse_req *fuse_request_alloc(void
return req;
 }
 
+struct fuse_req *fuse_request_alloc_nofs(void)
+{
+   struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
+   if (req)
+   fuse_request_init(req);
+   return req;
+}
+
 void fuse_request_free(struct fuse_req *req)
 {
kmem_cache_free(fuse_req_cachep, req);
@@ -430,6 +438,17 @@ void request_send_background(struct fuse
 }
 
 /*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+{
+   req->isreply = 1;
+   request_send_nowait_locked(fc, req);
+}
+
+/*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
  * aborted bail out.
Index: linux/fs/fuse/dir.c
===
--- linux.orig/fs/fuse/dir.c2008-02-04 15:24:03.0 +0100
+++ linux/fs/fuse/dir.c 2008-02-04 15:24:47.0 +0100
@@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr 
 }
 
 /*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   struct fuse_inode *fi = get_fuse_inode(inode);
+

[patch 0/3] fuse: writable mmap

2008-02-04 Thread Miklos Szeredi

This is short series for fuse writable mmap support.

The first two patches are small additions to mm infrastructure.  The
third is a large patch for fuse.  It also depends on the "mm: bdi:
export BDI attributes in sysfs" series.

I don't mind if this goes into 2.6.25 (guess, that depends on whether
the bdi things go).

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Peter Staubach


Miklos Szeredi wrote:
  
  

Would you describe the situation that would cause the kernel to
go into an infinite loop, please?



The patch basically does:

do {
...
error = inode->i_op->foo()
...
} while (error == ESTALE);

What is the guarantee, that ->foo() will not always return ESTALE?
  

You skimmed over some stuff, like the pathname lookup component
contained in the first set of dots...

I can't guarantee that ->foo() won't always return ESTALE.

That said, the loop is not unbreakable.  At least for NFS, a signal
to the process will interrupt the loop because the error returned
will change from ESTALE to EINTR.



In FUSE interrupts are sent to userspace, and the filesystem decides
what to do with them.  So it is entirely possible and valid for a
filesystem to ignore an interrupt.  If an operation was non-blocking
(such as one returning an error), then there would in fact be no
purpose in checking interrupts.

  


Why do you think that it is valid to ignore pending signals?
You seem to be asserting that it okay for processes to hang,
uninterruptibly, when accessing files on fuse mounted file
systems?

Perhaps the right error to return when there is a signal
pending is EINTR and not ESTALE or some other error?  There
has to be some way for the application to detect that its
system call was interrupted due to a signal pending.


So while sending a signal might reliably work in NFS to break out of
the loop, it does not necessarily work for other filesystems, and fuse
may not be the only one affected.

  


Have you noticed another one?  I would be happy to chat with the
developers for that file system to see if this support would
negatively impact them.


Also up till now, returning ESTALE in a fuse filesystem was a
perfectly valid thing to do.  This patch changes the behavior of that
rather drastically.  There might be installed systems that rely on
current behavior, and we want to avoid breaking those on a kernel
upgrade.

  


Perhaps the explanation for what ESTALE means was not clear?
If there are fuse file systems which really do support the
notion of ESTALE, then it seems to me that they would also
benefit from this support, ie. the ability to do some recovery
from the situation.


A few solutions come to mind, perhaps the best is to introduce a
kernel internal errno value (ERETRYSTALE), that forces the relevant
system calls to be retried.

NFS could transform ESTALE errors to ERETRYSTALE and get the desired
behavior, while other filesystems would not be affected.


We don't need more error numbers, we've got plenty already.  :-)

Do you have anything more specific about any real problems?
I see lots of "mays" and "coulds", but I don't see anything
that I can do to make this support better.

   Thanx...

  ps
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 0/3] add perform_write to a_ops

2008-02-04 Thread Miklos Szeredi

a_ops->perform_write() was left out from Nick Piggin's new a_ops
patchset, as it was non-essential, and postponed for later inclusion.

This short series reintroduces it, but only adds the fuse
implementation and not simple_perform_write(), which I'm not sure
would be a significant improvement.

This allows larger than 4k buffered writes for fuse, which is one of
the most requested features.

This goes on top of the "fuse: writable mmap" patches.

Thanks,
Miklos

--
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 3/3] fuse: implement perform_write

2008-02-04 Thread Miklos Szeredi

From: Nick Piggin <[EMAIL PROTECTED]>

Introduce fuse_perform_write. With fusexmp (a passthrough filesystem), large
(1MB) writes into a backing tmpfs filesystem are sped up by almost 4 times
(256MB/s vs 71MB/s).

[EMAIL PROTECTED]:

 - split into smaller functions
 - testing

Signed-off-by: Nick Piggin <[EMAIL PROTECTED]>
Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/fs/fuse/file.c
===
--- linux.orig/fs/fuse/file.c   2008-02-04 17:11:18.0 +0100
+++ linux/fs/fuse/file.c2008-02-04 17:11:59.0 +0100
@@ -677,6 +677,148 @@ static int fuse_write_end(struct file *f
return res;
 }
 
+static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+   struct inode *inode, loff_t pos,
+   size_t count)
+{
+   size_t res;
+   unsigned offset;
+   unsigned i;
+
+   for (i = 0; i < req->num_pages; i++)
+   fuse_wait_on_page_writeback(inode, req->pages[i]->index);
+
+   res = fuse_send_write(req, file, inode, pos, count, NULL);
+
+   offset = req->page_offset;
+   count = res;
+   for (i = 0; i < req->num_pages; i++) {
+   struct page *page = req->pages[i];
+
+   if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+   SetPageUptodate(page);
+
+   /* Just ignore count underflow on last page */
+   count -= PAGE_CACHE_SIZE - offset;
+   offset = 0;
+
+   unlock_page(page);
+   page_cache_release(page);
+   }
+
+   return res;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+  struct address_space *mapping,
+  struct iov_iter *ii, loff_t pos)
+{
+   struct fuse_conn *fc = get_fuse_conn(mapping->host);
+   unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+   size_t count = 0;
+   int err;
+
+   req->page_offset = offset;
+
+   do {
+   size_t tmp;
+   struct page *page;
+   pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+   size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+iov_iter_count(ii));
+
+   bytes = min_t(size_t, bytes, fc->max_write - count);
+
+ again:
+   err = -EFAULT;
+   if (iov_iter_fault_in_readable(ii, bytes))
+   break;
+
+   err = -ENOMEM;
+   page = __grab_cache_page(mapping, index);
+   if (!page)
+   break;
+
+   pagefault_disable();
+   tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+   pagefault_enable();
+   flush_dcache_page(page);
+
+   if (!tmp) {
+   unlock_page(page);
+   page_cache_release(page);
+   bytes = min(bytes, iov_iter_single_seg_count(ii));
+   goto again;
+   }
+
+   err = 0;
+   req->pages[req->num_pages] = page;
+   req->num_pages++;
+
+   iov_iter_advance(ii, tmp);
+   count += tmp;
+   pos += tmp;
+   offset += tmp;
+   if (offset == PAGE_CACHE_SIZE)
+   offset = 0;
+
+   } while (iov_iter_count(ii) && count < fc->max_write &&
+req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+
+   return count > 0 ? count : err;
+}
+
+static ssize_t fuse_perform_write(struct file *file,
+ struct address_space *mapping,
+ struct iov_iter *ii, loff_t pos)
+{
+   struct inode *inode = mapping->host;
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   int err = 0;
+   ssize_t res = 0;
+
+   if (is_bad_inode(inode))
+   return -EIO;
+
+   do {
+   struct fuse_req *req;
+   ssize_t count;
+
+   req = fuse_get_req(fc);
+   if (IS_ERR(req)) {
+   err = PTR_ERR(req);
+   break;
+   }
+
+   count = fuse_fill_write_pages(req, mapping, ii, pos);
+   if (count <= 0) {
+   err = count;
+   } else {
+   size_t num_written;
+
+   num_written = fuse_send_write_pages(req, file, inode,
+   pos, count);
+   err = req->out.h.error;
+   if (!err) {
+   res += num_written;
+   pos += num_written;
+
+   /* break out of the loop on short write */
+   if (num_written != count)

Re: [RFC] ext3: per-process soft-syncing data=ordered mode

2008-02-04 Thread Jan Kara

On Sat 02-02-08 00:26:00, Al Boldi wrote:
> Chris Mason wrote:
> > On Thursday 31 January 2008, Jan Kara wrote:
> > > On Thu 31-01-08 11:56:01, Chris Mason wrote:
> > > > On Thursday 31 January 2008, Al Boldi wrote:
> > > > > The big difference between ordered and writeback is that once the
> > > > > slowdown starts, ordered goes into ~100% iowait, whereas writeback
> > > > > continues 100% user.
> > > >
> > > > Does data=ordered write buffers in the order they were dirtied?  This
> > > > might explain the extreme problems in transactional workloads.
> > >
> > >   Well, it does but we submit them to block layer all at once so
> > > elevator should sort the requests for us...
> >
> > nr_requests is fairly small, so a long stream of random requests should
> > still end up being random IO.
> >
> > Al, could you please compare the write throughput from vmstat for the
> > data=ordered vs data=writeback runs?  I would guess the data=ordered one
> > has a lower overall write throughput.
> 
> That's what I would have guessed, but it's actually going up 4x fold for 
> mysql from 559mb to 2135mb, while the db-size ends up at 549mb.
  So you say we write 4-times as much data in ordered mode as in writeback
mode. Hmm, probably possible because we force all the dirty data to disk
when committing a transation in ordered mode (and don't do this in
writeback mode). So if the workload repeatedly dirties the whole DB, we are
going to write the whole DB several times in ordered mode but in writeback
mode we just keep the data in memory all the time. But this is what you
ask for if you mount in ordered mode so I wouldn't consider it a bug.
  I still don't like your hack with per-process journal mode setting but we
could easily do per-file journal mode setting (we already have a flag to do
data journaling for a file) and that would help at least your DB
workload...

Honza
-- 
Jan Kara <[EMAIL PROTECTED]>
SUSE Labs, CR
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Miklos Szeredi

> > In FUSE interrupts are sent to userspace, and the filesystem decides
> > what to do with them.  So it is entirely possible and valid for a
> > filesystem to ignore an interrupt.  If an operation was non-blocking
> > (such as one returning an error), then there would in fact be no
> > purpose in checking interrupts.
> >
> >   
> 
> Why do you think that it is valid to ignore pending signals?
> You seem to be asserting that it okay for processes to hang,
> uninterruptibly, when accessing files on fuse mounted file
> systems?
> 
> Perhaps the right error to return when there is a signal
> pending is EINTR and not ESTALE or some other error?  There
> has to be some way for the application to detect that its
> system call was interrupted due to a signal pending.

Traditionally a lot of filesystem related system calls are not
interruptible, and for good reason.  For example what happens, if an
app receives a signal, while the filesystem is performing a rename()
request?  It would be very confusing if the call returned EINTR, but
the rename would successfully complete regardless.

We had a related problem with the open(O_CREAT) call in fuse, which
was interruptible between the creation and the actual open because of
a design mistake.  So it could return EINTR, after the file was
created, and this broke a real world application (don't have details
at hand, but could dig them out if you are interested).

I don't know what NFS does, but returning EINTR without actually
canceling an operation in the server is generally not a good idea.

> > So while sending a signal might reliably work in NFS to break out of
> > the loop, it does not necessarily work for other filesystems, and fuse
> > may not be the only one affected.
> >
> >   
> 
> Have you noticed another one?  I would be happy to chat with the
> developers for that file system to see if this support would
> negatively impact them.

Oh, I have no idea.  And I wouldn't want to do a full audit of all the
filesystems to find out.  But if you do, please go ahead.

> > A few solutions come to mind, perhaps the best is to introduce a
> > kernel internal errno value (ERETRYSTALE), that forces the relevant
> > system calls to be retried.
> >
> > NFS could transform ESTALE errors to ERETRYSTALE and get the desired
> > behavior, while other filesystems would not be affected.
> 
> We don't need more error numbers, we've got plenty already.  :-)

That's a rather poor excuse against a simple solution which would
spare us some backward compatibility problems.

> Do you have anything more specific about any real problems?
> I see lots of "mays" and "coulds", but I don't see anything
> that I can do to make this support better.

Implement the above suggestion?  Or something else.

Otherwise I have to NAK this patch due to the possibility of it
breaking existing fuse installations.

Thanks,
Miklos
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Miklos Szeredi

> > I don't know what NFS does, but returning EINTR without actually
> > canceling an operation in the server is generally not a good idea.
> >
> >   
> 
> This is what NFS has been doing, for several decades, and no one
> has complained yet.

Is it really?  Man nfs says something quite different (emphasis mine):

   intrIf an NFS file operation has a *major timeout* and  it  is
   hard  mounted,  then  allow signals to interupt the file
   operation and cause it to return EINTR  to  the  calling
   program.  The *default* is to *not* allow file operations to
   be *interrupted*.

> >> Have you noticed another one?  I would be happy to chat with the
> >> developers for that file system to see if this support would
> >> negatively impact them.
> >> 
> >
> > Oh, I have no idea.  And I wouldn't want to do a full audit of all the
> > filesystems to find out.  But if you do, please go ahead.
> >
> >   
> 
> Well, you brought it up.  I thought that perhaps you had something
> other than FUD.

It's not FUD, it's being careful not to break an implementation when
changing an API in a backward incompatbile way.

> Please describe this real and existing fuse installation so that I can
> better understand the situation and the real requirements here.

I have already done so:

  "Also up till now, returning ESTALE in a fuse filesystem was a
   perfectly valid thing to do.  This patch changes the behavior of
   that rather drastically.  There might be installed systems that
   rely on current behavior, and we want to avoid breaking those on a
   kernel upgrade."

Miklos
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

2008-02-04 Thread Peter Staubach


Miklos Szeredi wrote:

In FUSE interrupts are sent to userspace, and the filesystem decides
what to do with them.  So it is entirely possible and valid for a
filesystem to ignore an interrupt.  If an operation was non-blocking
(such as one returning an error), then there would in fact be no
purpose in checking interrupts.

  
  

Why do you think that it is valid to ignore pending signals?
You seem to be asserting that it okay for processes to hang,
uninterruptibly, when accessing files on fuse mounted file
systems?

Perhaps the right error to return when there is a signal
pending is EINTR and not ESTALE or some other error?  There
has to be some way for the application to detect that its
system call was interrupted due to a signal pending.



Traditionally a lot of filesystem related system calls are not
interruptible, and for good reason.  For example what happens, if an
app receives a signal, while the filesystem is performing a rename()
request?  It would be very confusing if the call returned EINTR, but
the rename would successfully complete regardless.

We had a related problem with the open(O_CREAT) call in fuse, which
was interruptible between the creation and the actual open because of
a design mistake.  So it could return EINTR, after the file was
created, and this broke a real world application (don't have details
at hand, but could dig them out if you are interested).

I don't know what NFS does, but returning EINTR without actually
canceling an operation in the server is generally not a good idea.

  


This is what NFS has been doing, for several decades, and no one
has complained yet.  It is just generally accepted.  I do agree
that it isn't the best of semantics, but it does seem to work and
does solve a real problem which exists if you don't allow an
operation to be interrupted.  The alternative, for NFS clients,
was potentially to block an application until a server, which
might never come back up, comes back up.  It was a serious
problem and worse than this resolution.

Yes, I'd like to hear the details and find out why it was a
problem.  If you allow the fuse file system to block waiting
on things which may never occur, than you are going to have a
problem.  I would suggest considering this now instead of waiting
until it is too late.  We can learn from the NFS experience instead
of just dismissing it.



So while sending a signal might reliably work in NFS to break out of
the loop, it does not necessarily work for other filesystems, and fuse
may not be the only one affected.

  
  

Have you noticed another one?  I would be happy to chat with the
developers for that file system to see if this support would
negatively impact them.



Oh, I have no idea.  And I wouldn't want to do a full audit of all the
filesystems to find out.  But if you do, please go ahead.

  


Well, you brought it up.  I thought that perhaps you had something
other than FUD.


A few solutions come to mind, perhaps the best is to introduce a
kernel internal errno value (ERETRYSTALE), that forces the relevant
system calls to be retried.

NFS could transform ESTALE errors to ERETRYSTALE and get the desired
behavior, while other filesystems would not be affected.
  

We don't need more error numbers, we've got plenty already.  :-)



That's a rather poor excuse against a simple solution which would
spare us some backward compatibility problems.

  


Potential backwards compatibility problems and none are even known
or even considered.

The solution here isn't to create more hacks and a new error number
for this purpose is just a hack.


Do you have anything more specific about any real problems?
I see lots of "mays" and "coulds", but I don't see anything
that I can do to make this support better.



Implement the above suggestion?  Or something else.

Otherwise I have to NAK this patch due to the possibility of it
breaking existing fuse installations.


Please describe this real and existing fuse installation so that I can
better understand the situation and the real requirements here.

Instead of attempting to block this proposal, what about considering
how to architect fuse to handle the situation instead of pretending
that fuse won't have the same problem to solve if it isn't solved
here?  I have a real problem to solve and I need to get it resolved.
I have real customers, with real problems, and not just theoretical
and vague ones.

 ps
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 1/3] mm: bdi: export bdi_writeout_inc()

2008-02-04 Thread Miklos Szeredi

From: Miklos Szeredi <[EMAIL PROTECTED]>

Fuse needs this for writable mmap support.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/include/linux/backing-dev.h
===
--- linux.orig/include/linux/backing-dev.h  2008-02-04 12:29:01.0 
+0100
+++ linux/include/linux/backing-dev.h   2008-02-04 13:01:23.0 +0100
@@ -149,6 +149,8 @@ static inline unsigned long bdi_stat_err
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
+extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+
 /*
  * Flags in backing_dev_info::capability
  * - The first two flags control whether dirty pages will contribute to the
Index: linux/mm/page-writeback.c
===
--- linux.orig/mm/page-writeback.c  2008-02-04 12:29:01.0 +0100
+++ linux/mm/page-writeback.c   2008-02-04 13:01:23.0 +0100
@@ -168,6 +168,16 @@ static inline void __bdi_writeout_inc(st
  bdi->max_prop_frac);
 }
 
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+   unsigned long flags;
+
+   local_irq_save(flags);
+   __bdi_writeout_inc(bdi);
+   local_irq_restore(flags);
+}
+EXPORT_SYMBOL(bdi_writeout_inc);
+
 static inline void task_dirty_inc(struct task_struct *tsk)
 {
prop_inc_single(&vm_dirties, &tsk->dirties);

--
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 1/3] vfs: introduce perform_write in a_ops

2008-02-04 Thread Miklos Szeredi

From: Nick Piggin <[EMAIL PROTECTED]>

Introduce a new perform_write() address space operation.

This is a single-call, bulk version of write_begin/write_end
operations.  It is only used in the buffered write path (write_begin
must still be implemented), and not for in-kernel writes to pagecache.

For some filesystems, using this can provide significant speedups.

Signed-off-by: Nick Piggin <[EMAIL PROTECTED]>
Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2008-02-04 15:24:03.0 +0100
+++ linux/include/linux/fs.h2008-02-04 16:24:19.0 +0100
@@ -469,6 +469,9 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
 
+   ssize_t (*perform_write)(struct file *, struct address_space *mapping,
+   struct iov_iter *i, loff_t pos);
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
Index: linux/mm/filemap.c
===
--- linux.orig/mm/filemap.c 2008-02-04 15:24:03.0 +0100
+++ linux/mm/filemap.c  2008-02-04 16:22:55.0 +0100
@@ -2312,7 +2312,9 @@ generic_file_buffered_write(struct kiocb
struct iov_iter i;
 
iov_iter_init(&i, iov, nr_segs, count, written);
-   if (a_ops->write_begin)
+   if (a_ops->perform_write)
+   status = a_ops->perform_write(file, mapping, &i, pos);
+   else if (a_ops->write_begin)
status = generic_perform_write(file, &i, pos);
else
status = generic_perform_write_2copy(file, &i, pos);
Index: linux/Documentation/filesystems/vfs.txt
===
--- linux.orig/Documentation/filesystems/vfs.txt2008-02-04 
12:28:50.0 +0100
+++ linux/Documentation/filesystems/vfs.txt 2008-02-04 16:23:44.0 
+0100
@@ -533,6 +533,9 @@ struct address_space_operations {
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
+   ssize_t (*perform_write)(struct file *, struct address_space *mapping,
+   struct iov_iter *i, loff_t pos);
+
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
@@ -664,6 +667,17 @@ struct address_space_operations {
 Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
 that were able to be copied into pagecache.
 
+  perform_write: This is a single-call, bulk version of write_begin/write_end
+operations. It is only used in the buffered write path (write_begin
+must still be implemented), and not for in-kernel writes to pagecache.
+It takes an iov_iter structure, which provides a descriptor for the
+source data (and has associated iov_iter_xxx helpers to operate on
+that data). There are also file, mapping, and pos arguments, which
+specify the destination of the data.
+
+Returns < 0 on failure if nothing was written out, otherwise returns
+the number of bytes copied into pagecache.
+
   bmap: called by the VFS to map a logical block offset within object to
physical block number. This method is used by the FIBMAP
ioctl and for working with swap-files.  To be able to swap to

--
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH] vfs: optimization to /proc//mountinfo patch

2008-02-04 Thread Miklos Szeredi

> 1) reports deleted inode in dentry_path() consistent with that in __d_path()
> 2) modified __d_path() to use prepend(), reducing the size of __d_path()
> 3) moved all the functionality that reports mount information in /proc under
>   CONFIG_PROC_FS.
> 
> Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
> impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
> CONFIG_PROC_FS.
>   
> 
> 
> Signed-off-by: Ram Pai <[EMAIL PROTECTED]>
> ---
>  fs/dcache.c  |   59 
> +++
>  fs/namespace.c   |2 +
>  fs/seq_file.c|2 +
>  include/linux/dcache.h   |3 ++
>  include/linux/seq_file.h |3 ++
>  5 files changed, 34 insertions(+), 35 deletions(-)
> 
> Index: linux-2.6.23/fs/dcache.c
> ===
> --- linux-2.6.23.orig/fs/dcache.c
> +++ linux-2.6.23/fs/dcache.c
> @@ -1747,6 +1747,17 @@ shouldnt_be_hashed:
>   goto shouldnt_be_hashed;
>  }
>  
> +static int prepend(char **buffer, int *buflen, const char *str,
> +   int namelen)
> +{
> + *buflen -= namelen;
> + if (*buflen < 0)
> + return 1;

This is confusing.  Should return -ENAMETOOLONG intead (see Chapter 16
in Documentation/CodingStyle).

> + *buffer -= namelen;
> + memcpy(*buffer, str, namelen);
> + return 0;
> +}
> +
>  /**
>   * d_path - return the path of a dentry
>   * @dentry: dentry to report
> @@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den
>  {
>   char * end = buffer+buflen;
>   char * retval;
> - int namelen;
>  
> - *--end = '\0';
> - buflen--;
> - if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
> - buflen -= 10;
> - end -= 10;
> - if (buflen < 0)
> + prepend(&end, &buflen, "\0", 1);
> + if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
> + prepend(&end, &buflen, " (deleted)", 10))

And this should test for "prepend() != 0" or "prepend() < 0" instead,
otherwise it could easily be misread as "if prepend() succeeded,
then...".

And similarly for all the later calls.

Thanks,
Miklos
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [patch 0/3] add perform_write to a_ops

2008-02-04 Thread Christoph Hellwig

On Mon, Feb 04, 2008 at 06:04:10PM +0100, Miklos Szeredi wrote:
> a_ops->perform_write() was left out from Nick Piggin's new a_ops
> patchset, as it was non-essential, and postponed for later inclusion.
> 
> This short series reintroduces it, but only adds the fuse
> implementation and not simple_perform_write(), which I'm not sure
> would be a significant improvement.
> 
> This allows larger than 4k buffered writes for fuse, which is one of
> the most requested features.
> 
> This goes on top of the "fuse: writable mmap" patches.

Please don't do this, but rather implement your own .aio_write.  There's
very little in generic_file_aio_write that wouldn't be handle by
->perform_write and we should rather factor those up or move to higher
layers than adding this ill-defined abstraction.

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[patch 2/3] fuse: clean up setting i_size in write

2008-02-04 Thread Miklos Szeredi

From: Miklos Szeredi <[EMAIL PROTECTED]>

Extract common code for setting i_size in write functions into a
common helper.

Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
---

Index: linux/fs/fuse/file.c
===
--- linux.orig/fs/fuse/file.c   2008-02-04 13:01:39.0 +0100
+++ linux/fs/fuse/file.c2008-02-04 13:02:03.0 +0100
@@ -610,13 +610,24 @@ static int fuse_write_begin(struct file 
return 0;
 }
 
+static void fuse_write_update_size(struct inode *inode, loff_t pos)
+{
+   struct fuse_conn *fc = get_fuse_conn(inode);
+   struct fuse_inode *fi = get_fuse_inode(inode);
+
+   spin_lock(&fc->lock);
+   fi->attr_version = ++fc->attr_version;
+   if (pos > inode->i_size)
+   i_size_write(inode, pos);
+   spin_unlock(&fc->lock);
+}
+
 static int fuse_buffered_write(struct file *file, struct inode *inode,
   loff_t pos, unsigned count, struct page *page)
 {
int err;
size_t nres;
struct fuse_conn *fc = get_fuse_conn(inode);
-   struct fuse_inode *fi = get_fuse_inode(inode);
unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
struct fuse_req *req;
 
@@ -643,12 +654,7 @@ static int fuse_buffered_write(struct fi
err = -EIO;
if (!err) {
pos += nres;
-   spin_lock(&fc->lock);
-   fi->attr_version = ++fc->attr_version;
-   if (pos > inode->i_size)
-   i_size_write(inode, pos);
-   spin_unlock(&fc->lock);
-
+   fuse_write_update_size(inode, pos);
if (count == PAGE_CACHE_SIZE)
SetPageUptodate(page);
}
@@ -766,12 +772,8 @@ static ssize_t fuse_direct_io(struct fil
}
fuse_put_request(fc, req);
if (res > 0) {
-   if (write) {
-   spin_lock(&fc->lock);
-   if (pos > inode->i_size)
-   i_size_write(inode, pos);
-   spin_unlock(&fc->lock);
-   }
+   if (write)
+   fuse_write_update_size(inode, pos);
*ppos = pos;
}
fuse_invalidate_attr(inode);

--
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [patch 0/3] add perform_write to a_ops

2008-02-04 Thread Miklos Szeredi

> > a_ops->perform_write() was left out from Nick Piggin's new a_ops
> > patchset, as it was non-essential, and postponed for later inclusion.
> > 
> > This short series reintroduces it, but only adds the fuse
> > implementation and not simple_perform_write(), which I'm not sure
> > would be a significant improvement.
> > 
> > This allows larger than 4k buffered writes for fuse, which is one of
> > the most requested features.
> > 
> > This goes on top of the "fuse: writable mmap" patches.
> 
> Please don't do this, but rather implement your own .aio_write.  There's
> very little in generic_file_aio_write that wouldn't be handle by
> ->perform_write and we should rather factor those up or move to higher
> layers than adding this ill-defined abstraction.
> 

Moving up to higher layers might not be possible, due to lock/unlock
of i_mutex being inside generic_file_aio_write().

But with fuse being the only user, it's not a huge issue duplicating
some code.

Nick, were there any other candidates, that would want to use such an
interface in the future?

Thanks,
Miklos
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [patch 0/3] add perform_write to a_ops

2008-02-04 Thread Christoph Hellwig

On Mon, Feb 04, 2008 at 09:52:06PM +0100, Miklos Szeredi wrote:
> Moving up to higher layers might not be possible, due to lock/unlock
> of i_mutex being inside generic_file_aio_write().

Well some bits can be moved up.  Here's my grand plan which I plan
to implement once I get some time for it (or let someone else do
if they beat me):

 - generic_segment_checks goes to fs/read_write.c before caling into
   the filesystem
 - dito for vfs_check_frozen
 - generic_write_checks is a suitable helper already
 - dito for remove_suid
 - dito for file_update_time
 - after that there's not a whole lot left in generic_file_aio_write,
   except for direct I/O handling which will probably be very fs-specific
   if you have your own buffered I/O code

generic_file_buffered_write is an almost trivial wrapper around what's
->perform_write in Nick's earlier patches and a helper for the syncing
activity.



> 
> But with fuse being the only user, it's not a huge issue duplicating
> some code.
> 
> Nick, were there any other candidates, that would want to use such an
> interface in the future?
> 
> Thanks,
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [2.6.24 REGRESSION] BUG: Soft lockup - with VFS

2008-02-04 Thread Andrew Morton

On Mon, 28 Jan 2008 09:31:43 +0100 "Oliver Pinter (Pintér Olivér)"  <[EMAIL 
PROTECTED]> wrote:

> hi all!
> 
> in the 2.6.24 become i some soft lockups with usb-phone, when i pluged
> in the mobile, then the vfs-layer crashed. am afternoon can i the
> .config send, and i bisected the kernel, when i have time.
> 
> pictures from crash:
> http://students.zipernowsky.hu/~oliverp/kernel/regression_2624/

It looks like selinux's file_has_perm() is doing spin_lock() on an
uninitialised (or already locked) spinlock.


-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC] ext3: per-process soft-syncing data=ordered mode

2008-02-04 Thread Al Boldi

Jan Kara wrote:
> On Sat 02-02-08 00:26:00, Al Boldi wrote:
> > Chris Mason wrote:
> > > Al, could you please compare the write throughput from vmstat for the
> > > data=ordered vs data=writeback runs?  I would guess the data=ordered
> > > one has a lower overall write throughput.
> >
> > That's what I would have guessed, but it's actually going up 4x fold for
> > mysql from 559mb to 2135mb, while the db-size ends up at 549mb.
>
>   So you say we write 4-times as much data in ordered mode as in writeback
> mode. Hmm, probably possible because we force all the dirty data to disk
> when committing a transation in ordered mode (and don't do this in
> writeback mode). So if the workload repeatedly dirties the whole DB, we
> are going to write the whole DB several times in ordered mode but in
> writeback mode we just keep the data in memory all the time. But this is
> what you ask for if you mount in ordered mode so I wouldn't consider it a
> bug.

Ok, maybe not a bug, but a bit inefficient.  Check out this workload:

sync;

while :; do
  dd < /dev/full > /mnt/sda2/x.dmp bs=1M count=20
  rm -f /mnt/sda2/x.dmp
  usleep 1
done

vmstat 1 ( with mount /dev/sda2 /mnt/sda2 -o data=writeback) << note io-bo >>

procs ---memory-- ---swap-- -io --system-- cpu
 r  b   swpd   free   buff  cache   si   sobibo   incs us sy id wa
 2  0  0 293008   5232  5743600 0 0   18   206  4 80 16  0
 1  0  0 282840   5232  6762000 0 0   18   238  3 81 16  0
 1  0  0 297032   5244  5336400 0   152   21   211  4 79 17  0
 1  0  0 285236   5244  6522400 0 0   18   232  4 80 16  0
 1  0  0 299464   5244  5088000 0 0   18   222  4 80 16  0
 1  0  0 290156   5244  6017600 0 0   18   236  3 80 17  0
 0  0  0 302124   5256  4778800 0   152   21   213  4 80 16  0
 1  0  0 292180   5256  5824800 0 0   18   239  3 81 16  0
 1  0  0 287452   5256  6244400 0 0   18   202  3 80 17  0
 1  0  0 293016   5256  5739200 0 0   18   250  4 80 16  0
 0  0  0 302052   5256  4778800 0 0   19   194  3 81 16  0
 1  0  0 297536   5268  5292800 0   152   20   233  4 79 17  0
 1  0  0 286468   5268  6387200 0 0   18   212  3 81 16  0
 1  0  0 301572   5268  4881200 0 0   18   267  4 79 17  0
 1  0  0 292636   5268  5777600 0 0   18   208  4 80 16  0
 1  0  0 302124   5280  4778800 0   152   21   237  4 80 16  0
 1  0  0 291436   5280  5897600 0 0   18   205  3 81 16  0
 1  0  0 302068   5280  4778800 0 0   18   234  3 81 16  0
 1  0  0 293008   5280  5738800 0 0   18   221  4 79 17  0
 1  0  0 297288   5292  5253200 0   156   22   233  2 81 16  1
 1  0  0 294676   5292  5572400 0 0   19   199  3 81 16  0


vmstat 1 (with mount /dev/sda2 /mnt/sda2 -o data=ordered)

procs ---memory-- ---swap-- -io --system-- cpu
 r  b   swpd   free   buff  cache   si   sobibo   incs us sy id wa
 2  0  0 291052   5156  5901600 0 0   19   223  3 82 15  0
 1  0  0 291408   5156  5870400 0 0   18   218  3 81 16  0
 1  0  0 291888   5156  5827600 020   23   229  3 80 17  0
 1  0  0 300764   5168  4947200 0 12864   91   235  3 69 13 15
 1  0  0 300740   5168  4945600 0 0   19   215  3 80 17  0
 1  0  0 301088   5168  4904400 0 0   18   241  4 80 16  0
 1  0  0 298220   5168  5187200 0 0   18   225  3 81 16  0
 0  1  0 289168   5168  6075200 0 12712   45   237  3 77 15  5
 1  0  0 300260   5180  4985200 0   152   68   211  4 72 15  9
 1  0  0 298616   5180  5146000 0 0   18   237  3 81 16  0
 1  0  0 296988   5180  5309200 0 0   18   223  3 81 16  0
 1  0  0 296608   5180  5348000 0 0   18   223  3 81 16  0
 0  0  0 301640   5192  4803600 0 12868   93   206  4 67 13 16
 0  0  0 301624   5192  4803600 0 0   21   218  3 81 16  0
 0  0  0 301600   5192  4803600 0 0   18   212  3 81 16  0
 0  0  0 301584   5192  4803600 0 0   18   209  4 80 16  0
 0  0  0 301568   5192  4803600 0 0   18   208  3 81 16  0
 1  0  0 285520   5204  6454800 0 12864   95   216  3 69 13 15
 2  0  0 285124   5204  6492400 0 0   18   222  4 80 16  0
 1  0  0 283612   5204  6639200 0 0   18   231  3 81 16  0
 1  0  0 284216   5204  6573600 0 0   18   218  4 80 16  0
 0  1  0 289160   5204  6075200 0 12712   56   213  3 74 15  8
 1

[RFC PATCH] vfs: optimization to /proc//mountinfo patch

Re: [RFC PATCH] vfs: optimization to /proc//mountinfo patch

[RESEND] [PATCH] ext3,4:fdatasync should skip metadata writeout when overwriting

[patch 2/3] mm: Add NR_WRITEBACK_TEMP counter

[patch 3/3] fuse: support writable mmap

[patch 0/3] fuse: writable mmap

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

[patch 0/3] add perform_write to a_ops

[patch 3/3] fuse: implement perform_write

Re: [RFC] ext3: per-process soft-syncing data=ordered mode

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)

[patch 1/3] mm: bdi: export bdi_writeout_inc()

[patch 1/3] vfs: introduce perform_write in a_ops

Re: [RFC PATCH] vfs: optimization to /proc//mountinfo patch

Re: [patch 0/3] add perform_write to a_ops

[patch 2/3] fuse: clean up setting i_size in write

Re: [patch 0/3] add perform_write to a_ops

Re: [patch 0/3] add perform_write to a_ops

Re: [2.6.24 REGRESSION] BUG: Soft lockup - with VFS

Re: [RFC] ext3: per-process soft-syncing data=ordered mode

22 matches

Site Navigation

Mail list logo

Footer information