[RFC PATCH] vfs: optimization to /proc//mountinfo patch
1) reports deleted inode in dentry_path() consistent with that in __d_path() 2) modified __d_path() to use prepend(), reducing the size of __d_path() 3) moved all the functionality that reports mount information in /proc under CONFIG_PROC_FS. Could not verify if the code would work with CONFIG_PROC_FS=n, since it was impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable CONFIG_PROC_FS. Signed-off-by: Ram Pai <[EMAIL PROTECTED]> --- fs/dcache.c | 59 +++ fs/namespace.c |2 + fs/seq_file.c|2 + include/linux/dcache.h |3 ++ include/linux/seq_file.h |3 ++ 5 files changed, 34 insertions(+), 35 deletions(-) Index: linux-2.6.23/fs/dcache.c === --- linux-2.6.23.orig/fs/dcache.c +++ linux-2.6.23/fs/dcache.c @@ -1747,6 +1747,17 @@ shouldnt_be_hashed: goto shouldnt_be_hashed; } +static int prepend(char **buffer, int *buflen, const char *str, + int namelen) +{ + *buflen -= namelen; + if (*buflen < 0) + return 1; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + /** * d_path - return the path of a dentry * @dentry: dentry to report @@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den { char * end = buffer+buflen; char * retval; - int namelen; - *--end = '\0'; - buflen--; - if (!IS_ROOT(dentry) && d_unhashed(dentry)) { - buflen -= 10; - end -= 10; - if (buflen < 0) + prepend(&end, &buflen, "\0", 1); + if (!IS_ROOT(dentry) && d_unhashed(dentry) && + prepend(&end, &buflen, " (deleted)", 10)) goto Elong; - memcpy(end, " (deleted)", 10); - } if (buflen < 1) goto Elong; @@ -1805,13 +1810,10 @@ static char *__d_path(struct dentry *den } parent = dentry->d_parent; prefetch(parent); - namelen = dentry->d_name.len; - buflen -= namelen + 1; - if (buflen < 0) + if (prepend(&end, &buflen, dentry->d_name.name, + dentry->d_name.len) || + prepend(&end, &buflen, "/", 1)) goto Elong; - end -= namelen; - memcpy(end, dentry->d_name.name, namelen); - *--end = '/'; retval = end; dentry = parent; } @@ -1819,12 +1821,9 @@ static char *__d_path(struct dentry *den return retval; global_root: - namelen = dentry->d_name.len; - buflen -= namelen; - if (buflen < 0) - goto Elong; - retval -= namelen-1;/* hit the slash */ - memcpy(retval, dentry->d_name.name, namelen); + retval += 1;/* hit the slash */ + if (prepend(&retval, &buflen, dentry->d_name.name, dentry->d_name.len)) + goto Elong; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -1890,17 +1889,8 @@ char *dynamic_dname(struct dentry *dentr return memcpy(buffer, temp, sz); } -static int prepend(char **buffer, int *buflen, const char *str, - int namelen) -{ - *buflen -= namelen; - if (*buflen < 0) - return 1; - *buffer -= namelen; - memcpy(*buffer, str, namelen); - return 0; -} +#ifdef CONFIG_PROC_FS /* * Write full pathname from the root of the filesystem into the buffer. */ @@ -1910,11 +1900,9 @@ char *dentry_path(struct dentry *dentry, char *retval; spin_lock(&dcache_lock); - prepend(&end, &buflen, "\0", 1); - if (!IS_ROOT(dentry) && d_unhashed(dentry)) { - if (prepend(&end, &buflen, "//deleted", 9)) + if (!IS_ROOT(dentry) && d_unhashed(dentry) && + prepend(&end, &buflen, " (deleted)", 10)) goto Elong; - } if (buflen < 1) goto Elong; /* Get '/' right */ @@ -1943,6 +1931,7 @@ Elong: spin_unlock(&dcache_lock); return ERR_PTR(-ENAMETOOLONG); } +#endif /* CONFIG_PROC_FS */ /* * NOTE! The user-level library version returns a Index: linux-2.6.23/fs/namespace.c === --- linux-2.6.23.orig/fs/namespace.c +++ linux-2.6.23/fs/namespace.c @@ -609,6 +609,7 @@ void mnt_unpin(struct vfsmount *mnt) EXPORT_SYMBOL(mnt_unpin); +#ifdef CONFIG_PROC_FS /* iterator */ static void *m_start(struct seq_file *m, loff_t *pos) { @@ -795,6 +796,7 @@ const struct seq_operations mountstats_o .stop = m_stop, .show = show_vfsstat, }; +#endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy Index: linux-2.6.
Re: [RFC PATCH] vfs: optimization to /proc//mountinfo patch
On Mon, 04 Feb 2008 01:15:05 -0800 Ram Pai <[EMAIL PROTECTED]> wrote: > 1) reports deleted inode in dentry_path() consistent with that in __d_path() > 2) modified __d_path() to use prepend(), reducing the size of __d_path() > 3) moved all the functionality that reports mount information in /proc under > CONFIG_PROC_FS. > > Could not verify if the code would work with CONFIG_PROC_FS=n, since it was > impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable > CONFIG_PROC_FS. > Do `make menuconfig', then hit '/' and search for "proc_fs". It'll tell you that you need to set EMBEDDED=y to disable procfs. > fs/dcache.c | 59 > +++ > fs/namespace.c |2 + > fs/seq_file.c|2 + > include/linux/dcache.h |3 ++ > include/linux/seq_file.h |3 ++ Please resend after testing that, thanks. - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RESEND] [PATCH] ext3,4:fdatasync should skip metadata writeout when overwriting
Hi. Currently fdatasync is identical to fsync in ext3,4. I think fdatasync should skip journal flush in data=ordered and data=writeback mode when it overwrites to already-instantiated blocks on HDD. When I_DIRTY_DATASYNC flag is not set, fdatasync should skip journal writeout because this indicates only atime or/and mtime updates. Following patch is the same approach of ext2's fsync code(ext2_sync_file). I did a performance test using the sysbench. #sysbench --num-threads=128 --max-requests=5 --test=fileio --file-total-size=128G --file-test-mode=rndwr --file-fsync-mode=fdatasync run The result was: -2.6.24 Operations performed: 0 Read, 50080 Write, 59600 Other = 109680 Total Read 0b Written 782.5Mb Total transferred 782.5Mb (12.116Mb/sec) 775.45 Requests/sec executed Test execution summary: total time: 64.5814s total number of events: 50080 total time taken by event execution: 3713.9836 per-request statistics: min:0.s avg:0.0742s max:0.9375s approx. 95 percentile: 0.2901s Threads fairness: events (avg/stddev): 391.2500/23.26 execution time (avg/stddev): 29.0155/1.99 -2.6.24-patched Operations performed: 0 Read, 50009 Write, 61596 Other = 111605 Total Read 0b Written 781.39Mb Total transferred 781.39Mb (16.419Mb/sec) 1050.83 Requests/sec executed Test execution summary: total time: 47.5900s total number of events: 50009 total time taken by event execution: 2934.5768 per-request statistics: min:0.s avg:0.0587s max:0.8938s approx. 95 percentile: 0.1993s Threads fairness: events (avg/stddev): 390.6953/22.64 execution time (avg/stddev): 22.9264/1.17 Filesystem I/O throughput was improved. Thanks. Signed-off-by :Hisashi Hifumi <[EMAIL PROTECTED]> diff -Nrup linux-2.6.24.org/fs/ext3/fsync.c linux-2.6.24/fs/ext3/fsync.c --- linux-2.6.24.org/fs/ext3/fsync.c2008-01-25 07:58:37.0 +0900 +++ linux-2.6.24/fs/ext3/fsync.c2008-02-04 12:42:42.0 +0900 @@ -72,6 +72,9 @@ int ext3_sync_file(struct file * file, s goto out; } + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + /* * The VFS has written the file data. If the inode is unaltered * then we need not start a commit. diff -Nrup linux-2.6.24.org/fs/ext4/fsync.c linux-2.6.24/fs/ext4/fsync.c --- linux-2.6.24.org/fs/ext4/fsync.c2008-01-25 07:58:37.0 +0900 +++ linux-2.6.24/fs/ext4/fsync.c2008-02-04 12:43:37.0 +0900 @@ -72,6 +72,9 @@ int ext4_sync_file(struct file * file, s goto out; } +if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) +goto out; + /* * The VFS has written the file data. If the inode is unaltered * then we need not start a commit. - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/3] mm: Add NR_WRITEBACK_TEMP counter
From: Miklos Szeredi <[EMAIL PROTECTED]> Fuse will use temporary buffers to write back dirty data from memory mappings (normal writes are done synchronously). This is needed, because there cannot be any guarantee about the time in which a write will complete. By using temporary buffers, from the MM's point if view the page is written back immediately. If the writeout was due to memory pressure, this effectively migrates data from a full zone to a less full zone. This patch adds a new counter (NR_WRITEBACK_TEMP) for the number of pages used as temporary buffers. Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]> --- Index: linux/fs/proc/proc_misc.c === --- linux.orig/fs/proc/proc_misc.c 2008-02-04 12:29:00.0 +0100 +++ linux/fs/proc/proc_misc.c 2008-02-04 13:01:35.0 +0100 @@ -178,6 +178,7 @@ static int meminfo_read_proc(char *page, "PageTables: %8lu kB\n" "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" "CommitLimit: %8lu kB\n" "Committed_AS: %8lu kB\n" "VmallocTotal: %8lu kB\n" @@ -209,6 +210,7 @@ static int meminfo_read_proc(char *page, K(global_page_state(NR_PAGETABLE)), K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), + K(global_page_state(NR_WRITEBACK_TEMP)), K(allowed), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, Index: linux/include/linux/mmzone.h === --- linux.orig/include/linux/mmzone.h 2008-02-04 12:29:01.0 +0100 +++ linux/include/linux/mmzone.h2008-02-04 13:01:35.0 +0100 @@ -95,6 +95,7 @@ enum zone_stat_item { NR_UNSTABLE_NFS,/* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, + NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ Index: linux/drivers/base/node.c === --- linux.orig/drivers/base/node.c 2008-02-04 12:28:53.0 +0100 +++ linux/drivers/base/node.c 2008-02-04 13:01:35.0 +0100 @@ -64,6 +64,7 @@ static ssize_t node_read_meminfo(struct "Node %d PageTables: %8lu kB\n" "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" + "Node %d WritebackTmp: %8lu kB\n" "Node %d Slab: %8lu kB\n" "Node %d SReclaimable: %8lu kB\n" "Node %d SUnreclaim: %8lu kB\n", @@ -86,6 +87,7 @@ static ssize_t node_read_meminfo(struct nid, K(node_page_state(nid, NR_PAGETABLE)), nid, K(node_page_state(nid, NR_UNSTABLE_NFS)), nid, K(node_page_state(nid, NR_BOUNCE)), + nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) + node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), Index: linux/mm/page-writeback.c === --- linux.orig/mm/page-writeback.c 2008-02-04 13:01:23.0 +0100 +++ linux/mm/page-writeback.c 2008-02-04 13:01:35.0 +0100 @@ -211,7 +211,8 @@ clip_bdi_dirty_limit(struct backing_dev_ avail_dirty = dirty - (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_WRITEBACK) + -global_page_state(NR_UNSTABLE_NFS)); +global_page_state(NR_UNSTABLE_NFS) + +global_page_state(NR_WRITEBACK_TEMP)); if (avail_dirty < 0) avail_dirty = 0; -- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 3/3] fuse: support writable mmap
From: Miklos Szeredi <[EMAIL PROTECTED]> Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. This patch depends on mm-bdi-allow-setting-a-maximum-for-the-bdi-dirty-limit-fix.patch Fuse page writeback design -- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi//max_ratio'. Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]> --- Index: linux/fs/fuse/dev.c === --- linux.orig/fs/fuse/dev.c2008-02-04 15:24:03.0 +0100 +++ linux/fs/fuse/dev.c 2008-02-04 15:24:47.0 +0100 @@ -47,6 +47,14 @@ struct fuse_req *fuse_request_alloc(void return req; } +struct fuse_req *fuse_request_alloc_nofs(void) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); + if (req) + fuse_request_init(req); + return req; +} + void fuse_request_free(struct fuse_req *req) { kmem_cache_free(fuse_req_cachep, req); @@ -430,6 +438,17 @@ void request_send_background(struct fuse } /* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + request_send_nowait_locked(fc, req); +} + +/* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already * aborted bail out. Index: linux/fs/fuse/dir.c === --- linux.orig/fs/fuse/dir.c2008-02-04 15:24:03.0 +0100 +++ linux/fs/fuse/dir.c 2008-02-04 15:24:47.0 +0100 @@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr } /* + * Prevent concurrent writepages on inode + * + * This is done by adding a negative bias to the inode write counter + * and waiting for all pending writes to finish. + */ +void fuse_set_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); +
[patch 0/3] fuse: writable mmap
This is short series for fuse writable mmap support. The first two patches are small additions to mm infrastructure. The third is a large patch for fuse. It also depends on the "mm: bdi: export BDI attributes in sysfs" series. I don't mind if this goes into 2.6.25 (guess, that depends on whether the bdi things go). Thanks, Miklos -- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
Miklos Szeredi wrote: Would you describe the situation that would cause the kernel to go into an infinite loop, please? The patch basically does: do { ... error = inode->i_op->foo() ... } while (error == ESTALE); What is the guarantee, that ->foo() will not always return ESTALE? You skimmed over some stuff, like the pathname lookup component contained in the first set of dots... I can't guarantee that ->foo() won't always return ESTALE. That said, the loop is not unbreakable. At least for NFS, a signal to the process will interrupt the loop because the error returned will change from ESTALE to EINTR. In FUSE interrupts are sent to userspace, and the filesystem decides what to do with them. So it is entirely possible and valid for a filesystem to ignore an interrupt. If an operation was non-blocking (such as one returning an error), then there would in fact be no purpose in checking interrupts. Why do you think that it is valid to ignore pending signals? You seem to be asserting that it okay for processes to hang, uninterruptibly, when accessing files on fuse mounted file systems? Perhaps the right error to return when there is a signal pending is EINTR and not ESTALE or some other error? There has to be some way for the application to detect that its system call was interrupted due to a signal pending. So while sending a signal might reliably work in NFS to break out of the loop, it does not necessarily work for other filesystems, and fuse may not be the only one affected. Have you noticed another one? I would be happy to chat with the developers for that file system to see if this support would negatively impact them. Also up till now, returning ESTALE in a fuse filesystem was a perfectly valid thing to do. This patch changes the behavior of that rather drastically. There might be installed systems that rely on current behavior, and we want to avoid breaking those on a kernel upgrade. Perhaps the explanation for what ESTALE means was not clear? If there are fuse file systems which really do support the notion of ESTALE, then it seems to me that they would also benefit from this support, ie. the ability to do some recovery from the situation. A few solutions come to mind, perhaps the best is to introduce a kernel internal errno value (ERETRYSTALE), that forces the relevant system calls to be retried. NFS could transform ESTALE errors to ERETRYSTALE and get the desired behavior, while other filesystems would not be affected. We don't need more error numbers, we've got plenty already. :-) Do you have anything more specific about any real problems? I see lots of "mays" and "coulds", but I don't see anything that I can do to make this support better. Thanx... ps - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 0/3] add perform_write to a_ops
a_ops->perform_write() was left out from Nick Piggin's new a_ops patchset, as it was non-essential, and postponed for later inclusion. This short series reintroduces it, but only adds the fuse implementation and not simple_perform_write(), which I'm not sure would be a significant improvement. This allows larger than 4k buffered writes for fuse, which is one of the most requested features. This goes on top of the "fuse: writable mmap" patches. Thanks, Miklos -- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 3/3] fuse: implement perform_write
From: Nick Piggin <[EMAIL PROTECTED]> Introduce fuse_perform_write. With fusexmp (a passthrough filesystem), large (1MB) writes into a backing tmpfs filesystem are sped up by almost 4 times (256MB/s vs 71MB/s). [EMAIL PROTECTED]: - split into smaller functions - testing Signed-off-by: Nick Piggin <[EMAIL PROTECTED]> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]> --- Index: linux/fs/fuse/file.c === --- linux.orig/fs/fuse/file.c 2008-02-04 17:11:18.0 +0100 +++ linux/fs/fuse/file.c2008-02-04 17:11:59.0 +0100 @@ -677,6 +677,148 @@ static int fuse_write_end(struct file *f return res; } +static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, + struct inode *inode, loff_t pos, + size_t count) +{ + size_t res; + unsigned offset; + unsigned i; + + for (i = 0; i < req->num_pages; i++) + fuse_wait_on_page_writeback(inode, req->pages[i]->index); + + res = fuse_send_write(req, file, inode, pos, count, NULL); + + offset = req->page_offset; + count = res; + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + + if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE) + SetPageUptodate(page); + + /* Just ignore count underflow on last page */ + count -= PAGE_CACHE_SIZE - offset; + offset = 0; + + unlock_page(page); + page_cache_release(page); + } + + return res; +} + +static ssize_t fuse_fill_write_pages(struct fuse_req *req, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + size_t count = 0; + int err; + + req->page_offset = offset; + + do { + size_t tmp; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, +iov_iter_count(ii)); + + bytes = min_t(size_t, bytes, fc->max_write - count); + + again: + err = -EFAULT; + if (iov_iter_fault_in_readable(ii, bytes)) + break; + + err = -ENOMEM; + page = __grab_cache_page(mapping, index); + if (!page) + break; + + pagefault_disable(); + tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + if (!tmp) { + unlock_page(page); + page_cache_release(page); + bytes = min(bytes, iov_iter_single_seg_count(ii)); + goto again; + } + + err = 0; + req->pages[req->num_pages] = page; + req->num_pages++; + + iov_iter_advance(ii, tmp); + count += tmp; + pos += tmp; + offset += tmp; + if (offset == PAGE_CACHE_SIZE) + offset = 0; + + } while (iov_iter_count(ii) && count < fc->max_write && +req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + + return count > 0 ? count : err; +} + +static ssize_t fuse_perform_write(struct file *file, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err = 0; + ssize_t res = 0; + + if (is_bad_inode(inode)) + return -EIO; + + do { + struct fuse_req *req; + ssize_t count; + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + count = fuse_fill_write_pages(req, mapping, ii, pos); + if (count <= 0) { + err = count; + } else { + size_t num_written; + + num_written = fuse_send_write_pages(req, file, inode, + pos, count); + err = req->out.h.error; + if (!err) { + res += num_written; + pos += num_written; + + /* break out of the loop on short write */ + if (num_written != count)
Re: [RFC] ext3: per-process soft-syncing data=ordered mode
On Sat 02-02-08 00:26:00, Al Boldi wrote: > Chris Mason wrote: > > On Thursday 31 January 2008, Jan Kara wrote: > > > On Thu 31-01-08 11:56:01, Chris Mason wrote: > > > > On Thursday 31 January 2008, Al Boldi wrote: > > > > > The big difference between ordered and writeback is that once the > > > > > slowdown starts, ordered goes into ~100% iowait, whereas writeback > > > > > continues 100% user. > > > > > > > > Does data=ordered write buffers in the order they were dirtied? This > > > > might explain the extreme problems in transactional workloads. > > > > > > Well, it does but we submit them to block layer all at once so > > > elevator should sort the requests for us... > > > > nr_requests is fairly small, so a long stream of random requests should > > still end up being random IO. > > > > Al, could you please compare the write throughput from vmstat for the > > data=ordered vs data=writeback runs? I would guess the data=ordered one > > has a lower overall write throughput. > > That's what I would have guessed, but it's actually going up 4x fold for > mysql from 559mb to 2135mb, while the db-size ends up at 549mb. So you say we write 4-times as much data in ordered mode as in writeback mode. Hmm, probably possible because we force all the dirty data to disk when committing a transation in ordered mode (and don't do this in writeback mode). So if the workload repeatedly dirties the whole DB, we are going to write the whole DB several times in ordered mode but in writeback mode we just keep the data in memory all the time. But this is what you ask for if you mount in ordered mode so I wouldn't consider it a bug. I still don't like your hack with per-process journal mode setting but we could easily do per-file journal mode setting (we already have a flag to do data journaling for a file) and that would help at least your DB workload... Honza -- Jan Kara <[EMAIL PROTECTED]> SUSE Labs, CR - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
> > In FUSE interrupts are sent to userspace, and the filesystem decides > > what to do with them. So it is entirely possible and valid for a > > filesystem to ignore an interrupt. If an operation was non-blocking > > (such as one returning an error), then there would in fact be no > > purpose in checking interrupts. > > > > > > Why do you think that it is valid to ignore pending signals? > You seem to be asserting that it okay for processes to hang, > uninterruptibly, when accessing files on fuse mounted file > systems? > > Perhaps the right error to return when there is a signal > pending is EINTR and not ESTALE or some other error? There > has to be some way for the application to detect that its > system call was interrupted due to a signal pending. Traditionally a lot of filesystem related system calls are not interruptible, and for good reason. For example what happens, if an app receives a signal, while the filesystem is performing a rename() request? It would be very confusing if the call returned EINTR, but the rename would successfully complete regardless. We had a related problem with the open(O_CREAT) call in fuse, which was interruptible between the creation and the actual open because of a design mistake. So it could return EINTR, after the file was created, and this broke a real world application (don't have details at hand, but could dig them out if you are interested). I don't know what NFS does, but returning EINTR without actually canceling an operation in the server is generally not a good idea. > > So while sending a signal might reliably work in NFS to break out of > > the loop, it does not necessarily work for other filesystems, and fuse > > may not be the only one affected. > > > > > > Have you noticed another one? I would be happy to chat with the > developers for that file system to see if this support would > negatively impact them. Oh, I have no idea. And I wouldn't want to do a full audit of all the filesystems to find out. But if you do, please go ahead. > > A few solutions come to mind, perhaps the best is to introduce a > > kernel internal errno value (ERETRYSTALE), that forces the relevant > > system calls to be retried. > > > > NFS could transform ESTALE errors to ERETRYSTALE and get the desired > > behavior, while other filesystems would not be affected. > > We don't need more error numbers, we've got plenty already. :-) That's a rather poor excuse against a simple solution which would spare us some backward compatibility problems. > Do you have anything more specific about any real problems? > I see lots of "mays" and "coulds", but I don't see anything > that I can do to make this support better. Implement the above suggestion? Or something else. Otherwise I have to NAK this patch due to the possibility of it breaking existing fuse installations. Thanks, Miklos - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
> > I don't know what NFS does, but returning EINTR without actually > > canceling an operation in the server is generally not a good idea. > > > > > > This is what NFS has been doing, for several decades, and no one > has complained yet. Is it really? Man nfs says something quite different (emphasis mine): intrIf an NFS file operation has a *major timeout* and it is hard mounted, then allow signals to interupt the file operation and cause it to return EINTR to the calling program. The *default* is to *not* allow file operations to be *interrupted*. > >> Have you noticed another one? I would be happy to chat with the > >> developers for that file system to see if this support would > >> negatively impact them. > >> > > > > Oh, I have no idea. And I wouldn't want to do a full audit of all the > > filesystems to find out. But if you do, please go ahead. > > > > > > Well, you brought it up. I thought that perhaps you had something > other than FUD. It's not FUD, it's being careful not to break an implementation when changing an API in a backward incompatbile way. > Please describe this real and existing fuse installation so that I can > better understand the situation and the real requirements here. I have already done so: "Also up till now, returning ESTALE in a fuse filesystem was a perfectly valid thing to do. This patch changes the behavior of that rather drastically. There might be installed systems that rely on current behavior, and we want to avoid breaking those on a kernel upgrade." Miklos - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] enhanced syscall ESTALE error handling (v2)
Miklos Szeredi wrote: In FUSE interrupts are sent to userspace, and the filesystem decides what to do with them. So it is entirely possible and valid for a filesystem to ignore an interrupt. If an operation was non-blocking (such as one returning an error), then there would in fact be no purpose in checking interrupts. Why do you think that it is valid to ignore pending signals? You seem to be asserting that it okay for processes to hang, uninterruptibly, when accessing files on fuse mounted file systems? Perhaps the right error to return when there is a signal pending is EINTR and not ESTALE or some other error? There has to be some way for the application to detect that its system call was interrupted due to a signal pending. Traditionally a lot of filesystem related system calls are not interruptible, and for good reason. For example what happens, if an app receives a signal, while the filesystem is performing a rename() request? It would be very confusing if the call returned EINTR, but the rename would successfully complete regardless. We had a related problem with the open(O_CREAT) call in fuse, which was interruptible between the creation and the actual open because of a design mistake. So it could return EINTR, after the file was created, and this broke a real world application (don't have details at hand, but could dig them out if you are interested). I don't know what NFS does, but returning EINTR without actually canceling an operation in the server is generally not a good idea. This is what NFS has been doing, for several decades, and no one has complained yet. It is just generally accepted. I do agree that it isn't the best of semantics, but it does seem to work and does solve a real problem which exists if you don't allow an operation to be interrupted. The alternative, for NFS clients, was potentially to block an application until a server, which might never come back up, comes back up. It was a serious problem and worse than this resolution. Yes, I'd like to hear the details and find out why it was a problem. If you allow the fuse file system to block waiting on things which may never occur, than you are going to have a problem. I would suggest considering this now instead of waiting until it is too late. We can learn from the NFS experience instead of just dismissing it. So while sending a signal might reliably work in NFS to break out of the loop, it does not necessarily work for other filesystems, and fuse may not be the only one affected. Have you noticed another one? I would be happy to chat with the developers for that file system to see if this support would negatively impact them. Oh, I have no idea. And I wouldn't want to do a full audit of all the filesystems to find out. But if you do, please go ahead. Well, you brought it up. I thought that perhaps you had something other than FUD. A few solutions come to mind, perhaps the best is to introduce a kernel internal errno value (ERETRYSTALE), that forces the relevant system calls to be retried. NFS could transform ESTALE errors to ERETRYSTALE and get the desired behavior, while other filesystems would not be affected. We don't need more error numbers, we've got plenty already. :-) That's a rather poor excuse against a simple solution which would spare us some backward compatibility problems. Potential backwards compatibility problems and none are even known or even considered. The solution here isn't to create more hacks and a new error number for this purpose is just a hack. Do you have anything more specific about any real problems? I see lots of "mays" and "coulds", but I don't see anything that I can do to make this support better. Implement the above suggestion? Or something else. Otherwise I have to NAK this patch due to the possibility of it breaking existing fuse installations. Please describe this real and existing fuse installation so that I can better understand the situation and the real requirements here. Instead of attempting to block this proposal, what about considering how to architect fuse to handle the situation instead of pretending that fuse won't have the same problem to solve if it isn't solved here? I have a real problem to solve and I need to get it resolved. I have real customers, with real problems, and not just theoretical and vague ones. ps - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/3] mm: bdi: export bdi_writeout_inc()
From: Miklos Szeredi <[EMAIL PROTECTED]> Fuse needs this for writable mmap support. Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]> --- Index: linux/include/linux/backing-dev.h === --- linux.orig/include/linux/backing-dev.h 2008-02-04 12:29:01.0 +0100 +++ linux/include/linux/backing-dev.h 2008-02-04 13:01:23.0 +0100 @@ -149,6 +149,8 @@ static inline unsigned long bdi_stat_err int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +extern void bdi_writeout_inc(struct backing_dev_info *bdi); + /* * Flags in backing_dev_info::capability * - The first two flags control whether dirty pages will contribute to the Index: linux/mm/page-writeback.c === --- linux.orig/mm/page-writeback.c 2008-02-04 12:29:01.0 +0100 +++ linux/mm/page-writeback.c 2008-02-04 13:01:23.0 +0100 @@ -168,6 +168,16 @@ static inline void __bdi_writeout_inc(st bdi->max_prop_frac); } +void bdi_writeout_inc(struct backing_dev_info *bdi) +{ + unsigned long flags; + + local_irq_save(flags); + __bdi_writeout_inc(bdi); + local_irq_restore(flags); +} +EXPORT_SYMBOL(bdi_writeout_inc); + static inline void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); -- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 1/3] vfs: introduce perform_write in a_ops
From: Nick Piggin <[EMAIL PROTECTED]> Introduce a new perform_write() address space operation. This is a single-call, bulk version of write_begin/write_end operations. It is only used in the buffered write path (write_begin must still be implemented), and not for in-kernel writes to pagecache. For some filesystems, using this can provide significant speedups. Signed-off-by: Nick Piggin <[EMAIL PROTECTED]> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]> --- Index: linux/include/linux/fs.h === --- linux.orig/include/linux/fs.h 2008-02-04 15:24:03.0 +0100 +++ linux/include/linux/fs.h2008-02-04 16:24:19.0 +0100 @@ -469,6 +469,9 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); + ssize_t (*perform_write)(struct file *, struct address_space *mapping, + struct iov_iter *i, loff_t pos); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); Index: linux/mm/filemap.c === --- linux.orig/mm/filemap.c 2008-02-04 15:24:03.0 +0100 +++ linux/mm/filemap.c 2008-02-04 16:22:55.0 +0100 @@ -2312,7 +2312,9 @@ generic_file_buffered_write(struct kiocb struct iov_iter i; iov_iter_init(&i, iov, nr_segs, count, written); - if (a_ops->write_begin) + if (a_ops->perform_write) + status = a_ops->perform_write(file, mapping, &i, pos); + else if (a_ops->write_begin) status = generic_perform_write(file, &i, pos); else status = generic_perform_write_2copy(file, &i, pos); Index: linux/Documentation/filesystems/vfs.txt === --- linux.orig/Documentation/filesystems/vfs.txt2008-02-04 12:28:50.0 +0100 +++ linux/Documentation/filesystems/vfs.txt 2008-02-04 16:23:44.0 +0100 @@ -533,6 +533,9 @@ struct address_space_operations { int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); + ssize_t (*perform_write)(struct file *, struct address_space *mapping, + struct iov_iter *i, loff_t pos); + sector_t (*bmap)(struct address_space *, sector_t); int (*invalidatepage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); @@ -664,6 +667,17 @@ struct address_space_operations { Returns < 0 on failure, otherwise the number of bytes (<= 'copied') that were able to be copied into pagecache. + perform_write: This is a single-call, bulk version of write_begin/write_end +operations. It is only used in the buffered write path (write_begin +must still be implemented), and not for in-kernel writes to pagecache. +It takes an iov_iter structure, which provides a descriptor for the +source data (and has associated iov_iter_xxx helpers to operate on +that data). There are also file, mapping, and pos arguments, which +specify the destination of the data. + +Returns < 0 on failure if nothing was written out, otherwise returns +the number of bytes copied into pagecache. + bmap: called by the VFS to map a logical block offset within object to physical block number. This method is used by the FIBMAP ioctl and for working with swap-files. To be able to swap to -- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH] vfs: optimization to /proc//mountinfo patch
> 1) reports deleted inode in dentry_path() consistent with that in __d_path() > 2) modified __d_path() to use prepend(), reducing the size of __d_path() > 3) moved all the functionality that reports mount information in /proc under > CONFIG_PROC_FS. > > Could not verify if the code would work with CONFIG_PROC_FS=n, since it was > impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable > CONFIG_PROC_FS. > > > > Signed-off-by: Ram Pai <[EMAIL PROTECTED]> > --- > fs/dcache.c | 59 > +++ > fs/namespace.c |2 + > fs/seq_file.c|2 + > include/linux/dcache.h |3 ++ > include/linux/seq_file.h |3 ++ > 5 files changed, 34 insertions(+), 35 deletions(-) > > Index: linux-2.6.23/fs/dcache.c > === > --- linux-2.6.23.orig/fs/dcache.c > +++ linux-2.6.23/fs/dcache.c > @@ -1747,6 +1747,17 @@ shouldnt_be_hashed: > goto shouldnt_be_hashed; > } > > +static int prepend(char **buffer, int *buflen, const char *str, > + int namelen) > +{ > + *buflen -= namelen; > + if (*buflen < 0) > + return 1; This is confusing. Should return -ENAMETOOLONG intead (see Chapter 16 in Documentation/CodingStyle). > + *buffer -= namelen; > + memcpy(*buffer, str, namelen); > + return 0; > +} > + > /** > * d_path - return the path of a dentry > * @dentry: dentry to report > @@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den > { > char * end = buffer+buflen; > char * retval; > - int namelen; > > - *--end = '\0'; > - buflen--; > - if (!IS_ROOT(dentry) && d_unhashed(dentry)) { > - buflen -= 10; > - end -= 10; > - if (buflen < 0) > + prepend(&end, &buflen, "\0", 1); > + if (!IS_ROOT(dentry) && d_unhashed(dentry) && > + prepend(&end, &buflen, " (deleted)", 10)) And this should test for "prepend() != 0" or "prepend() < 0" instead, otherwise it could easily be misread as "if prepend() succeeded, then...". And similarly for all the later calls. Thanks, Miklos - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 0/3] add perform_write to a_ops
On Mon, Feb 04, 2008 at 06:04:10PM +0100, Miklos Szeredi wrote: > a_ops->perform_write() was left out from Nick Piggin's new a_ops > patchset, as it was non-essential, and postponed for later inclusion. > > This short series reintroduces it, but only adds the fuse > implementation and not simple_perform_write(), which I'm not sure > would be a significant improvement. > > This allows larger than 4k buffered writes for fuse, which is one of > the most requested features. > > This goes on top of the "fuse: writable mmap" patches. Please don't do this, but rather implement your own .aio_write. There's very little in generic_file_aio_write that wouldn't be handle by ->perform_write and we should rather factor those up or move to higher layers than adding this ill-defined abstraction. - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/3] fuse: clean up setting i_size in write
From: Miklos Szeredi <[EMAIL PROTECTED]> Extract common code for setting i_size in write functions into a common helper. Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]> --- Index: linux/fs/fuse/file.c === --- linux.orig/fs/fuse/file.c 2008-02-04 13:01:39.0 +0100 +++ linux/fs/fuse/file.c2008-02-04 13:02:03.0 +0100 @@ -610,13 +610,24 @@ static int fuse_write_begin(struct file return 0; } +static void fuse_write_update_size(struct inode *inode, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&fc->lock); +} + static int fuse_buffered_write(struct file *file, struct inode *inode, loff_t pos, unsigned count, struct page *page) { int err; size_t nres; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); unsigned offset = pos & (PAGE_CACHE_SIZE - 1); struct fuse_req *req; @@ -643,12 +654,7 @@ static int fuse_buffered_write(struct fi err = -EIO; if (!err) { pos += nres; - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - if (pos > inode->i_size) - i_size_write(inode, pos); - spin_unlock(&fc->lock); - + fuse_write_update_size(inode, pos); if (count == PAGE_CACHE_SIZE) SetPageUptodate(page); } @@ -766,12 +772,8 @@ static ssize_t fuse_direct_io(struct fil } fuse_put_request(fc, req); if (res > 0) { - if (write) { - spin_lock(&fc->lock); - if (pos > inode->i_size) - i_size_write(inode, pos); - spin_unlock(&fc->lock); - } + if (write) + fuse_write_update_size(inode, pos); *ppos = pos; } fuse_invalidate_attr(inode); -- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 0/3] add perform_write to a_ops
> > a_ops->perform_write() was left out from Nick Piggin's new a_ops > > patchset, as it was non-essential, and postponed for later inclusion. > > > > This short series reintroduces it, but only adds the fuse > > implementation and not simple_perform_write(), which I'm not sure > > would be a significant improvement. > > > > This allows larger than 4k buffered writes for fuse, which is one of > > the most requested features. > > > > This goes on top of the "fuse: writable mmap" patches. > > Please don't do this, but rather implement your own .aio_write. There's > very little in generic_file_aio_write that wouldn't be handle by > ->perform_write and we should rather factor those up or move to higher > layers than adding this ill-defined abstraction. > Moving up to higher layers might not be possible, due to lock/unlock of i_mutex being inside generic_file_aio_write(). But with fuse being the only user, it's not a huge issue duplicating some code. Nick, were there any other candidates, that would want to use such an interface in the future? Thanks, Miklos - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch 0/3] add perform_write to a_ops
On Mon, Feb 04, 2008 at 09:52:06PM +0100, Miklos Szeredi wrote: > Moving up to higher layers might not be possible, due to lock/unlock > of i_mutex being inside generic_file_aio_write(). Well some bits can be moved up. Here's my grand plan which I plan to implement once I get some time for it (or let someone else do if they beat me): - generic_segment_checks goes to fs/read_write.c before caling into the filesystem - dito for vfs_check_frozen - generic_write_checks is a suitable helper already - dito for remove_suid - dito for file_update_time - after that there's not a whole lot left in generic_file_aio_write, except for direct I/O handling which will probably be very fs-specific if you have your own buffered I/O code generic_file_buffered_write is an almost trivial wrapper around what's ->perform_write in Nick's earlier patches and a helper for the syncing activity. > > But with fuse being the only user, it's not a huge issue duplicating > some code. > > Nick, were there any other candidates, that would want to use such an > interface in the future? > > Thanks, > Miklos > - > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to [EMAIL PROTECTED] > More majordomo info at http://vger.kernel.org/majordomo-info.html ---end quoted text--- - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [2.6.24 REGRESSION] BUG: Soft lockup - with VFS
On Mon, 28 Jan 2008 09:31:43 +0100 "Oliver Pinter (Pintér Olivér)" <[EMAIL PROTECTED]> wrote: > hi all! > > in the 2.6.24 become i some soft lockups with usb-phone, when i pluged > in the mobile, then the vfs-layer crashed. am afternoon can i the > .config send, and i bisected the kernel, when i have time. > > pictures from crash: > http://students.zipernowsky.hu/~oliverp/kernel/regression_2624/ It looks like selinux's file_has_perm() is doing spin_lock() on an uninitialised (or already locked) spinlock. - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] ext3: per-process soft-syncing data=ordered mode
Jan Kara wrote: > On Sat 02-02-08 00:26:00, Al Boldi wrote: > > Chris Mason wrote: > > > Al, could you please compare the write throughput from vmstat for the > > > data=ordered vs data=writeback runs? I would guess the data=ordered > > > one has a lower overall write throughput. > > > > That's what I would have guessed, but it's actually going up 4x fold for > > mysql from 559mb to 2135mb, while the db-size ends up at 549mb. > > So you say we write 4-times as much data in ordered mode as in writeback > mode. Hmm, probably possible because we force all the dirty data to disk > when committing a transation in ordered mode (and don't do this in > writeback mode). So if the workload repeatedly dirties the whole DB, we > are going to write the whole DB several times in ordered mode but in > writeback mode we just keep the data in memory all the time. But this is > what you ask for if you mount in ordered mode so I wouldn't consider it a > bug. Ok, maybe not a bug, but a bit inefficient. Check out this workload: sync; while :; do dd < /dev/full > /mnt/sda2/x.dmp bs=1M count=20 rm -f /mnt/sda2/x.dmp usleep 1 done vmstat 1 ( with mount /dev/sda2 /mnt/sda2 -o data=writeback) << note io-bo >> procs ---memory-- ---swap-- -io --system-- cpu r b swpd free buff cache si sobibo incs us sy id wa 2 0 0 293008 5232 5743600 0 0 18 206 4 80 16 0 1 0 0 282840 5232 6762000 0 0 18 238 3 81 16 0 1 0 0 297032 5244 5336400 0 152 21 211 4 79 17 0 1 0 0 285236 5244 6522400 0 0 18 232 4 80 16 0 1 0 0 299464 5244 5088000 0 0 18 222 4 80 16 0 1 0 0 290156 5244 6017600 0 0 18 236 3 80 17 0 0 0 0 302124 5256 4778800 0 152 21 213 4 80 16 0 1 0 0 292180 5256 5824800 0 0 18 239 3 81 16 0 1 0 0 287452 5256 6244400 0 0 18 202 3 80 17 0 1 0 0 293016 5256 5739200 0 0 18 250 4 80 16 0 0 0 0 302052 5256 4778800 0 0 19 194 3 81 16 0 1 0 0 297536 5268 5292800 0 152 20 233 4 79 17 0 1 0 0 286468 5268 6387200 0 0 18 212 3 81 16 0 1 0 0 301572 5268 4881200 0 0 18 267 4 79 17 0 1 0 0 292636 5268 5777600 0 0 18 208 4 80 16 0 1 0 0 302124 5280 4778800 0 152 21 237 4 80 16 0 1 0 0 291436 5280 5897600 0 0 18 205 3 81 16 0 1 0 0 302068 5280 4778800 0 0 18 234 3 81 16 0 1 0 0 293008 5280 5738800 0 0 18 221 4 79 17 0 1 0 0 297288 5292 5253200 0 156 22 233 2 81 16 1 1 0 0 294676 5292 5572400 0 0 19 199 3 81 16 0 vmstat 1 (with mount /dev/sda2 /mnt/sda2 -o data=ordered) procs ---memory-- ---swap-- -io --system-- cpu r b swpd free buff cache si sobibo incs us sy id wa 2 0 0 291052 5156 5901600 0 0 19 223 3 82 15 0 1 0 0 291408 5156 5870400 0 0 18 218 3 81 16 0 1 0 0 291888 5156 5827600 020 23 229 3 80 17 0 1 0 0 300764 5168 4947200 0 12864 91 235 3 69 13 15 1 0 0 300740 5168 4945600 0 0 19 215 3 80 17 0 1 0 0 301088 5168 4904400 0 0 18 241 4 80 16 0 1 0 0 298220 5168 5187200 0 0 18 225 3 81 16 0 0 1 0 289168 5168 6075200 0 12712 45 237 3 77 15 5 1 0 0 300260 5180 4985200 0 152 68 211 4 72 15 9 1 0 0 298616 5180 5146000 0 0 18 237 3 81 16 0 1 0 0 296988 5180 5309200 0 0 18 223 3 81 16 0 1 0 0 296608 5180 5348000 0 0 18 223 3 81 16 0 0 0 0 301640 5192 4803600 0 12868 93 206 4 67 13 16 0 0 0 301624 5192 4803600 0 0 21 218 3 81 16 0 0 0 0 301600 5192 4803600 0 0 18 212 3 81 16 0 0 0 0 301584 5192 4803600 0 0 18 209 4 80 16 0 0 0 0 301568 5192 4803600 0 0 18 208 3 81 16 0 1 0 0 285520 5204 6454800 0 12864 95 216 3 69 13 15 2 0 0 285124 5204 6492400 0 0 18 222 4 80 16 0 1 0 0 283612 5204 6639200 0 0 18 231 3 81 16 0 1 0 0 284216 5204 6573600 0 0 18 218 4 80 16 0 0 1 0 289160 5204 6075200 0 12712 56 213 3 74 15 8 1