[PATCH v2] Btrfs: fix fsync data loss after a ranged fsync

Filipe Manana Tue, 02 Sep 2014 04:51:13 -0700

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.


Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).

Signed-off-by: Filipe Manana <fdman...@suse.com>
---

V2: No code change, only updated the changelog and the comment, to make
    them more clear and accurate.

 fs/btrfs/file.c     |  2 +-
 fs/btrfs/tree-log.c | 54 +++++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c4076..e5534c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
 
        btrfs_init_log_ctx(&ctx);
 
-       ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+       ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
        if (ret < 0) {
                /* Fallthrough and commit/free transaction. */
                ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a917a6..93d3c16 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, struct inode *inode,
-                            int inode_only);
+                          struct btrfs_root *root, struct inode *inode,
+                          int inode_only,
+                          const loff_t start,
+                          const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -3856,8 +3858,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, struct inode *inode,
-                            int inode_only)
+                          struct btrfs_root *root, struct inode *inode,
+                          int inode_only,
+                          const loff_t start,
+                          const loff_t end)
 {
        struct btrfs_path *path;
        struct btrfs_path *dst_path;
@@ -4050,8 +4054,31 @@ log_extents:
                struct extent_map *em, *n;
 
                write_lock(&tree->lock);
-               list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+               /*
+                * We can't just remove every em if we're called for a ranged
+                * fsync - that is, one that doesn't cover the whole possible
+                * file range (0 to LLONG_MAX). This is because we can have
+                * em's that fall outside the range we're logging and therefore
+                * their ordered operations haven't completed yet
+                * (btrfs_finish_ordered_io() not invoked yet). This means we
+                * didn't get their respective file extent item in the fs/subvol
+                * tree yet, and need to let the next fast fsync (one which
+                * consults the list of modified extent maps) find the em so
+                * that it logs a matching file extent item and waits for the
+                * respective ordered operation to complete (if it's still
+                * running).
+                *
+                * Removing every em outside the range we're logging would make
+                * the next fast fsync not log their matching file extent items,
+                * therefore making us lose data after a log replay.
+                */
+               list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+                       if (em->mod_start > end)
+                               continue;
+                       if (em->mod_start + em->mod_len <= start)
+                               continue;
                        list_del_init(&em->list);
+               }
                write_unlock(&tree->lock);
        }
 
@@ -4158,7 +4185,10 @@ out:
  */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, struct inode *inode,
-                                 struct dentry *parent, int exists_only,
+                                 struct dentry *parent,
+                                 const loff_t start,
+                                 const loff_t end,
+                                 int exists_only,
                                  struct btrfs_log_ctx *ctx)
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4204,7 +4234,7 @@ static int btrfs_log_inode_parent(struct 
btrfs_trans_handle *trans,
        if (ret)
                goto end_no_trans;
 
-       ret = btrfs_log_inode(trans, root, inode, inode_only);
+       ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
        if (ret)
                goto end_trans;
 
@@ -4232,7 +4262,8 @@ static int btrfs_log_inode_parent(struct 
btrfs_trans_handle *trans,
 
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
-                       ret = btrfs_log_inode(trans, root, inode, inode_only);
+                       ret = btrfs_log_inode(trans, root, inode, inode_only,
+                                             0, LLONG_MAX);
                        if (ret)
                                goto end_trans;
                }
@@ -4266,13 +4297,15 @@ end_no_trans:
  */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                         const loff_t start,
+                         const loff_t end,
                          struct btrfs_log_ctx *ctx)
 {
        struct dentry *parent = dget_parent(dentry);
        int ret;
 
        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
-                                    0, ctx);
+                                    start, end, 0, ctx);
        dput(parent);
 
        return ret;
@@ -4509,6 +4542,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
                    root->fs_info->last_trans_committed))
                return 0;
 
-       return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+       return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+                                     LLONG_MAX, 1, NULL);
 }
 
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41b..e2e798a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry,
+                         const loff_t start,
+                         const loff_t end,
                          struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] Btrfs: fix fsync data loss after a ranged fsync

Reply via email to