From: Filipe Manana <fdman...@suse.com>

If we delete a snapshot, delete its parent directory, create a new
directory with the same name as that parent and then fsync either that
new directory or some file inside it, we end up with a log tree that
is not possible to replay because the log replay procedure interprets
the snapshot's directory item as a regular entry and not as a root
item, resulting in the following failure and trace when mounting the
filesystem:

[52174.510532] BTRFS info (device dm-0): failed to delete reference to snap, 
inode 257 parent 257
[52174.512570] ------------[ cut here ]------------
[52174.513278] WARNING: CPU: 12 PID: 28024 at fs/btrfs/inode.c:3986 
__btrfs_unlink_inode+0x178/0x351 [btrfs]()
[52174.514681] BTRFS: Transaction aborted (error -2)
[52174.515630] Modules linked in: btrfs dm_flakey dm_mod overlay crc32c_generic 
ppdev xor raid6_pq acpi_cpufreq parport_pc tpm_tis sg parport tpm evdev 
i2c_piix4 proc
[52174.521568] CPU: 12 PID: 28024 Comm: mount Tainted: G        W       
4.5.0-rc6-btrfs-next-27+ #1
[52174.522805] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by 
qemu-project.org 04/01/2014
[52174.524053]  0000000000000000 ffff8801df2a7710 ffffffff81264e93 
ffff8801df2a7758
[52174.524053]  0000000000000009 ffff8801df2a7748 ffffffff81051618 
ffffffffa03591cd
[52174.524053]  00000000fffffffe ffff88015e6e5000 ffff88016dbc3c88 
ffff88016dbc3c88
[52174.524053] Call Trace:
[52174.524053]  [<ffffffff81264e93>] dump_stack+0x67/0x90
[52174.524053]  [<ffffffff81051618>] warn_slowpath_common+0x99/0xb2
[52174.524053]  [<ffffffffa03591cd>] ? __btrfs_unlink_inode+0x178/0x351 [btrfs]
[52174.524053]  [<ffffffff81051679>] warn_slowpath_fmt+0x48/0x50
[52174.524053]  [<ffffffffa03591cd>] __btrfs_unlink_inode+0x178/0x351 [btrfs]
[52174.524053]  [<ffffffff8118f5e9>] ? iput+0xb0/0x284
[52174.524053]  [<ffffffffa0359fe8>] btrfs_unlink_inode+0x1c/0x3d [btrfs]
[52174.524053]  [<ffffffffa038631e>] check_item_in_log+0x1fe/0x29b [btrfs]
[52174.524053]  [<ffffffffa0386522>] replay_dir_deletes+0x167/0x1cf [btrfs]
[52174.524053]  [<ffffffffa038739e>] fixup_inode_link_count+0x289/0x2aa [btrfs]
[52174.524053]  [<ffffffffa038748a>] fixup_inode_link_counts+0xcb/0x105 [btrfs]
[52174.524053]  [<ffffffffa038a5ec>] btrfs_recover_log_trees+0x258/0x32c [btrfs]
[52174.524053]  [<ffffffffa03885b2>] ? replay_one_extent+0x511/0x511 [btrfs]
[52174.524053]  [<ffffffffa034f288>] open_ctree+0x1dd4/0x21b9 [btrfs]
[52174.524053]  [<ffffffffa032b753>] btrfs_mount+0x97e/0xaed [btrfs]
[52174.524053]  [<ffffffff8108e1b7>] ? trace_hardirqs_on+0xd/0xf
[52174.524053]  [<ffffffff8117bafa>] mount_fs+0x67/0x131
[52174.524053]  [<ffffffff81193003>] vfs_kern_mount+0x6c/0xde
[52174.524053]  [<ffffffffa032af81>] btrfs_mount+0x1ac/0xaed [btrfs]
[52174.524053]  [<ffffffff8108e1b7>] ? trace_hardirqs_on+0xd/0xf
[52174.524053]  [<ffffffff8108c262>] ? lockdep_init_map+0xb9/0x1b3
[52174.524053]  [<ffffffff8117bafa>] mount_fs+0x67/0x131
[52174.524053]  [<ffffffff81193003>] vfs_kern_mount+0x6c/0xde
[52174.524053]  [<ffffffff8119590f>] do_mount+0x8a6/0x9e8
[52174.524053]  [<ffffffff811358dd>] ? strndup_user+0x3f/0x59
[52174.524053]  [<ffffffff81195c65>] SyS_mount+0x77/0x9f
[52174.524053]  [<ffffffff814935d7>] entry_SYSCALL_64_fastpath+0x12/0x6b
[52174.561288] ---[ end trace 6b53049efb1a3ea6 ]---

So when we delete a directory we need to propagate its last_unlink_trans
value (updated on snapshot deletion) to its parent and then check at
fsync time for it and fallback for a transaction commit.

A test case for fstests follows.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1      # failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      cd /
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_dm_target flakey
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _populate_fs()
  {
      _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT \
      $SCRATCH_MNT/testdir/snap
      _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/testdir/snap
      rmdir $SCRATCH_MNT/testdir
      mkdir $SCRATCH_MNT/testdir
  }

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  mkdir $SCRATCH_MNT/testdir
  _populate_fs
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir
  _flakey_drop_and_remount

  echo "Filesystem contents after the first log replay:"
  ls -R $SCRATCH_MNT | _filter_scratch

  # Now do the same as before but instead of doing an fsync against the 
directory,
  # do an fsync against a file inside the directory.

  _populate_fs
  touch $SCRATCH_MNT/testdir/foobar
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foobar
  _flakey_drop_and_remount

  echo "Filesystem contents after the second log replay:"
  ls -R $SCRATCH_MNT | _filter_scratch

  _unmount_flakey
  status=0
  exit

Signed-off-by: Filipe Manana <fdman...@suse.com>
---
 fs/btrfs/inode.c    | 22 +++++++++++++++++++++-
 fs/btrfs/tree-log.c |  6 +++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 41a5688..b5c23b5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4173,6 +4173,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry 
*dentry)
        int err = 0;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
+       u64 last_unlink_trans;
 
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
@@ -4195,11 +4196,30 @@ static int btrfs_rmdir(struct inode *dir, struct dentry 
*dentry)
        if (err)
                goto out;
 
+       last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
        /* now the directory is empty */
        err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
                                 dentry->d_name.name, dentry->d_name.len);
-       if (!err)
+       if (!err) {
                btrfs_i_size_write(inode, 0);
+               /*
+                * Propagate the last_unlink_trans value of the deleted dir to
+                * its parent directory. This is to prevent an unrecoverable
+                * log tree in the case we do something like this:
+                * 1) create dir foo
+                * 2) create snapshot under dir foo
+                * 3) delete the snapshot
+                * 4) rmdir foo
+                * 5) mkdir foo
+                * 6) fsync foo or some file inside foo
+                */
+               if (last_unlink_trans >= trans->transid) {
+                       mutex_lock(&BTRFS_I(dir)->log_mutex);
+                       BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+                       mutex_unlock(&BTRFS_I(dir)->log_mutex);
+               }
+       }
 out:
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 24d03c7..d3f38dd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4875,8 +4875,12 @@ static noinline int check_parent_dirs_for_sync(struct 
btrfs_trans_handle *trans,
                if (!parent || d_really_is_negative(parent) || sb != 
d_inode(parent)->i_sb)
                        break;
 
-               if (IS_ROOT(parent))
+               if (IS_ROOT(parent)) {
+                       inode = d_inode(parent);
+                       if (btrfs_must_commit_transaction(trans, inode))
+                               ret = 1;
                        break;
+               }
 
                parent = dget_parent(parent);
                dput(old_parent);
-- 
2.7.0.rc3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to