[PATCH v3 01/10] btrfs-progs: Cleanup check_tree_block() function and split the output to print_tree_block_err() function.

2015-01-29 Thread Qu Wenruo
Before this patch, check_tree_block() will print error on bytenr
mismatch but don't output error on fsid mismatch.

This patch will modify check_tree_block(), so it will only return errno
but not print error messages.
The error message will be output by print_tree_block_err() function.

Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
Changelog:
v3:
  Newly introduced.
---
 disk-io.c | 45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/disk-io.c b/disk-io.c
index 7f03790..e14a143 100644
--- a/disk-io.c
+++ b/disk-io.c
@@ -22,6 +22,7 @@
 #include sys/stat.h
 #include fcntl.h
 #include unistd.h
+#include uuid/uuid.h
 #include kerncompat.h
 #include radix-tree.h
 #include ctree.h
@@ -33,17 +34,18 @@
 #include print-tree.h
 #include rbtree-utils.h
 
+/* specified errno for check_tree_block */
+#define EBYTENR1
+#define EFSID  2
+
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 {
 
struct btrfs_fs_devices *fs_devices;
-   int ret = 1;
+   int ret = -EFSID;
 
-   if (buf-start != btrfs_header_bytenr(buf)) {
-   printk(Check tree block failed, want=%Lu, have=%Lu\n,
-  buf-start, btrfs_header_bytenr(buf));
-   return ret;
-   }
+   if (buf-start != btrfs_header_bytenr(buf))
+   return -EBYTENR;
 
fs_devices = root-fs_info-fs_devices;
while (fs_devices) {
@@ -58,6 +60,30 @@ static int check_tree_block(struct btrfs_root *root, struct 
extent_buffer *buf)
return ret;
 }
 
+static void print_tree_block_err(struct btrfs_root *root,
+   struct extent_buffer *eb,
+   int err)
+{
+   char fs_uuid[BTRFS_UUID_UNPARSED_SIZE] = {'\0'};
+   char found_uuid[BTRFS_UUID_UNPARSED_SIZE] = {'\0'};
+   u8 buf[BTRFS_UUID_SIZE];
+
+   switch (err) {
+   case -EFSID:
+   read_extent_buffer(eb, buf, btrfs_header_fsid(),
+  BTRFS_UUID_SIZE);
+   uuid_unparse(buf, found_uuid);
+   uuid_unparse(root-fs_info-fsid, fs_uuid);
+   fprintf(stderr, fsid mismatch, want=%s, have=%s\n,
+   fs_uuid, found_uuid);
+   break;
+   case -EBYTENR:
+   fprintf(stderr, bytenr mismatch, want=%llu, have=%llu\n,
+   eb-start, btrfs_header_bytenr(eb));
+   break;
+   }
+}
+
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
return crc32c(seed, data, len);
@@ -280,7 +306,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root 
*root, u64 bytenr,
}
if (ignore) {
if (check_tree_block(root, eb))
-   printk(read block failed check_tree_block\n);
+   print_tree_block_err(root, eb,
+   check_tree_block(root, eb));
else
printk(Csum didn't match\n);
break;
@@ -342,8 +369,10 @@ static int write_tree_block(struct btrfs_trans_handle 
*trans,
 struct btrfs_root *root,
 struct extent_buffer *eb)
 {
-   if (check_tree_block(root, eb))
+   if (check_tree_block(root, eb)) {
+   print_tree_block_err(root, eb, check_tree_block(root, eb));
BUG();
+   }
 
if (!btrfs_buffer_uptodate(eb, trans-transid))
BUG();
-- 
2.2.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 02/10] btrfs-progs: Add support to suppress tree block csum error output.

2015-01-29 Thread Qu Wenruo
Add new open ctree flag OPEN_CTREE_SUPPRESS_ERROR to suppress tree block
csum error output.

Provides the basis for new btrfs-find-root and other enhancement on
btrfs offline tools output.

Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
changelog:
v2:
  None
v3:
  Use 'suppress_tree_error' to replace the old name 'suppress_error' to
  avoid confusion.
---
 ctree.h   |  2 ++
 disk-io.c | 19 +--
 disk-io.h |  2 ++
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/ctree.h b/ctree.h
index c069a95..494c9ac 100644
--- a/ctree.h
+++ b/ctree.h
@@ -996,6 +996,7 @@ struct btrfs_fs_info {
unsigned int on_restoring:1;
unsigned int is_chunk_recover:1;
unsigned int quota_enabled:1;
+   unsigned int suppress_tree_err:1;
 
int (*free_extent_hook)(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -1004,6 +1005,7 @@ struct btrfs_fs_info {
int refs_to_drop);
struct cache_tree *fsck_extent_cache;
struct cache_tree *corrupt_blocks;
+
 };
 
 /*
diff --git a/disk-io.c b/disk-io.c
index e14a143..3291071 100644
--- a/disk-io.c
+++ b/disk-io.c
@@ -141,6 +141,8 @@ int csum_tree_block(struct btrfs_root *root, struct 
extent_buffer *buf,
 {
u16 csum_size =
btrfs_super_csum_size(root-fs_info-super_copy);
+   if (verify  root-fs_info-suppress_tree_err)
+   return verify_tree_block_csum_silent(buf, csum_size);
return csum_tree_block_size(buf, csum_size, verify);
 }
 
@@ -291,8 +293,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root 
*root, u64 bytenr,
 
while (1) {
ret = read_whole_eb(root-fs_info, eb, mirror_num);
-   if (ret == 0  check_tree_block(root, eb) == 0 
-   csum_tree_block(root, eb, 1) == 0 
+   if (ret == 0  csum_tree_block(root, eb, 1) == 0 
+   check_tree_block(root, eb) == 0 
verify_parent_transid(eb-tree, eb, parent_transid, ignore)
== 0) {
if (eb-flags  EXTENT_BAD_TRANSID 
@@ -305,11 +307,14 @@ struct extent_buffer *read_tree_block(struct btrfs_root 
*root, u64 bytenr,
return eb;
}
if (ignore) {
-   if (check_tree_block(root, eb))
-   print_tree_block_err(root, eb,
+   if (check_tree_block(root, eb)) {
+   if (!root-fs_info-suppress_tree_err)
+   print_tree_block_err(root, eb,
check_tree_block(root, eb));
-   else
-   printk(Csum didn't match\n);
+   } else {
+   if (!root-fs_info-suppress_tree_err)
+   fprintf(stderr, Csum didn't match\n);
+   }
break;
}
num_copies = btrfs_num_copies(root-fs_info-mapping_tree,
@@ -1138,6 +1143,8 @@ static struct btrfs_fs_info *__open_ctree_fd(int fp, 
const char *path,
}
if (flags  OPEN_CTREE_RESTORE)
fs_info-on_restoring = 1;
+   if (flags  OPEN_CTREE_SUPPRESS_ERROR)
+   fs_info-suppress_tree_err = 1;
 
ret = btrfs_scan_fs_devices(fp, path, fs_devices, sb_bytenr,
(flags  OPEN_CTREE_RECOVER_SUPER));
diff --git a/disk-io.h b/disk-io.h
index f963a96..84f878a 100644
--- a/disk-io.h
+++ b/disk-io.h
@@ -33,6 +33,8 @@ enum btrfs_open_ctree_flags {
OPEN_CTREE_RESTORE  = (1  4),
OPEN_CTREE_NO_BLOCK_GROUPS  = (1  5),
OPEN_CTREE_EXCLUSIVE= (1  6),
+   OPEN_CTREE_SUPPRESS_ERROR   = (1  7), /* Suppress tree block
+  check error */
 };
 
 static inline u64 btrfs_sb_offset(int mirror)
-- 
2.2.2

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Indefinite hang in reserve_metadata_bytes on kernel 3.18.3

2015-01-29 Thread Holger Hoffstätte
On Thu, 29 Jan 2015 01:44:02 +, Steven Schlansker wrote:

[..snip..]

 The symptoms are an endlessly increasing stream of hung tasks and high

Please try 3.18.5 (-rc1 is good) which contains the following fix:

workqueue: fix subtle pool management issue which can stall whole
 worker_pool

see: http://article.gmane.org/gmane.linux.kernel.stable/122074

No promises but it seems sufficiently causally related..

-h

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Linux documentation: btrfs.txt

2015-01-29 Thread David Sterba
On Wed, Jan 28, 2015 at 08:04:39PM +0300, Pavel Volkov wrote:
 In my 3.18.3 sources the file at Documentation/filesystems/btrfs.txt says:
 
 Btrfs is under heavy development, and is not suitable for
 any uses other than benchmarking and review. The Btrfs disk format is
 not yet finalized.
 
 btrfs is no longer marked experimental in kernel and the file format is 
 stable. It is suited for general use except some features.
 Maybe it's time to update btrfs.txt?

Yeah, the text hasn't been changed since 2009 and we've missed it during
the dangerous  experimental label removal. Thanks.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Al Viro
On Thu, Jan 29, 2015 at 01:37:58PM +0100, David Sterba wrote:
 Adding Al Viro into CC
 
 On Thu, Jan 29, 2015 at 10:24:39AM +0800, Qu Wenruo wrote:
  +struct vfsmount *get_vfsmount_sb(struct super_block *sb)
  +{
  +   struct vfsmount *ret_vfs = NULL;
  +   struct mount *mnt;
  +   int ret = 0;
  +
  +   lock_mount_hash();
  +   if (list_empty(sb-s_mounts))
  +   goto out;
  +   mnt = list_entry(sb-s_mounts.next, struct mount, mnt_instance);
 
 from include/linux/fs.h:
 
 struct super_block {
 ...
   struct list_heads_mounts;   /* list of mounts; _not_ for fs 
 use */
 ...
 };
 
 I hear a storm in the distance coming our direction ... so I'll
 preemptively NAK this change.

Could you explain what the devil is that for?  The primitive looks rather
bogus - if nothing else, it includes make random instance of the filesystem
in someone's namespace appear busy to umount, which doesn't look like a
part of useful interface...  The only piece of context I'd been able to find
was something vague about sysfs-inflicted operations and wanting to use
mnt_want_write() but having nothing to pass it; BTW, what if the (random)
instance you run into happens to mounted r/o?

Assuming that your superblock is guaranteed to stay alive and usable for
whatever work you are trying to do, what's wrong with sb_want_write()?  

If it's _not_ guaranteed to stay so, and this is what you are trying to
solve, you are doing that at the wrong level - just take sysfs entry
removals earlier in shutdown process and be done with that.  Beginning of
close_ctree() would probably be early enough to be safe, but if that's
not enough, you can take it into the beginning of btrfs_kill_super().
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread David Sterba
Adding Al Viro into CC

On Thu, Jan 29, 2015 at 10:24:39AM +0800, Qu Wenruo wrote:
 +struct vfsmount *get_vfsmount_sb(struct super_block *sb)
 +{
 + struct vfsmount *ret_vfs = NULL;
 + struct mount *mnt;
 + int ret = 0;
 +
 + lock_mount_hash();
 + if (list_empty(sb-s_mounts))
 + goto out;
 + mnt = list_entry(sb-s_mounts.next, struct mount, mnt_instance);

from include/linux/fs.h:

struct super_block {
...
struct list_heads_mounts;   /* list of mounts; _not_ for fs 
use */
...
};

I hear a storm in the distance coming our direction ... so I'll
preemptively NAK this change.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix race between transaction commit and empty block group removal

2015-01-29 Thread Filipe Manana
Committing a transaction can race with automatic removal of empty block
groups (cleaner kthread), leading to a BUG_ON() in the transaction
commit code while running btrfs_finish_extent_commit(). The following
sequence diagram shows how it can happen:

   CPU 1   CPU 2

btrfs_commit_transaction()
  fs_info-running_transaction = NULL
  btrfs_finish_extent_commit()
find_first_extent_bit()
  - found range for block group X
 in fs_info-freed_extents[]

   btrfs_delete_unused_bgs()
 - found block group X

 Removed block group X's range
 from fs_info-freed_extents[]

 btrfs_remove_chunk()
btrfs_remove_block_group(bg 
X)

unpin_extent_range(bg X range)
   btrfs_lookup_block_group(bg X)
  - returns NULL
- BUG_ON()

The trace that results from the BUG_ON() is:

[48665.187808] [ cut here ]
[48665.188032] kernel BUG at fs/btrfs/extent-tree.c:5675!
[48665.188032] invalid opcode:  [#1] SMP DEBUG_PAGEALLOC
[48665.188032] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor 
raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc 
loop parport_pc evdev microcode
[48665.197388] CPU: 2 PID: 31211 Comm: kworker/u32:16 Tainted: GW  
3.19.0-rc5-btrfs-next-4+ #1
[48665.197388] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[48665.197388] Workqueue: events_unbound btrfs_async_reclaim_metadata_space 
[btrfs]
[48665.197388] task: 880222011810 ti: 8801b56a4000 task.ti: 
8801b56a4000
[48665.197388] RIP: 0010:[a0350d05]  [a0350d05] 
unpin_extent_range+0x6a/0x1ba [btrfs]
[48665.197388] RSP: 0018:8801b56a7b88  EFLAGS: 00010246
[48665.197388] RAX:  RBX: 8802143a6000 RCX: 8802220120c8
[48665.197388] RDX: 0001 RSI: 0001 RDI: 8800a3c140b0
[48665.197388] RBP: 8801b56a7bd8 R08: 0003 R09: 
[48665.197388] R10:  R11: bbac R12: 12e8e000
[48665.197388] R13: 8800a3c14000 R14:  R15: 
[48665.197388] FS:  () GS:88023ec4() 
knlGS:
[48665.197388] CS:  0010 DS:  ES:  CR0: 8005003b
[48665.197388] CR2: 7f065e42f270 CR3: 000206f7 CR4: 06e0
[48665.197388] Stack:
[48665.197388]  8801b56a7bd8 12ea 01ff8800a3c14138 
12e9
[48665.197388]  880141df3dd8 8802143a6000 8800a3c14138 
880141df3df0
[48665.197388]  880141df3dd8  8801b56a7c08 
a0354227
[48665.197388] Call Trace:
[48665.197388]  [a0354227] btrfs_finish_extent_commit+0xb0/0xd9 
[btrfs]
[48665.197388]  [a0366b4b] btrfs_commit_transaction+0x791/0x92c 
[btrfs]
[48665.197388]  [a0352432] flush_space+0x43d/0x452 [btrfs]
[48665.197388]  [814295c3] ? _raw_spin_unlock+0x28/0x33
[48665.197388]  [a035255f] 
btrfs_async_reclaim_metadata_space+0x118/0x164 [btrfs]
[48665.197388]  [81059917] ? process_one_work+0x14b/0x3ab
[48665.197388]  [810599ac] process_one_work+0x1e0/0x3ab
[48665.197388]  [81079fa9] ? trace_hardirqs_off+0xd/0xf
[48665.197388]  [8105a55b] worker_thread+0x210/0x2d0
[48665.197388]  [8105a34b] ? rescuer_thread+0x2c3/0x2c3
[48665.197388]  [8105e5c0] kthread+0xef/0xf7
[48665.197388]  [81429682] ? _raw_spin_unlock_irq+0x2d/0x39
[48665.197388]  [8105e4d1] ? __kthread_parkme+0xad/0xad
[48665.197388]  [81429dec] ret_from_fork+0x7c/0xb0
[48665.197388]  [8105e4d1] ? __kthread_parkme+0xad/0xad
[48665.197388] Code: 85 f6 74 14 49 8b 06 49 03 46 09 49 39 c4 72 1d 4c 89 f7 
e8 83 ec ff ff 4c 89 e6 4c 89 ef e8 1e f1 ff ff 48 85 c0 49 89 c6 75 02 0f 0b 
49 8b 1e 49 03 5e 09 48 8b
[48665.197388] RIP  [a0350d05] unpin_extent_range+0x6a/0x1ba [btrfs]
[48665.197388]  RSP 8801b56a7b88
[48665.272246] ---[ end trace b9c6ab9957521376 ]---

Fix this by ensuring that unpining the block group's range in
btrfs_finish_extent_commit() is done in a synchronized fashion
with removing the block group's range from freed_extents[]
in btrfs_delete_unused_bgs()

This race got introduced with the change:

Btrfs: remove empty block groups automatically
commit 47ab2a6c689913db23ccae38349714edf8365e0a

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/extent-tree.c | 21 -
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git 

Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function 
to get vfsmount from a given sb.

From: Al Viro v...@zeniv.linux.org.uk
To: dste...@suse.cz, Qu Wenruo quwen...@cn.fujitsu.com, 
linux-btrfs@vger.kernel.org, linux-fsdevel linux-fsde...@vger.kernel.org

Date: 2015年01月29日 23:23

On Thu, Jan 29, 2015 at 01:37:58PM +0100, David Sterba wrote:

Adding Al Viro into CC

On Thu, Jan 29, 2015 at 10:24:39AM +0800, Qu Wenruo wrote:

+struct vfsmount *get_vfsmount_sb(struct super_block *sb)
+{
+   struct vfsmount *ret_vfs = NULL;
+   struct mount *mnt;
+   int ret = 0;
+
+   lock_mount_hash();
+   if (list_empty(sb-s_mounts))
+   goto out;
+   mnt = list_entry(sb-s_mounts.next, struct mount, mnt_instance);

from include/linux/fs.h:

struct super_block {
...
struct list_heads_mounts;   /* list of mounts; _not_ for fs 
use */
...
};

I hear a storm in the distance coming our direction ... so I'll
preemptively NAK this change.

Could you explain what the devil is that for?
In sysfs interface handler, we don't have struct file or vfsmount to do 
the mnt_want_write* protection,

so this function is introduced to get a vfsmount and do the protection.

The primitive looks rather
bogus - if nothing else, it includes make random instance of the filesystem
in someone's namespace appear busy to umount, which doesn't look like a
part of useful interface...

This is the problem I didn't find a good way to avoid.

If using the function, it will always use the first(or last?) vfsmount 
of the filesystem.
For 1 to 1 match case, it's OK, but if one device is mounted on multiple 
mount points,
it will delay the umount of that mount point. But we only need to keep 
at least one rw mount point exists.

   The only piece of context I'd been able to find
was something vague about sysfs-inflicted operations and wanting to use
mnt_want_write() but having nothing to pass it; BTW, what if the (random)
instance you run into happens to mounted r/o?
For the mounted ro case, that's not a problem, since if one instance is 
mounted ro,

other instances are also mounted ro, so that's not a problem.



Assuming that your superblock is guaranteed to stay alive and usable for
whatever work you are trying to do, what's wrong with sb_want_write()?
Did you mean change the function name and it's parameter to 
sb_want_write(sb) and sb_drop_write(sb).

That looks much better.
But I'm a little worried about just using sb_start_write() and 
s_readonly_remount/s_flags to do the protection,

will it be enough?

Thanks,
Qu
  


If it's _not_ guaranteed to stay so, and this is what you are trying to
solve, you are doing that at the wrong level - just take sysfs entry
removals earlier in shutdown process and be done with that.  Beginning of
close_ctree() would probably be early enough to be safe, but if that's
not enough, you can take it into the beginning of btrfs_kill_super().


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount option in a atomic way

2015-01-29 Thread Miao Xie
On Fri, 30 Jan 2015 09:33:17 +0800, Qu Wenruo wrote:
 
  Original Message 
 Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse 
 mount
 option in a atomic way
 From: Miao Xie miao...@huawei.com
 To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
 Date: 2015年01月30日 09:29
 On Fri, 30 Jan 2015 09:20:46 +0800, Qu Wenruo wrote:
 Here need ACCESS_ONCE to wrap info-mount_opt, or the complier might use
 info-mount_opt instead of new_opt.
 Thanks for pointing out this one.
 But I worried that this is not key reason of the wrong space cache. Could
 you explain the race condition which caused the wrong space cache?

 Thanks
 Miao
 CPU0:
 remount()
 |- sync_fs() - after sync_fs() we can start new trans
 |- btrfs_parse_options() CPU1:
  |- start_transaction()
  |- Do some bg allocation, not recorded in space_cache.
 I think it is a bug if a free space is not recorded in space cache. Could you
 explain why it is not recorded?

 Thanks
 Miao
 IIRC, in that window, the fs_info-mount_opt's SPACE_CACHE bit is cleared.
 So space cache is not recorded.

SPACE_CACHE is used to control cache write out, not in-memory cache. All the
free space should be recorded in in-memory cache.And when we write out
the in-memory space cache, we need protect the space cache from changing.

Thanks
Miao

 
 Thanks,
 Qu

   |- set SPACE_CACHE bit due to cache_gen

  |- commit_transaction()
  |- write space cache and update cache_gen.
  but since some of it is not recorded in space
 cache,
  the space cache missing some records.
   |- clear SPACE_CACHE bit dut to nospace_cache

 So the space cache is wrong.

 Thanks,
 Qu
 +}
kfree(orig);
return ret;
}

 .

 
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Al Viro
On Fri, Jan 30, 2015 at 09:44:03AM +0800, Qu Wenruo wrote:

 This shouldn't happen. If someone is ro, the whole fs should be ro, right?

Wrong.  Individual vfsmounts over an r/w superblock might very well be r/o.
As for that trylock...  What for?  It invites transient failures for no
good reason.  Removal of sysfs entry will block while write(2) to that sucker
is in progress, so btrfs shutdown will block at that point in ctree_close().
It won't go away under you.

Now, you might want to move those sysfs entry removals to the very beginning
of btrfs_kill_super(), but that's a different story - you need only to make
sure that they are removed not later than the destruction of the data
structures they need (IOW, the current location might very well be OK - I
hadn't checked the details).

As for it won't go r/o under us - sb_want_write() will do that just fine.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount option in a atomic way

2015-01-29 Thread Miao Xie
On Thu, 29 Jan 2015 10:24:35 +0800, Qu Wenruo wrote:
 Current btrfs_parse_options() is not atomic, which can set and clear a
 bit, especially for nospace_cache case.
 
 For example, if a fs is mounted with nospace_cache,
 btrfs_parse_options() will set SPACE_CACHE bit first(since
 cache_generation is non-zeo) and clear the SPACE_CACHE bit due to
 nospace_cache mount option.
 So under heavy operations and remount a nospace_cache btrfs, there is a
 windows for commit to create space cache.
 
 This bug can be reproduced by fstest/btrfs/071 073 074 with
 nospace_cache mount option. It has about 50% chance to create space
 cache, and about 10% chance to create wrong space cache, which can't
 pass btrfsck.
 
 This patch will do the mount option parse in a copy-and-update method.
 First copy the mount_opt from fs_info to new_opt, and only update
 options in new_opt. At last, copy the new_opt back to
 fs_info-mount_opt.
 
 This patch is already good enough to fix the above nospace_cache +
 remount bug, but need later patch to make sure mount options does not
 change during transaction.
 
 Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
 ---
  fs/btrfs/ctree.h |  16 
  fs/btrfs/super.c | 115 
 +--
  2 files changed, 69 insertions(+), 62 deletions(-)
 
 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 5f99743..26bb47b 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -2119,18 +2119,18 @@ struct btrfs_ioctl_defrag_range_args {
  #define btrfs_test_opt(root, opt)((root)-fs_info-mount_opt  \
BTRFS_MOUNT_##opt)
  
 -#define btrfs_set_and_info(root, opt, fmt, args...)  \
 +#define btrfs_set_and_info(fs_info, val, opt, fmt, args...)  \
  {\
 - if (!btrfs_test_opt(root, opt)) \
 - btrfs_info(root-fs_info, fmt, ##args); \
 - btrfs_set_opt(root-fs_info-mount_opt, opt);   \
 + if (!btrfs_raw_test_opt(val, opt))  \
 + btrfs_info(fs_info, fmt, ##args);   \
 + btrfs_set_opt(val, opt);\
  }
  
 -#define btrfs_clear_and_info(root, opt, fmt, args...)
 \
 +#define btrfs_clear_and_info(fs_info, val, opt, fmt, args...)
 \
  {\
 - if (btrfs_test_opt(root, opt))  \
 - btrfs_info(root-fs_info, fmt, ##args); \
 - btrfs_clear_opt(root-fs_info-mount_opt, opt); \
 + if (btrfs_raw_test_opt(val, opt))   \
 + btrfs_info(fs_info, fmt, ##args);   \
 + btrfs_clear_opt(val, opt);  \
  }
  
  /*
 diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
 index b0c45b2..490fe1f 100644
 --- a/fs/btrfs/super.c
 +++ b/fs/btrfs/super.c
 @@ -395,10 +395,13 @@ int btrfs_parse_options(struct btrfs_root *root, char 
 *options)
   int ret = 0;
   char *compress_type;
   bool compress_force = false;
 + unsigned long new_opt;
 +
 + new_opt = info-mount_opt;

Here and

  
   cache_gen = btrfs_super_cache_generation(root-fs_info-super_copy);
   if (cache_gen)
 - btrfs_set_opt(info-mount_opt, SPACE_CACHE);
[SNIP]
  out:
 - if (!ret  btrfs_test_opt(root, SPACE_CACHE))
 - btrfs_info(root-fs_info, disk space caching is enabled);
 + if (!ret) {
 + if (btrfs_raw_test_opt(new_opt, SPACE_CACHE))
 + btrfs_info(info, disk space caching is enabled);
 + info-mount_opt = new_opt;

Here need ACCESS_ONCE to wrap info-mount_opt, or the complier might use
info-mount_opt instead of new_opt.

But I worried that this is not key reason of the wrong space cache. Could
you explain the race condition which caused the wrong space cache?

Thanks
Miao

 + }
   kfree(orig);
   return ret;
  }
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount option in a atomic way

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() 
parse mount option in a atomic way

From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 08:31

On Thu, 29 Jan 2015 10:24:35 +0800, Qu Wenruo wrote:

Current btrfs_parse_options() is not atomic, which can set and clear a
bit, especially for nospace_cache case.

For example, if a fs is mounted with nospace_cache,
btrfs_parse_options() will set SPACE_CACHE bit first(since
cache_generation is non-zeo) and clear the SPACE_CACHE bit due to
nospace_cache mount option.
So under heavy operations and remount a nospace_cache btrfs, there is a
windows for commit to create space cache.

This bug can be reproduced by fstest/btrfs/071 073 074 with
nospace_cache mount option. It has about 50% chance to create space
cache, and about 10% chance to create wrong space cache, which can't
pass btrfsck.

This patch will do the mount option parse in a copy-and-update method.
First copy the mount_opt from fs_info to new_opt, and only update
options in new_opt. At last, copy the new_opt back to
fs_info-mount_opt.

This patch is already good enough to fix the above nospace_cache +
remount bug, but need later patch to make sure mount options does not
change during transaction.

Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
  fs/btrfs/ctree.h |  16 
  fs/btrfs/super.c | 115 +--
  2 files changed, 69 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5f99743..26bb47b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2119,18 +2119,18 @@ struct btrfs_ioctl_defrag_range_args {
  #define btrfs_test_opt(root, opt) ((root)-fs_info-mount_opt  \
 BTRFS_MOUNT_##opt)
  
-#define btrfs_set_and_info(root, opt, fmt, args...)			\

+#define btrfs_set_and_info(fs_info, val, opt, fmt, args...)\
  { \
-   if (!btrfs_test_opt(root, opt)) \
-   btrfs_info(root-fs_info, fmt, ##args);  \
-   btrfs_set_opt(root-fs_info-mount_opt, opt); \
+   if (!btrfs_raw_test_opt(val, opt))  \
+   btrfs_info(fs_info, fmt, ##args);   \
+   btrfs_set_opt(val, opt);\
  }
  
-#define btrfs_clear_and_info(root, opt, fmt, args...)			\

+#define btrfs_clear_and_info(fs_info, val, opt, fmt, args...)  \
  { \
-   if (btrfs_test_opt(root, opt))  \
-   btrfs_info(root-fs_info, fmt, ##args);  \
-   btrfs_clear_opt(root-fs_info-mount_opt, opt);   \
+   if (btrfs_raw_test_opt(val, opt))   \
+   btrfs_info(fs_info, fmt, ##args);   \
+   btrfs_clear_opt(val, opt);  \
  }
  
  /*

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b0c45b2..490fe1f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -395,10 +395,13 @@ int btrfs_parse_options(struct btrfs_root *root, char 
*options)
int ret = 0;
char *compress_type;
bool compress_force = false;
+   unsigned long new_opt;
+
+   new_opt = info-mount_opt;

Here and

  
  	cache_gen = btrfs_super_cache_generation(root-fs_info-super_copy);

if (cache_gen)
-   btrfs_set_opt(info-mount_opt, SPACE_CACHE);

[SNIP]

  out:
-   if (!ret  btrfs_test_opt(root, SPACE_CACHE))
-   btrfs_info(root-fs_info, disk space caching is enabled);
+   if (!ret) {
+   if (btrfs_raw_test_opt(new_opt, SPACE_CACHE))
+   btrfs_info(info, disk space caching is enabled);
+   info-mount_opt = new_opt;

Here need ACCESS_ONCE to wrap info-mount_opt, or the complier might use
info-mount_opt instead of new_opt.

Thanks for pointing out this one.


But I worried that this is not key reason of the wrong space cache. Could
you explain the race condition which caused the wrong space cache?

Thanks
Miao

CPU0:
remount()
|- sync_fs() - after sync_fs() we can start new trans
|- btrfs_parse_options() CPU1:
|- start_transaction()
|- Do some bg allocation, not recorded in space_cache.
 |- set SPACE_CACHE bit due to cache_gen

|- commit_transaction()
|- write space cache and update cache_gen.
but since some of it is not recorded in 
space cache,

the space cache missing some records.
 |- clear SPACE_CACHE bit dut to nospace_cache

So the space cache is wrong.


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function 
to get vfsmount from a given sb.

From: Al Viro v...@zeniv.linux.org.uk
To: Qu Wenruo quwen...@cn.fujitsu.com
Date: 2015年01月30日 10:09

On Fri, Jan 30, 2015 at 09:11:26AM +0800, Qu Wenruo wrote:

For the mounted ro case, that's not a problem, since if one instance
is mounted ro,
other instances are also mounted ro, so that's not a problem.

Not really.

root@satch:~# cd /tmp/
root@satch:~# mkdir /tm/a
root@satch:~# mount --bind /tmp/ /tmp/a
root@satch:~# mount --bind -o remount,ro /tmp/a
root@satch:~# mkdir /tmp/b
root@satch:~# mkdir /tmp/a/c
mkdir: cannot create directory '/tmp/a/c': Read-only file system
root@satch:~# stat /tmp/b /tmp/a/b
   File: '/tmp/b'
   Size: 4096Blocks: 8  IO Block: 4096   directory
Device: 806h/2054d  Inode: 257537  Links: 2
Access: (0755/drwxr-xr-x)  Uid: (0/root)   Gid: (0/root)
Access: 2015-01-29 21:03:35.0 -0500
Modify: 2015-01-29 21:03:35.0 -0500
Change: 2015-01-29 21:03:35.0 -0500
  Birth: -
   File: '/tmp/a/b'
   Size: 4096Blocks: 8  IO Block: 4096   directory
Device: 806h/2054d  Inode: 257537  Links: 2
Access: (0755/drwxr-xr-x)  Uid: (0/root)   Gid: (0/root)
Access: 2015-01-29 21:03:35.0 -0500
Modify: 2015-01-29 21:03:35.0 -0500
Change: 2015-01-29 21:03:35.0 -0500
  Birth: -
root@satch:~#

IOW, /tmp and /tmp/a bear the same filesystem (one from /dev/sda6), the former
is mounted r/w, the latter - r/o.

Oh, bind mount.

I was so stupid to forget bind.



Assuming that your superblock is guaranteed to stay alive and usable for
whatever work you are trying to do, what's wrong with sb_want_write()?

Did you mean change the function name and it's parameter to
sb_want_write(sb) and sb_drop_write(sb).
That looks much better.
But I'm a little worried about just using sb_start_write() and
s_readonly_remount/s_flags to do the protection,
will it be enough?

Protection against what?  If superblock is r/w, it will stay r/w until you
do sb_drop_write(); if it's r/o, sb_want_write() will fail.  There might be
any number of vfsmounts over superblock; attempt to get write access via
vfsmount will succeed only of both the vfsmount and superblock are r/w -
mnt_want_write() does sb_want_write() and fails if that has failed.

Nice.
I'll add sb_want_write() and sb_drop_write().

Tons of thanks for all your advice!
Qu
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount option in a atomic way

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() 
parse mount option in a atomic way

From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 09:29

On Fri, 30 Jan 2015 09:20:46 +0800, Qu Wenruo wrote:

Here need ACCESS_ONCE to wrap info-mount_opt, or the complier might use
info-mount_opt instead of new_opt.

Thanks for pointing out this one.

But I worried that this is not key reason of the wrong space cache. Could
you explain the race condition which caused the wrong space cache?

Thanks
Miao

CPU0:
remount()
|- sync_fs() - after sync_fs() we can start new trans
|- btrfs_parse_options() CPU1:
 |- start_transaction()
 |- Do some bg allocation, not recorded in space_cache.

I think it is a bug if a free space is not recorded in space cache. Could you
explain why it is not recorded?

Thanks
Miao

IIRC, in that window, the fs_info-mount_opt's SPACE_CACHE bit is cleared.
So space cache is not recorded.

Thanks,
Qu



  |- set SPACE_CACHE bit due to cache_gen

 |- commit_transaction()
 |- write space cache and update cache_gen.
 but since some of it is not recorded in space 
cache,
 the space cache missing some records.
  |- clear SPACE_CACHE bit dut to nospace_cache

So the space cache is wrong.

Thanks,
Qu

+}
   kfree(orig);
   return ret;
   }


.



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Indefinite hang in reserve_metadata_bytes on kernel 3.18.3

2015-01-29 Thread Steven Schlansker
Hi Holger,

On Jan 29, 2015, at 2:08 AM, Holger Hoffstätte 
holger.hoffstae...@googlemail.com wrote:

 [This mail was also posted to gmane.comp.file-systems.btrfs.]
 
 On Thu, 29 Jan 2015 01:44:02 +, Steven Schlansker wrote:
 
 [..snip..]
 
 The symptoms are an endlessly increasing stream of hung tasks and high
 
 Please try 3.18.5 (-rc1 is good) which contains the following fix:
 
 workqueue: fix subtle pool management issue which can stall whole
 worker_pool
 
 see: http://article.gmane.org/gmane.linux.kernel.stable/122074
 
 No promises but it seems sufficiently causally related..

Thank you for the suggestion.  I did not find a 3.18.5 presented in any form 
other than as a large number of *.patch files, so I went for 3.19-rc6 instead 
(which I verified has this commit)

Now I am getting:

[ 1224.728313] [ cut here ]
[ 1224.728323] kernel BUG at fs/btrfs/extent-tree.c:7362!
[ 1224.728327] invalid opcode:  [#1] SMP 
[ 1224.728331] Modules linked in: dm_multipath(E) scsi_dh(E) 
x86_pkg_temp_thermal(E) coretemp(E) crct10dif_pclmul(E) crc32_pclmul(E) 
ghash_clmulni_intel(E) aesni_intel(E) aes_x86_64(E) lrw(E) gf128mul(E) 
glue_helper(E) ablk_helper(E) cryptd(E) btrfs(E)
[ 1224.728347] CPU: 3 PID: 18072 Comm: gunicorn Tainted: GE  
3.19.0-rc6 #2
[ 1224.728351] task: 8803eafeb1c0 ti: 8803e838 task.ti: 
8803e838
[ 1224.728354] RIP: e030:[a00210d3]  [a00210d3] 
btrfs_alloc_tree_block+0x3c3/0x3d0 [btrfs]
[ 1224.728372] RSP: e02b:8803e83838b8  EFLAGS: 00010202
[ 1224.728375] RAX: fff4 RBX: fff4 RCX: 32ca
[ 1224.728377] RDX: 32c9 RSI: 8802660c6b90 RDI: 8807543ace00
[ 1224.728380] RBP: 8803e8383958 R08: 0001e560 R09: 88075a2de560
[ 1224.728383] R10: a004fe02 R11: ea0009983180 R12: 88073dfaf8f0
[ 1224.728386] R13: 8807557fe170 R14:  R15: 8800fe05d000
[ 1224.728393] FS:  7fc8e0fc7740() GS:88075a2c() 
knlGS:
[ 1224.728396] CS:  e033 DS:  ES:  CR0: 8005003b
[ 1224.728399] CR2: 7fc8e0fcd000 CR3: 0003e8695000 CR4: 2660
[ 1224.728402] Stack:
[ 1224.728404]    8803e83838d8 
a002b5be
[ 1224.728409]  8803e83839a7 0061 8807557fe000 

[ 1224.728413]  01d3 4000 8802660c6a68 
00ffa005782c
[ 1224.728417] Call Trace:
[ 1224.728430]  [a002b5be] ? btree_set_page_dirty+0xe/0x10 [btrfs]
[ 1224.728442]  [a000b46c] __btrfs_cow_block+0x11c/0x530 [btrfs]
[ 1224.728454]  [a000ba3c] btrfs_cow_block+0x12c/0x1d0 [btrfs]
[ 1224.728465]  [a000f673] btrfs_search_slot+0x1f3/0xa40 [btrfs]
[ 1224.728472]  [81003610] ? xen_write_msr_safe+0x40/0x70
[ 1224.728477]  [8101260f] ? __switch_to+0x15f/0x580
[ 1224.728482]  [811fa186] ? inode_init_always+0x106/0x1e0
[ 1224.728490]  [a0011700] btrfs_insert_empty_items+0x70/0xc0 [btrfs]
[ 1224.728494]  [811fc5a2] ? insert_inode_locked4+0xe2/0x190
[ 1224.728506]  [a00413ff] btrfs_new_inode+0x1bf/0x540 [btrfs]
[ 1224.728517]  [a0042bcf] btrfs_create+0xdf/0x210 [btrfs]
[ 1224.728521]  [811ebf75] vfs_create+0xd5/0x140
[ 1224.728524]  [811efaa3] do_last+0x1013/0x1210
[ 1224.728528]  [811ecff1] ? path_init+0xc1/0x470
[ 1224.728532]  [811efd24] path_openat+0x84/0x630
[ 1224.728536]  [811e29f5] ? __sb_end_write+0x35/0x70
[ 1224.728540]  [811f146a] do_filp_open+0x3a/0x90
[ 1224.728544]  [811fe007] ? __alloc_fd+0xa7/0x130
[ 1224.728548]  [811df548] do_sys_open+0x128/0x220
[ 1224.728552]  [811df65e] SyS_open+0x1e/0x20
[ 1224.728557]  [819521ed] system_call_fastpath+0x16/0x1b
[ 1224.728560] Code: 3d 00 f0 ff ff 0f 87 b8 fd ff ff 44 8b 6d ac 4d 01 af d0 
03 00 00 48 83 c4 78 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 0f 0b 0f 0b 0f 0b 
66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 
[ 1224.728597] RIP  [a00210d3] btrfs_alloc_tree_block+0x3c3/0x3d0 
[btrfs]
[ 1224.728610]  RSP 8803e83838b8


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function 
to get vfsmount from a given sb.

From: Qu Wenruo quwen...@cn.fujitsu.com
To: Miao Xie miao...@huawei.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 09:44


 Original Message 
Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function 
to get vfsmount from a given sb.

From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 08:52

On Thu, 29 Jan 2015 10:24:39 +0800, Qu Wenruo wrote:
There are sysfs interfaces in some fs, only btrfs yet, which will 
modify

on-disk data.
Unlike normal file operation routine we can use 
mnt_want_write_file() to
protect the operation, change through sysfs won't to be binded to 
any file

in the filesystem.
So we can only extract the first vfsmount of a superblock and pass 
it to

mnt_want_write() to do the protection.

This method is wrong, becasue one fs  may be mounted on the multi places
at the same time, someone is R/O, someone is R/W, you may get a R/O and
fail to get the write permission.
This shouldn't happen. If someone is ro, the whole fs should be ro, 
right?
You can mount a device which is already mounted as rw to other point 
as ro,
and remount a mount point to ro will also cause all other mount point 
to ro.


So I didn't see the problem here.


I think you do label/feature change by sysfs interface by the 
following way


btrfs_sysfs_change_()
{
/* Use trylock to avoid the race with umount */
if(!mutex_trylock(sb-s_umount))
return -EBUSY;

check R/O and FREEZE

mutex_unlock(sb-s_umount);
}

This looks better since it not introduce changes to VFS.

Thanks,
Qu

Oh, wait a second, this one leads to the old problem and old solution.

If we hold s_umount mutex, we must do freeze check and can't start 
transaction since it will deadlock.


And for freeze check, we must use sb_try_start_intwrite() to hold the 
freeze lock and then add a new

btrfs_start_transaction_freeze() which will not call sb_start_write()...

Oh this seems so similar, v2 or v3 version RFC patch?
So still goes to the old method?

Thanks,
Qu


Thanks
Miao


Cc: linux-fsdevel linux-fsde...@vger.kernel.org
Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
  fs/namespace.c| 25 +
  include/linux/mount.h |  1 +
  2 files changed, 26 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e968..5a16a62 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1105,6 +1105,31 @@ struct vfsmount *mntget(struct vfsmount *mnt)
  }
  EXPORT_SYMBOL(mntget);
  +/*
+ * get a vfsmount from a given sb
+ *
+ * This is especially used for case where change fs' sysfs interface
+ * will lead to a write, e.g. Change label through sysfs in btrfs.
+ * So vfs can get a vfsmount and then use mnt_want_write() to protect.
+ */
+struct vfsmount *get_vfsmount_sb(struct super_block *sb)
+{
+struct vfsmount *ret_vfs = NULL;
+struct mount *mnt;
+int ret = 0;
+
+lock_mount_hash();
+if (list_empty(sb-s_mounts))
+goto out;
+mnt = list_entry(sb-s_mounts.next, struct mount, mnt_instance);
+ret_vfs = mnt-mnt;
+ret_vfs = mntget(ret_vfs);
+out:
+unlock_mount_hash();
+return ret_vfs;
+}
+EXPORT_SYMBOL(get_vfsmount_sb);
+
  struct vfsmount *mnt_clone_internal(struct path *path)
  {
  struct mount *p;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c2c561d..cf1b0f5 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -79,6 +79,7 @@ extern void mnt_drop_write_file(struct file *file);
  extern void mntput(struct vfsmount *mnt);
  extern struct vfsmount *mntget(struct vfsmount *mnt);
  extern struct vfsmount *mnt_clone_internal(struct path *path);
+extern struct vfsmount *get_vfsmount_sb(struct super_block *sb);
  extern int __mnt_is_readonly(struct vfsmount *mnt);
struct path;





--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Al Viro
On Fri, Jan 30, 2015 at 09:11:26AM +0800, Qu Wenruo wrote:
 For the mounted ro case, that's not a problem, since if one instance
 is mounted ro,
 other instances are also mounted ro, so that's not a problem.

Not really.

root@satch:~# cd /tmp/
root@satch:~# mkdir /tm/a
root@satch:~# mount --bind /tmp/ /tmp/a
root@satch:~# mount --bind -o remount,ro /tmp/a
root@satch:~# mkdir /tmp/b  
root@satch:~# mkdir /tmp/a/c
mkdir: cannot create directory '/tmp/a/c': Read-only file system
root@satch:~# stat /tmp/b /tmp/a/b
  File: '/tmp/b'
  Size: 4096Blocks: 8  IO Block: 4096   directory
Device: 806h/2054d  Inode: 257537  Links: 2
Access: (0755/drwxr-xr-x)  Uid: (0/root)   Gid: (0/root)
Access: 2015-01-29 21:03:35.0 -0500
Modify: 2015-01-29 21:03:35.0 -0500
Change: 2015-01-29 21:03:35.0 -0500
 Birth: -
  File: '/tmp/a/b'
  Size: 4096Blocks: 8  IO Block: 4096   directory
Device: 806h/2054d  Inode: 257537  Links: 2
Access: (0755/drwxr-xr-x)  Uid: (0/root)   Gid: (0/root)
Access: 2015-01-29 21:03:35.0 -0500
Modify: 2015-01-29 21:03:35.0 -0500
Change: 2015-01-29 21:03:35.0 -0500
 Birth: -
root@satch:~#

IOW, /tmp and /tmp/a bear the same filesystem (one from /dev/sda6), the former
is mounted r/w, the latter - r/o.

 Assuming that your superblock is guaranteed to stay alive and usable for
 whatever work you are trying to do, what's wrong with sb_want_write()?
 Did you mean change the function name and it's parameter to
 sb_want_write(sb) and sb_drop_write(sb).
 That looks much better.
 But I'm a little worried about just using sb_start_write() and
 s_readonly_remount/s_flags to do the protection,
 will it be enough?

Protection against what?  If superblock is r/w, it will stay r/w until you
do sb_drop_write(); if it's r/o, sb_want_write() will fail.  There might be
any number of vfsmounts over superblock; attempt to get write access via
vfsmount will succeed only of both the vfsmount and superblock are r/w -
mnt_want_write() does sb_want_write() and fails if that has failed.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function 
to get vfsmount from a given sb.

From: Miao Xie miao...@huawei.com
To: Al Viro v...@zeniv.linux.org.uk, Qu Wenruo quwen...@cn.fujitsu.com
Date: 2015年01月30日 12:14

On Fri, 30 Jan 2015 02:14:45 +, Al Viro wrote:

On Fri, Jan 30, 2015 at 09:44:03AM +0800, Qu Wenruo wrote:


This shouldn't happen. If someone is ro, the whole fs should be ro, right?

Wrong.  Individual vfsmounts over an r/w superblock might very well be r/o.
As for that trylock...  What for?  It invites transient failures for no
good reason.  Removal of sysfs entry will block while write(2) to that sucker
is in progress, so btrfs shutdown will block at that point in ctree_close().
It won't go away under you.

could you explain the race condition? I think the deadlock won't happen, during
the btrfs shutdown, we hold s_umount, the write operation will fail to lock it,
and quit quickly, and then umount will continue.

I think sb_want_write() is similar to trylock(s_umount), the difference is that
sb_want_write() is more complex.

How?
sb_want_write() should be much like mnt_want_write(), except no need to 
increase vfsmout ref count things

and no need to check per mount ro/rw things.

Thanks,
Qu



Now, you might want to move those sysfs entry removals to the very beginning
of btrfs_kill_super(), but that's a different story - you need only to make
sure that they are removed not later than the destruction of the data
structures they need (IOW, the current location might very well be OK - I
hadn't checked the details).

Yes, we need move those sysfs entry removals, but needn't move to the very
beginning of btrfs_kill_super(), just at the beginning of close_ctree();

The current location is not right, it will introduce the use-after-free
problem. because we remove the sysfs entry after we release
transaction_kthread, use-after-free problem might happen in this case
Task1   Task2
change Label by sysfs
close_ctree
  kthread_stop(transaction_kthread);
  change label
  wake_up(transaction_kthread)


Thanks
Miao


As for it won't go r/o under us - sb_want_write() will do that just fine.
.



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount option in a atomic way

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() 
parse mount option in a atomic way

From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 10:06

On Fri, 30 Jan 2015 09:33:17 +0800, Qu Wenruo wrote:

 Original Message 
Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount
option in a atomic way
From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 09:29

On Fri, 30 Jan 2015 09:20:46 +0800, Qu Wenruo wrote:

Here need ACCESS_ONCE to wrap info-mount_opt, or the complier might use
info-mount_opt instead of new_opt.

Thanks for pointing out this one.

But I worried that this is not key reason of the wrong space cache. Could
you explain the race condition which caused the wrong space cache?

Thanks
Miao

CPU0:
remount()
|- sync_fs() - after sync_fs() we can start new trans
|- btrfs_parse_options() CPU1:
  |- start_transaction()
  |- Do some bg allocation, not recorded in space_cache.

I think it is a bug if a free space is not recorded in space cache. Could you
explain why it is not recorded?

Thanks
Miao

IIRC, in that window, the fs_info-mount_opt's SPACE_CACHE bit is cleared.
So space cache is not recorded.

SPACE_CACHE is used to control cache write out, not in-memory cache. All the
free space should be recorded in in-memory cache.And when we write out
the in-memory space cache, we need protect the space cache from changing.

Thanks
Miao
You're right, the wrong space cache problem is not caused by the 
non-atomic mount option problem.
But the atomic mount option change with per-transaction mount option 
will at least make it disappear

when using nospace_cache mount option.

Thanks,
Qu



Thanks,
Qu

   |- set SPACE_CACHE bit due to cache_gen

  |- commit_transaction()
  |- write space cache and update cache_gen.
  but since some of it is not recorded in space
cache,
  the space cache missing some records.
   |- clear SPACE_CACHE bit dut to nospace_cache

So the space cache is wrong.

Thanks,
Qu

+}
kfree(orig);
return ret;
}


.





--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount option in a atomic way

2015-01-29 Thread Miao Xie
On Fri, 30 Jan 2015 10:51:52 +0800, Qu Wenruo wrote:
 
  Original Message 
 Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse 
 mount
 option in a atomic way
 From: Miao Xie miao...@huawei.com
 To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
 Date: 2015年01月30日 10:06
 On Fri, 30 Jan 2015 09:33:17 +0800, Qu Wenruo wrote:
  Original Message 
 Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse 
 mount
 option in a atomic way
 From: Miao Xie miao...@huawei.com
 To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
 Date: 2015年01月30日 09:29
 On Fri, 30 Jan 2015 09:20:46 +0800, Qu Wenruo wrote:
 Here need ACCESS_ONCE to wrap info-mount_opt, or the complier might use
 info-mount_opt instead of new_opt.
 Thanks for pointing out this one.
 But I worried that this is not key reason of the wrong space cache. Could
 you explain the race condition which caused the wrong space cache?

 Thanks
 Miao
 CPU0:
 remount()
 |- sync_fs() - after sync_fs() we can start new trans
 |- btrfs_parse_options() CPU1:
   |- start_transaction()
   |- Do some bg allocation, not recorded in 
 space_cache.
 I think it is a bug if a free space is not recorded in space cache. Could 
 you
 explain why it is not recorded?

 Thanks
 Miao
 IIRC, in that window, the fs_info-mount_opt's SPACE_CACHE bit is cleared.
 So space cache is not recorded.
 SPACE_CACHE is used to control cache write out, not in-memory cache. All the
 free space should be recorded in in-memory cache.And when we write out
 the in-memory space cache, we need protect the space cache from changing.

 Thanks
 Miao
 You're right, the wrong space cache problem is not caused by the non-atomic
 mount option problem.
 But the atomic mount option change with per-transaction mount option will at
 least make it disappear
 when using nospace_cache mount option.

But we need fix a problem, not hide a problem.

Thanks
Miao

 
 Thanks,
 Qu

 Thanks,
 Qu
|- set SPACE_CACHE bit due to cache_gen

   |- commit_transaction()
   |- write space cache and update cache_gen.
   but since some of it is not recorded in 
 space
 cache,
   the space cache missing some records.
|- clear SPACE_CACHE bit dut to nospace_cache

 So the space cache is wrong.

 Thanks,
 Qu
 +}
 kfree(orig);
 return ret;
 }

 .


 
 .
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Al Viro
On Fri, Jan 30, 2015 at 12:14:24PM +0800, Miao Xie wrote:
 On Fri, 30 Jan 2015 02:14:45 +, Al Viro wrote:
  On Fri, Jan 30, 2015 at 09:44:03AM +0800, Qu Wenruo wrote:
  
  This shouldn't happen. If someone is ro, the whole fs should be ro, right?
  
  Wrong.  Individual vfsmounts over an r/w superblock might very well be r/o.
  As for that trylock...  What for?  It invites transient failures for no
  good reason.  Removal of sysfs entry will block while write(2) to that 
  sucker
  is in progress, so btrfs shutdown will block at that point in ctree_close().
  It won't go away under you.
 
 could you explain the race condition? I think the deadlock won't happen, 
 during
 the btrfs shutdown, we hold s_umount, the write operation will fail to lock 
 it,
 and quit quickly, and then umount will continue.

First of all, -s_umount is not a mutex; it's rwsem.  So you mean
down_read_trylock().  As for the transient failures - grep for down_write
on it...  E.g. have somebody call mount() from the same device.  We call
sget(), which finds existing superblock and calls grab_super().  Sure,
that -s_umount will be released shortly, but in the meanwhile your trylock
will fail...

 I think sb_want_write() is similar to trylock(s_umount), the difference is 
 that
 sb_want_write() is more complex.
 
  
  Now, you might want to move those sysfs entry removals to the very beginning
  of btrfs_kill_super(), but that's a different story - you need only to make
  sure that they are removed not later than the destruction of the data
  structures they need (IOW, the current location might very well be OK - I
  hadn't checked the details).
 
 Yes, we need move those sysfs entry removals, but needn't move to the very
 beginning of btrfs_kill_super(), just at the beginning of close_ctree();

So move them...  It's a matter of moving one function call around a bit.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Miao Xie
On Fri, 30 Jan 2015 04:37:14 +, Al Viro wrote:
 On Fri, Jan 30, 2015 at 12:14:24PM +0800, Miao Xie wrote:
 On Fri, 30 Jan 2015 02:14:45 +, Al Viro wrote:
 On Fri, Jan 30, 2015 at 09:44:03AM +0800, Qu Wenruo wrote:

 This shouldn't happen. If someone is ro, the whole fs should be ro, right?

 Wrong.  Individual vfsmounts over an r/w superblock might very well be r/o.
 As for that trylock...  What for?  It invites transient failures for no
 good reason.  Removal of sysfs entry will block while write(2) to that 
 sucker
 is in progress, so btrfs shutdown will block at that point in ctree_close().
 It won't go away under you.

 could you explain the race condition? I think the deadlock won't happen, 
 during
 the btrfs shutdown, we hold s_umount, the write operation will fail to lock 
 it,
 and quit quickly, and then umount will continue.
 
   First of all, -s_umount is not a mutex; it's rwsem.  So you mean
 down_read_trylock().  As for the transient failures - grep for down_write
 on it...  E.g. have somebody call mount() from the same device.  We call
 sget(), which finds existing superblock and calls grab_super().  Sure,
 that -s_umount will be released shortly, but in the meanwhile your trylock
 will fail...

I know it, so I suggested to return -EBUSY in the previous mail.
I think it is acceptable method, mount/umount operations are not so many
after all.

Thanks
Miao

 
 I think sb_want_write() is similar to trylock(s_umount), the difference is 
 that
 sb_want_write() is more complex.


 Now, you might want to move those sysfs entry removals to the very beginning
 of btrfs_kill_super(), but that's a different story - you need only to make
 sure that they are removed not later than the destruction of the data
 structures they need (IOW, the current location might very well be OK - I
 hadn't checked the details).

 Yes, we need move those sysfs entry removals, but needn't move to the very
 beginning of btrfs_kill_super(), just at the beginning of close_ctree();
 
 So move them...  It's a matter of moving one function call around a bit.
 .
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount option in a atomic way

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() 
parse mount option in a atomic way

From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 11:21

On Fri, 30 Jan 2015 10:51:52 +0800, Qu Wenruo wrote:

 Original Message 
Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount
option in a atomic way
From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 10:06

On Fri, 30 Jan 2015 09:33:17 +0800, Qu Wenruo wrote:

 Original Message 
Subject: Re: [PATCH RESEND v4 2/8] btrfs: Make btrfs_parse_options() parse mount
option in a atomic way
From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 09:29

On Fri, 30 Jan 2015 09:20:46 +0800, Qu Wenruo wrote:

Here need ACCESS_ONCE to wrap info-mount_opt, or the complier might use
info-mount_opt instead of new_opt.

Thanks for pointing out this one.

But I worried that this is not key reason of the wrong space cache. Could
you explain the race condition which caused the wrong space cache?

Thanks
Miao

CPU0:
remount()
|- sync_fs() - after sync_fs() we can start new trans
|- btrfs_parse_options() CPU1:
   |- start_transaction()
   |- Do some bg allocation, not recorded in space_cache.

I think it is a bug if a free space is not recorded in space cache. Could you
explain why it is not recorded?

Thanks
Miao

IIRC, in that window, the fs_info-mount_opt's SPACE_CACHE bit is cleared.
So space cache is not recorded.

SPACE_CACHE is used to control cache write out, not in-memory cache. All the
free space should be recorded in in-memory cache.And when we write out
the in-memory space cache, we need protect the space cache from changing.

Thanks
Miao

You're right, the wrong space cache problem is not caused by the non-atomic
mount option problem.
But the atomic mount option change with per-transaction mount option will at
least make it disappear
when using nospace_cache mount option.

But we need fix a problem, not hide a problem.

Thanks
Miao

Yes, But I don't mean to hide it.
If it's a bug in space cache, it will still be reproducible on with 
space_cache mount option.


The patch itself is focused on the space cache created with 
nospace_cache mount option,

at least it's doing it job.

The wrong space cahce problem is another story then.
I'll remove the wrong space cache description in the commit message in 
next version.


Thanks,
Qu



Thanks,
Qu

Thanks,
Qu

|- set SPACE_CACHE bit due to cache_gen

   |- commit_transaction()
   |- write space cache and update cache_gen.
   but since some of it is not recorded in space
cache,
   the space cache missing some records.
|- clear SPACE_CACHE bit dut to nospace_cache

So the space cache is wrong.

Thanks,
Qu

+}
 kfree(orig);
 return ret;
 }


.


.



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Miao Xie
On Fri, 30 Jan 2015 10:02:26 +0800, Qu Wenruo wrote:
 
  Original Message 
 Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get
 vfsmount from a given sb.
 From: Qu Wenruo quwen...@cn.fujitsu.com
 To: Miao Xie miao...@huawei.com, linux-btrfs@vger.kernel.org
 Date: 2015年01月30日 09:44

  Original Message 
 Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get
 vfsmount from a given sb.
 From: Miao Xie miao...@huawei.com
 To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
 Date: 2015年01月30日 08:52
 On Thu, 29 Jan 2015 10:24:39 +0800, Qu Wenruo wrote:
 There are sysfs interfaces in some fs, only btrfs yet, which will modify
 on-disk data.
 Unlike normal file operation routine we can use mnt_want_write_file() to
 protect the operation, change through sysfs won't to be binded to any file
 in the filesystem.
 So we can only extract the first vfsmount of a superblock and pass it to
 mnt_want_write() to do the protection.
 This method is wrong, becasue one fs  may be mounted on the multi places
 at the same time, someone is R/O, someone is R/W, you may get a R/O and
 fail to get the write permission.
 This shouldn't happen. If someone is ro, the whole fs should be ro, right?
 You can mount a device which is already mounted as rw to other point as ro,
 and remount a mount point to ro will also cause all other mount point to ro.

 So I didn't see the problem here.

 I think you do label/feature change by sysfs interface by the following way

 btrfs_sysfs_change_()
 {
 /* Use trylock to avoid the race with umount */
 if(!mutex_trylock(sb-s_umount))
 return -EBUSY;

 check R/O and FREEZE

 mutex_unlock(sb-s_umount);
 }
 This looks better since it not introduce changes to VFS.

 Thanks,
 Qu
 Oh, wait a second, this one leads to the old problem and old solution.
 
 If we hold s_umount mutex, we must do freeze check and can't start transaction
 since it will deadlock.
 
 And for freeze check, we must use sb_try_start_intwrite() to hold the freeze
 lock and then add a new
 btrfs_start_transaction_freeze() which will not call sb_start_write()...
 
 Oh this seems so similar, v2 or v3 version RFC patch?
 So still goes to the old method?

No. Just check R/O and RREEZE, if failed, go out. if the check pass,
we start_transaction. Because we do it in s_umount lock, no one can
change fs to R/O or FREEZE.

Maybe the above description is not so clear, explain it again.

btrfs_sysfs_change_()
{
/* Use trylock to avoid the race with umount */
if(!mutex_trylock(sb-s_umount))
return -EBUSY;

if (fs is R/O or FREEZED) {
mutex_unlock(sb-s_umount);
return -EACCES;
}

btrfs_start_transaction()
change label/feature
btrfs_commit_transaction()

mutex_unlock(sb-s_umount);
}

Thanks
Miao

 
 Thanks,
 Qu

 Thanks
 Miao

 Cc: linux-fsdevel linux-fsde...@vger.kernel.org
 Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
 ---
   fs/namespace.c| 25 +
   include/linux/mount.h |  1 +
   2 files changed, 26 insertions(+)

 diff --git a/fs/namespace.c b/fs/namespace.c
 index cd1e968..5a16a62 100644
 --- a/fs/namespace.c
 +++ b/fs/namespace.c
 @@ -1105,6 +1105,31 @@ struct vfsmount *mntget(struct vfsmount *mnt)
   }
   EXPORT_SYMBOL(mntget);
   +/*
 + * get a vfsmount from a given sb
 + *
 + * This is especially used for case where change fs' sysfs interface
 + * will lead to a write, e.g. Change label through sysfs in btrfs.
 + * So vfs can get a vfsmount and then use mnt_want_write() to protect.
 + */
 +struct vfsmount *get_vfsmount_sb(struct super_block *sb)
 +{
 +struct vfsmount *ret_vfs = NULL;
 +struct mount *mnt;
 +int ret = 0;
 +
 +lock_mount_hash();
 +if (list_empty(sb-s_mounts))
 +goto out;
 +mnt = list_entry(sb-s_mounts.next, struct mount, mnt_instance);
 +ret_vfs = mnt-mnt;
 +ret_vfs = mntget(ret_vfs);
 +out:
 +unlock_mount_hash();
 +return ret_vfs;
 +}
 +EXPORT_SYMBOL(get_vfsmount_sb);
 +
   struct vfsmount *mnt_clone_internal(struct path *path)
   {
   struct mount *p;
 diff --git a/include/linux/mount.h b/include/linux/mount.h
 index c2c561d..cf1b0f5 100644
 --- a/include/linux/mount.h
 +++ b/include/linux/mount.h
 @@ -79,6 +79,7 @@ extern void mnt_drop_write_file(struct file *file);
   extern void mntput(struct vfsmount *mnt);
   extern struct vfsmount *mntget(struct vfsmount *mnt);
   extern struct vfsmount *mnt_clone_internal(struct path *path);
 +extern struct vfsmount *get_vfsmount_sb(struct super_block *sb);
   extern int __mnt_is_readonly(struct vfsmount *mnt);
 struct path;


 
 .
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Miao Xie
On Fri, 30 Jan 2015 02:14:45 +, Al Viro wrote:
 On Fri, Jan 30, 2015 at 09:44:03AM +0800, Qu Wenruo wrote:
 
 This shouldn't happen. If someone is ro, the whole fs should be ro, right?
 
 Wrong.  Individual vfsmounts over an r/w superblock might very well be r/o.
 As for that trylock...  What for?  It invites transient failures for no
 good reason.  Removal of sysfs entry will block while write(2) to that sucker
 is in progress, so btrfs shutdown will block at that point in ctree_close().
 It won't go away under you.

could you explain the race condition? I think the deadlock won't happen, during
the btrfs shutdown, we hold s_umount, the write operation will fail to lock it,
and quit quickly, and then umount will continue.

I think sb_want_write() is similar to trylock(s_umount), the difference is that
sb_want_write() is more complex.

 
 Now, you might want to move those sysfs entry removals to the very beginning
 of btrfs_kill_super(), but that's a different story - you need only to make
 sure that they are removed not later than the destruction of the data
 structures they need (IOW, the current location might very well be OK - I
 hadn't checked the details).

Yes, we need move those sysfs entry removals, but needn't move to the very
beginning of btrfs_kill_super(), just at the beginning of close_ctree();

The current location is not right, it will introduce the use-after-free
problem. because we remove the sysfs entry after we release
transaction_kthread, use-after-free problem might happen in this case
Task1   Task2
change Label by sysfs
close_ctree
  kthread_stop(transaction_kthread);
  change label
  wake_up(transaction_kthread)


Thanks
Miao

 
 As for it won't go r/o under us - sb_want_write() will do that just fine.
 .
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Al Viro
On Fri, Jan 30, 2015 at 01:34:41PM +0800, Miao Xie wrote:
  First of all, -s_umount is not a mutex; it's rwsem.  So you mean
  down_read_trylock().  As for the transient failures - grep for down_write
  on it...  E.g. have somebody call mount() from the same device.  We call
  sget(), which finds existing superblock and calls grab_super().  Sure,
  that -s_umount will be released shortly, but in the meanwhile your trylock
  will fail...
 
 I know it, so I suggested to return -EBUSY in the previous mail.
 I think it is acceptable method, mount/umount operations are not so many
 after all.

Er...  What for, when there's a trivial variant that doesn't suffer those
spurious -EBUSY?  Seriously, just move sysfs removals up to make sure
that any possible fs shutdown won't progress past those during sysfs IO and
use sb_want_write/sb_dont_write to make sure it'll stay r/w for the duration.
What's the problem?
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Miao Xie
On Thu, 29 Jan 2015 10:24:39 +0800, Qu Wenruo wrote:
 There are sysfs interfaces in some fs, only btrfs yet, which will modify
 on-disk data.
 Unlike normal file operation routine we can use mnt_want_write_file() to
 protect the operation, change through sysfs won't to be binded to any file
 in the filesystem.
 So we can only extract the first vfsmount of a superblock and pass it to
 mnt_want_write() to do the protection.

This method is wrong, becasue one fs  may be mounted on the multi places
at the same time, someone is R/O, someone is R/W, you may get a R/O and
fail to get the write permission.

I think you do label/feature change by sysfs interface by the following way

btrfs_sysfs_change_()
{
/* Use trylock to avoid the race with umount */
if(!mutex_trylock(sb-s_umount))
return -EBUSY;

check R/O and FREEZE

mutex_unlock(sb-s_umount);
}

Thanks
Miao

 
 Cc: linux-fsdevel linux-fsde...@vger.kernel.org
 Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
 ---
  fs/namespace.c| 25 +
  include/linux/mount.h |  1 +
  2 files changed, 26 insertions(+)
 
 diff --git a/fs/namespace.c b/fs/namespace.c
 index cd1e968..5a16a62 100644
 --- a/fs/namespace.c
 +++ b/fs/namespace.c
 @@ -1105,6 +1105,31 @@ struct vfsmount *mntget(struct vfsmount *mnt)
  }
  EXPORT_SYMBOL(mntget);
  
 +/*
 + * get a vfsmount from a given sb
 + *
 + * This is especially used for case where change fs' sysfs interface
 + * will lead to a write, e.g. Change label through sysfs in btrfs.
 + * So vfs can get a vfsmount and then use mnt_want_write() to protect.
 + */
 +struct vfsmount *get_vfsmount_sb(struct super_block *sb)
 +{
 + struct vfsmount *ret_vfs = NULL;
 + struct mount *mnt;
 + int ret = 0;
 +
 + lock_mount_hash();
 + if (list_empty(sb-s_mounts))
 + goto out;
 + mnt = list_entry(sb-s_mounts.next, struct mount, mnt_instance);
 + ret_vfs = mnt-mnt;
 + ret_vfs = mntget(ret_vfs);
 +out:
 + unlock_mount_hash();
 + return ret_vfs;
 +}
 +EXPORT_SYMBOL(get_vfsmount_sb);
 +
  struct vfsmount *mnt_clone_internal(struct path *path)
  {
   struct mount *p;
 diff --git a/include/linux/mount.h b/include/linux/mount.h
 index c2c561d..cf1b0f5 100644
 --- a/include/linux/mount.h
 +++ b/include/linux/mount.h
 @@ -79,6 +79,7 @@ extern void mnt_drop_write_file(struct file *file);
  extern void mntput(struct vfsmount *mnt);
  extern struct vfsmount *mntget(struct vfsmount *mnt);
  extern struct vfsmount *mnt_clone_internal(struct path *path);
 +extern struct vfsmount *get_vfsmount_sb(struct super_block *sb);
  extern int __mnt_is_readonly(struct vfsmount *mnt);
  
  struct path;
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function to get vfsmount from a given sb.

2015-01-29 Thread Qu Wenruo


 Original Message 
Subject: Re: [PATCH RESEND v4 6/8] vfs: Add get_vfsmount_sb() function 
to get vfsmount from a given sb.

From: Miao Xie miao...@huawei.com
To: Qu Wenruo quwen...@cn.fujitsu.com, linux-btrfs@vger.kernel.org
Date: 2015年01月30日 08:52

On Thu, 29 Jan 2015 10:24:39 +0800, Qu Wenruo wrote:

There are sysfs interfaces in some fs, only btrfs yet, which will modify
on-disk data.
Unlike normal file operation routine we can use mnt_want_write_file() to
protect the operation, change through sysfs won't to be binded to any file
in the filesystem.
So we can only extract the first vfsmount of a superblock and pass it to
mnt_want_write() to do the protection.

This method is wrong, becasue one fs  may be mounted on the multi places
at the same time, someone is R/O, someone is R/W, you may get a R/O and
fail to get the write permission.

This shouldn't happen. If someone is ro, the whole fs should be ro, right?
You can mount a device which is already mounted as rw to other point as ro,
and remount a mount point to ro will also cause all other mount point to ro.

So I didn't see the problem here.


I think you do label/feature change by sysfs interface by the following way

btrfs_sysfs_change_()
{
/* Use trylock to avoid the race with umount */
if(!mutex_trylock(sb-s_umount))
return -EBUSY;

check R/O and FREEZE

mutex_unlock(sb-s_umount);
}

This looks better since it not introduce changes to VFS.

Thanks,
Qu


Thanks
Miao


Cc: linux-fsdevel linux-fsde...@vger.kernel.org
Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
  fs/namespace.c| 25 +
  include/linux/mount.h |  1 +
  2 files changed, 26 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e968..5a16a62 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1105,6 +1105,31 @@ struct vfsmount *mntget(struct vfsmount *mnt)
  }
  EXPORT_SYMBOL(mntget);
  
+/*

+ * get a vfsmount from a given sb
+ *
+ * This is especially used for case where change fs' sysfs interface
+ * will lead to a write, e.g. Change label through sysfs in btrfs.
+ * So vfs can get a vfsmount and then use mnt_want_write() to protect.
+ */
+struct vfsmount *get_vfsmount_sb(struct super_block *sb)
+{
+   struct vfsmount *ret_vfs = NULL;
+   struct mount *mnt;
+   int ret = 0;
+
+   lock_mount_hash();
+   if (list_empty(sb-s_mounts))
+   goto out;
+   mnt = list_entry(sb-s_mounts.next, struct mount, mnt_instance);
+   ret_vfs = mnt-mnt;
+   ret_vfs = mntget(ret_vfs);
+out:
+   unlock_mount_hash();
+   return ret_vfs;
+}
+EXPORT_SYMBOL(get_vfsmount_sb);
+
  struct vfsmount *mnt_clone_internal(struct path *path)
  {
struct mount *p;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c2c561d..cf1b0f5 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -79,6 +79,7 @@ extern void mnt_drop_write_file(struct file *file);
  extern void mntput(struct vfsmount *mnt);
  extern struct vfsmount *mntget(struct vfsmount *mnt);
  extern struct vfsmount *mnt_clone_internal(struct path *path);
+extern struct vfsmount *get_vfsmount_sb(struct super_block *sb);
  extern int __mnt_is_readonly(struct vfsmount *mnt);
  
  struct path;




--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html