Re: Ceph on btrfs 3.4rc

2012-05-24 Thread Christian Brunner
Same thing here.

I've tried really hard, but even after 12 hours I wasn't able to get a
single warning from btrfs.

I think you cracked it!

Thanks,
Christian

2012/5/24 Martin Mailand :
> Hi,
> the ceph cluster is running under heavy load for the last 13 hours without a
> problem, dmesg is empty and the performance is good.
>
> -martin
>
> Am 23.05.2012 21:12, schrieb Martin Mailand:
>
>> this patch is running for 3 hours without a Bug and without the Warning.
>> I will let it run overnight and report tomorrow.
>> It looks very good ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-23 Thread Martin Mailand

Hi,
the ceph cluster is running under heavy load for the last 13 hours 
without a problem, dmesg is empty and the performance is good.


-martin

Am 23.05.2012 21:12, schrieb Martin Mailand:

this patch is running for 3 hours without a Bug and without the Warning.
I will let it run overnight and report tomorrow.
It looks very good ;-)

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-23 Thread Martin Mailand

Hi Josef,

this patch is running for 3 hours without a Bug and without the Warning.
I will let it run overnight and report tomorrow.
It looks very good ;-)

-martin

Am 23.05.2012 17:02, schrieb Josef Bacik:

Ok give this a shot, it should do it.  Thanks,

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-23 Thread Josef Bacik
On Wed, May 23, 2012 at 02:34:43PM +0200, Christian Brunner wrote:
> 2012/5/22 Josef Bacik :
> >>
> >
> > Yeah you would also need to change orphan_meta_reserved.  I fixed this by 
> > just
> > taking the BTRFS_I(inode)->lock when messing with these since we don't want 
> > to
> > take up all that space in the inode just for a marker.  I ran this patch 
> > for 3
> > hours with no issues, let me know if it works for you.  Thanks,
> 
> Compared to the last runs, I had to run it much longer, but somehow I
> managed to hit a BUG_ON again:
> 

Ok give this a shot, it should do it.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15f..41ddec8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,22 @@
 #include "ordered-data.h"
 #include "delayed-inode.h"
 
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero.  When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE 0
+#define BTRFS_INODE_ORPHAN_META_RESERVED   1
+#define BTRFS_INODE_DUMMY  2
+#define BTRFS_INODE_IN_DEFRAG  3
+#define BTRFS_INODE_DELALLOC_META_RESERVED 4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM5
+#define BTRFS_INODE_FORCE_ZLIB 6
+#define BTRFS_INODE_FORCE_LZO  7
+
 /* in memory btrfs inode */
 struct btrfs_inode {
/* which subvolume this inode belongs to */
@@ -57,9 +73,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -143,24 +156,7 @@ struct btrfs_inode {
 */
unsigned outstanding_extents;
unsigned reserved_extents;
-
-   /*
-* ordered_data_close is set by truncate when a file that used
-* to have good data has been truncated to zero.  When it is set
-* the btrfs file release call will add this inode to the
-* ordered operations list so that we make sure to flush out any
-* new data the application may have written before commit.
-*/
-   unsigned ordered_data_close:1;
-   unsigned orphan_meta_reserved:1;
-   unsigned dummy_inode:1;
-   unsigned in_defrag:1;
-   unsigned delalloc_meta_reserved:1;
-
-   /*
-* always compress this one file
-*/
-   unsigned force_compress:4;
+   unsigned long runtime_flags;
 
struct btrfs_delayed_node *delayed_node;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd7233..aad2600 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 03e3748..5190861 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
spin_lock(&BTRFS_I(inode)->lock);
-   if (BTRFS_I(inode)->delalloc_meta_reserved) {
-   BTRFS_I(inode)->delalloc_meta_reserved = 0;
+   if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+  &BTRFS_I(inode)->runtime_flags)) {
spin_unlock(&BTRFS_I(inode)->lock);
release = true;
goto migrate;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7ffc88..0ddeb0d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+   atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_com

Re: Ceph on btrfs 3.4rc

2012-05-23 Thread Josef Bacik
On Wed, May 23, 2012 at 02:34:43PM +0200, Christian Brunner wrote:
> 2012/5/22 Josef Bacik :
> >>
> >
> > Yeah you would also need to change orphan_meta_reserved.  I fixed this by 
> > just
> > taking the BTRFS_I(inode)->lock when messing with these since we don't want 
> > to
> > take up all that space in the inode just for a marker.  I ran this patch 
> > for 3
> > hours with no issues, let me know if it works for you.  Thanks,
> 
> Compared to the last runs, I had to run it much longer, but somehow I
> managed to hit a BUG_ON again:
> 

Yeah it's because we access other parts of that bitfield with no lock at all
which is what is likely screwing us.  I'm going to have to redo that part and
then do the orphan fix, I'll have a patch shortly.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-23 Thread Christian Brunner
2012/5/22 Josef Bacik :
>>
>
> Yeah you would also need to change orphan_meta_reserved.  I fixed this by just
> taking the BTRFS_I(inode)->lock when messing with these since we don't want to
> take up all that space in the inode just for a marker.  I ran this patch for 3
> hours with no issues, let me know if it works for you.  Thanks,

Compared to the last runs, I had to run it much longer, but somehow I
managed to hit a BUG_ON again:

[448281.002087] couldn't find orphan item for 2027, nlink 1, root 308,
root being deleted no
[448281.011339] [ cut here ]
[448281.016590] kernel BUG at fs/btrfs/inode.c:2230!
[448281.021837] invalid opcode:  [#1] SMP
[448281.026525] CPU 4
[448281.028670] Modules linked in: btrfs zlib_deflate libcrc32c xfs
exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
iTCO_vendor_support ixgbe dca mdio i7core_edac edac_core
iomemory_vsl(PO) hpsa squashfs [last unloaded: btrfs]
[448281.052215]
[448281.053977] Pid: 16018, comm: ceph-osd Tainted: PW  O
3.3.5-1.fits.1.el6.x86_64 #1 HP ProLiant DL180 G6
[448281.06] RIP: 0010:[]  []
btrfs_orphan_del+0x19b/0x1b0 [btrfs]
[448281.075965] RSP: 0018:880458257d18  EFLAGS: 00010292
[448281.081987] RAX: 0063 RBX: 8803a28ebc48 RCX:
2fdb
[448281.090042] RDX:  RSI: 0046 RDI:
0246
[448281.098093] RBP: 880458257d58 R08: 81af6100 R09:

[448281.106146] R10: 0004 R11:  R12:
0001
[448281.114202] R13: 88052e130400 R14: 0001 R15:
8805beae9e10
[448281.122262] FS:  7fa2e772f700() GS:88062728()
knlGS:
[448281.131386] CS:  0010 DS:  ES:  CR0: 80050033
[448281.137879] CR2: ff600400 CR3: 0005015a5000 CR4:
06e0
[448281.145929] DR0:  DR1:  DR2:

[448281.153974] DR3:  DR6: 0ff0 DR7:
0400
[448281.162043] Process ceph-osd (pid: 16018, threadinfo
880458256000, task 88055b711940)
[448281.171646] Stack:
[448281.173987]  880458257dff 8803a28eba98 880458257d58
8805beae9e10
[448281.182377]   88052e130400 88029ff33380
8803a28ebc48
[448281.190766]  880458257e08 a04ab4e6 
8803a28ebc48
[448281.199155] Call Trace:
[448281.202005]  [] btrfs_truncate+0x5f6/0x660 [btrfs]
[448281.209203]  [] btrfs_setattr+0xf6/0x1a0 [btrfs]
[448281.216202]  [] notify_change+0x18b/0x2b0
[448281.222517]  [] ? selinux_inode_permission+0xd1/0x130
[448281.229990]  [] do_truncate+0x64/0xa0
[448281.235919]  [] ? inode_permission+0x49/0x100
[448281.242617]  [] sys_truncate+0x137/0x150
[448281.248838]  [] system_call_fastpath+0x16/0x1b
[448281.255631] Code: a0 49 8b 8d f0 02 00 00 8b 53 48 4c 0f 45 c0 48
85 f6 74 1b 80 bb 60 fe ff ff 84 74 12 48 c7 c7 e8 1d 50 a0 31 c0 e8
9d ea 0d e1 <0f> 0b eb fe 48 8b 73 40 eb e8 66 66 2e 0f 1f 84 00 00 00
00 00
[448281.277435] RIP  [] btrfs_orphan_del+0x19b/0x1b0 [btrfs]
[448281.285229]  RSP 
[448281.289667] ---[ end trace 9adc7b36a3e66872 ]---

Sorry,
Christian
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-22 Thread Josef Bacik
On Tue, May 22, 2012 at 12:29:59PM +0200, Christian Brunner wrote:
> 2012/5/21 Miao Xie :
> > Hi Josef,
> >
> > On fri, 18 May 2012 15:01:05 -0400, Josef Bacik wrote:
> >> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> >> index 9b9b15f..492c74f 100644
> >> --- a/fs/btrfs/btrfs_inode.h
> >> +++ b/fs/btrfs/btrfs_inode.h
> >> @@ -57,9 +57,6 @@ struct btrfs_inode {
> >>       /* used to order data wrt metadata */
> >>       struct btrfs_ordered_inode_tree ordered_tree;
> >>
> >> -     /* for keeping track of orphaned inodes */
> >> -     struct list_head i_orphan;
> >> -
> >>       /* list of all the delalloc inodes in the FS.  There are times we 
> >> need
> >>        * to write all the delalloc pages to disk, and this list is used
> >>        * to walk them all.
> >> @@ -156,6 +153,8 @@ struct btrfs_inode {
> >>       unsigned dummy_inode:1;
> >>       unsigned in_defrag:1;
> >>       unsigned delalloc_meta_reserved:1;
> >> +     unsigned has_orphan_item:1;
> >> +     unsigned doing_truncate:1;
> >
> > I think the problem is we should not use the different lock to protect the 
> > bit fields which
> > are stored in the same machine word. Or some bit fields may be covered by 
> > the others when
> > someone change those fields. Could you try to declare 
> > ->delalloc_meta_reserved and ->has_orphan_item
> > as a integer?
> 
> I have tried changing it to:
> 
> struct btrfs_inode {
> unsigned orphan_meta_reserved:1;
> unsigned dummy_inode:1;
> unsigned in_defrag:1;
> -   unsigned delalloc_meta_reserved:1;
> +   int delalloc_meta_reserved;
> +   int has_orphan_item;
> +   int doing_truncate;
> 
> The strange thing is, that I'm no longer hitting the BUG_ON, but the
> old WARNING (no additional messages):
> 

Yeah you would also need to change orphan_meta_reserved.  I fixed this by just
taking the BTRFS_I(inode)->lock when messing with these since we don't want to
take up all that space in the inode just for a marker.  I ran this patch for 3
hours with no issues, let me know if it works for you.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3771b85..559e716 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -153,6 +150,7 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ba8743b..72cdf98 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 19f5b45..25dba7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+   atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 54ae3df..54f1b30 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2104,12 +2104,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle 
*trans,
struct btrfs_block_rsv *block_rsv;
int ret;
 
-   if (!list_empty(&root->orphan_list) ||
+   if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
 
spin_lock(&root->orphan_lock);
-   if (!list_empty(&root->orphan_list)) {
+   if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2166,8 +2166,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
block_rsv = NULL;
}
 
-   if

Re: Ceph on btrfs 3.4rc

2012-05-22 Thread Josef Bacik
On Mon, May 21, 2012 at 11:59:54AM +0800, Miao Xie wrote:
> Hi Josef,
> 
> On fri, 18 May 2012 15:01:05 -0400, Josef Bacik wrote:
> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> > index 9b9b15f..492c74f 100644
> > --- a/fs/btrfs/btrfs_inode.h
> > +++ b/fs/btrfs/btrfs_inode.h
> > @@ -57,9 +57,6 @@ struct btrfs_inode {
> > /* used to order data wrt metadata */
> > struct btrfs_ordered_inode_tree ordered_tree;
> >  
> > -   /* for keeping track of orphaned inodes */
> > -   struct list_head i_orphan;
> > -
> > /* list of all the delalloc inodes in the FS.  There are times we need
> >  * to write all the delalloc pages to disk, and this list is used
> >  * to walk them all.
> > @@ -156,6 +153,8 @@ struct btrfs_inode {
> > unsigned dummy_inode:1;
> > unsigned in_defrag:1;
> > unsigned delalloc_meta_reserved:1;
> > +   unsigned has_orphan_item:1;
> > +   unsigned doing_truncate:1;
> 
> I think the problem is we should not use the different lock to protect the 
> bit fields which
> are stored in the same machine word. Or some bit fields may be covered by the 
> others when
> someone change those fields. Could you try to declare 
> ->delalloc_meta_reserved and ->has_orphan_item
> as a integer?
> 

Oh freaking duh, thank you Miao, I'm an idiot.

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-22 Thread Christian Brunner
2012/5/21 Miao Xie :
> Hi Josef,
>
> On fri, 18 May 2012 15:01:05 -0400, Josef Bacik wrote:
>> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>> index 9b9b15f..492c74f 100644
>> --- a/fs/btrfs/btrfs_inode.h
>> +++ b/fs/btrfs/btrfs_inode.h
>> @@ -57,9 +57,6 @@ struct btrfs_inode {
>>       /* used to order data wrt metadata */
>>       struct btrfs_ordered_inode_tree ordered_tree;
>>
>> -     /* for keeping track of orphaned inodes */
>> -     struct list_head i_orphan;
>> -
>>       /* list of all the delalloc inodes in the FS.  There are times we need
>>        * to write all the delalloc pages to disk, and this list is used
>>        * to walk them all.
>> @@ -156,6 +153,8 @@ struct btrfs_inode {
>>       unsigned dummy_inode:1;
>>       unsigned in_defrag:1;
>>       unsigned delalloc_meta_reserved:1;
>> +     unsigned has_orphan_item:1;
>> +     unsigned doing_truncate:1;
>
> I think the problem is we should not use the different lock to protect the 
> bit fields which
> are stored in the same machine word. Or some bit fields may be covered by the 
> others when
> someone change those fields. Could you try to declare 
> ->delalloc_meta_reserved and ->has_orphan_item
> as a integer?

I have tried changing it to:

struct btrfs_inode {
unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
unsigned in_defrag:1;
-   unsigned delalloc_meta_reserved:1;
+   int delalloc_meta_reserved;
+   int has_orphan_item;
+   int doing_truncate;

The strange thing is, that I'm no longer hitting the BUG_ON, but the
old WARNING (no additional messages):

[351021.157124] [ cut here ]
[351021.162400] WARNING: at fs/btrfs/inode.c:2103
btrfs_orphan_commit_root+0xf7/0x100 [btrfs]()
[351021.171812] Hardware name: ProLiant DL180 G6
[351021.176867] Modules linked in: btrfs zlib_deflate libcrc32c xfs
exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
iTCO_vendor_support ixgbe dca mdio i7core_edac edac_core
iomemory_vsl(PO) hpsa squashfs [last unloaded: btrfs]
[351021.200236] Pid: 9837, comm: btrfs-transacti Tainted: PW
O 3.3.5-1.fits.1.el6.x86_64 #1
[351021.210126] Call Trace:
[351021.212957]  [] warn_slowpath_common+0x7f/0xc0
[351021.219758]  [] warn_slowpath_null+0x1a/0x20
[351021.226385]  []
btrfs_orphan_commit_root+0xf7/0x100 [btrfs]
[351021.234461]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
[351021.241669]  [] ?
btrfs_run_delayed_items+0xf1/0x160 [btrfs]
[351021.249841]  []
btrfs_commit_transaction+0x584/0xa50 [btrfs]
[351021.258006]  [] ? start_transaction+0x92/0x310 [btrfs]
[351021.265580]  [] ? wake_up_bit+0x40/0x40
[351021.271719]  [] transaction_kthread+0x26b/0x2e0 [btrfs]
[351021.279405]  [] ?
btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
[351021.288934]  [] ?
btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
[351021.298449]  [] kthread+0x9e/0xb0
[351021.303989]  [] kernel_thread_helper+0x4/0x10
[351021.310691]  [] ? kthread_freezable_should_stop+0x70/0x70
[351021.318555]  [] ? gs_change+0x13/0x13
[351021.324479] ---[ end trace 9adc7b36a3e66833 ]---
[351710.339482] [ cut here ]
[351710.344754] WARNING: at fs/btrfs/inode.c:2103
btrfs_orphan_commit_root+0xf7/0x100 [btrfs]()
[351710.354165] Hardware name: ProLiant DL180 G6
[351710.359222] Modules linked in: btrfs zlib_deflate libcrc32c xfs
exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
iTCO_vendor_support ixgbe dca mdio i7core_edac edac_core
iomemory_vsl(PO) hpsa squashfs [last unloaded: btrfs]
[351710.382569] Pid: 9797, comm: kworker/5:0 Tainted: PW  O
3.3.5-1.fits.1.el6.x86_64 #1
[351710.392075] Call Trace:
[351710.394901]  [] warn_slowpath_common+0x7f/0xc0
[351710.401750]  [] warn_slowpath_null+0x1a/0x20
[351710.408414]  []
btrfs_orphan_commit_root+0xf7/0x100 [btrfs]
[351710.416528]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
[351710.423775]  []
btrfs_commit_transaction+0x584/0xa50 [btrfs]
[351710.431983]  [] ? __switch_to+0x153/0x440
[351710.438352]  [] ? wake_up_bit+0x40/0x40
[351710.444529]  [] ?
btrfs_commit_transaction+0xa50/0xa50 [btrfs]
[351710.452894]  [] do_async_commit+0x1f/0x30 [btrfs]
[351710.459979]  [] process_one_work+0x129/0x450
[351710.466576]  [] worker_thread+0x17b/0x3c0
[351710.472884]  [] ? manage_workers+0x220/0x220
[351710.479472]  [] kthread+0x9e/0xb0
[351710.485029]  [] kernel_thread_helper+0x4/0x10
[351710.491731]  [] ? kthread_freezable_should_stop+0x70/0x70
[351710.499640]  [] ? gs_change+0x13/0x13
[351710.505590] ---[ end trace 9adc7b36a3e66834 ]---


Regards,
Christian
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-20 Thread Miao Xie
Hi Josef,

On fri, 18 May 2012 15:01:05 -0400, Josef Bacik wrote:
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 9b9b15f..492c74f 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -57,9 +57,6 @@ struct btrfs_inode {
>   /* used to order data wrt metadata */
>   struct btrfs_ordered_inode_tree ordered_tree;
>  
> - /* for keeping track of orphaned inodes */
> - struct list_head i_orphan;
> -
>   /* list of all the delalloc inodes in the FS.  There are times we need
>* to write all the delalloc pages to disk, and this list is used
>* to walk them all.
> @@ -156,6 +153,8 @@ struct btrfs_inode {
>   unsigned dummy_inode:1;
>   unsigned in_defrag:1;
>   unsigned delalloc_meta_reserved:1;
> + unsigned has_orphan_item:1;
> + unsigned doing_truncate:1;

I think the problem is we should not use the different lock to protect the bit 
fields which
are stored in the same machine word. Or some bit fields may be covered by the 
others when
someone change those fields. Could you try to declare ->delalloc_meta_reserved 
and ->has_orphan_item
as a integer?

Thanks
Miao

>  
>   /*
>* always compress this one file
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 8fd7233..aad2600 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1375,7 +1375,7 @@ struct btrfs_root {
>   struct list_head root_list;
>  
>   spinlock_t orphan_lock;
> - struct list_head orphan_list;
> + atomic_t orphan_inodes;
>   struct btrfs_block_rsv *orphan_block_rsv;
>   int orphan_item_inserted;
>   int orphan_cleanup_state;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index a7ffc88..ff3bf4b 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
> u32 sectorsize,
>   root->orphan_block_rsv = NULL;
>  
>   INIT_LIST_HEAD(&root->dirty_list);
> - INIT_LIST_HEAD(&root->orphan_list);
>   INIT_LIST_HEAD(&root->root_list);
>   spin_lock_init(&root->orphan_lock);
>   spin_lock_init(&root->inode_lock);
> @@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
> u32 sectorsize,
>   atomic_set(&root->log_commit[0], 0);
>   atomic_set(&root->log_commit[1], 0);
>   atomic_set(&root->log_writers, 0);
> + atomic_set(&root->orphan_inodes, 0);
>   root->log_batch = 0;
>   root->log_transid = 0;
>   root->last_log_commit = 0;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 61b16c6..572da13 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2072,12 +2072,12 @@ void btrfs_orphan_commit_root(struct 
> btrfs_trans_handle *trans,
>   struct btrfs_block_rsv *block_rsv;
>   int ret;
>  
> - if (!list_empty(&root->orphan_list) ||
> + if (atomic_read(&root->orphan_inodes) ||
>   root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
>   return;
>  
>   spin_lock(&root->orphan_lock);
> - if (!list_empty(&root->orphan_list)) {
> + if (atomic_read(&root->orphan_inodes)) {
>   spin_unlock(&root->orphan_lock);
>   return;
>   }
> @@ -2134,8 +2134,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
> struct inode *inode)
>   block_rsv = NULL;
>   }
>  
> - if (list_empty(&BTRFS_I(inode)->i_orphan)) {
> - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
> + if (!BTRFS_I(inode)->has_orphan_item) {
> + BTRFS_I(inode)->has_orphan_item = 1;
>  #if 0
>   /*
>* For proper ENOSPC handling, we should do orphan
> @@ -2148,6 +2148,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
> struct inode *inode)
>   insert = 1;
>  #endif
>   insert = 1;
> + atomic_inc(&root->orphan_inodes);
>   }
>  
>   if (!BTRFS_I(inode)->orphan_meta_reserved) {
> @@ -2166,6 +2167,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
> struct inode *inode)
>   if (insert >= 1) {
>   ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
>   if (ret && ret != -EEXIST) {
> + spin_lock(&root->orphan_lock);
> + BTRFS_I(inode)->has_orphan_item = 0;
> + spin_unlock(&root->orphan_lock);
>   btrfs_abort_transaction(trans, root, ret);
>   return ret;
>   }
> @@ -2195,13 +2199,21 @@ int btrfs_orphan_del(struct btrfs_trans_handle 
> *trans, struct inode *inode)
>   int release_rsv = 0;
>   int ret = 0;
>  
> + /*
> +  * evict_inode gets called without holding the i_mutex so we need to
> +  * take the orphan lock to make sure we are safe in messing with these.
> +  */
>   spin_lock(&root->orphan_lock);
> - if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
> - list_del_

Re: Ceph on btrfs 3.4rc

2012-05-18 Thread Martin Mailand

Hi Josef,
now I get
[ 2081.142669] couldn't find orphan item for 2039, nlink 1, root 269, 
root being deleted no


-martin

Am 18.05.2012 21:01, schrieb Josef Bacik:

*sigh*  ok try this, hopefully it will point me in the right direction.  Thanks,



[  126.389847] Btrfs loaded
[  126.390284] device fsid 0c9d8c6d-2982-4604-b32a-fc443c4e2c50 devid 1 
transid 4 /dev/sdc

[  126.391246] btrfs: setting nodatacow
[  126.391252] btrfs: enabling auto defrag
[  126.391254] btrfs: disk space caching is enabled
[  126.391257] btrfs flagging fs with big metadata feature
[  126.405700] device fsid e8a0dc27-8714-49bd-a14f-ac37525febb1 devid 1 
transid 4 /dev/sdd

[  126.406162] btrfs: setting nodatacow
[  126.406167] btrfs: enabling auto defrag
[  126.406170] btrfs: disk space caching is enabled
[  126.406172] btrfs flagging fs with big metadata feature
[  126.419819] device fsid f67cd977-ebf4-41f2-9821-f2989e985954 devid 1 
transid 4 /dev/sde

[  126.420198] btrfs: setting nodatacow
[  126.420206] btrfs: enabling auto defrag
[  126.420210] btrfs: disk space caching is enabled
[  126.420214] btrfs flagging fs with big metadata feature
[  127.274555] device fsid 3001355e-c2e2-46c7-9eba-dfecb441d6a6 devid 1 
transid 4 /dev/sdf

[  127.274980] btrfs: setting nodatacow
[  127.274986] btrfs: enabling auto defrag
[  127.274989] btrfs: disk space caching is enabled
[  127.274992] btrfs flagging fs with big metadata feature
[ 2081.142669] couldn't find orphan item for 2039, nlink 1, root 269, 
root being deleted no

[ 2081.142735] [ cut here ]
[ 2081.142750] kernel BUG at fs/btrfs/inode.c:2228!
[ 2081.142766] invalid opcode:  [#1] SMP
[ 2081.142786] CPU 10
[ 2081.142794] Modules linked in: btrfs zlib_deflate libcrc32c ext2 
bonding coretemp ghash_clmulni_intel aesni_intel cryptd aes_x86_64 
microcode psmouse serio_raw sb_edac edac_core joydev mei(C) ioatdma ses 
enclosure mac_hid lp parport usbhid hid megaraid_sas isci libsas 
scsi_transport_sas igb ixgbe dca mdio

[ 2081.142974]
[ 2081.142985] Pid: 2966, comm: ceph-osd Tainted: G C 
3.4.0-rc7.2012051802+ #16 Supermicro X9SRi/X9SRi
[ 2081.143020] RIP: 0010:[]  [] 
btrfs_orphan_del+0x173/0x180 [btrfs]

[ 2081.143080] RSP: 0018:881016d83d18  EFLAGS: 00010292
[ 2081.143096] RAX: 0062 RBX: 881017ad4770 RCX: 

[ 2081.143115] RDX:  RSI: 0082 RDI: 
0246
[ 2081.143134] RBP: 881016d83d58 R08:  R09: 

[ 2081.143154] R10:  R11: 0116 R12: 
88101e7baf90
[ 2081.143173] R13: 88101e7bac00 R14: 0001 R15: 
0001
[ 2081.143193] FS:  7fcc1e736700() GS:88107fd4() 
knlGS:

[ 2081.143243] CS:  0010 DS:  ES:  CR0: 80050033
[ 2081.143274] CR2: 09269000 CR3: 00101ba87000 CR4: 
000407e0
[ 2081.143308] DR0:  DR1:  DR2: 

[ 2081.143341] DR3:  DR6: 0ff0 DR7: 
0400
[ 2081.143376] Process ceph-osd (pid: 2966, threadinfo 881016d82000, 
task 881023c744a0)

[ 2081.143424] Stack:
[ 2081.143447]  0c07 88101e1dac30 881016d83d38 
88101e1dac30
[ 2081.143510]   88101e7bac00 881017ad4770 
88101f0f7d60
[ 2081.143572]  881016d83e08 a026d7c8 881017ad4770 


[ 2081.143634] Call Trace:
[ 2081.143684]  [] btrfs_truncate+0x5e8/0x6d0 [btrfs]
[ 2081.143737]  [] btrfs_setattr+0xc1/0x1b0 [btrfs]
[ 2081.143773]  [] notify_change+0x183/0x320
[ 2081.143807]  [] do_truncate+0x5e/0xa0
[ 2081.143839]  [] sys_truncate+0x144/0x1b0
[ 2081.143873]  [] system_call_fastpath+0x16/0x1b
[ 2081.143903] Code: a0 49 8b 8d f0 02 00 00 8b 53 48 4c 0f 44 c0 48 85 
f6 74 19 80 bb 60 fe ff ff 84 74 10 48 c7 c7 10 88 2c a0 31 c0 e8 e5 3b 
3e e1 <0f> 0b 48 8b 73 40 eb ea 0f 1f 44 00 00 55 48 89 e5 48 83 ec 10
[ 2081.144199] RIP  [] btrfs_orphan_del+0x173/0x180 
[btrfs]

[ 2081.144258]  RSP 
[ 2081.144614] ---[ end trace 8d0829d100639242 ]---

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-18 Thread Josef Bacik
On Fri, May 18, 2012 at 07:24:25PM +0200, Martin Mailand wrote:
> Hi Josef,
> there was one line before the bug.
> 
> [  995.725105] couldn't find orphan item for 524
> 
> 

*sigh* ok try this, hopefully it will point me in the right direction.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15f..492c74f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -156,6 +153,8 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
+   unsigned doing_truncate:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd7233..aad2600 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7ffc88..ff3bf4b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+   atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61b16c6..572da13 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2072,12 +2072,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle 
*trans,
struct btrfs_block_rsv *block_rsv;
int ret;
 
-   if (!list_empty(&root->orphan_list) ||
+   if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
 
spin_lock(&root->orphan_lock);
-   if (!list_empty(&root->orphan_list)) {
+   if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2134,8 +2134,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
block_rsv = NULL;
}
 
-   if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+   if (!BTRFS_I(inode)->has_orphan_item) {
+   BTRFS_I(inode)->has_orphan_item = 1;
 #if 0
/*
 * For proper ENOSPC handling, we should do orphan
@@ -2148,6 +2148,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
insert = 1;
 #endif
insert = 1;
+   atomic_inc(&root->orphan_inodes);
}
 
if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2166,6 +2167,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret && ret != -EEXIST) {
+   spin_lock(&root->orphan_lock);
+   BTRFS_I(inode)->has_orphan_item = 0;
+   spin_unlock(&root->orphan_lock);
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -2195,13 +2199,21 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, 
struct inode *inode)
int release_rsv = 0;
int ret = 0;
 
+   /*
+* evict_inode gets called without holding the i_mutex so we need to
+* take the orphan lock to make sure we are safe in messing with these.
+*/
spin_lock(&root->orphan_lock);
-   if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_del_init(&BTRFS_I(inode)->i_orphan);
-   delete_item = 1;
+   if (BTRFS_I(inode)->has_orphan_item) {
+   if (trans) {
+   BTRFS_I(inode)->has_orphan_item = 0;
+

Re: Ceph on btrfs 3.4rc

2012-05-18 Thread Martin Mailand

Hi Josef,
there was one line before the bug.

[  995.725105] couldn't find orphan item for 524


Am 18.05.2012 16:48, schrieb Josef Bacik:

Ok hopefully this will print something out that makes sense.  Thanks,


-martin

[  241.754693] Btrfs loaded
[  241.755148] device fsid 43c4ebd9-3824-4b07-a710-3ec39b012759 devid 1 
transid 4 /dev/sdc

[  241.755750] btrfs: setting nodatacow
[  241.755753] btrfs: enabling auto defrag
[  241.755754] btrfs: disk space caching is enabled
[  241.755755] btrfs flagging fs with big metadata feature
[  241.768683] device fsid e7e7f2df-6a4e-45b1-85cc-860cda849953 devid 1 
transid 4 /dev/sdd

[  241.769028] btrfs: setting nodatacow
[  241.769030] btrfs: enabling auto defrag
[  241.769031] btrfs: disk space caching is enabled
[  241.769032] btrfs flagging fs with big metadata feature
[  241.781360] device fsid 203fdd4c-baac-49f8-bfdb-08486c937989 devid 1 
transid 4 /dev/sde

[  241.781854] btrfs: setting nodatacow
[  241.781859] btrfs: enabling auto defrag
[  241.781861] btrfs: disk space caching is enabled
[  241.781864] btrfs flagging fs with big metadata feature
[  242.713741] device fsid 95c36e12-0098-48d7-a08d-9d54a299206b devid 1 
transid 4 /dev/sdf

[  242.714110] btrfs: setting nodatacow
[  242.714118] btrfs: enabling auto defrag
[  242.714121] btrfs: disk space caching is enabled
[  242.714125] btrfs flagging fs with big metadata feature
[  995.725105] couldn't find orphan item for 524
[  995.725126] [ cut here ]
[  995.725134] kernel BUG at fs/btrfs/inode.c:2227!
[  995.725143] invalid opcode:  [#1] SMP
[  995.725158] CPU 0
[  995.725162] Modules linked in: btrfs zlib_deflate libcrc32c ext2 
coretemp ghash_clmulni_intel aesni_intel bonding cryptd aes_x86_64 
microcode psmouse serio_raw sb_edac edac_core joydev mei(C) ses ioatdma 
enclosure mac_hid lp parport ixgbe usbhid hid isci libsas megaraid_sas 
scsi_transport_sas igb dca mdio

[  995.725285]
[  995.725290] Pid: 2972, comm: ceph-osd Tainted: G C 
3.4.0-rc7.2012051800+ #14 Supermicro X9SRi/X9SRi
[  995.725324] RIP: 0010:[]  [] 
btrfs_orphan_del+0x14f/0x160 [btrfs]

[  995.725354] RSP: 0018:881016ed9d18  EFLAGS: 00010292
[  995.725364] RAX: 0037 RBX: 88101485fdb0 RCX: 

[  995.725378] RDX:  RSI: 0082 RDI: 
0246
[  995.725392] RBP: 881016ed9d58 R08:  R09: 

[  995.725405] R10:  R11: 00b6 R12: 
88101efe9f90
[  995.725419] R13: 88101efe9c00 R14: 0001 R15: 
0001
[  995.725433] FS:  7f58e5dbc700() GS:88107fc0() 
knlGS:

[  995.725466] CS:  0010 DS:  ES:  CR0: 80050033
[  995.725492] CR2: 03f28000 CR3: 00101acac000 CR4: 
000407f0
[  995.725522] DR0:  DR1:  DR2: 

[  995.725551] DR3:  DR6: 0ff0 DR7: 
0400
[  995.725581] Process ceph-osd (pid: 2972, threadinfo 881016ed8000, 
task 88101618)

[  995.725626] Stack:
[  995.725646]  0c02 88101deaf550 881016ed9d38 
88101deaf550
[  995.725700]   88101efe9c00 88101485fdb0 
880be890c1e0
[  995.725757]  881016ed9e08 a02897a8 88101485fdb0 


[  995.725807] Call Trace:
[  995.725835]  [] btrfs_truncate+0x5e8/0x6d0 [btrfs]
[  995.725869]  [] btrfs_setattr+0xc1/0x1b0 [btrfs]
[  995.725898]  [] notify_change+0x183/0x320
[  995.725925]  [] do_truncate+0x5e/0xa0
[  995.725951]  [] sys_truncate+0x144/0x1b0
[  995.725979]  [] system_call_fastpath+0x16/0x1b
[  995.726006] Code: 45 31 ff e9 3c ff ff ff 48 8b b3 58 fe ff ff 48 85 
f6 74 19 80 bb 60 fe ff ff 84 74 10 48 c7 c7 08 48 2e a0 31 c0 e8 09 7c 
3c e1 <0f> 0b 48 8b 73 40 eb ea 66 0f 1f 84 00 00 00 00 00 55 48 89 e5
[  995.726221] RIP  [] btrfs_orphan_del+0x14f/0x160 
[btrfs]

[  995.726258]  RSP 
[  995.726574] ---[ end trace 4bde8f513a6d106d ]---

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-18 Thread Josef Bacik
On Thu, May 17, 2012 at 11:18:25PM +0200, Martin Mailand wrote:
> Hi Josef,
> 
> I hit exact the same bug as Christian with your last patch.
> 

Ok hopefully this will print something out that makes sense.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15f..492c74f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -156,6 +153,8 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
+   unsigned doing_truncate:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd7233..aad2600 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7ffc88..ff3bf4b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+   atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61b16c6..7de7f6f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2072,12 +2072,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle 
*trans,
struct btrfs_block_rsv *block_rsv;
int ret;
 
-   if (!list_empty(&root->orphan_list) ||
+   if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
 
spin_lock(&root->orphan_lock);
-   if (!list_empty(&root->orphan_list)) {
+   if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2134,8 +2134,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
block_rsv = NULL;
}
 
-   if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+   if (!BTRFS_I(inode)->has_orphan_item) {
+   BTRFS_I(inode)->has_orphan_item = 1;
 #if 0
/*
 * For proper ENOSPC handling, we should do orphan
@@ -2148,6 +2148,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
insert = 1;
 #endif
insert = 1;
+   atomic_inc(&root->orphan_inodes);
}
 
if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2166,6 +2167,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret && ret != -EEXIST) {
+   spin_lock(&root->orphan_lock);
+   BTRFS_I(inode)->has_orphan_item = 0;
+   spin_unlock(&root->orphan_lock);
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -2195,13 +2199,21 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, 
struct inode *inode)
int release_rsv = 0;
int ret = 0;
 
+   /*
+* evict_inode gets called without holding the i_mutex so we need to
+* take the orphan lock to make sure we are safe in messing with these.
+*/
spin_lock(&root->orphan_lock);
-   if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_del_init(&BTRFS_I(inode)->i_orphan);
-   delete_item = 1;
+   if (BTRFS_I(inode)->has_orphan_item) {
+   if (trans) {
+   BTRFS_I(inode)->has_orphan_item = 0;
+   delete_item = 1;
+  

Re: Ceph on btrfs 3.4rc

2012-05-17 Thread Martin Mailand

Hi Josef,

I hit exact the same bug as Christian with your last patch.

-martin
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-17 Thread Christian Brunner
2012/5/17 Josef Bacik :
> On Thu, May 17, 2012 at 05:12:55PM +0200, Martin Mailand wrote:
>> Hi Josef,
>> no there was nothing above. Here the is another dmesg output.
>>
>
> Hrm ok give this a try and hopefully this is it, still couldn't reproduce.
> Thanks,
>
> Josef

Well, I hate to say it, but the new patch doesn't seem to change much...

Regards,
Christian

[  123.507444] Btrfs loaded
[  202.683630] device fsid 2aa7531c-0e3c-4955-8542-6aed7ab8c1a2 devid
1 transid 4 /dev/sda
[  202.693704] btrfs: use lzo compression
[  202.697999] btrfs: enabling inode map caching
[  202.702989] btrfs: enabling auto defrag
[  202.707190] btrfs: disk space caching is enabled
[  202.712721] btrfs flagging fs with big metadata feature
[  207.839761] device fsid f81ff6a1-c333-4daf-989f-a28139f15f08 devid
1 transid 4 /dev/sdb
[  207.849681] btrfs: use lzo compression
[  207.853987] btrfs: enabling inode map caching
[  207.858970] btrfs: enabling auto defrag
[  207.863173] btrfs: disk space caching is enabled
[  207.868635] btrfs flagging fs with big metadata feature
[  210.857328] device fsid 9b905faa-f4fa-4626-9cae-2cd0287b30f7 devid
1 transid 4 /dev/sdc
[  210.867265] btrfs: use lzo compression
[  210.871560] btrfs: enabling inode map caching
[  210.876550] btrfs: enabling auto defrag
[  210.880757] btrfs: disk space caching is enabled
[  210.886228] btrfs flagging fs with big metadata feature
[  214.296287] device fsid f7990e4c-90b0-4691-9502-92b60538574a devid
1 transid 4 /dev/sdd
[  214.306510] btrfs: use lzo compression
[  214.310855] btrfs: enabling inode map caching
[  214.315905] btrfs: enabling auto defrag
[  214.320174] btrfs: disk space caching is enabled
[  214.325706] btrfs flagging fs with big metadata feature
[ 1337.937379] [ cut here ]
[ 1337.942526] kernel BUG at fs/btrfs/inode.c:2224!
[ 1337.947671] invalid opcode:  [#1] SMP
[ 1337.952255] CPU 5
[ 1337.954300] Modules linked in: btrfs zlib_deflate libcrc32c xfs
exportfs sunrpc bonding ipv6 sg pcspkr serio_raw iTCO_wdt
iTCO_vendor_support iomemory_vsl(PO) ixgbe dca mdio i7core_edac
edac_core hpsa squashfs [last unloaded: scsi_wait_scan]
[ 1337.978570]
[ 1337.980230] Pid: 6812, comm: ceph-osd Tainted: P   O
3.3.5-1.fits.1.el6.x86_64 #1 HP ProLiant DL180 G6
[ 1337.991592] RIP: 0010:[]  []
btrfs_orphan_del+0x14c/0x150 [btrfs]
[ 1338.001897] RSP: 0018:8805e1171d38  EFLAGS: 00010282
[ 1338.007815] RAX: fffe RBX: 88061c3c8400 RCX: 00b37f48
[ 1338.015768] RDX: 00b37f47 RSI: 8805ec2a1cf0 RDI: ea0017b0a840
[ 1338.023724] RBP: 8805e1171d68 R08: 60f9d88028a0 R09: a033016a
[ 1338.031675] R10:  R11: 0004 R12: 8805de7f57a0
[ 1338.039629] R13: 0001 R14: 0001 R15: 8805ec2a5280
[ 1338.047584] FS:  7f4bffc6e700() GS:8806272a()
knlGS:
[ 1338.056600] CS:  0010 DS:  ES:  CR0: 80050033
[ 1338.063003] CR2: ff600400 CR3: 0005e34c3000 CR4: 06e0
[ 1338.070954] DR0:  DR1:  DR2: 
[ 1338.078909] DR3:  DR6: 0ff0 DR7: 0400
[ 1338.086865] Process ceph-osd (pid: 6812, threadinfo
8805e117, task 88060fa81940)
[ 1338.096268] Stack:
[ 1338.098509]  8805e1171d68 8805ec2a5280 88051235b920

[ 1338.106795]  88051235b920 0008 8805e1171e08
a036043c
[ 1338.115082]    
00011000
[ 1338.123367] Call Trace:
[ 1338.126111]  [] btrfs_truncate+0x5bc/0x640 [btrfs]
[ 1338.133213]  [] btrfs_setattr+0xf6/0x1a0 [btrfs]
[ 1338.140105]  [] notify_change+0x18b/0x2b0
[ 1338.146320]  [] ? selinux_inode_permission+0xd1/0x130
[ 1338.153699]  [] do_truncate+0x64/0xa0
[ 1338.159527]  [] ? inode_permission+0x49/0x100
[ 1338.166128]  [] sys_truncate+0x137/0x150
[ 1338.172244]  [] system_call_fastpath+0x16/0x1b
[ 1338.178936] Code: 89 e7 e8 88 7d fe ff eb 89 66 0f 1f 44 00 00 be
a4 08 00 00 48 c7 c7 59 49 3b a0 45 31 ed e8 5c 78 cf e0 45 31 f6 e9
30 ff ff ff <0f> 0b eb fe 55 48 89 e5 48 83 ec 40 48 89 5d d8 4c 89 65
e0 4c
[ 1338.200623] RIP  [] btrfs_orphan_del+0x14c/0x150 [btrfs]
[ 1338.208317]  RSP 
[ 1338.212681] ---[ end trace 86be14f0f863ea79 ]---
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-17 Thread Josef Bacik
On Thu, May 17, 2012 at 05:12:55PM +0200, Martin Mailand wrote:
> Hi Josef,
> no there was nothing above. Here the is another dmesg output.
> 

Hrm ok give this a try and hopefully this is it, still couldn't reproduce.
Thanks,

Josef

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3771b85..559e716 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -153,6 +150,7 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ba8743b..72cdf98 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 19f5b45..25dba7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+   atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 54ae3df..7cc1c96 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2104,12 +2104,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle 
*trans,
struct btrfs_block_rsv *block_rsv;
int ret;
 
-   if (!list_empty(&root->orphan_list) ||
+   if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
 
spin_lock(&root->orphan_lock);
-   if (!list_empty(&root->orphan_list)) {
+   if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2166,8 +2166,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
block_rsv = NULL;
}
 
-   if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+   if (!BTRFS_I(inode)->has_orphan_item) {
+   BTRFS_I(inode)->has_orphan_item = 1;
 #if 0
/*
 * For proper ENOSPC handling, we should do orphan
@@ -2180,6 +2180,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
insert = 1;
 #endif
insert = 1;
+   atomic_inc(&root->orphan_inodes);
}
 
if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2198,6 +2199,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret && ret != -EEXIST) {
+   spin_lock(&root->orphan_lock);
+   BTRFS_I(inode)->has_orphan_item = 0;
+   spin_unlock(&root->orphan_lock);
btrfs_abort_transaction(trans, root, ret);
return ret;
}
@@ -2227,13 +2231,21 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, 
struct inode *inode)
int release_rsv = 0;
int ret = 0;
 
+   /*
+* evict_inode gets called without holding the i_mutex so we need to
+* take the orphan lock to make sure we are safe in messing with these.
+*/
spin_lock(&root->orphan_lock);
-   if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_del_init(&BTRFS_I(inode)->i_orphan);
-   delete_item = 1;
+   if (BTRFS_I(inode)->has_orphan_item) {
+   if (trans) {
+   BTRFS_I(inode)->has_orphan_item = 0;
+   delete_item = 1;
+   } else {
+ 

Re: Ceph on btrfs 3.4rc

2012-05-17 Thread Martin Mailand

Hi Josef,
no there was nothing above. Here the is another dmesg output.


Was there anything above those messages?  There should have been a WARN_ON() or
something.  If not thats fine, I just need to know one way or the other so I can
figure out what to do next.  Thanks,

Josef


-martin

[   63.027277] Btrfs loaded
[   63.027485] device fsid 266726e1-439f-4d89-a374-7ef92d355daf devid 1 
transid 4 /dev/sdc

[   63.027750] btrfs: setting nodatacow
[   63.027752] btrfs: enabling auto defrag
[   63.027753] btrfs: disk space caching is enabled
[   63.027754] btrfs flagging fs with big metadata feature
[   63.036347] device fsid 070e2c6c-2ea5-478d-bc07-7ce3a954e2e4 devid 1 
transid 4 /dev/sdd

[   63.036624] btrfs: setting nodatacow
[   63.036626] btrfs: enabling auto defrag
[   63.036627] btrfs: disk space caching is enabled
[   63.036628] btrfs flagging fs with big metadata feature
[   63.045628] device fsid 6f7b82a9-a1b7-40c6-8b00-2c2a44481066 devid 1 
transid 4 /dev/sde

[   63.045910] btrfs: setting nodatacow
[   63.045912] btrfs: enabling auto defrag
[   63.045913] btrfs: disk space caching is enabled
[   63.045914] btrfs flagging fs with big metadata feature
[   63.831278] device fsid 46890b76-45c2-4ea2-96ee-2ea88e29628b devid 1 
transid 4 /dev/sdf

[   63.831577] btrfs: setting nodatacow
[   63.831579] btrfs: enabling auto defrag
[   63.831579] btrfs: disk space caching is enabled
[   63.831580] btrfs flagging fs with big metadata feature
[ 1521.820412] [ cut here ]
[ 1521.820424] kernel BUG at fs/btrfs/inode.c:2220!
[ 1521.820433] invalid opcode:  [#1] SMP
[ 1521.820448] CPU 4
[ 1521.820452] Modules linked in: btrfs zlib_deflate libcrc32c ext2 ses 
enclosure bonding coretemp ghash_clmulni_intel aesni_intel cryptd 
aes_x86_64 psmouse microcode serio_raw sb_edac edac_core mei(C) joydev 
ioatdma mac_hid lp parport isci libsas scsi_transport_sas usbhid hid 
ixgbe igb dca megaraid_sas mdio

[ 1521.820562]
[ 1521.820567] Pid: 3095, comm: ceph-osd Tainted: G C 
3.4.0-rc7+ #10 Supermicro X9SRi/X9SRi
[ 1521.820591] RIP: 0010:[]  [] 
btrfs_orphan_del+0xe2/0xf0 [btrfs]

[ 1521.820616] RSP: 0018:881013da9d18  EFLAGS: 00010282
[ 1521.820626] RAX: fffe RBX: 881013a3b7f0 RCX: 
00395dcf
[ 1521.820640] RDX: 00395dce RSI: 88101df77480 RDI: 
ea004077ddc0
[ 1521.820654] RBP: 881013da9d58 R08: 60ef800010d0 R09: 
a022ac6a
[ 1521.820667] R10:  R11: 010a R12: 
88101e378790
[ 1521.820681] R13: 88101e378400 R14: 0001 R15: 
0001
[ 1521.820695] FS:  7faa45d30700() GS:88107fc8() 
knlGS:

[ 1521.820710] CS:  0010 DS:  ES:  CR0: 80050033
[ 1521.820738] CR2: 7fe0efba6010 CR3: 001016fec000 CR4: 
000407e0
[ 1521.820767] DR0:  DR1:  DR2: 

[ 1521.820796] DR3:  DR6: 0ff0 DR7: 
0400
[ 1521.820825] Process ceph-osd (pid: 3095, threadinfo 881013da8000, 
task 881013da44a0)

[ 1521.820870] Stack:
[ 1521.820889]  0c05 88101df9c230 881013da9d38 
88101df9c230
[ 1521.820939]   88101e378400 881013a3b7f0 
880c6880f840
[ 1521.820988]  881013da9e08 a0257628 881013a3b7f0 


[ 1521.821038] Call Trace:
[ 1521.821066]  [] btrfs_truncate+0x4d8/0x650 [btrfs]
[ 1521.821096]  [] ? path_lookupat+0x6d/0x750
[ 1521.821128]  [] btrfs_setattr+0xc1/0x1b0 [btrfs]
[ 1521.821156]  [] notify_change+0x183/0x320
[ 1521.821183]  [] do_truncate+0x5e/0xa0
[ 1521.821209]  [] sys_truncate+0x144/0x1b0
[ 1521.821237]  [] system_call_fastpath+0x16/0x1b
[ 1521.821265] Code: e8 4c 8b 75 f0 4c 8b 7d f8 c9 c3 66 0f 1f 44 00 00 
80 bb 60 fe ff ff 84 75 b4 eb ae 0f 1f 44 00 00 48 89 df e8 50 73 fe ff 
eb b8 <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec

[ 1521.821458] RIP  [] btrfs_orphan_del+0xe2/0xf0 [btrfs]
[ 1521.821492]  RSP 
[ 1521.821758] ---[ end trace aee4c5fe92ee2a67 ]---
[ 6888.637508] btrfs: truncated 1 orphans
[ 7641.701736] [ cut here ]
[ 7641.701764] kernel BUG at fs/btrfs/inode.c:2220!
[ 7641.701789] invalid opcode:  [#2] SMP
[ 7641.701816] CPU 3
[ 7641.701819] Modules linked in: btrfs zlib_deflate libcrc32c ext2 ses 
enclosure bonding coretemp ghash_clmulni_intel aesni_intel cryptd 
aes_x86_64 psmouse microcode serio_raw sb_edac edac_core mei(C) joydev 
ioatdma mac_hid lp parport isci libsas scsi_transport_sas usbhid hid 
ixgbe igb dca megaraid_sas mdio

[ 7641.702000]
[ 7641.702030] Pid: 3064, comm: ceph-osd Tainted: G  D  C 
3.4.0-rc7+ #10 Supermicro X9SRi/X9SRi
[ 7641.702081] RIP: 0010:[]  [] 
btrfs_orphan_del+0xe2/0xf0 [btrfs]

[ 7641.702140] RSP: 0018:881013c51d18  EFLAGS: 00010282
[ 7641.702166] RAX: fffe RBX: 881010871130 RCX: 
013df293
[ 7641.702195] RDX: 013df292 RSI: 881

Re: Ceph on btrfs 3.4rc

2012-05-17 Thread Josef Bacik
On Thu, May 17, 2012 at 12:29:32PM +0200, Martin Mailand wrote:
> Hi Josef,
> 
> somehow I still get the kernel Bug messages, I used your patch from
> the 16th against rc7.
> 

Was there anything above those messages?  There should have been a WARN_ON() or
something.  If not thats fine, I just need to know one way or the other so I can
figure out what to do next.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-17 Thread Martin Mailand

Hi Josef,

somehow I still get the kernel Bug messages, I used your patch from the 
16th against rc7.


-martin

Am 16.05.2012 21:20, schrieb Josef Bacik:

Hrm ok so I finally got some time to try and debug it and let the test run a
good long while (5 hours almost) and I couldn't hit either the original bug or
the one you guys were hitting.  So either my extra little bit of locking did the
trick or I get to keep my "Worst reproducer ever" award.  Can you guys give this
one a whirl and if it panics send the entire dmesg since it should spit out a
WARN_ON() to let me know what I thought was the problem was it.  Thanks,


[ 2868.813236] [ cut here ]
[ 2868.813297] kernel BUG at fs/btrfs/inode.c:2220!
[ 2868.813355] invalid opcode:  [#2] SMP
[ 2868.813479] CPU 2
[ 2868.813516] Modules linked in: btrfs zlib_deflate libcrc32c ext2 
bonding coretemp ghash_clmulni_intel aesni_intel cryptd aes_x86_64 
microcode psmouse serio_raw sb_edac edac_core joydev mei(C) ses ioatdma 
enclosure mac_hid lp parport isci libsas scsi_transport_sas usbhid hid 
ixgbe igb megaraid_sas dca mdio

[ 2868.814871]
[ 2868.814925] Pid: 5325, comm: ceph-osd Tainted: G  D  C 
3.4.0-rc7+ #10 Supermicro X9SRi/X9SRi
[ 2868.815108] RIP: 0010:[]  [] 
btrfs_orphan_del+0xe2/0xf0 [btrfs]

[ 2868.815236] RSP: 0018:880296e89d18  EFLAGS: 00010282
[ 2868.815294] RAX: fffe RBX: 88101ef3c390 RCX: 
00562497
[ 2868.815355] RDX: 00562496 RSI: 88101ef1 RDI: 
ea00407bc400
[ 2868.815416] RBP: 880296e89d58 R08: 60ef8fd0 R09: 
a01f8c6a
[ 2868.815476] R10:  R11: 011d R12: 
880fdf602790
[ 2868.815537] R13: 880fdf602400 R14: 0001 R15: 
0001
[ 2868.815598] FS:  7f07d5512700() GS:88107fc4() 
knlGS:

[ 2868.815675] CS:  0010 DS:  ES:  CR0: 80050033
[ 2868.815734] CR2: 0ab16000 CR3: 00082a6b2000 CR4: 
000407e0
[ 2868.815796] DR0:  DR1:  DR2: 

[ 2868.815858] DR3:  DR6: 0ff0 DR7: 
0400
[ 2868.815920] Process ceph-osd (pid: 5325, threadinfo 880296e88000, 
task 8810170616e0)

[ 2868.815997] Stack:
[ 2868.816049]  0c07 88101ef12960 880296e89d38 
88101ef12960
[ 2868.816262]   880fdf602400 88101ef3c390 
880b4ce2f260
[ 2868.816485]  880296e89e08 a0225628 88101ef3c390 


[ 2868.816694] Call Trace:
[ 2868.816755]  [] btrfs_truncate+0x4d8/0x650 [btrfs]
[ 2868.816817]  [] ? path_lookupat+0x6d/0x750
[ 2868.816880]  [] btrfs_setattr+0xc1/0x1b0 [btrfs]
[ 2868.816940]  [] notify_change+0x183/0x320
[ 2868.816998]  [] do_truncate+0x5e/0xa0
[ 2868.817056]  [] sys_truncate+0x144/0x1b0
[ 2868.817115]  [] system_call_fastpath+0x16/0x1b
[ 2868.817173] Code: e8 4c 8b 75 f0 4c 8b 7d f8 c9 c3 66 0f 1f 44 00 00 
80 bb 60 fe ff ff 84 75 b4 eb ae 0f 1f 44 00 00 48 89 df e8 50 73 fe ff 
eb b8 <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec

[ 2868.819501] RIP  [] btrfs_orphan_del+0xe2/0xf0 [btrfs]
[ 2868.819602]  RSP 
[ 2868.819703] ---[ end trace 94d17b770b376c84 ]---
[ 3249.857453] [ cut here ]
[ 3249.857481] kernel BUG at fs/btrfs/inode.c:2220!
[ 3249.857506] invalid opcode:  [#3] SMP
[ 3249.857534] CPU 0
[ 3249.857538] Modules linked in: btrfs zlib_deflate libcrc32c ext2 
bonding coretemp ghash_clmulni_intel aesni_intel cryptd aes_x86_64 
microcode psmouse serio_raw sb_edac edac_core joydev mei(C) ses ioatdma 
enclosure mac_hid lp parport isci libsas scsi_transport_sas usbhid hid 
ixgbe igb megaraid_sas dca mdio

[ 3249.857721]
[ 3249.857740] Pid: 5384, comm: ceph-osd Tainted: G  D  C 
3.4.0-rc7+ #10 Supermicro X9SRi/X9SRi
[ 3249.857791] RIP: 0010:[]  [] 
btrfs_orphan_del+0xe2/0xf0 [btrfs]

[ 3249.857847] RSP: 0018:880abe8b5d18  EFLAGS: 00010282
[ 3249.857873] RAX: fffe RBX: 8807eb8b6670 RCX: 
0077a084
[ 3249.857902] RDX: 0077a083 RSI: 88101ee497e0 RDI: 
ea00407b9240
[ 3249.857931] RBP: 880abe8b5d58 R08: 60ef8fd0 R09: 
a01f8c6a
[ 3249.857959] R10:  R11: 0153 R12: 
880d56825390
[ 3249.857988] R13: 880d56825000 R14: 0001 R15: 
0001
[ 3249.858017] FS:  7f06bd13b700() GS:88107fc0() 
knlGS:

[ 3249.858062] CS:  0010 DS:  ES:  CR0: 80050033
[ 3249.858088] CR2: 043d2000 CR3: 000e7ebe5000 CR4: 
000407f0
[ 3249.858117] DR0:  DR1:  DR2: 

[ 3249.858146] DR3:  DR6: 0ff0 DR7: 
0400
[ 3249.858175] Process ceph-osd (pid: 5384, threadinfo 880abe8b4000, 
task 880eb7a596e0)

[ 3249.858219] Stack:
[ 3249.858239]  0c02 88101ede4d70 880abe8b5d38 
88101

Re: Ceph on btrfs 3.4rc

2012-05-16 Thread Josef Bacik
On Mon, May 14, 2012 at 10:20:48AM -0400, Josef Bacik wrote:
> On Mon, May 14, 2012 at 04:19:37PM +0200, Martin Mailand wrote:
> > Hi Josef,
> > 
> > Am 11.05.2012 21:16, schrieb Josef Bacik:
> > >Heh duh, sorry, try this one instead.  Thanks,
> > 
> > With this patch I got this Bug:
> 
> Yeah Christian reported the same thing on Friday.  I'm going to work on a 
> patch
> and actually run it here to make sure it doesn't blow up and then send it to 
> the
> list when I think I've got something that works.  Thanks,
> 

Hrm ok so I finally got some time to try and debug it and let the test run a
good long while (5 hours almost) and I couldn't hit either the original bug or
the one you guys were hitting.  So either my extra little bit of locking did the
trick or I get to keep my "Worst reproducer ever" award.  Can you guys give this
one a whirl and if it panics send the entire dmesg since it should spit out a
WARN_ON() to let me know what I thought was the problem was it.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3771b85..559e716 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -153,6 +150,7 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ba8743b..72cdf98 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 19f5b45..25dba7a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+   atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 54ae3df..c0cff20 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2104,12 +2104,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle 
*trans,
struct btrfs_block_rsv *block_rsv;
int ret;
 
-   if (!list_empty(&root->orphan_list) ||
+   if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
 
spin_lock(&root->orphan_lock);
-   if (!list_empty(&root->orphan_list)) {
+   if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2166,8 +2166,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
block_rsv = NULL;
}
 
-   if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+   if (!BTRFS_I(inode)->has_orphan_item) {
+   BTRFS_I(inode)->has_orphan_item = 1;
 #if 0
/*
 * For proper ENOSPC handling, we should do orphan
@@ -2180,6 +2180,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
insert = 1;
 #endif
insert = 1;
+   atomic_inc(&root->orphan_inodes);
}
 
if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2198,6 +2199,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret && ret != -EEXIST) {
+   spin_lock(&root->orphan_lock);
+   BTRFS_I(inode)->has_orphan_item = 0;
+   spin_unlock(&root->orphan_lock);
btrfs_abort_transaction(trans, root, ret);
  

Re: Ceph on btrfs 3.4rc

2012-05-14 Thread Josef Bacik
On Mon, May 14, 2012 at 04:19:37PM +0200, Martin Mailand wrote:
> Hi Josef,
> 
> Am 11.05.2012 21:16, schrieb Josef Bacik:
> >Heh duh, sorry, try this one instead.  Thanks,
> 
> With this patch I got this Bug:

Yeah Christian reported the same thing on Friday.  I'm going to work on a patch
and actually run it here to make sure it doesn't blow up and then send it to the
list when I think I've got something that works.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-14 Thread Martin Mailand

Hi Josef,

Am 11.05.2012 21:16, schrieb Josef Bacik:

Heh duh, sorry, try this one instead.  Thanks,


With this patch I got this Bug:

[ 8233.828722] [ cut here ]
[ 8233.828737] kernel BUG at fs/btrfs/inode.c:2217!
[ 8233.828746] invalid opcode:  [#1] SMP
[ 8233.828761] CPU 1
[ 8233.828766] Modules linked in: btrfs zlib_deflate libcrc32c ses 
enclosure bonding coretemp ghash_clmulni_intel psmouse aesni_intel 
sb_edac cryptd a es_x86_64 ext2 microcode serio_raw edac_core mei(C) 
joydev ioatdma mac_hid lp parport usbhid hid isci libsas ixgbe 
scsi_transport_sas megaraid_sas igb  dca mdio

[ 8233.828885]
[ 8233.828891] Pid: , comm: ceph-osd Tainted: GWC 
3.4.0-rc6+ #6 Supermicro X9SRi/X9SRi
[ 8233.828915] RIP: 0010:[]  [] 
btrfs_orphan_del+0xe2/0xf0 [btrfs]

[ 8233.828947] RSP: 0018:88101ce53d18  EFLAGS: 00010282
[ 8233.828957] RAX: fffe RBX: 880d194e2c50 RCX: 
00d0a3be
[ 8233.828971] RDX: 00d0a3bd RSI: 88101de2a000 RDI: 
ea0040778a80
[ 8233.828985] RBP: 88101ce53d58 R08: 60ef8f00 R09: 
a0220c6a
[ 8233.828999] R10:  R11: 00f0 R12: 
88071bb1e790
[ 8233.829029] R13: 88071bb1e400 R14: 0001 R15: 
0001
[ 8233.829059] FS:  7fdfa179b700() GS:88107fc2() 
knlGS:

[ 8233.829104] CS:  0010 DS:  ES:  CR0: 80050033
[ 8233.829131] CR2: 0c614000 CR3: 0001df9d2000 CR4: 
000407e0
[ 8233.829160] DR0:  DR1:  DR2: 

[ 8233.829190] DR3:  DR6: 0ff0 DR7: 
0400
[ 8233.829220] Process ceph-osd (pid: , threadinfo 88101ce52000, 
task 88101b7b96e0)

[ 8233.829265] Stack:
[ 8233.829286]  0c02 88101de14cd0 88101ce53d38 
88101de14cd0
[ 8233.829336]   88071bb1e400 880d194e2c50 
881024680620
[ 8233.829386]  88101ce53e08 a024d608 880d194e2c50 


[ 8233.829436] Call Trace:
[ 8233.829472]  [] btrfs_truncate+0x4d8/0x650 [btrfs]
[ 8233.829503]  [] ? path_lookupat+0x6d/0x750
[ 8233.829537]  [] btrfs_setattr+0xc1/0x1b0 [btrfs]
[ 8233.829567]  [] notify_change+0x183/0x320
[ 8233.829595]  [] do_truncate+0x5e/0xa0
[ 8233.829621]  [] sys_truncate+0x144/0x1b0
[ 8233.829649]  [] system_call_fastpath+0x16/0x1b
[ 8233.829676] Code: e8 4c 8b 75 f0 4c 8b 7d f8 c9 c3 66 0f 1f 44 00 00 
80 bb 60 fe ff ff 84 75 b4 eb ae 0f 1f 44 00 00 48 89 df e8 70 73 fe ff 
eb b8  <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec

[ 8233.829875] RIP  [] btrfs_orphan_del+0xe2/0xf0 [btrfs]
[ 8233.829914]  RSP 
[ 8233.830187] ---[ end trace 46dd4a711bf2979d ]---


-martin

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-11 Thread Josef Bacik
On Fri, May 11, 2012 at 08:33:34PM +0200, Martin Mailand wrote:
> Hi Josef,
> 
> Am 11.05.2012 15:31, schrieb Josef Bacik:
> >That previous patch was against btrfs-next, this patch is against 3.4-rc6 if 
> >you
> >are on mainline.  Thanks,
> 
> I tried your patch against mainline, after a few minutes I hit this bug.
> 

Heh duh, sorry, try this one instead.  Thanks,

Josef

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15f..54af1fa 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -156,6 +153,7 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd7233..aad2600 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7ffc88..ff3bf4b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
+   atomic_set(&root->orphan_inodes, 0);
root->log_batch = 0;
root->log_transid = 0;
root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61b16c6..5ba68d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2072,12 +2072,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle 
*trans,
struct btrfs_block_rsv *block_rsv;
int ret;
 
-   if (!list_empty(&root->orphan_list) ||
+   if (atomic_read(&root->orphan_inodes) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
 
spin_lock(&root->orphan_lock);
-   if (!list_empty(&root->orphan_list)) {
+   if (atomic_read(&root->orphan_inodes)) {
spin_unlock(&root->orphan_lock);
return;
}
@@ -2134,8 +2134,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
block_rsv = NULL;
}
 
-   if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+   if (!BTRFS_I(inode)->has_orphan_item) {
+   BTRFS_I(inode)->has_orphan_item = 1;
 #if 0
/*
 * For proper ENOSPC handling, we should do orphan
@@ -2148,6 +2148,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
struct inode *inode)
insert = 1;
 #endif
insert = 1;
+   atomic_inc(&root->orphan_inodes);
}
 
if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2195,9 +2196,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, 
struct inode *inode)
int release_rsv = 0;
int ret = 0;
 
+   /*
+* evict_inode gets called without holding the i_mutex so we need to
+* take the orphan lock to make sure we are safe in messing with these.
+*/
spin_lock(&root->orphan_lock);
-   if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-   list_del_init(&BTRFS_I(inode)->i_orphan);
+   if (BTRFS_I(inode)->has_orphan_item) {
+   BTRFS_I(inode)->has_orphan_item = 0;
delete_item = 1;
}
 
@@ -2215,6 +2220,9 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, 
struct inode *inode)
if (release_rsv)
btrfs_orphan_release_metadata(inode);
 
+   if (trans && delete_item)
+   atomic_dec(&root->orphan_inodes);
+
return 0;
 }
 
@@ -2352,9 +2360,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 * add this inode to the orphan list so btrfs_orphan_del does
 * the proper thing when we hit it
  

Re: Ceph on btrfs 3.4rc

2012-05-11 Thread Martin Mailand

Hi Josef,

Am 11.05.2012 15:31, schrieb Josef Bacik:

That previous patch was against btrfs-next, this patch is against 3.4-rc6 if you
are on mainline.  Thanks,


I tried your patch against mainline, after a few minutes I hit this bug.

[ 1078.523655] [ cut here ]
[ 1078.523667] kernel BUG at fs/btrfs/inode.c:2211!
[ 1078.523676] invalid opcode:  [#1] SMP
[ 1078.523692] CPU 5
[ 1078.523696] Modules linked in: btrfs zlib_deflate libcrc32c mlx4_en 
bonding ext2 coretemp ghash_clmulni_intel aesni_intel cryptd aes_x86_64 
microcode psmouse serio_raw sb_edac edac_core mei(C) joydev ses ioatdma 
enclosure mac_hid lp parport isci libsas scsi_transport_sas usbhid hid 
igb megaraid_sas mlx4_core dca

[ 1078.523813]
[ 1078.523818] Pid: 4108, comm: ceph-osd Tainted: G C 
3.4.0-rc6+ #5 Supermicro X9SRi/X9SRi
[ 1078.523841] RIP: 0010:[]  [] 
btrfs_orphan_del+0xb2/0xc0 [btrfs]

[ 1078.523867] RSP: 0018:880ff14a5d38  EFLAGS: 00010282
[ 1078.523877] RAX: fffe RBX: 880ff004d6f0 RCX: 
00117400
[ 1078.523891] RDX: 001173ff RSI: 8810279f6ea0 RDI: 
ea00409e7d80
[ 1078.523905] RBP: 880ff14a5d58 R08: 60ef80001400 R09: 
a0202c6a
[ 1078.523918] R10:  R11: 00ba R12: 
0001
[ 1078.523932] R13: 881017663c00 R14: 0001 R15: 
88101776f5a0
[ 1078.523946] FS:  7f1d2c03c700() GS:88107fca() 
knlGS:

[ 1078.523961] CS:  0010 DS:  ES:  CR0: 80050033
[ 1078.523990] CR2: 050f4000 CR3: 000ff2a57000 CR4: 
000407e0
[ 1078.524019] DR0:  DR1:  DR2: 

[ 1078.524048] DR3:  DR6: 0ff0 DR7: 
0400
[ 1078.524077] Process ceph-osd (pid: 4108, threadinfo 880ff14a4000, 
task 880ff2aa44a0)

[ 1078.524121] Stack:
[ 1078.524141]  8810279f7460  881017663c00 
880ff004d6f0
[ 1078.524190]  880ff14a5e08 a022f5d8 880ff004d6f0 

[ 1078.524240]  880ff14a5e18 81188afd 8000 
80001000

[ 1078.524289] Call Trace:
[ 1078.524317]  [] btrfs_truncate+0x4d8/0x650 [btrfs]
[ 1078.524348]  [] ? path_lookupat+0x6d/0x750
[ 1078.524380]  [] btrfs_setattr+0xc1/0x1b0 [btrfs]
[ 1078.524408]  [] notify_change+0x183/0x320
[ 1078.524435]  [] do_truncate+0x5e/0xa0
[ 1078.524461]  [] sys_truncate+0x144/0x1b0
[ 1078.524489]  [] system_call_fastpath+0x16/0x1b
[ 1078.524516] Code: 8b 65 e8 4c 8b 6d f0 4c 8b 75 f8 c9 c3 0f 1f 40 00 
80 bb 60 fe ff ff 84 75 c1 eb bb 0f 1f 44 00 00 48 89 df e8 a0 73 fe ff 
eb c1 <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec

[ 1078.524710] RIP  [] btrfs_orphan_del+0xb2/0xc0 [btrfs]
[ 1078.524744]  RSP 
[ 1078.525013] ---[ end trace 88c92720204f7aa4 ]---


That's the drive with the broken btrfs.

[  212.843776] device fsid 28492275-01d3-4e89-9f1c-bd86057194bf devid 1 
transid 4 /dev/sdc

[  212.844630] btrfs: setting nodatacow
[  212.844637] btrfs: enabling auto defrag
[  212.844640] btrfs: disk space caching is enabled
[  212.844643] btrfs flagging fs with big metadata feature



-martin
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-11 Thread Christian Brunner
2012/5/10 Josef Bacik :
> On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
>> Am 24. April 2012 18:26 schrieb Sage Weil :
>> > On Tue, 24 Apr 2012, Josef Bacik wrote:
>> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
>> >> > After running ceph on XFS for some time, I decided to try btrfs again.
>> >> > Performance with the current "for-linux-min" branch and big metadata
>> >> > is much better. The only problem (?) I'm still seeing is a warning
>> >> > that seems to occur from time to time:
>> >
>> > Actually, before you do that... we have a new tool,
>> > test_filestore_workloadgen, that generates a ceph-osd-like workload on the
>> > local file system.  It's a subset of what a full OSD might do, but if
>> > we're lucky it will be sufficient to reproduce this issue.  Something like
>> >
>> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
>> >
>> > will hopefully do the trick.
>> >
>> > Christian, maybe you can see if that is able to trigger this warning?
>> > You'll need to pull it from the current master branch; it wasn't in the
>> > last release.
>>
>> Trying to reproduce with test_filestore_workloadgen didn't work for
>> me. So here are some instructions on how to reproduce with a minimal
>> ceph setup.
>> [...]
>
> Well I feel like an idiot, I finally get it to reproduce, go look at where I
> want to put my printks and theres the problem staring me right in the face.
> I've looked seriously at this problem 2 or 3 times and have missed this every
> single freaking time.  Here is the patch I'm trying, please try it on yours to
> make sure it fixes the problem.  It takes like 2 hours for it to reproduce for
> me so I won't be able to fully test it until tomorrow, but so far it hasn't
> broken anything so it should be good.  Thanks,

Great! I've put your patch on my testbox and will run a test over the
weekend. I'll report back on monday.

Thanks,
Christian
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-11 Thread Josef Bacik
On Thu, May 10, 2012 at 04:35:23PM -0400, Josef Bacik wrote:
> On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
> > Am 24. April 2012 18:26 schrieb Sage Weil :
> > > On Tue, 24 Apr 2012, Josef Bacik wrote:
> > >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> > >> > After running ceph on XFS for some time, I decided to try btrfs again.
> > >> > Performance with the current "for-linux-min" branch and big metadata
> > >> > is much better. The only problem (?) I'm still seeing is a warning
> > >> > that seems to occur from time to time:
> > >
> > > Actually, before you do that... we have a new tool,
> > > test_filestore_workloadgen, that generates a ceph-osd-like workload on the
> > > local file system.  It's a subset of what a full OSD might do, but if
> > > we're lucky it will be sufficient to reproduce this issue.  Something like
> > >
> > >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> > >
> > > will hopefully do the trick.
> > >
> > > Christian, maybe you can see if that is able to trigger this warning?
> > > You'll need to pull it from the current master branch; it wasn't in the
> > > last release.
> > 
> > Trying to reproduce with test_filestore_workloadgen didn't work for
> > me. So here are some instructions on how to reproduce with a minimal
> > ceph setup.
> > 
> > You will need a single system with two disks and a bit of memory.
> > 
> > - Compile and install ceph (detailed instructions:
> > http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
> > 
> > - For the test setup I've used two tmpfs files as journal devices. To
> > create these, do the following:
> > 
> > # mkdir -p /ceph/temp
> > # mount -t tmpfs tmpfs /ceph/temp
> > # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
> > # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
> > 
> > - Now you should create and mount btrfs. Here is what I did:
> > 
> > # mkfs.btrfs -l 64k -n 64k /dev/sda
> > # mkfs.btrfs -l 64k -n 64k /dev/sdb
> > # mkdir /ceph/osd.000
> > # mkdir /ceph/osd.001
> > # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda /ceph/osd.000
> > # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb /ceph/osd.001
> > 
> > - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
> > will probably have to change the btrfs devices and the hostname
> > (os39).
> > 
> > - Create the ceph filesystems:
> > 
> > # mkdir /ceph/mon
> > # mkcephfs -a -c /etc/ceph/ceph.conf
> > 
> > - Start ceph (e.g. "service ceph start")
> > 
> > - Now you should be able to use ceph - "ceph -s" will tell you about
> > the state of the ceph cluster.
> > 
> > - "rbd create -size 100 testimg" will create an rbd image on the ceph 
> > cluster.
> > 
> > - Compile my test with "gcc -o rbdtest rbdtest.c -lrbd" and run it
> > with "./rbdtest testimg".
> > 
> > I can see the first btrfs_orphan_commit_root warning after an hour or
> > so... I hope that I've described all necessary steps. If there is a
> > problem just send me a note.
> > 
> 
> Well I feel like an idiot, I finally get it to reproduce, go look at where I
> want to put my printks and theres the problem staring me right in the face.
> I've looked seriously at this problem 2 or 3 times and have missed this every
> single freaking time.  Here is the patch I'm trying, please try it on yours to
> make sure it fixes the problem.  It takes like 2 hours for it to reproduce for
> me so I won't be able to fully test it until tomorrow, but so far it hasn't
> broken anything so it should be good.  Thanks,
> 

That previous patch was against btrfs-next, this patch is against 3.4-rc6 if you
are on mainline.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9b15f..54af1fa 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -156,6 +153,7 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd7233..aad2600 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a7ffc88..ff3bf4b 100644
--- a/

Re: Ceph on btrfs 3.4rc

2012-05-10 Thread Josef Bacik
On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
> Am 24. April 2012 18:26 schrieb Sage Weil :
> > On Tue, 24 Apr 2012, Josef Bacik wrote:
> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> >> > After running ceph on XFS for some time, I decided to try btrfs again.
> >> > Performance with the current "for-linux-min" branch and big metadata
> >> > is much better. The only problem (?) I'm still seeing is a warning
> >> > that seems to occur from time to time:
> >
> > Actually, before you do that... we have a new tool,
> > test_filestore_workloadgen, that generates a ceph-osd-like workload on the
> > local file system.  It's a subset of what a full OSD might do, but if
> > we're lucky it will be sufficient to reproduce this issue.  Something like
> >
> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> >
> > will hopefully do the trick.
> >
> > Christian, maybe you can see if that is able to trigger this warning?
> > You'll need to pull it from the current master branch; it wasn't in the
> > last release.
> 
> Trying to reproduce with test_filestore_workloadgen didn't work for
> me. So here are some instructions on how to reproduce with a minimal
> ceph setup.
> 
> You will need a single system with two disks and a bit of memory.
> 
> - Compile and install ceph (detailed instructions:
> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
> 
> - For the test setup I've used two tmpfs files as journal devices. To
> create these, do the following:
> 
> # mkdir -p /ceph/temp
> # mount -t tmpfs tmpfs /ceph/temp
> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
> 
> - Now you should create and mount btrfs. Here is what I did:
> 
> # mkfs.btrfs -l 64k -n 64k /dev/sda
> # mkfs.btrfs -l 64k -n 64k /dev/sdb
> # mkdir /ceph/osd.000
> # mkdir /ceph/osd.001
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda /ceph/osd.000
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb /ceph/osd.001
> 
> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
> will probably have to change the btrfs devices and the hostname
> (os39).
> 
> - Create the ceph filesystems:
> 
> # mkdir /ceph/mon
> # mkcephfs -a -c /etc/ceph/ceph.conf
> 
> - Start ceph (e.g. "service ceph start")
> 
> - Now you should be able to use ceph - "ceph -s" will tell you about
> the state of the ceph cluster.
> 
> - "rbd create -size 100 testimg" will create an rbd image on the ceph cluster.
> 
> - Compile my test with "gcc -o rbdtest rbdtest.c -lrbd" and run it
> with "./rbdtest testimg".
> 
> I can see the first btrfs_orphan_commit_root warning after an hour or
> so... I hope that I've described all necessary steps. If there is a
> problem just send me a note.
> 

Well I feel like an idiot, I finally get it to reproduce, go look at where I
want to put my printks and theres the problem staring me right in the face.
I've looked seriously at this problem 2 or 3 times and have missed this every
single freaking time.  Here is the patch I'm trying, please try it on yours to
make sure it fixes the problem.  It takes like 2 hours for it to reproduce for
me so I won't be able to fully test it until tomorrow, but so far it hasn't
broken anything so it should be good.  Thanks,

Josef


diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index eefe573..4ad628d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,9 +57,6 @@ struct btrfs_inode {
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
 
-   /* for keeping track of orphaned inodes */
-   struct list_head i_orphan;
-
/* list of all the delalloc inodes in the FS.  There are times we need
 * to write all the delalloc pages to disk, and this list is used
 * to walk them all.
@@ -164,6 +161,7 @@ struct btrfs_inode {
unsigned dummy_inode:1;
unsigned in_defrag:1;
unsigned delalloc_meta_reserved:1;
+   unsigned has_orphan_item:1;
 
/*
 * always compress this one file
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8a89888..6dd20f3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1375,7 +1375,7 @@ struct btrfs_root {
struct list_head root_list;
 
spinlock_t orphan_lock;
-   struct list_head orphan_list;
+   atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
int orphan_item_inserted;
int orphan_cleanup_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f849b3..8bbe8c4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1148,7 +1148,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 
sectorsize,
root->orphan_block_rsv = NULL;
 
INIT_LIST_HEAD(&root->dirty_list);
-   INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root-

Re: Ceph on btrfs 3.4rc

2012-05-10 Thread Josef Bacik
On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
> Am 24. April 2012 18:26 schrieb Sage Weil :
> > On Tue, 24 Apr 2012, Josef Bacik wrote:
> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> >> > After running ceph on XFS for some time, I decided to try btrfs again.
> >> > Performance with the current "for-linux-min" branch and big metadata
> >> > is much better. The only problem (?) I'm still seeing is a warning
> >> > that seems to occur from time to time:
> >
> > Actually, before you do that... we have a new tool,
> > test_filestore_workloadgen, that generates a ceph-osd-like workload on the
> > local file system.  It's a subset of what a full OSD might do, but if
> > we're lucky it will be sufficient to reproduce this issue.  Something like
> >
> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> >
> > will hopefully do the trick.
> >
> > Christian, maybe you can see if that is able to trigger this warning?
> > You'll need to pull it from the current master branch; it wasn't in the
> > last release.
> 
> Trying to reproduce with test_filestore_workloadgen didn't work for
> me. So here are some instructions on how to reproduce with a minimal
> ceph setup.
> 
> You will need a single system with two disks and a bit of memory.
> 
> - Compile and install ceph (detailed instructions:
> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
> 
> - For the test setup I've used two tmpfs files as journal devices. To
> create these, do the following:
> 
> # mkdir -p /ceph/temp
> # mount -t tmpfs tmpfs /ceph/temp
> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
> 
> - Now you should create and mount btrfs. Here is what I did:
> 
> # mkfs.btrfs -l 64k -n 64k /dev/sda
> # mkfs.btrfs -l 64k -n 64k /dev/sdb
> # mkdir /ceph/osd.000
> # mkdir /ceph/osd.001
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda /ceph/osd.000
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb /ceph/osd.001
> 
> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
> will probably have to change the btrfs devices and the hostname
> (os39).
> 
> - Create the ceph filesystems:
> 
> # mkdir /ceph/mon
> # mkcephfs -a -c /etc/ceph/ceph.conf
> 
> - Start ceph (e.g. "service ceph start")
> 
> - Now you should be able to use ceph - "ceph -s" will tell you about
> the state of the ceph cluster.
> 
> - "rbd create -size 100 testimg" will create an rbd image on the ceph cluster.
> 
> - Compile my test with "gcc -o rbdtest rbdtest.c -lrbd" and run it
> with "./rbdtest testimg".
> 
> I can see the first btrfs_orphan_commit_root warning after an hour or
> so... I hope that I've described all necessary steps. If there is a
> problem just send me a note.
> 

Well it's only taken me 2 weeks but I've finally git it all up and running,
hopefully I'll reproduce.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-09 Thread Josef Bacik
On Fri, May 04, 2012 at 10:24:16PM +0200, Christian Brunner wrote:
> 2012/5/3 Josef Bacik :
> > On Thu, May 03, 2012 at 09:38:27AM -0700, Josh Durgin wrote:
> >> On Thu, 3 May 2012 11:20:53 -0400, Josef Bacik 
> >> wrote:
> >> > On Thu, May 03, 2012 at 08:17:43AM -0700, Josh Durgin wrote:
> >> >
> >> > Yeah all that was in the right place, I rebooted and I magically
> >> > stopped getting
> >> > that error, but now I'm getting this
> >> >
> >> > http://fpaste.org/OE92/
> >> >
> >> > with that ping thing repeating over and over.  Thanks,
> >>
> >> That just looks like the osd isn't running. If you restart the
> >> osd with 'debug osd = 20' the osd log should tell us what's going on.
> >
> > Ok that part was my fault, Duh I need to redo the tmpfs and mkcephfs stuff 
> > after
> > reboot.  But now I'm back to my original problem
> >
> > http://fpaste.org/PfwO/
> >
> > I have the osd class dir = /usr/lib64/rados-classes thing set and 
> > libcls_rbd is
> > in there, so I'm not sure what is wrong.  Thanks,
> 
> Thats really strange. Do you have the osd logs in /var/log/ceph? If
> so, can you look if you find anything about "rbd" or "class" loading
> in there?
> 
> Another thing you should try is, whether you can access ceph with rados:
> 
> # rados -p rbd ls
> # rados -p rbd -i /proc/cpuinfo put testobj
> # rados -p rbd -o - get testobj
>

Ok weirdly ceph is trying to dlopen /usr/lib64/rados-classes/libcls_rbd.so but
all I had was libcls_rbd.so.1 and libcls_rbd.so.1.0.0.  Symlink fixed that part,
I'll see if I can reproduce now.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-04 Thread Christian Brunner
2012/5/3 Josef Bacik :
> On Thu, May 03, 2012 at 09:38:27AM -0700, Josh Durgin wrote:
>> On Thu, 3 May 2012 11:20:53 -0400, Josef Bacik 
>> wrote:
>> > On Thu, May 03, 2012 at 08:17:43AM -0700, Josh Durgin wrote:
>> >
>> > Yeah all that was in the right place, I rebooted and I magically
>> > stopped getting
>> > that error, but now I'm getting this
>> >
>> > http://fpaste.org/OE92/
>> >
>> > with that ping thing repeating over and over.  Thanks,
>>
>> That just looks like the osd isn't running. If you restart the
>> osd with 'debug osd = 20' the osd log should tell us what's going on.
>
> Ok that part was my fault, Duh I need to redo the tmpfs and mkcephfs stuff 
> after
> reboot.  But now I'm back to my original problem
>
> http://fpaste.org/PfwO/
>
> I have the osd class dir = /usr/lib64/rados-classes thing set and libcls_rbd 
> is
> in there, so I'm not sure what is wrong.  Thanks,

Thats really strange. Do you have the osd logs in /var/log/ceph? If
so, can you look if you find anything about "rbd" or "class" loading
in there?

Another thing you should try is, whether you can access ceph with rados:

# rados -p rbd ls
# rados -p rbd -i /proc/cpuinfo put testobj
# rados -p rbd -o - get testobj

Regards,
Christian
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-03 Thread Josef Bacik
On Thu, May 03, 2012 at 09:38:27AM -0700, Josh Durgin wrote:
> On Thu, 3 May 2012 11:20:53 -0400, Josef Bacik 
> wrote:
> > On Thu, May 03, 2012 at 08:17:43AM -0700, Josh Durgin wrote:
> >> On Thu, 3 May 2012 10:13:55 -0400, Josef Bacik 
> >> wrote:
> >> > On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
> >> >> Am 24. April 2012 18:26 schrieb Sage Weil :
> >> >> > On Tue, 24 Apr 2012, Josef Bacik wrote:
> >> >> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> >> >> >> > After running ceph on XFS for some time, I decided to try btrfs 
> >> >> >> > again.
> >> >> >> > Performance with the current "for-linux-min" branch and big 
> >> >> >> > metadata
> >> >> >> > is much better. The only problem (?) I'm still seeing is a warning
> >> >> >> > that seems to occur from time to time:
> >> >> >
> >> >> > Actually, before you do that... we have a new tool,
> >> >> > test_filestore_workloadgen, that generates a ceph-osd-like workload 
> >> >> > on the
> >> >> > local file system.  It's a subset of what a full OSD might do, but if
> >> >> > we're lucky it will be sufficient to reproduce this issue.  Something 
> >> >> > like
> >> >> >
> >> >> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> >> >> >
> >> >> > will hopefully do the trick.
> >> >> >
> >> >> > Christian, maybe you can see if that is able to trigger this warning?
> >> >> > You'll need to pull it from the current master branch; it wasn't in 
> >> >> > the
> >> >> > last release.
> >> >>
> >> >> Trying to reproduce with test_filestore_workloadgen didn't work for
> >> >> me. So here are some instructions on how to reproduce with a minimal
> >> >> ceph setup.
> >> >>
> >> >> You will need a single system with two disks and a bit of memory.
> >> >>
> >> >> - Compile and install ceph (detailed instructions:
> >> >> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
> >> >>
> >> >> - For the test setup I've used two tmpfs files as journal devices. To
> >> >> create these, do the following:
> >> >>
> >> >> # mkdir -p /ceph/temp
> >> >> # mount -t tmpfs tmpfs /ceph/temp
> >> >> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
> >> >> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
> >> >>
> >> >> - Now you should create and mount btrfs. Here is what I did:
> >> >>
> >> >> # mkfs.btrfs -l 64k -n 64k /dev/sda
> >> >> # mkfs.btrfs -l 64k -n 64k /dev/sdb
> >> >> # mkdir /ceph/osd.000
> >> >> # mkdir /ceph/osd.001
> >> >> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda 
> >> >> /ceph/osd.000
> >> >> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb 
> >> >> /ceph/osd.001
> >> >>
> >> >> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
> >> >> will probably have to change the btrfs devices and the hostname
> >> >> (os39).
> >> >>
> >> >> - Create the ceph filesystems:
> >> >>
> >> >> # mkdir /ceph/mon
> >> >> # mkcephfs -a -c /etc/ceph/ceph.conf
> >> >>
> >> >> - Start ceph (e.g. "service ceph start")
> >> >>
> >> >> - Now you should be able to use ceph - "ceph -s" will tell you about
> >> >> the state of the ceph cluster.
> >> >>
> >> >> - "rbd create -size 100 testimg" will create an rbd image on the ceph 
> >> >> cluster.
> >> >>
> >> >
> >> > It's failing here
> >> >
> >> > http://fpaste.org/e3BG/
> >>
> >> 2012-05-03 10:11:28.818308 7fcb5a0ee700 -- 127.0.0.1:0/1003269 <==
> >> osd.1 127.0.0.1:6803/2379 3  osd_op_reply(3 rbd_info [call] = -5
> >> (Input/output error)) v4  107+0+0 (3948821281 0 0) 0x7fcb380009a0
> >> con 0x1cad3e0
> >>
> >> This is probably because the osd isn't finding the rbd class.
> >> Do you have 'rbd_cls.so' in /usr/lib64/rados-classes? Wherever
> >> rbd_cls.so is,
> >> try adding 'osd class dir = /path/to/rados-classes' to the [osd]
> >> section
> >> in your ceph.conf, and restarting the osds.
> >>
> >> If you set 'debug osd = 10' you should see '_load_class rbd' in the osd
> >> log
> >> when you try to create an rbd image.
> >>
> >> Autotools should be setting the default location correctly, but if
> >> you're
> >> running the osds in a chroot or something the path would be wrong.
> >>
> > 
> > Yeah all that was in the right place, I rebooted and I magically
> > stopped getting
> > that error, but now I'm getting this
> > 
> > http://fpaste.org/OE92/
> > 
> > with that ping thing repeating over and over.  Thanks,
> 
> That just looks like the osd isn't running. If you restart the
> osd with 'debug osd = 20' the osd log should tell us what's going on.

Ok that part was my fault, Duh I need to redo the tmpfs and mkcephfs stuff after
reboot.  But now I'm back to my original problem

http://fpaste.org/PfwO/

I have the osd class dir = /usr/lib64/rados-classes thing set and libcls_rbd is
in there, so I'm not sure what is wrong.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vge

Re: Ceph on btrfs 3.4rc

2012-05-03 Thread Josh Durgin
On Thu, 3 May 2012 10:13:55 -0400, Josef Bacik 
wrote:
> On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
>> Am 24. April 2012 18:26 schrieb Sage Weil :
>> > On Tue, 24 Apr 2012, Josef Bacik wrote:
>> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
>> >> > After running ceph on XFS for some time, I decided to try btrfs again.
>> >> > Performance with the current "for-linux-min" branch and big metadata
>> >> > is much better. The only problem (?) I'm still seeing is a warning
>> >> > that seems to occur from time to time:
>> >
>> > Actually, before you do that... we have a new tool,
>> > test_filestore_workloadgen, that generates a ceph-osd-like workload on the
>> > local file system.  It's a subset of what a full OSD might do, but if
>> > we're lucky it will be sufficient to reproduce this issue.  Something like
>> >
>> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
>> >
>> > will hopefully do the trick.
>> >
>> > Christian, maybe you can see if that is able to trigger this warning?
>> > You'll need to pull it from the current master branch; it wasn't in the
>> > last release.
>>
>> Trying to reproduce with test_filestore_workloadgen didn't work for
>> me. So here are some instructions on how to reproduce with a minimal
>> ceph setup.
>>
>> You will need a single system with two disks and a bit of memory.
>>
>> - Compile and install ceph (detailed instructions:
>> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
>>
>> - For the test setup I've used two tmpfs files as journal devices. To
>> create these, do the following:
>>
>> # mkdir -p /ceph/temp
>> # mount -t tmpfs tmpfs /ceph/temp
>> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
>> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
>>
>> - Now you should create and mount btrfs. Here is what I did:
>>
>> # mkfs.btrfs -l 64k -n 64k /dev/sda
>> # mkfs.btrfs -l 64k -n 64k /dev/sdb
>> # mkdir /ceph/osd.000
>> # mkdir /ceph/osd.001
>> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda /ceph/osd.000
>> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb /ceph/osd.001
>>
>> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
>> will probably have to change the btrfs devices and the hostname
>> (os39).
>>
>> - Create the ceph filesystems:
>>
>> # mkdir /ceph/mon
>> # mkcephfs -a -c /etc/ceph/ceph.conf
>>
>> - Start ceph (e.g. "service ceph start")
>>
>> - Now you should be able to use ceph - "ceph -s" will tell you about
>> the state of the ceph cluster.
>>
>> - "rbd create -size 100 testimg" will create an rbd image on the ceph 
>> cluster.
>>
> 
> It's failing here
> 
> http://fpaste.org/e3BG/

2012-05-03 10:11:28.818308 7fcb5a0ee700 -- 127.0.0.1:0/1003269 <==
osd.1 127.0.0.1:6803/2379 3  osd_op_reply(3 rbd_info [call] = -5
(Input/output error)) v4  107+0+0 (3948821281 0 0) 0x7fcb380009a0
con 0x1cad3e0

This is probably because the osd isn't finding the rbd class.
Do you have 'rbd_cls.so' in /usr/lib64/rados-classes? Wherever
rbd_cls.so is,
try adding 'osd class dir = /path/to/rados-classes' to the [osd]
section
in your ceph.conf, and restarting the osds.

If you set 'debug osd = 10' you should see '_load_class rbd' in the osd
log
when you try to create an rbd image.

Autotools should be setting the default location correctly, but if
you're
running the osds in a chroot or something the path would be wrong.

Josh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-03 Thread Josh Durgin
On Thu, 3 May 2012 11:20:53 -0400, Josef Bacik 
wrote:
> On Thu, May 03, 2012 at 08:17:43AM -0700, Josh Durgin wrote:
>> On Thu, 3 May 2012 10:13:55 -0400, Josef Bacik 
>> wrote:
>> > On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
>> >> Am 24. April 2012 18:26 schrieb Sage Weil :
>> >> > On Tue, 24 Apr 2012, Josef Bacik wrote:
>> >> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
>> >> >> > After running ceph on XFS for some time, I decided to try btrfs 
>> >> >> > again.
>> >> >> > Performance with the current "for-linux-min" branch and big metadata
>> >> >> > is much better. The only problem (?) I'm still seeing is a warning
>> >> >> > that seems to occur from time to time:
>> >> >
>> >> > Actually, before you do that... we have a new tool,
>> >> > test_filestore_workloadgen, that generates a ceph-osd-like workload on 
>> >> > the
>> >> > local file system.  It's a subset of what a full OSD might do, but if
>> >> > we're lucky it will be sufficient to reproduce this issue.  Something 
>> >> > like
>> >> >
>> >> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
>> >> >
>> >> > will hopefully do the trick.
>> >> >
>> >> > Christian, maybe you can see if that is able to trigger this warning?
>> >> > You'll need to pull it from the current master branch; it wasn't in the
>> >> > last release.
>> >>
>> >> Trying to reproduce with test_filestore_workloadgen didn't work for
>> >> me. So here are some instructions on how to reproduce with a minimal
>> >> ceph setup.
>> >>
>> >> You will need a single system with two disks and a bit of memory.
>> >>
>> >> - Compile and install ceph (detailed instructions:
>> >> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
>> >>
>> >> - For the test setup I've used two tmpfs files as journal devices. To
>> >> create these, do the following:
>> >>
>> >> # mkdir -p /ceph/temp
>> >> # mount -t tmpfs tmpfs /ceph/temp
>> >> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
>> >> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
>> >>
>> >> - Now you should create and mount btrfs. Here is what I did:
>> >>
>> >> # mkfs.btrfs -l 64k -n 64k /dev/sda
>> >> # mkfs.btrfs -l 64k -n 64k /dev/sdb
>> >> # mkdir /ceph/osd.000
>> >> # mkdir /ceph/osd.001
>> >> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda 
>> >> /ceph/osd.000
>> >> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb 
>> >> /ceph/osd.001
>> >>
>> >> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
>> >> will probably have to change the btrfs devices and the hostname
>> >> (os39).
>> >>
>> >> - Create the ceph filesystems:
>> >>
>> >> # mkdir /ceph/mon
>> >> # mkcephfs -a -c /etc/ceph/ceph.conf
>> >>
>> >> - Start ceph (e.g. "service ceph start")
>> >>
>> >> - Now you should be able to use ceph - "ceph -s" will tell you about
>> >> the state of the ceph cluster.
>> >>
>> >> - "rbd create -size 100 testimg" will create an rbd image on the ceph 
>> >> cluster.
>> >>
>> >
>> > It's failing here
>> >
>> > http://fpaste.org/e3BG/
>>
>> 2012-05-03 10:11:28.818308 7fcb5a0ee700 -- 127.0.0.1:0/1003269 <==
>> osd.1 127.0.0.1:6803/2379 3  osd_op_reply(3 rbd_info [call] = -5
>> (Input/output error)) v4  107+0+0 (3948821281 0 0) 0x7fcb380009a0
>> con 0x1cad3e0
>>
>> This is probably because the osd isn't finding the rbd class.
>> Do you have 'rbd_cls.so' in /usr/lib64/rados-classes? Wherever
>> rbd_cls.so is,
>> try adding 'osd class dir = /path/to/rados-classes' to the [osd]
>> section
>> in your ceph.conf, and restarting the osds.
>>
>> If you set 'debug osd = 10' you should see '_load_class rbd' in the osd
>> log
>> when you try to create an rbd image.
>>
>> Autotools should be setting the default location correctly, but if
>> you're
>> running the osds in a chroot or something the path would be wrong.
>>
> 
> Yeah all that was in the right place, I rebooted and I magically
> stopped getting
> that error, but now I'm getting this
> 
> http://fpaste.org/OE92/
> 
> with that ping thing repeating over and over.  Thanks,

That just looks like the osd isn't running. If you restart the
osd with 'debug osd = 20' the osd log should tell us what's going on.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-03 Thread Josef Bacik
On Thu, May 03, 2012 at 08:17:43AM -0700, Josh Durgin wrote:
> On Thu, 3 May 2012 10:13:55 -0400, Josef Bacik 
> wrote:
> > On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
> >> Am 24. April 2012 18:26 schrieb Sage Weil :
> >> > On Tue, 24 Apr 2012, Josef Bacik wrote:
> >> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> >> >> > After running ceph on XFS for some time, I decided to try btrfs again.
> >> >> > Performance with the current "for-linux-min" branch and big metadata
> >> >> > is much better. The only problem (?) I'm still seeing is a warning
> >> >> > that seems to occur from time to time:
> >> >
> >> > Actually, before you do that... we have a new tool,
> >> > test_filestore_workloadgen, that generates a ceph-osd-like workload on 
> >> > the
> >> > local file system.  It's a subset of what a full OSD might do, but if
> >> > we're lucky it will be sufficient to reproduce this issue.  Something 
> >> > like
> >> >
> >> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> >> >
> >> > will hopefully do the trick.
> >> >
> >> > Christian, maybe you can see if that is able to trigger this warning?
> >> > You'll need to pull it from the current master branch; it wasn't in the
> >> > last release.
> >>
> >> Trying to reproduce with test_filestore_workloadgen didn't work for
> >> me. So here are some instructions on how to reproduce with a minimal
> >> ceph setup.
> >>
> >> You will need a single system with two disks and a bit of memory.
> >>
> >> - Compile and install ceph (detailed instructions:
> >> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
> >>
> >> - For the test setup I've used two tmpfs files as journal devices. To
> >> create these, do the following:
> >>
> >> # mkdir -p /ceph/temp
> >> # mount -t tmpfs tmpfs /ceph/temp
> >> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
> >> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
> >>
> >> - Now you should create and mount btrfs. Here is what I did:
> >>
> >> # mkfs.btrfs -l 64k -n 64k /dev/sda
> >> # mkfs.btrfs -l 64k -n 64k /dev/sdb
> >> # mkdir /ceph/osd.000
> >> # mkdir /ceph/osd.001
> >> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda 
> >> /ceph/osd.000
> >> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb 
> >> /ceph/osd.001
> >>
> >> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
> >> will probably have to change the btrfs devices and the hostname
> >> (os39).
> >>
> >> - Create the ceph filesystems:
> >>
> >> # mkdir /ceph/mon
> >> # mkcephfs -a -c /etc/ceph/ceph.conf
> >>
> >> - Start ceph (e.g. "service ceph start")
> >>
> >> - Now you should be able to use ceph - "ceph -s" will tell you about
> >> the state of the ceph cluster.
> >>
> >> - "rbd create -size 100 testimg" will create an rbd image on the ceph 
> >> cluster.
> >>
> > 
> > It's failing here
> > 
> > http://fpaste.org/e3BG/
> 
> 2012-05-03 10:11:28.818308 7fcb5a0ee700 -- 127.0.0.1:0/1003269 <==
> osd.1 127.0.0.1:6803/2379 3  osd_op_reply(3 rbd_info [call] = -5
> (Input/output error)) v4  107+0+0 (3948821281 0 0) 0x7fcb380009a0
> con 0x1cad3e0
> 
> This is probably because the osd isn't finding the rbd class.
> Do you have 'rbd_cls.so' in /usr/lib64/rados-classes? Wherever
> rbd_cls.so is,
> try adding 'osd class dir = /path/to/rados-classes' to the [osd]
> section
> in your ceph.conf, and restarting the osds.
> 
> If you set 'debug osd = 10' you should see '_load_class rbd' in the osd
> log
> when you try to create an rbd image.
> 
> Autotools should be setting the default location correctly, but if
> you're
> running the osds in a chroot or something the path would be wrong.
> 

Yeah all that was in the right place, I rebooted and I magically stopped getting
that error, but now I'm getting this

http://fpaste.org/OE92/

with that ping thing repeating over and over.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-05-03 Thread Josef Bacik
On Fri, Apr 27, 2012 at 01:02:08PM +0200, Christian Brunner wrote:
> Am 24. April 2012 18:26 schrieb Sage Weil :
> > On Tue, 24 Apr 2012, Josef Bacik wrote:
> >> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> >> > After running ceph on XFS for some time, I decided to try btrfs again.
> >> > Performance with the current "for-linux-min" branch and big metadata
> >> > is much better. The only problem (?) I'm still seeing is a warning
> >> > that seems to occur from time to time:
> >
> > Actually, before you do that... we have a new tool,
> > test_filestore_workloadgen, that generates a ceph-osd-like workload on the
> > local file system.  It's a subset of what a full OSD might do, but if
> > we're lucky it will be sufficient to reproduce this issue.  Something like
> >
> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> >
> > will hopefully do the trick.
> >
> > Christian, maybe you can see if that is able to trigger this warning?
> > You'll need to pull it from the current master branch; it wasn't in the
> > last release.
> 
> Trying to reproduce with test_filestore_workloadgen didn't work for
> me. So here are some instructions on how to reproduce with a minimal
> ceph setup.
> 
> You will need a single system with two disks and a bit of memory.
> 
> - Compile and install ceph (detailed instructions:
> http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)
> 
> - For the test setup I've used two tmpfs files as journal devices. To
> create these, do the following:
> 
> # mkdir -p /ceph/temp
> # mount -t tmpfs tmpfs /ceph/temp
> # dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
> # dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k
> 
> - Now you should create and mount btrfs. Here is what I did:
> 
> # mkfs.btrfs -l 64k -n 64k /dev/sda
> # mkfs.btrfs -l 64k -n 64k /dev/sdb
> # mkdir /ceph/osd.000
> # mkdir /ceph/osd.001
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda /ceph/osd.000
> # mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb /ceph/osd.001
> 
> - Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
> will probably have to change the btrfs devices and the hostname
> (os39).
> 
> - Create the ceph filesystems:
> 
> # mkdir /ceph/mon
> # mkcephfs -a -c /etc/ceph/ceph.conf
> 
> - Start ceph (e.g. "service ceph start")
> 
> - Now you should be able to use ceph - "ceph -s" will tell you about
> the state of the ceph cluster.
> 
> - "rbd create -size 100 testimg" will create an rbd image on the ceph cluster.
> 

It's failing here

http://fpaste.org/e3BG/

Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-04-30 Thread Christian Brunner
2012/4/29 tsuna :
> On Fri, Apr 20, 2012 at 8:09 AM, Christian Brunner
>  wrote:
>> After running ceph on XFS for some time, I decided to try btrfs again.
>> Performance with the current "for-linux-min" branch and big metadata
>> is much better.
>
> I've heard that although performance from btrfs is better at first, it
> degrades over time due to metadata fragmentation, whereas XFS'
> performance starts off a little worse, but remains stable even after
> weeks of heavy utilization.  Would be curious to hear your (or
> others') feedback on that topic.

Metadata fragmentation was a big problem (for us) in the past. With
the "big metatdata feature" (mkfs.btrfs -l 64k -n 64k) these problems
seem to be solved. We do not use it in production yet, but my stress
test didn't show any degradation. The only remaining issues I've seen
are these warnings.

Regards,
Christian
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-04-29 Thread tsuna
On Fri, Apr 20, 2012 at 8:09 AM, Christian Brunner
 wrote:
> After running ceph on XFS for some time, I decided to try btrfs again.
> Performance with the current "for-linux-min" branch and big metadata
> is much better.

I've heard that although performance from btrfs is better at first, it
degrades over time due to metadata fragmentation, whereas XFS'
performance starts off a little worse, but remains stable even after
weeks of heavy utilization.  Would be curious to hear your (or
others') feedback on that topic.

-- 
Benoit "tsuna" Sigoure
Software Engineer @ www.StumbleUpon.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-04-27 Thread Christian Brunner
Am 24. April 2012 18:26 schrieb Sage Weil :
> On Tue, 24 Apr 2012, Josef Bacik wrote:
>> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
>> > After running ceph on XFS for some time, I decided to try btrfs again.
>> > Performance with the current "for-linux-min" branch and big metadata
>> > is much better. The only problem (?) I'm still seeing is a warning
>> > that seems to occur from time to time:
>
> Actually, before you do that... we have a new tool,
> test_filestore_workloadgen, that generates a ceph-osd-like workload on the
> local file system.  It's a subset of what a full OSD might do, but if
> we're lucky it will be sufficient to reproduce this issue.  Something like
>
>  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
>
> will hopefully do the trick.
>
> Christian, maybe you can see if that is able to trigger this warning?
> You'll need to pull it from the current master branch; it wasn't in the
> last release.

Trying to reproduce with test_filestore_workloadgen didn't work for
me. So here are some instructions on how to reproduce with a minimal
ceph setup.

You will need a single system with two disks and a bit of memory.

- Compile and install ceph (detailed instructions:
http://ceph.newdream.net/docs/master/ops/install/mkcephfs/)

- For the test setup I've used two tmpfs files as journal devices. To
create these, do the following:

# mkdir -p /ceph/temp
# mount -t tmpfs tmpfs /ceph/temp
# dd if=/dev/zero of=/ceph/temp/journal0 count=500 bs=1024k
# dd if=/dev/zero of=/ceph/temp/journal1 count=500 bs=1024k

- Now you should create and mount btrfs. Here is what I did:

# mkfs.btrfs -l 64k -n 64k /dev/sda
# mkfs.btrfs -l 64k -n 64k /dev/sdb
# mkdir /ceph/osd.000
# mkdir /ceph/osd.001
# mount -o noatime,space_cache,inode_cache,autodefrag /dev/sda /ceph/osd.000
# mount -o noatime,space_cache,inode_cache,autodefrag /dev/sdb /ceph/osd.001

- Create /etc/ceph/ceph.conf similar to the attached ceph.conf. You
will probably have to change the btrfs devices and the hostname
(os39).

- Create the ceph filesystems:

# mkdir /ceph/mon
# mkcephfs -a -c /etc/ceph/ceph.conf

- Start ceph (e.g. "service ceph start")

- Now you should be able to use ceph - "ceph -s" will tell you about
the state of the ceph cluster.

- "rbd create -size 100 testimg" will create an rbd image on the ceph cluster.

- Compile my test with "gcc -o rbdtest rbdtest.c -lrbd" and run it
with "./rbdtest testimg".

I can see the first btrfs_orphan_commit_root warning after an hour or
so... I hope that I've described all necessary steps. If there is a
problem just send me a note.

Thanks,
Christian


ceph.conf
Description: Binary data


Re: Ceph on btrfs 3.4rc

2012-04-24 Thread Neil Horman
On Tue, Apr 24, 2012 at 01:33:44PM -0400, Josef Bacik wrote:
> On Tue, Apr 24, 2012 at 09:26:15AM -0700, Sage Weil wrote:
> > On Tue, 24 Apr 2012, Josef Bacik wrote:
> > > On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> > > > After running ceph on XFS for some time, I decided to try btrfs again.
> > > > Performance with the current "for-linux-min" branch and big metadata
> > > > is much better. The only problem (?) I'm still seeing is a warning
> > > > that seems to occur from time to time:
> > 
> > Actually, before you do that... we have a new tool, 
> > test_filestore_workloadgen, that generates a ceph-osd-like workload on the 
> > local file system.  It's a subset of what a full OSD might do, but if 
> > we're lucky it will be sufficient to reproduce this issue.  Something like
> > 
> >  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> > 
> > will hopefully do the trick.
> > 
> > Christian, maybe you can see if that is able to trigger this warning?  
> > You'll need to pull it from the current master branch; it wasn't in the 
> > last release.
> > 
> 
> Keep up the good work Sage, at this rate I'll never have to setup ceph for
> myself :),
> 
You can setup another OSD on daedalus if you're looking for something to do
Josef :)
Neil

> Josef
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-04-24 Thread Josef Bacik
On Tue, Apr 24, 2012 at 09:26:15AM -0700, Sage Weil wrote:
> On Tue, 24 Apr 2012, Josef Bacik wrote:
> > On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> > > After running ceph on XFS for some time, I decided to try btrfs again.
> > > Performance with the current "for-linux-min" branch and big metadata
> > > is much better. The only problem (?) I'm still seeing is a warning
> > > that seems to occur from time to time:
> 
> Actually, before you do that... we have a new tool, 
> test_filestore_workloadgen, that generates a ceph-osd-like workload on the 
> local file system.  It's a subset of what a full OSD might do, but if 
> we're lucky it will be sufficient to reproduce this issue.  Something like
> 
>  test_filestore_workloadgen --osd-data /foo --osd-journal /bar
> 
> will hopefully do the trick.
> 
> Christian, maybe you can see if that is able to trigger this warning?  
> You'll need to pull it from the current master branch; it wasn't in the 
> last release.
> 

Keep up the good work Sage, at this rate I'll never have to setup ceph for
myself :),

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-04-24 Thread Sage Weil
On Tue, 24 Apr 2012, Josef Bacik wrote:
> On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> > After running ceph on XFS for some time, I decided to try btrfs again.
> > Performance with the current "for-linux-min" branch and big metadata
> > is much better. The only problem (?) I'm still seeing is a warning
> > that seems to occur from time to time:

Actually, before you do that... we have a new tool, 
test_filestore_workloadgen, that generates a ceph-osd-like workload on the 
local file system.  It's a subset of what a full OSD might do, but if 
we're lucky it will be sufficient to reproduce this issue.  Something like

 test_filestore_workloadgen --osd-data /foo --osd-journal /bar

will hopefully do the trick.

Christian, maybe you can see if that is able to trigger this warning?  
You'll need to pull it from the current master branch; it wasn't in the 
last release.

Thanks!
sage


> > 
> > [87703.784552] [ cut here ]
> > [87703.789759] WARNING: at fs/btrfs/inode.c:2103
> > btrfs_orphan_commit_root+0xf6/0x100 [btrfs]()
> > [87703.799070] Hardware name: ProLiant DL180 G6
> > [87703.804024] Modules linked in: btrfs zlib_deflate libcrc32c xfs
> > exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
> > iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
> > iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
> > [87703.828166] Pid: 929, comm: kworker/1:2 Tainted: P   O
> > 3.3.2-1.fits.1.el6.x86_64 #1
> > [87703.837513] Call Trace:
> > [87703.840280]  [] warn_slowpath_common+0x7f/0xc0
> > [87703.847016]  [] warn_slowpath_null+0x1a/0x20
> > [87703.853533]  [] btrfs_orphan_commit_root+0xf6/0x100 
> > [btrfs]
> > [87703.861541]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
> > [87703.868674]  []
> > btrfs_commit_transaction+0x5db/0xa50 [btrfs]
> > [87703.876745]  [] ? __switch_to+0x153/0x440
> > [87703.882966]  [] ? wake_up_bit+0x40/0x40
> > [87703.888997]  [] ?
> > btrfs_commit_transaction+0xa50/0xa50 [btrfs]
> > [87703.897271]  [] do_async_commit+0x1f/0x30 [btrfs]
> > [87703.904262]  [] process_one_work+0x129/0x450
> > [87703.910777]  [] worker_thread+0x17b/0x3c0
> > [87703.916991]  [] ? manage_workers+0x220/0x220
> > [87703.923504]  [] kthread+0x9e/0xb0
> > [87703.928952]  [] kernel_thread_helper+0x4/0x10
> > [87703.93]  [] ? 
> > kthread_freezable_should_stop+0x70/0x70
> > [87703.943323]  [] ? gs_change+0x13/0x13
> > [87703.949149] ---[ end trace b8c31966cca731fa ]---
> > [91128.812399] [ cut here ]
> > [91128.817576] WARNING: at fs/btrfs/inode.c:2103
> > btrfs_orphan_commit_root+0xf6/0x100 [btrfs]()
> > [91128.826930] Hardware name: ProLiant DL180 G6
> > [91128.831897] Modules linked in: btrfs zlib_deflate libcrc32c xfs
> > exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
> > iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
> > iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
> > [91128.856086] Pid: 6806, comm: btrfs-transacti Tainted: PW  O
> > 3.3.2-1.fits.1.el6.x86_64 #1
> > [91128.865912] Call Trace:
> > [91128.868670]  [] warn_slowpath_common+0x7f/0xc0
> > [91128.875379]  [] warn_slowpath_null+0x1a/0x20
> > [91128.881900]  [] btrfs_orphan_commit_root+0xf6/0x100 
> > [btrfs]
> > [91128.889894]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
> > [91128.897019]  [] ?
> > btrfs_run_delayed_items+0xf1/0x160 [btrfs]
> > [91128.905075]  []
> > btrfs_commit_transaction+0x5db/0xa50 [btrfs]
> > [91128.913156]  [] ? start_transaction+0x92/0x310 [btrfs]
> > [91128.920643]  [] ? wake_up_bit+0x40/0x40
> > [91128.926667]  [] transaction_kthread+0x26b/0x2e0 [btrfs]
> > [91128.934254]  [] ?
> > btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
> > [91128.943671]  [] ?
> > btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
> > [91128.953079]  [] kthread+0x9e/0xb0
> > [91128.958532]  [] kernel_thread_helper+0x4/0x10
> > [91128.965133]  [] ? 
> > kthread_freezable_should_stop+0x70/0x70
> > [91128.972913]  [] ? gs_change+0x13/0x13
> > [91128.978826] ---[ end trace b8c31966cca731fb ]---
> > 
> > I'm able to reproduce this with ceph on a single server with 4 disks
> > (4 filesystems/osds) and a small test program based on librbd. It is
> > simply writing random bytes on a rbd volume (see attachment).
> > 
> > Is this something I should care about? Any hint's on solving this
> > would be appreciated.
> > 
> 
> Can you send me a config or some basic steps for me to setup ceph on my box 
> so I
> can run this program and finally track down this problem?  Thanks,
> 
> Josef
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-04-24 Thread Josef Bacik
On Fri, Apr 20, 2012 at 05:09:34PM +0200, Christian Brunner wrote:
> After running ceph on XFS for some time, I decided to try btrfs again.
> Performance with the current "for-linux-min" branch and big metadata
> is much better. The only problem (?) I'm still seeing is a warning
> that seems to occur from time to time:
> 
> [87703.784552] [ cut here ]
> [87703.789759] WARNING: at fs/btrfs/inode.c:2103
> btrfs_orphan_commit_root+0xf6/0x100 [btrfs]()
> [87703.799070] Hardware name: ProLiant DL180 G6
> [87703.804024] Modules linked in: btrfs zlib_deflate libcrc32c xfs
> exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
> iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
> iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
> [87703.828166] Pid: 929, comm: kworker/1:2 Tainted: P   O
> 3.3.2-1.fits.1.el6.x86_64 #1
> [87703.837513] Call Trace:
> [87703.840280]  [] warn_slowpath_common+0x7f/0xc0
> [87703.847016]  [] warn_slowpath_null+0x1a/0x20
> [87703.853533]  [] btrfs_orphan_commit_root+0xf6/0x100 
> [btrfs]
> [87703.861541]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
> [87703.868674]  []
> btrfs_commit_transaction+0x5db/0xa50 [btrfs]
> [87703.876745]  [] ? __switch_to+0x153/0x440
> [87703.882966]  [] ? wake_up_bit+0x40/0x40
> [87703.888997]  [] ?
> btrfs_commit_transaction+0xa50/0xa50 [btrfs]
> [87703.897271]  [] do_async_commit+0x1f/0x30 [btrfs]
> [87703.904262]  [] process_one_work+0x129/0x450
> [87703.910777]  [] worker_thread+0x17b/0x3c0
> [87703.916991]  [] ? manage_workers+0x220/0x220
> [87703.923504]  [] kthread+0x9e/0xb0
> [87703.928952]  [] kernel_thread_helper+0x4/0x10
> [87703.93]  [] ? kthread_freezable_should_stop+0x70/0x70
> [87703.943323]  [] ? gs_change+0x13/0x13
> [87703.949149] ---[ end trace b8c31966cca731fa ]---
> [91128.812399] [ cut here ]
> [91128.817576] WARNING: at fs/btrfs/inode.c:2103
> btrfs_orphan_commit_root+0xf6/0x100 [btrfs]()
> [91128.826930] Hardware name: ProLiant DL180 G6
> [91128.831897] Modules linked in: btrfs zlib_deflate libcrc32c xfs
> exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
> iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
> iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
> [91128.856086] Pid: 6806, comm: btrfs-transacti Tainted: PW  O
> 3.3.2-1.fits.1.el6.x86_64 #1
> [91128.865912] Call Trace:
> [91128.868670]  [] warn_slowpath_common+0x7f/0xc0
> [91128.875379]  [] warn_slowpath_null+0x1a/0x20
> [91128.881900]  [] btrfs_orphan_commit_root+0xf6/0x100 
> [btrfs]
> [91128.889894]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
> [91128.897019]  [] ?
> btrfs_run_delayed_items+0xf1/0x160 [btrfs]
> [91128.905075]  []
> btrfs_commit_transaction+0x5db/0xa50 [btrfs]
> [91128.913156]  [] ? start_transaction+0x92/0x310 [btrfs]
> [91128.920643]  [] ? wake_up_bit+0x40/0x40
> [91128.926667]  [] transaction_kthread+0x26b/0x2e0 [btrfs]
> [91128.934254]  [] ?
> btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
> [91128.943671]  [] ?
> btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
> [91128.953079]  [] kthread+0x9e/0xb0
> [91128.958532]  [] kernel_thread_helper+0x4/0x10
> [91128.965133]  [] ? kthread_freezable_should_stop+0x70/0x70
> [91128.972913]  [] ? gs_change+0x13/0x13
> [91128.978826] ---[ end trace b8c31966cca731fb ]---
> 
> I'm able to reproduce this with ceph on a single server with 4 disks
> (4 filesystems/osds) and a small test program based on librbd. It is
> simply writing random bytes on a rbd volume (see attachment).
> 
> Is this something I should care about? Any hint's on solving this
> would be appreciated.
> 

Can you send me a config or some basic steps for me to setup ceph on my box so I
can run this program and finally track down this problem?  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Ceph on btrfs 3.4rc

2012-04-23 Thread Christian Brunner
I decided to run the test over the weekend. The good news is, that the
system is still running without performance degradation. But in the
meantime I've got over 5000 WARNINGs of this kind:

[330700.043557] btrfs: block rsv returned -28
[330700.043559] [ cut here ]
[330700.048898] WARNING: at fs/btrfs/extent-tree.c:6220
btrfs_alloc_free_block+0x357/0x370 [btrfs]()
[330700.058880] Hardware name: ProLiant DL180 G6
[330700.064044] Modules linked in: btrfs zlib_deflate libcrc32c xfs
exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
[330700.090361] Pid: 7954, comm: btrfs-endio-wri Tainted: PW
O 3.3.2-1.fits.1.el6.x86_64 #1
[330700.100393] Call Trace:
[330700.103263]  [] warn_slowpath_common+0x7f/0xc0
[330700.110201]  [] warn_slowpath_null+0x1a/0x20
[330700.116905]  [] btrfs_alloc_free_block+0x357/0x370 [btrfs]
[330700.124988]  [] ? __btrfs_cow_block+0x330/0x530 [btrfs]
[330700.132787]  [] ?
btrfs_add_delayed_data_ref+0x64/0x1c0 [btrfs]
[330700.141369]  [] ? read_extent_buffer+0xbb/0x120 [btrfs]
[330700.149194]  [] ?
btrfs_token_item_offset+0x5d/0xe0 [btrfs]
[330700.157373]  [] __btrfs_cow_block+0x133/0x530 [btrfs]
[330700.165023]  [] ?
read_block_for_search+0x14d/0x3d0 [btrfs]
[330700.173183]  [] btrfs_cow_block+0xf4/0x1f0 [btrfs]
[330700.180552]  [] btrfs_search_slot+0x3e8/0x8e0 [btrfs]
[330700.188128]  [] btrfs_lookup_csum+0x74/0x170 [btrfs]
[330700.195634]  [] ? kmem_cache_alloc+0x105/0x130
[330700.202551]  [] btrfs_csum_file_blocks+0xd0/0x6d0 [btrfs]
[330700.210542]  [] ? clear_extent_bit+0x161/0x420 [btrfs]
[330700.218237]  [] add_pending_csums+0x49/0x70 [btrfs]
[330700.225706]  []
btrfs_finish_ordered_io+0x276/0x3d0 [btrfs]
[330700.233940]  []
btrfs_writepage_end_io_hook+0x4c/0xa0 [btrfs]
[330700.242345]  [] end_extent_writepage+0x69/0x100 [btrfs]
[330700.250192]  [] end_bio_extent_writepage+0x66/0xa0 [btrfs]
[330700.258327]  [] bio_endio+0x1d/0x40
[330700.264214]  [] end_workqueue_fn+0x45/0x50 [btrfs]
[330700.271612]  [] worker_loop+0x14f/0x5a0 [btrfs]
[330700.278672]  [] ? btrfs_queue_worker+0x300/0x300 [btrfs]
[330700.286582]  [] ? btrfs_queue_worker+0x300/0x300 [btrfs]
[330700.294535]  [] kthread+0x9e/0xb0
[330700.300244]  [] kernel_thread_helper+0x4/0x10
[330700.307031]  [] ? kthread_freezable_should_stop+0x70/0x70
[330700.315061]  [] ? gs_change+0x13/0x13
[330700.321167] ---[ end trace b8c31966cca74ca0 ]---

The filesystems have plenty of free space:

/dev/sda  1.9T   16G  1.8T   1% /ceph/osd.000
/dev/sdb  1.9T   15G  1.8T   1% /ceph/osd.001
/dev/sdc  1.9T   13G  1.8T   1% /ceph/osd.002
/dev/sdd  1.9T   14G  1.8T   1% /ceph/osd.003

# btrfs fi df /ceph/osd.000
Data: total=38.01GB, used=15.53GB
System, DUP: total=8.00MB, used=64.00KB
System: total=4.00MB, used=0.00
Metadata, DUP: total=37.50GB, used=82.19MB
Metadata: total=8.00MB, used=0.00

A few more btrfs_orphan_commit_root WARNINGS are present too. If
needed I could upload the messages file.

Regards,
Christian

Am 20. April 2012 17:09 schrieb Christian Brunner :
> After running ceph on XFS for some time, I decided to try btrfs again.
> Performance with the current "for-linux-min" branch and big metadata
> is much better. The only problem (?) I'm still seeing is a warning
> that seems to occur from time to time:
>
> [87703.784552] [ cut here ]
> [87703.789759] WARNING: at fs/btrfs/inode.c:2103
> btrfs_orphan_commit_root+0xf6/0x100 [btrfs]()
> [87703.799070] Hardware name: ProLiant DL180 G6
> [87703.804024] Modules linked in: btrfs zlib_deflate libcrc32c xfs
> exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
> iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
> iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
> [87703.828166] Pid: 929, comm: kworker/1:2 Tainted: P           O
> 3.3.2-1.fits.1.el6.x86_64 #1
> [87703.837513] Call Trace:
> [87703.840280]  [] warn_slowpath_common+0x7f/0xc0
> [87703.847016]  [] warn_slowpath_null+0x1a/0x20
> [87703.853533]  [] btrfs_orphan_commit_root+0xf6/0x100 
> [btrfs]
> [87703.861541]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
> [87703.868674]  []
> btrfs_commit_transaction+0x5db/0xa50 [btrfs]
> [87703.876745]  [] ? __switch_to+0x153/0x440
> [87703.882966]  [] ? wake_up_bit+0x40/0x40
> [87703.888997]  [] ?
> btrfs_commit_transaction+0xa50/0xa50 [btrfs]
> [87703.897271]  [] do_async_commit+0x1f/0x30 [btrfs]
> [87703.904262]  [] process_one_work+0x129/0x450
> [87703.910777]  [] worker_thread+0x17b/0x3c0
> [87703.916991]  [] ? manage_workers+0x220/0x220
> [87703.923504]  [] kthread+0x9e/0xb0
> [87703.928952]  [] kernel_thread_helper+0x4/0x10
> [87703.93]  [] ? kthread_freezable_should_stop+0x70/0x70
> [87703.943323]  [] ? gs_change+0x13/0x13
> [87703.949149] ---[ end trace b8c31966cca731fa ]---
> [91128.812399] [ cut here ]
> [91128.817

Ceph on btrfs 3.4rc

2012-04-20 Thread Christian Brunner
After running ceph on XFS for some time, I decided to try btrfs again.
Performance with the current "for-linux-min" branch and big metadata
is much better. The only problem (?) I'm still seeing is a warning
that seems to occur from time to time:

[87703.784552] [ cut here ]
[87703.789759] WARNING: at fs/btrfs/inode.c:2103
btrfs_orphan_commit_root+0xf6/0x100 [btrfs]()
[87703.799070] Hardware name: ProLiant DL180 G6
[87703.804024] Modules linked in: btrfs zlib_deflate libcrc32c xfs
exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
[87703.828166] Pid: 929, comm: kworker/1:2 Tainted: P   O
3.3.2-1.fits.1.el6.x86_64 #1
[87703.837513] Call Trace:
[87703.840280]  [] warn_slowpath_common+0x7f/0xc0
[87703.847016]  [] warn_slowpath_null+0x1a/0x20
[87703.853533]  [] btrfs_orphan_commit_root+0xf6/0x100 [btrfs]
[87703.861541]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
[87703.868674]  []
btrfs_commit_transaction+0x5db/0xa50 [btrfs]
[87703.876745]  [] ? __switch_to+0x153/0x440
[87703.882966]  [] ? wake_up_bit+0x40/0x40
[87703.888997]  [] ?
btrfs_commit_transaction+0xa50/0xa50 [btrfs]
[87703.897271]  [] do_async_commit+0x1f/0x30 [btrfs]
[87703.904262]  [] process_one_work+0x129/0x450
[87703.910777]  [] worker_thread+0x17b/0x3c0
[87703.916991]  [] ? manage_workers+0x220/0x220
[87703.923504]  [] kthread+0x9e/0xb0
[87703.928952]  [] kernel_thread_helper+0x4/0x10
[87703.93]  [] ? kthread_freezable_should_stop+0x70/0x70
[87703.943323]  [] ? gs_change+0x13/0x13
[87703.949149] ---[ end trace b8c31966cca731fa ]---
[91128.812399] [ cut here ]
[91128.817576] WARNING: at fs/btrfs/inode.c:2103
btrfs_orphan_commit_root+0xf6/0x100 [btrfs]()
[91128.826930] Hardware name: ProLiant DL180 G6
[91128.831897] Modules linked in: btrfs zlib_deflate libcrc32c xfs
exportfs sunrpc bonding ipv6 sg serio_raw pcspkr iTCO_wdt
iTCO_vendor_support i7core_edac edac_core ixgbe dca mdio
iomemory_vsl(PO) hpsa squashfs [last unloaded: scsi_wait_scan]
[91128.856086] Pid: 6806, comm: btrfs-transacti Tainted: PW  O
3.3.2-1.fits.1.el6.x86_64 #1
[91128.865912] Call Trace:
[91128.868670]  [] warn_slowpath_common+0x7f/0xc0
[91128.875379]  [] warn_slowpath_null+0x1a/0x20
[91128.881900]  [] btrfs_orphan_commit_root+0xf6/0x100 [btrfs]
[91128.889894]  [] commit_fs_roots+0xc6/0x1c0 [btrfs]
[91128.897019]  [] ?
btrfs_run_delayed_items+0xf1/0x160 [btrfs]
[91128.905075]  []
btrfs_commit_transaction+0x5db/0xa50 [btrfs]
[91128.913156]  [] ? start_transaction+0x92/0x310 [btrfs]
[91128.920643]  [] ? wake_up_bit+0x40/0x40
[91128.926667]  [] transaction_kthread+0x26b/0x2e0 [btrfs]
[91128.934254]  [] ?
btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
[91128.943671]  [] ?
btrfs_destroy_marked_extents.clone.0+0x1f0/0x1f0 [btrfs]
[91128.953079]  [] kthread+0x9e/0xb0
[91128.958532]  [] kernel_thread_helper+0x4/0x10
[91128.965133]  [] ? kthread_freezable_should_stop+0x70/0x70
[91128.972913]  [] ? gs_change+0x13/0x13
[91128.978826] ---[ end trace b8c31966cca731fb ]---

I'm able to reproduce this with ceph on a single server with 4 disks
(4 filesystems/osds) and a small test program based on librbd. It is
simply writing random bytes on a rbd volume (see attachment).

Is this something I should care about? Any hint's on solving this
would be appreciated.

Thanks,
Christian
#include 
#include 
#include 
#include 

int nr_writes=0;

void
alarm_handler(int sig) {
fprintf(stderr, "Writes/sec: %i\n", nr_writes/10);
	nr_writes = 0;
	alarm(10);
}


int main(int argc, char *argv[]) {
char *clientname;
rados_t cluster;
rados_ioctx_t io_ctx;
rbd_image_t image;
char *pool = "rbd";
char *imgname = argv[1];
	
if (rados_create(&cluster, NULL) < 0) {
fprintf(stderr, "error initializing");
return 1;
}

rados_conf_read_file(cluster, NULL);
	
if (rados_connect(cluster) < 0) {
fprintf(stderr, "error connecting");
rados_shutdown(cluster);
return 1;
}

if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) {
fprintf(stderr, "error opening pool %s", pool);
rados_shutdown(cluster);
return 1;
}

int r = rbd_open(io_ctx, imgname, &image, NULL);
if (r < 0) {
fprintf(stderr, "error reading header from %s", imgname);
rados_ioctx_destroy(io_ctx);
rados_shutdown(cluster);
return 1;
}

alarm(10);
(void) signal(SIGALRM, alarm_handler);

while(1) {
#define RAND_MAX 10485760
   int start = rand();
   rbd_write(image, start, 1, "a");
   nr_writes++;
}

rados_ioctx_destroy(io_ctx);
rados_shutdown(cluster);
}