Re: Ceph on btrfs 3.4rc

2012-05-20 Thread Miao Xie
Hi Josef,

On fri, 18 May 2012 15:01:05 -0400, Josef Bacik wrote:
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 9b9b15f..492c74f 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -57,9 +57,6 @@ struct btrfs_inode {
>   /* used to order data wrt metadata */
>   struct btrfs_ordered_inode_tree ordered_tree;
>  
> - /* for keeping track of orphaned inodes */
> - struct list_head i_orphan;
> -
>   /* list of all the delalloc inodes in the FS.  There are times we need
>* to write all the delalloc pages to disk, and this list is used
>* to walk them all.
> @@ -156,6 +153,8 @@ struct btrfs_inode {
>   unsigned dummy_inode:1;
>   unsigned in_defrag:1;
>   unsigned delalloc_meta_reserved:1;
> + unsigned has_orphan_item:1;
> + unsigned doing_truncate:1;

I think the problem is we should not use the different lock to protect the bit 
fields which
are stored in the same machine word. Or some bit fields may be covered by the 
others when
someone change those fields. Could you try to declare ->delalloc_meta_reserved 
and ->has_orphan_item
as a integer?

Thanks
Miao

>  
>   /*
>* always compress this one file
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 8fd7233..aad2600 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1375,7 +1375,7 @@ struct btrfs_root {
>   struct list_head root_list;
>  
>   spinlock_t orphan_lock;
> - struct list_head orphan_list;
> + atomic_t orphan_inodes;
>   struct btrfs_block_rsv *orphan_block_rsv;
>   int orphan_item_inserted;
>   int orphan_cleanup_state;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index a7ffc88..ff3bf4b 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
> u32 sectorsize,
>   root->orphan_block_rsv = NULL;
>  
>   INIT_LIST_HEAD(&root->dirty_list);
> - INIT_LIST_HEAD(&root->orphan_list);
>   INIT_LIST_HEAD(&root->root_list);
>   spin_lock_init(&root->orphan_lock);
>   spin_lock_init(&root->inode_lock);
> @@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, 
> u32 sectorsize,
>   atomic_set(&root->log_commit[0], 0);
>   atomic_set(&root->log_commit[1], 0);
>   atomic_set(&root->log_writers, 0);
> + atomic_set(&root->orphan_inodes, 0);
>   root->log_batch = 0;
>   root->log_transid = 0;
>   root->last_log_commit = 0;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 61b16c6..572da13 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2072,12 +2072,12 @@ void btrfs_orphan_commit_root(struct 
> btrfs_trans_handle *trans,
>   struct btrfs_block_rsv *block_rsv;
>   int ret;
>  
> - if (!list_empty(&root->orphan_list) ||
> + if (atomic_read(&root->orphan_inodes) ||
>   root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
>   return;
>  
>   spin_lock(&root->orphan_lock);
> - if (!list_empty(&root->orphan_list)) {
> + if (atomic_read(&root->orphan_inodes)) {
>   spin_unlock(&root->orphan_lock);
>   return;
>   }
> @@ -2134,8 +2134,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
> struct inode *inode)
>   block_rsv = NULL;
>   }
>  
> - if (list_empty(&BTRFS_I(inode)->i_orphan)) {
> - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
> + if (!BTRFS_I(inode)->has_orphan_item) {
> + BTRFS_I(inode)->has_orphan_item = 1;
>  #if 0
>   /*
>* For proper ENOSPC handling, we should do orphan
> @@ -2148,6 +2148,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
> struct inode *inode)
>   insert = 1;
>  #endif
>   insert = 1;
> + atomic_inc(&root->orphan_inodes);
>   }
>  
>   if (!BTRFS_I(inode)->orphan_meta_reserved) {
> @@ -2166,6 +2167,9 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, 
> struct inode *inode)
>   if (insert >= 1) {
>   ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
>   if (ret && ret != -EEXIST) {
> + spin_lock(&root->orphan_lock);
> + BTRFS_I(inode)->has_orphan_item = 0;
> + spin_unlock(&root->orphan_lock);
>   btrfs_abort_transaction(trans, root, ret);
>   return ret;
>   }
> @@ -2195,13 +2199,21 @@ int btrfs_orphan_del(struct btrfs_trans_handle 
> *trans, struct inode *inode)
>   int release_rsv = 0;
>   int ret = 0;
>  
> + /*
> +  * evict_inode gets called without holding the i_mutex so we need to
> +  * take the orphan lock to make sure we are safe in messing with these.
> +  */
>   spin_lock(&root->orphan_lock);
> - if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
> - list_del_

Re: problem with ceph and btrfs patch: set journal_info in async trans commit worker

2012-11-14 Thread Miao Xie
Hi, Stefan

On wed, 14 Nov 2012 14:42:07 +0100, Stefan Priebe - Profihost AG wrote:
> Hello list,
> 
> i wanted to try out ceph with latest vanilla kernel 3.7-rc5. I was seeing a 
> massive performance degration. I see around 22x btrfs-endio-write processes 
> every 10-20 seconds and they run a long time while consuming a massive amount 
> of CPU.
> 
> So my performance of 23.000 iops drops to an up and down of 23.000 iops to 0 
> - avg is now 2500 iops instead of 23.000.
> 
> Git bisect shows me commit: e209db7ace281ca347b1ac699bf1fb222eac03fe "Btrfs: 
> set journal_info in async trans commit worker" as the problematic patch.
> 
> When i revert this one everything is fine again.
> 
> Is this known?

Could you try the following patch?

http://marc.info/?l=linux-btrfs&m=135175512030453&w=2

I think the patch

  Btrfs: set journal_info in async trans commit worker

is not the real reason that caused the regression.

I guess it is caused by the bug of the reservation. When we join the
same transaction handle more than 2 times, the pointer of the reservation
in the transaction handle would be lost, and the statistical data in the
reservation would be corrupted. And then we would trigger the space flush,
which may block your tasks.

Thanks
Miao

> 
> Greets,
> Stefan
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.0-rcX BUG at fs/btrfs/ioctl.c:432 - bisected

2011-06-09 Thread Miao Xie
Hi,

On Thu, 9 Jun 2011 15:52:43 -0600, Jim Schutt wrote:
> Jun  9 15:14:50 an1 [  299.446615] [ cut here ]
> Jun  9 15:14:50 an1 [  299.447357] kernel BUG at fs/btrfs/ioctl.c:432!
> Jun  9 15:14:50 an1 [  299.447357] invalid opcode:  [#1] SMP
> Jun  9 15:14:50 an1 [  299.447357] last sysfs file: 
> /sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_map
> Jun  9 15:14:50 an1 [  299.447357] CPU 2
> Jun  9 15:14:50 an1 [  299.447357] Modules linked in: btrfs zlib_deflate 
> lzo_compress ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 
> nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter 
> ip_tables x_tables bridge stp i2c_dev i2c_core ext3 jbd scsi_transport_iscsi 
> rds ib_ipoib rdma_ucm rdma_cm ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr 
> ipv6 ib_sa dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod video 
> sbs sbshc pci_slot battery acpi_pad ac kvm sg ses sd_mod enclosure ide_cd_mod 
> cdrom megaraid_sas qla2xxx ib_mthca scsi_transport_fc scsi_tgt ib_mad ib_core 
> button serio_raw ata_piix libata scsi_mod tpm_tis tpm dcdbas tpm_bios 
> iTCO_wdt ioatdma iTCO_vendor_support ehci_hcd i5k_amb dca uhci_hcd hwmon 
> i5000_edac edac_core pcspkr rtc nfs nfs_acl auth_rpcgss fscache lockd sunrpc 
> tg3 bnx2 e1000 [last unloaded: freq_table]
> Jun  9 15:14:50 an1 [  299.447357]
> Jun  9 15:14:50 an1 [  299.447357] Pid: 6047, comm: cosd Not tainted 
> 2.6.39-1-g16cdcec #24 Dell Inc. PowerEdge 1950/0DT097
> Jun  9 15:14:50 an1 [  299.447357] RIP: 0010:[]  
> [] create_subvol+0x36a/0x440 [btrfs]
> Jun  9 15:14:50 an1 [  299.447357] RSP: 0018:88021b163c48  EFLAGS: 
> 00010206
> Jun  9 15:14:50 an1 [  299.447357] RAX: 19b201a0 RBX: 
> 88022402d800 RCX: 88019206c028
> Jun  9 15:14:50 an1 [  299.447357] RDX: 88021884f618 RSI: 
> fff4 RDI: 88019206c048
> Jun  9 15:14:50 an1 [  299.447357] RBP: 88021b163de8 R08: 
> 88021b163bb8 R09: 88019206c048
> Jun  9 15:14:50 an1 [  299.447357] R10: a06c7ed7 R11: 
> 88021b163bb8 R12: 88021b163d88
> Jun  9 15:14:50 an1 [  299.447357] R13:  R14: 
> 880222fac000 R15: 8801e8b6e280
> Jun  9 15:14:50 an1 [  299.447357] FS:  7fa5aa04d710() 
> GS:88022fc8() knlGS:
> Jun  9 15:14:50 an1 [  299.447357] CS:  0010 DS:  ES:  CR0: 
> 80050033
> Jun  9 15:14:50 an1 [  299.447357] CR2: 01d45000 CR3: 
> 00021bd42000 CR4: 06e0
> Jun  9 15:14:50 an1 [  299.447357] DR0:  DR1: 
>  DR2: 
> Jun  9 15:14:50 an1 [  299.447357] DR3:  DR6: 
> 0ff0 DR7: 0400
> Jun  9 15:14:50 an1 [  299.447357] Process cosd (pid: 6047, threadinfo 
> 88021b162000, task 8801dee243e0)
> Jun  9 15:14:50 an1 [  299.447357] Stack:
> Jun  9 15:14:50 an1 [  299.447357]  88020002 0003 
>  8801e8b6d000
> Jun  9 15:14:50 an1 [  299.447357]   00071b163c80 
> 8801d6af9008 880192075000
> Jun  9 15:14:50 an1 [  299.447357]  8801e898eb40 880219b201a0 
> 0001 
> Jun  9 15:14:50 an1 [  299.447357] Call Trace:
> Jun  9 15:14:50 an1 [  299.447357]  [] ? 
> need_resched+0x23/0x2d
> Jun  9 15:14:50 an1 [  299.447357]  [] 
> btrfs_mksubvol+0x10e/0x167 [btrfs]
> Jun  9 15:14:50 an1 [  299.447357]  [] 
> btrfs_ioctl_snap_create_transid+0x9c/0x121 [btrfs]
> Jun  9 15:14:50 an1 [  299.447357]  [] 
> btrfs_ioctl_snap_create+0x50/0x67 [btrfs]
> Jun  9 15:14:50 an1 [  299.447357]  [] 
> btrfs_ioctl+0x1d0/0x2c6 [btrfs]
> Jun  9 15:14:50 an1 [  299.447357]  [] vfs_ioctl+0x1d/0x34
> Jun  9 15:14:50 an1 [  299.447357]  [] 
> do_vfs_ioctl+0x171/0x17a
> Jun  9 15:14:50 an1 [  299.447357]  [] ? 
> fget_light+0x69/0x81
> Jun  9 15:14:50 an1 [  299.447357]  [] sys_ioctl+0x5c/0x7c
> Jun  9 15:14:50 an1 [  299.447357]  [] ? putname+0x33/0x37
> Jun  9 15:14:50 an1 [  299.447357]  [] 
> system_call_fastpath+0x16/0x1b
> Jun  9 f6 4c 89 ff 01 c0 48 98 48 03 82 c8 00 00 00 48 89 82 c8 00 00 00 48 
> 89 42 c0 48 8b 95 a8 fe ff ff e8 5e d5 fd ff 85 c0 74 04 <0f> 0b eb fe 48 8b 
> 85 a8 fe ff ff 49 8b 8e 17 01 00 00 4c 89 ff
> Jun  9 15:14:50 an1 [  299.447357] RIP  [] 
> create_subvol+0x36a/0x440 [btrfs]
> Jun  9 15:14:50 an1 [  299.447357]  RSP 
> Jun  9 15:14:50 an1 [  299.767860] ---[ end trace 10d1f43d69984a37 ]---
> Jun  9 15:15:23 an1 [  333.037241] [ cut here ]
> 
> 
> It bisects to the following commit:
> 
> 16cdcec736cd214350cdb591bf1091f8beedefa0 is the first bad commit
> commit 16cdcec736cd214350cdb591bf1091f8beedefa0
> Author: Miao Xi

Re: kernel BUG at fs/btrfs/delayed-inode.c:1301!

2011-06-19 Thread Miao Xie
On fri, 17 Jun 2011 10:10:31 -0600, Jim Schutt wrote:
> I've hit this delayed-inode BUG several times.  I'm using btrfs
> as the data store for Ceph OSDs, and testing a heavy write load.
> The kernel I'm running is a recent commit (f8f44f09eaa) from
> Linus' tree with the for-chris branch (commit ed0ca14021e5) of
>   git://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs-work.git
> merged in.
> 
> Please let me know what I can do to help resolve this.
> 
> [ 5447.554187] err add delayed dir index item(name: pglog_0.965_0) into the 
> insertion tree of the delayed node(root id: 262, inode id: 258, errno: -17)
> [ 5447.569766] [ cut here ]
> [ 5447.575361] kernel BUG at fs/btrfs/delayed-inode.c:1301!
> [ 5447.580672] invalid opcode:  [#1] SMP
> [ 5447.584806] CPU 2
> [ 5447.586646] Modules linked in: loop btrfs zlib_deflate lzo_compress 
> ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state 
> nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge 
> stp i2c_dev i2c_core ext3 jbd scsi_transport_iscsi rds ib_ipoib rdma_ucm 
> rdma_cm ib_ucm ib_uverbs ib_umad ib_cm iw_cm ib_addr ipv6 ib_sa dm_mirror 
> dm_region_hash dm_log dm_multipath scsi_dh dm_mod video sbs sbshc pci_slot 
> battery acpi_pad ac kvm sg ses enclosure sd_mod megaraid_sas ide_cd_mod cdrom 
> qla2xxx ib_mthca scsi_transport_fc ib_mad scsi_tgt ib_core button serio_raw 
> ata_piix i5k_amb libata hwmon i5000_edac scsi_mod tpm_tis edac_core ehci_hcd 
> pcspkr iTCO_wdt tpm dcdbas tpm_bios iTCO_vendor_support uhci_hcd rtc nfs 
> nfs_acl auth_rpcgss fscache lockd sunrpc tg3 bnx2 e1000 [last unloaded: 
> freq_table]
> [ 5447.660248]
> [ 5447.661744] Pid: 7622, comm: cosd Not tainted 3.0.0-rc3-00178-gbfc8ccb #34 
> Dell Inc. PowerEdge 1950/0DT097
> [ 5447.671421] RIP: 0010:[]  [] 
> btrfs_insert_delayed_dir_index+0x124/0x14c [btrfs]
> [ 5447.681922] RSP: 0018:88021c0edaf8  EFLAGS: 00010292
> [ 5447.687351] RAX: 009e RBX: 880085bf0480 RCX: 
> 00012e0f
> [ 5447.694487] RDX:  RSI: 0046 RDI: 
> 819aed98
> [ 5447.701631] RBP: 88021c0edb48 R08: 88021c0ed908 R09: 
> 8189ef98
> [ 5447.708783] R10:  R11: 0006 R12: 
> ffef
> [ 5447.715923] R13: 880072e64240 R14: 880072e64288 R15: 
> 000d
> [ 5447.723065] FS:  7fefc66a9940() GS:88022fc8() 
> knlGS:
> [ 5447.731178] CS:  0010 DS:  ES:  CR0: 8005003b
> [ 5447.736934] CR2: 004042b0 CR3: 0001ca4ef000 CR4: 
> 06e0
> [ 5447.744087] DR0:  DR1:  DR2: 
> 
> [ 5447.751218] DR3:  DR6: 0ff0 DR7: 
> 0400
> [ 5447.758343] Process cosd (pid: 7622, threadinfo 88021c0ec000, task 
> 8801d92996b0)
> [ 5447.766422] Stack:
> [ 5447.768429]  0001d2da3700 88021c0edb88 8802238d50f8 
> 880225627000
> [ 5447.775866]  8801e32f5e58 8801d2da3700  
> 8801d9aec510
> [ 5447.783291]  000d 8802238d50f8 88021c0edbf8 
> a0641c4e
> [ 5447.790721] Call Trace:
> [ 5447.793191]  [] btrfs_insert_dir_item+0x189/0x1bb [btrfs]
> [ 5447.800156]  [] btrfs_add_link+0x12b/0x191 [btrfs]
> [ 5447.806517]  [] btrfs_add_nondir+0x31/0x58 [btrfs]
> [ 5447.812876]  [] btrfs_create+0xf9/0x197 [btrfs]
> [ 5447.818961]  [] vfs_create+0x72/0x92
> [ 5447.824090]  [] do_last+0x22c/0x40b
> [ 5447.829133]  [] path_openat+0xc0/0x2ef
> [ 5447.834438]  [] ? __perf_event_task_sched_out+0x24/0x44
> [ 5447.841216]  [] ? perf_event_task_sched_out+0x59/0x67
> [ 5447.847846]  [] do_filp_open+0x3d/0x87
> [ 5447.853156]  [] ? strncpy_from_user+0x43/0x4d
> [ 5447.859072]  [] ? getname_flags+0x2e/0x80
> [ 5447.864636]  [] ? do_getname+0x14b/0x173
> [ 5447.870112]  [] ? audit_getname+0x16/0x26
> [ 5447.875682]  [] ? spin_lock+0xe/0x10
> [ 5447.880882]  [] do_sys_open+0x69/0xae
> [ 5447.886153]  [] sys_open+0x20/0x22
> [ 5447.891114]  [] system_call_fastpath+0x16/0x1b
> [ 5447.897124] Code: 85 c0 41 89 c4 74 28 49 8b 45 10 49 8b 4d 00 45 89 e0 48 
> 8b 75 c0 48 c7 c7 bb 43 69 a0 48 8b 90 e8 02 00 00 31 c0 e8 ce fb 9b e0 <0f> 
> 0b eb fe 4c 89 f7 e8 81 7b d2 e0 4c 89 ef e8 d4 e4 ff ff 48
> [ 5447.916562] RIP  [] 
> btrfs_insert_delayed_dir_index+0x124/0x14c [btrfs]
> [ 5447.924683]  RSP 
> [ 5447.928514] ---[ end trace 461a7f9887994fe0 ]---

Thanks for your report. I will deal with it.

Miao

> 
> -- Jim
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel BUG at fs/btrfs/delayed-inode.c:1301!

2011-06-20 Thread Miao Xie
 
> -- Jim
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

>From 457f39393b2e3d475fbba029b90b6a4e17b94d43 Mon Sep 17 00:00:00 2001
From: Miao Xie 
Date: Mon, 20 Jun 2011 17:21:51 +0800
Subject: [PATCH] btrfs: fix inconsonant inode information

When iputting the inode, We may leave the delayed nodes if they have some
delayed items that have not been dealt with. So when the inode is read again,
we must look up the relative delayed node, and use the information in it to
initialize the inode. Or we will get inconsonant inode information.

Signed-off-by: Miao Xie 
---
 fs/btrfs/delayed-inode.c |  104 +++---
 fs/btrfs/delayed-inode.h |1 +
 fs/btrfs/inode.c |   12 -
 3 files changed, 91 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f1cbd02..280755e 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -82,19 +82,16 @@ static inline struct btrfs_delayed_root 
*btrfs_get_delayed_root(
return root->fs_info->delayed_root;
 }
 
-static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
-   struct inode *inode)
+static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
 {
-   struct btrfs_delayed_node *node;
struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(inode);
-   int ret;
+   struct btrfs_delayed_node *node;
 
-again:
node = ACCESS_ONCE(btrfs_inode->delayed_node);
if (node) {
-   atomic_inc(&node->refs);/* can be accessed */
+   atomic_inc(&node->refs);
return node;
}
 
@@ -103,7 +100,9 @@ again:
if (node) {
if (btrfs_inode->delayed_node) {
spin_unlock(&root->inode_lock);
-   goto again;
+   BUG_ON(btrfs_inode->delayed_node != node);
+   atomic_inc(&node->refs);/* can be accessed */
+   return node;
}
btrfs_inode->delayed_node = node;
atomic_inc(&node->refs);/* can be accessed */
@@ -113,6 +112,23 @@ again:
}
spin_unlock(&root->inode_lock);
 
+   return NULL;
+}
+
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+   struct inode *inode)
+{
+   struct btrfs_delayed_node *node;
+   struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+   struct btrfs_root *root = btrfs_inode->root;
+   u64 ino = btrfs_ino(inode);
+   int ret;
+
+again:
+   node = btrfs_get_delayed_node(inode);
+   if (node)
+   return node;
+
node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
if (!node)
return ERR_PTR(-ENOMEM);
@@ -548,19 +564,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item(
return next;
 }
 
-static inline struct btrfs_delayed_node *btrfs_get_delayed_node(
-   struct inode *inode)
-{
-   struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
-   struct btrfs_delayed_node *delayed_node;
-
-   delayed_node = btrfs_inode->delayed_node;
-   if (delayed_node)
-   atomic_inc(&delayed_node->refs);
-
-   return delayed_node;
-}
-
 static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
   u64 root_id)
 {
@@ -1404,8 +1407,7 @@ end:
 
 int btrfs_inode_delayed_dir_index_count(struct inode *inode)
 {
-   struct btrfs_delayed_node *delayed_node = BTRFS_I(inode)->delayed_node;
-   int ret = 0;
+   struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
 
if (!delayed_node)
return -ENOENT;
@@ -1415,11 +1417,14 @@ int btrfs_inode_delayed_dir_index_count(struct inode 
*inode)
 * a new directory index is added into the delayed node and index_cnt
 * is updated now. So we needn't lock the delayed node.
 */
-   if (!delayed_node->index_cnt)
+   if (!delayed_node->index_cnt) {
+   btrfs_release_delayed_node(delayed_node);
return -EINVAL;
+   }
 
BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
-   return ret;
+   btrfs_release_delayed_node(delayed_node);
+   return 0;
 }
 
 void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
@@ -1613,6 +1618,57 @@ static void fill_stack_inode_item(struct 
btrfs_

Re: kernel BUG at fs/btrfs/delayed-inode.c:1301!

2011-06-21 Thread Miao Xie
On Tue, 21 Jun 2011 02:08:54 +0200, David Sterba wrote:
> On Mon, Jun 20, 2011 at 06:12:10PM +0800, Miao Xie wrote:
>> >From 457f39393b2e3d475fbba029b90b6a4e17b94d43 Mon Sep 17 00:00:00 2001
>> From: Miao Xie 
>> Date: Mon, 20 Jun 2011 17:21:51 +0800
>> Subject: [PATCH] btrfs: fix inconsonant inode information
>>
>> When iputting the inode, We may leave the delayed nodes if they have some
>> delayed items that have not been dealt with. So when the inode is read again,
>> we must look up the relative delayed node, and use the information in it to
>> initialize the inode. Or we will get inconsonant inode information.
>>
>> Signed-off-by: Miao Xie 
>> ---
>>  fs/btrfs/delayed-inode.c |  104 
>> +++---
>>  fs/btrfs/delayed-inode.h |1 +
>>  fs/btrfs/inode.c |   12 -
>>  3 files changed, 91 insertions(+), 26 deletions(-)
>>
>> diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
>> index f1cbd02..280755e 100644
>> --- a/fs/btrfs/delayed-inode.c
>> +++ b/fs/btrfs/delayed-inode.c
>> @@ -82,19 +82,16 @@ static inline struct btrfs_delayed_root 
>> *btrfs_get_delayed_root(
>>  return root->fs_info->delayed_root;
>>  }
>>  
>> -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
>> -struct inode *inode)
>> +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode 
>> *inode)
>>  {
>> -struct btrfs_delayed_node *node;
>>  struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
>>  struct btrfs_root *root = btrfs_inode->root;
>>  u64 ino = btrfs_ino(inode);
>> -int ret;
>> +struct btrfs_delayed_node *node;
>>  
>> -again:
>>  node = ACCESS_ONCE(btrfs_inode->delayed_node);
> 
> do you still need the volatile access here, after the again: label has
> been removed? it does not break things if it's there, but it raises
> questions ...

The rule of ACCESS_ONCE said by Linus is:

  if you access unlocked values, you use ACCESS_ONCE().

(See: http://yarchive.net/comp/linux/ACCESS_ONCE.html)
So I think it is still needed.
> 
>>  if (node) {
>> -atomic_inc(&node->refs);/* can be accessed */
>> +atomic_inc(&node->refs);
>>  return node;
>>  }
>>  
>> @@ -103,7 +100,9 @@ again:
>>  if (node) {
>>  if (btrfs_inode->delayed_node) {
>>  spin_unlock(&root->inode_lock);
>> -goto again;
>> +BUG_ON(btrfs_inode->delayed_node != node);
>> +atomic_inc(&node->refs);/* can be accessed */
>> +return node;
>>  }
>>  btrfs_inode->delayed_node = node;
>>  atomic_inc(&node->refs);/* can be accessed */
>> @@ -113,6 +112,23 @@ again:
>>  }
>>  spin_unlock(&root->inode_lock);
>>  
>> +return NULL;
>> +}
>> +
>> +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
>> +struct inode *inode)
>> +{
>> +struct btrfs_delayed_node *node;
>> +struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
>> +struct btrfs_root *root = btrfs_inode->root;
>> +u64 ino = btrfs_ino(inode);
>> +int ret;
>> +
>> +again:
>> +node = btrfs_get_delayed_node(inode);
> 
> ... aha, it's been somehow moved here, which copies the original logic.
> Now reading inode->delayed_node is inside a function and I do not think
> that compiler could optimize reading value of btrfs_inode->delayed_node
> that it would require ACCESS_ONCE.

The reason that I rewrote btrfs_get_delayed_node() is:
The old btrfs_get_delayed_node() may ignore the old delayed node which holds the
up-to-date information (such as: new directory indexes, up-to-date inode 
information),
considers there is no relative delayed node. And then btrfs will use the max 
index number
in the fs tree to initialize ->index_cnt, but in fact, this number is not right,
perhaps there are some directory indexes in the delayed node, the index number 
of them
is greater than the one in the fs tree. In this way, the same index number may 
be allocated
twice, and hit EEXIST error.

> 
> And there is another ACCESS_ONCE in btrfs_remove_delayed_node. I wonder
> what's the reason for that. Sorry to abuse this thread, but I'd like to
> be sure about protection of the ->