Re: [Ocfs2-devel] [DRAFT 2/2] ocfs2: fix deadlock caused by recursive cluster locking
Hi, On 10/31/2016 06:55 PM, piaojun wrote: > Hi Eric, > > On 2016-10-19 13:19, Eric Ren wrote: >> The deadlock issue happens when running discontiguous block >> group testing on multiple nodes. The easier way to reproduce >> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple >> nodes at the same time by pssh. >> >> This is indeed another deadlock caused by: commit 743b5f1434f5 >> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason >> had been explained well by Tariq Saeed in this thread: >> >> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html >> >> For this case, the ocfs2_inode_lock() is misused recursively as below: >> >> do_sys_open >> do_filp_open >>path_openat >> may_open >> inode_permission >> __inode_permission >>ocfs2_permission <== ocfs2_inode_lock() >> generic_permission >> get_acl >> ocfs2_iop_get_acl <== ocfs2_inode_lock() >>ocfs2_inode_lock_full_nested <= deadlock if a remote EX >> request > Do you mean another node wants to get ex of the inode? or another process? Remote EX request means "another node wants to get ex of the inode";-) Eric >> comes between two ocfs2_inode_lock() >> >> Fix by checking if the cluster lock has been acquired aready in the >> call-chain >> path. >> >> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in >> ocfs2_iop_set/get_acl()") >> Signed-off-by: Eric Ren>> --- >> fs/ocfs2/acl.c | 39 +++ >> 1 file changed, 27 insertions(+), 12 deletions(-) >> >> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c >> index bed1fcb..7e3544e 100644 >> --- a/fs/ocfs2/acl.c >> +++ b/fs/ocfs2/acl.c >> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle, >> int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) >> { >> struct buffer_head *bh = NULL; >> +struct ocfs2_holder *oh; >> +struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres; >> int status = 0; >> >> -status = ocfs2_inode_lock(inode, , 1); >> -if (status < 0) { >> -if (status != -ENOENT) >> -mlog_errno(status); >> -return status; >> +oh = ocfs2_is_locked_by_me(lockres); >> +if (!oh) { >> +status = ocfs2_inode_lock(inode, , 1); >> +if (status < 0) { >> +if (status != -ENOENT) >> +mlog_errno(status); >> +return status; >> +} >> } >> + >> status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); >> -ocfs2_inode_unlock(inode, 1); >> + >> +if (!oh) >> +ocfs2_inode_unlock(inode, 1); >> brelse(bh); >> return status; >> } >> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode >> *inode, int type) >> struct ocfs2_super *osb; >> struct buffer_head *di_bh = NULL; >> struct posix_acl *acl; >> +struct ocfs2_holder *oh; >> +struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres; >> int ret; >> >> osb = OCFS2_SB(inode->i_sb); >> if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) >> return NULL; >> -ret = ocfs2_inode_lock(inode, _bh, 0); >> -if (ret < 0) { >> -if (ret != -ENOENT) >> -mlog_errno(ret); >> -return ERR_PTR(ret); >> + >> +oh = ocfs2_is_locked_by_me(lockres); >> +if (!oh) { >> +ret = ocfs2_inode_lock(inode, _bh, 0); >> +if (ret < 0) { >> +if (ret != -ENOENT) >> +mlog_errno(ret); >> +return ERR_PTR(ret); >> +} >> } >> >> acl = ocfs2_get_acl_nolock(inode, type, di_bh); >> >> -ocfs2_inode_unlock(inode, 0); >> +if (!oh) >> +ocfs2_inode_unlock(inode, 0); >> brelse(di_bh); >> return acl; >> } >> > ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Re: [Ocfs2-devel] [PATCH] ocfs2/dlm: clean up deadcode in dlm_master_request_handler()
On 2016/10/31 21:52, piaojun wrote: > when 'dispatch_assert' is set, 'response' must be DLM_MASTER_RESP_YES, > and 'res' won't be null, so execution can't reach these two branch. > > Signed-off-by: Jun PiaoLooks good to me. Reviewed-by: Joseph Qi > --- > fs/ocfs2/dlm/dlmmaster.c | 6 -- > 1 file changed, 6 deletions(-) > > diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c > index 3f828a1..9a72dd8 100644 > --- a/fs/ocfs2/dlm/dlmmaster.c > +++ b/fs/ocfs2/dlm/dlmmaster.c > @@ -1644,12 +1644,6 @@ int dlm_master_request_handler(struct o2net_msg *msg, > u32 len, void *data, >* dlm_assert_master_worker() isn't called, we drop it here. >*/ > if (dispatch_assert) { > - if (response != DLM_MASTER_RESP_YES) > - mlog(ML_ERROR, "invalid response %d\n", response); > - if (!res) { > - mlog(ML_ERROR, "bad lockres while trying to assert!\n"); > - BUG(); > - } > mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", >dlm->node_num, res->lockname.len, > res->lockname.name); > spin_lock(>spinlock); > ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] [PATCH] ocfs2/dlm: clean up deadcode in dlm_master_request_handler()
when 'dispatch_assert' is set, 'response' must be DLM_MASTER_RESP_YES, and 'res' won't be null, so execution can't reach these two branch. Signed-off-by: Jun Piao--- fs/ocfs2/dlm/dlmmaster.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3f828a1..9a72dd8 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1644,12 +1644,6 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, * dlm_assert_master_worker() isn't called, we drop it here. */ if (dispatch_assert) { - if (response != DLM_MASTER_RESP_YES) - mlog(ML_ERROR, "invalid response %d\n", response); - if (!res) { - mlog(ML_ERROR, "bad lockres while trying to assert!\n"); - BUG(); - } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); spin_lock(>spinlock); -- 1.9.5.msysgit.1 ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Re: [Ocfs2-devel] [DRAFT 2/2] ocfs2: fix deadlock caused by recursive cluster locking
Hi Eric, On 2016-10-19 13:19, Eric Ren wrote: > The deadlock issue happens when running discontiguous block > group testing on multiple nodes. The easier way to reproduce > is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple > nodes at the same time by pssh. > > This is indeed another deadlock caused by: commit 743b5f1434f5 > ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason > had been explained well by Tariq Saeed in this thread: > > https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html > > For this case, the ocfs2_inode_lock() is misused recursively as below: > > do_sys_open > do_filp_open > path_openat >may_open > inode_permission > __inode_permission > ocfs2_permission <== ocfs2_inode_lock() >generic_permission > get_acl > ocfs2_iop_get_acl <== ocfs2_inode_lock() > ocfs2_inode_lock_full_nested <= deadlock if a remote EX request Do you mean another node wants to get ex of the inode? or another process? > comes between two ocfs2_inode_lock() > > Fix by checking if the cluster lock has been acquired aready in the call-chain > path. > > Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in > ocfs2_iop_set/get_acl()") > Signed-off-by: Eric Ren> --- > fs/ocfs2/acl.c | 39 +++ > 1 file changed, 27 insertions(+), 12 deletions(-) > > diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c > index bed1fcb..7e3544e 100644 > --- a/fs/ocfs2/acl.c > +++ b/fs/ocfs2/acl.c > @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle, > int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) > { > struct buffer_head *bh = NULL; > + struct ocfs2_holder *oh; > + struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres; > int status = 0; > > - status = ocfs2_inode_lock(inode, , 1); > - if (status < 0) { > - if (status != -ENOENT) > - mlog_errno(status); > - return status; > + oh = ocfs2_is_locked_by_me(lockres); > + if (!oh) { > + status = ocfs2_inode_lock(inode, , 1); > + if (status < 0) { > + if (status != -ENOENT) > + mlog_errno(status); > + return status; > + } > } > + > status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); > - ocfs2_inode_unlock(inode, 1); > + > + if (!oh) > + ocfs2_inode_unlock(inode, 1); > brelse(bh); > return status; > } > @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode > *inode, int type) > struct ocfs2_super *osb; > struct buffer_head *di_bh = NULL; > struct posix_acl *acl; > + struct ocfs2_holder *oh; > + struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres; > int ret; > > osb = OCFS2_SB(inode->i_sb); > if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) > return NULL; > - ret = ocfs2_inode_lock(inode, _bh, 0); > - if (ret < 0) { > - if (ret != -ENOENT) > - mlog_errno(ret); > - return ERR_PTR(ret); > + > + oh = ocfs2_is_locked_by_me(lockres); > + if (!oh) { > + ret = ocfs2_inode_lock(inode, _bh, 0); > + if (ret < 0) { > + if (ret != -ENOENT) > + mlog_errno(ret); > + return ERR_PTR(ret); > + } > } > > acl = ocfs2_get_acl_nolock(inode, type, di_bh); > > - ocfs2_inode_unlock(inode, 0); > + if (!oh) > + ocfs2_inode_unlock(inode, 0); > brelse(di_bh); > return acl; > } > ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel