Re: [Ocfs2-devel] [DRAFT 2/2] ocfs2: fix deadlock caused by recursive cluster locking

2016-10-31 Thread Eric Ren
Hi,

On 10/31/2016 06:55 PM, piaojun wrote:
> Hi Eric,
>
> On 2016-10-19 13:19, Eric Ren wrote:
>> The deadlock issue happens when running discontiguous block
>> group testing on multiple nodes. The easier way to reproduce
>> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
>> nodes at the same time by pssh.
>>
>> This is indeed another deadlock caused by: commit 743b5f1434f5
>> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
>> had been explained well by Tariq Saeed in this thread:
>>
>> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html
>>
>> For this case, the ocfs2_inode_lock() is misused recursively as below:
>>
>> do_sys_open
>>   do_filp_open
>>path_openat
>> may_open
>>  inode_permission
>>   __inode_permission
>>ocfs2_permission  <== ocfs2_inode_lock()
>> generic_permission
>>  get_acl
>>   ocfs2_iop_get_acl  <== ocfs2_inode_lock()
>>ocfs2_inode_lock_full_nested <= deadlock if a remote EX 
>> request
> Do you mean another node wants to get ex of the inode? or another process?
Remote EX request means "another node wants to get ex of the inode";-)

Eric
>> comes between two ocfs2_inode_lock()
>>
>> Fix by checking if the cluster lock has been acquired aready in the 
>> call-chain
>> path.
>>
>> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in 
>> ocfs2_iop_set/get_acl()")
>> Signed-off-by: Eric Ren 
>> ---
>>   fs/ocfs2/acl.c | 39 +++
>>   1 file changed, 27 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
>> index bed1fcb..7e3544e 100644
>> --- a/fs/ocfs2/acl.c
>> +++ b/fs/ocfs2/acl.c
>> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>>   int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>>   {
>>  struct buffer_head *bh = NULL;
>> +struct ocfs2_holder *oh;
>> +struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres;
>>  int status = 0;
>>   
>> -status = ocfs2_inode_lock(inode, , 1);
>> -if (status < 0) {
>> -if (status != -ENOENT)
>> -mlog_errno(status);
>> -return status;
>> +oh = ocfs2_is_locked_by_me(lockres);
>> +if (!oh) {
>> +status = ocfs2_inode_lock(inode, , 1);
>> +if (status < 0) {
>> +if (status != -ENOENT)
>> +mlog_errno(status);
>> +return status;
>> +}
>>  }
>> +
>>  status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
>> -ocfs2_inode_unlock(inode, 1);
>> +
>> +if (!oh)
>> +ocfs2_inode_unlock(inode, 1);
>>  brelse(bh);
>>  return status;
>>   }
>> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
>> *inode, int type)
>>  struct ocfs2_super *osb;
>>  struct buffer_head *di_bh = NULL;
>>  struct posix_acl *acl;
>> +struct ocfs2_holder *oh;
>> +struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres;
>>  int ret;
>>   
>>  osb = OCFS2_SB(inode->i_sb);
>>  if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>>  return NULL;
>> -ret = ocfs2_inode_lock(inode, _bh, 0);
>> -if (ret < 0) {
>> -if (ret != -ENOENT)
>> -mlog_errno(ret);
>> -return ERR_PTR(ret);
>> +
>> +oh = ocfs2_is_locked_by_me(lockres);
>> +if (!oh) {
>> +ret = ocfs2_inode_lock(inode, _bh, 0);
>> +if (ret < 0) {
>> +if (ret != -ENOENT)
>> +mlog_errno(ret);
>> +return ERR_PTR(ret);
>> +}
>>  }
>>   
>>  acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>>   
>> -ocfs2_inode_unlock(inode, 0);
>> +if (!oh)
>> +ocfs2_inode_unlock(inode, 0);
>>  brelse(di_bh);
>>  return acl;
>>   }
>>
>


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


Re: [Ocfs2-devel] [PATCH] ocfs2/dlm: clean up deadcode in dlm_master_request_handler()

2016-10-31 Thread Joseph Qi
On 2016/10/31 21:52, piaojun wrote:
> when 'dispatch_assert' is set, 'response' must be DLM_MASTER_RESP_YES,
> and 'res' won't be null, so execution can't reach these two branch.
> 
> Signed-off-by: Jun Piao 

Looks good to me.
Reviewed-by: Joseph Qi 
> ---
>  fs/ocfs2/dlm/dlmmaster.c | 6 --
>  1 file changed, 6 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
> index 3f828a1..9a72dd8 100644
> --- a/fs/ocfs2/dlm/dlmmaster.c
> +++ b/fs/ocfs2/dlm/dlmmaster.c
> @@ -1644,12 +1644,6 @@ int dlm_master_request_handler(struct o2net_msg *msg, 
> u32 len, void *data,
>* dlm_assert_master_worker() isn't called, we drop it here.
>*/
>   if (dispatch_assert) {
> - if (response != DLM_MASTER_RESP_YES)
> - mlog(ML_ERROR, "invalid response %d\n", response);
> - if (!res) {
> - mlog(ML_ERROR, "bad lockres while trying to assert!\n");
> - BUG();
> - }
>   mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
>dlm->node_num, res->lockname.len, 
> res->lockname.name);
>   spin_lock(>spinlock);
> 



___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] [PATCH] ocfs2/dlm: clean up deadcode in dlm_master_request_handler()

2016-10-31 Thread piaojun
when 'dispatch_assert' is set, 'response' must be DLM_MASTER_RESP_YES,
and 'res' won't be null, so execution can't reach these two branch.

Signed-off-by: Jun Piao 
---
 fs/ocfs2/dlm/dlmmaster.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3f828a1..9a72dd8 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1644,12 +1644,6 @@ int dlm_master_request_handler(struct o2net_msg *msg, 
u32 len, void *data,
 * dlm_assert_master_worker() isn't called, we drop it here.
 */
if (dispatch_assert) {
-   if (response != DLM_MASTER_RESP_YES)
-   mlog(ML_ERROR, "invalid response %d\n", response);
-   if (!res) {
-   mlog(ML_ERROR, "bad lockres while trying to assert!\n");
-   BUG();
-   }
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
 dlm->node_num, res->lockname.len, 
res->lockname.name);
spin_lock(>spinlock);
-- 
1.9.5.msysgit.1


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


Re: [Ocfs2-devel] [DRAFT 2/2] ocfs2: fix deadlock caused by recursive cluster locking

2016-10-31 Thread piaojun
Hi Eric,

On 2016-10-19 13:19, Eric Ren wrote:
> The deadlock issue happens when running discontiguous block
> group testing on multiple nodes. The easier way to reproduce
> is to do "chmod -R 777 /mnt/ocfs2" things like this on multiple
> nodes at the same time by pssh.
> 
> This is indeed another deadlock caused by: commit 743b5f1434f5
> ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). The reason
> had been explained well by Tariq Saeed in this thread:
> 
> https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html
> 
> For this case, the ocfs2_inode_lock() is misused recursively as below:
> 
> do_sys_open
>  do_filp_open
>   path_openat
>may_open
> inode_permission
>  __inode_permission
>   ocfs2_permission  <== ocfs2_inode_lock()
>generic_permission
> get_acl
>  ocfs2_iop_get_acl  <== ocfs2_inode_lock()
>   ocfs2_inode_lock_full_nested <= deadlock if a remote EX request
Do you mean another node wants to get ex of the inode? or another process?
> comes between two ocfs2_inode_lock()
> 
> Fix by checking if the cluster lock has been acquired aready in the call-chain
> path.
> 
> Fixes: commit 743b5f1434f5 ("ocfs2: take inode lock in 
> ocfs2_iop_set/get_acl()")
> Signed-off-by: Eric Ren 
> ---
>  fs/ocfs2/acl.c | 39 +++
>  1 file changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..7e3544e 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -283,16 +283,24 @@ int ocfs2_set_acl(handle_t *handle,
>  int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  {
>   struct buffer_head *bh = NULL;
> + struct ocfs2_holder *oh;
> + struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres;
>   int status = 0;
>  
> - status = ocfs2_inode_lock(inode, , 1);
> - if (status < 0) {
> - if (status != -ENOENT)
> - mlog_errno(status);
> - return status;
> + oh = ocfs2_is_locked_by_me(lockres);
> + if (!oh) {
> + status = ocfs2_inode_lock(inode, , 1);
> + if (status < 0) {
> + if (status != -ENOENT)
> + mlog_errno(status);
> + return status;
> + }
>   }
> +
>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> - ocfs2_inode_unlock(inode, 1);
> +
> + if (!oh)
> + ocfs2_inode_unlock(inode, 1);
>   brelse(bh);
>   return status;
>  }
> @@ -302,21 +310,28 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
> *inode, int type)
>   struct ocfs2_super *osb;
>   struct buffer_head *di_bh = NULL;
>   struct posix_acl *acl;
> + struct ocfs2_holder *oh;
> + struct ocfs2_lock_res *lockres = _I(inode)->ip_inode_lockres;
>   int ret;
>  
>   osb = OCFS2_SB(inode->i_sb);
>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   return NULL;
> - ret = ocfs2_inode_lock(inode, _bh, 0);
> - if (ret < 0) {
> - if (ret != -ENOENT)
> - mlog_errno(ret);
> - return ERR_PTR(ret);
> +
> + oh = ocfs2_is_locked_by_me(lockres);
> + if (!oh) {
> + ret = ocfs2_inode_lock(inode, _bh, 0);
> + if (ret < 0) {
> + if (ret != -ENOENT)
> + mlog_errno(ret);
> + return ERR_PTR(ret);
> + }
>   }
>  
>   acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> - ocfs2_inode_unlock(inode, 0);
> + if (!oh)
> + ocfs2_inode_unlock(inode, 0);
>   brelse(di_bh);
>   return acl;
>  }
> 


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel