[Devel] [PATCH] ext4: release leaked posix acl in ext4_acl_chmod

2018-02-07 Thread Stanislav Kinsburskiy
Note: only rh7-3.10.0-693.17.1.el7-based kernels are affected.
I.e. starting from rh7-3.10.0-693.17.1.vz7.43.1.

Posix acl is used to convert of an extended attribute, provided by user to
ext4 attributes. In particular to i_mode in case of ACL_TYPE_ACCESS
request.
IOW, this object is allocated, used for convertion, not stored anywhere
and
must be freed.
However posix_acl_update_mode() can zerofy the pointer to support
ext4_set_acl() logic, but then the object is leaked.
So, fix it by releasing new temporary pointer with the same value instead
of
acl pointer.

In scope of https://jira.sw.ru/browse/PSBM-81384

RHEL bug URL: https://bugzilla.redhat.com/show_bug.cgi?id=1543020

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/ext4/acl.c |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f8a38a2..046b338 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -297,7 +297,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct 
inode *dir)
 int
 ext4_acl_chmod(struct inode *inode)
 {
-   struct posix_acl *acl;
+   struct posix_acl *acl, *real_acl;
handle_t *handle;
int retries = 0;
int error;
@@ -315,6 +315,8 @@ ext4_acl_chmod(struct inode *inode)
error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (error)
return error;
+
+   real_acl = acl;
 retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR,
ext4_jbd2_credits_xattr(inode));
@@ -341,7 +343,7 @@ ext4_acl_chmod(struct inode *inode)
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
 out:
-   posix_acl_release(acl);
+   posix_acl_release(real_acl);
return error;
 }
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2] ext4: release leaked posix acl in ext4_xattr_set_acl

2018-02-07 Thread Stanislav Kinsburskiy
Note: only rh7-3.10.0-693.17.1.el7-based kernels are affcted.
I.e. starting from rh7-3.10.0-693.17.1.vz7.43.1.

Posix acl is used to convert of an extended attribute, provided by user to
ext4 attributes. In particular to i_mode in case of ACL_TYPE_ACCESS request.
IOW, this object is allocated, used for convertion, not stored anywhere and
must be freed.
However posix_acl_update_mode() can zerofy the pointer to support
ext4_set_acl() logic, but then the object is leaked.
So, fix it by releasing new temporary pointer with the same value instead of
acl pointer.

https://jira.sw.ru/browse/PSBM-81384

RHEL bug URL: https://bugzilla.redhat.com/show_bug.cgi?id=1543020

v2: Added affected kernel version + RHEL bug URL

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/ext4/acl.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 917e819..f8a38a2 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -403,7 +403,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
 {
struct inode *inode = dentry->d_inode;
handle_t *handle;
-   struct posix_acl *acl;
+   struct posix_acl *acl, *real_acl;
int error, retries = 0;
int update_mode = 0;
umode_t mode = inode->i_mode;
@@ -416,7 +416,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
return -EPERM;
 
if (value) {
-   acl = posix_acl_from_xattr(&init_user_ns, value, size);
+   acl = real_acl = posix_acl_from_xattr(&init_user_ns, value, 
size);
if (IS_ERR(acl))
return PTR_ERR(acl);
else if (acl) {
@@ -425,7 +425,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
goto release_and_out;
}
} else
-   acl = NULL;
+   acl = real_acl = NULL;
 
 retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR,
@@ -452,7 +452,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
goto retry;
 
 release_and_out:
-   posix_acl_release(acl);
+   posix_acl_release(real_acl);
return error;
 }
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] ext4: release leaked posix acl in ext4_xattr_set_acl

2018-02-07 Thread Stanislav Kinsburskiy


07.02.2018 14:51, Dmitry Monakhov пишет:
> Stanislav Kinsburskiy  writes:
> 
>> Posix acl is used to convert of an extended attribute, provided by user to
>> ext4 attributes. In particular to I-mode in case of ACL_TYPE_ACCESS request.
>> IOW, this object is allocated, used for convertion, not stored anywhere and
>> must be freed.
>> However posix_acl_update_mode() can zerofy the pointer to support
>> ext4_set_acl() logic, but then the object is leaked.
>> So, fix it by releasing new temporary pointer with the same value instead of
>> acl pointer.
> So you are telling me that:
> ext4_xattr_set_acl
> L1 acl = posix_acl_from_xattr 
> L2 -> ext4_set_acl(handle, inode, type, acl)
> L3->posix_acl_update_mode(inode, &inode->i_mode, &acl)
>   *acl = NULL;
>   You are saying that instruction above can affect value at L1?
>   HOW? acl passed to ext4_set_acl() by value, so
>   posix_acl_update_mode() can affect value only in L2 and L3 but not L1. 
> 
> Stas, have you drunk a lousy beer today?

OMG, Dmitry... Looks like storage work clouded your mind and you are looking to 
some other sources.
There is no "L3" in the bug.
Because posix_acl_update_mode() in called from ext4_xattr_set_acl() (i.e on 
"L2").
Thus pointer is set to NULL on "L2" and released on "L1".
IOW, pointer is leaked.


>>
>> https://jira.sw.ru/browse/PSBM-81384
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/ext4/acl.c |8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
>> index 917e819..2640d7b 100644
>> --- a/fs/ext4/acl.c
>> +++ b/fs/ext4/acl.c
>> @@ -403,7 +403,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char 
>> *name, const void *value,
>>  {
>>  struct inode *inode = dentry->d_inode;
>>  handle_t *handle;
>> -struct posix_acl *acl;
>> +struct posix_acl *acl, *tmp;
>>  int error, retries = 0;
>>  int update_mode = 0;
>>  umode_t mode = inode->i_mode;
>> @@ -416,7 +416,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char 
>> *name, const void *value,
>>  return -EPERM;
>>  
>>  if (value) {
>> -acl = posix_acl_from_xattr(&init_user_ns, value, size);
>> +acl = tmp = posix_acl_from_xattr(&init_user_ns, value, size);
>>  if (IS_ERR(acl))
>>  return PTR_ERR(acl);
>>  else if (acl) {
>> @@ -425,7 +425,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char 
>> *name, const void *value,
>>  goto release_and_out;
>>  }
>>  } else
>> -acl = NULL;
>> +acl = tmp = NULL;
>>  
>>  retry:
>>  handle = ext4_journal_start(inode, EXT4_HT_XATTR,
>> @@ -452,7 +452,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char 
>> *name, const void *value,
>>  goto retry;
>>  
>>  release_and_out:
>> -posix_acl_release(acl);
>> +posix_acl_release(tmp);
>>  return error;
>>  }
>>  
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] ext4: release leaked posix acl in ext4_xattr_set_acl

2018-02-07 Thread Stanislav Kinsburskiy
Posix acl is used to convert of an extended attribute, provided by user to
ext4 attributes. In particular to I-mode in case of ACL_TYPE_ACCESS request.
IOW, this object is allocated, used for convertion, not stored anywhere and
must be freed.
However posix_acl_update_mode() can zerofy the pointer to support
ext4_set_acl() logic, but then the object is leaked.
So, fix it by releasing new temporary pointer with the same value instead of
acl pointer.

https://jira.sw.ru/browse/PSBM-81384

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/ext4/acl.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 917e819..2640d7b 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -403,7 +403,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
 {
struct inode *inode = dentry->d_inode;
handle_t *handle;
-   struct posix_acl *acl;
+   struct posix_acl *acl, *tmp;
int error, retries = 0;
int update_mode = 0;
umode_t mode = inode->i_mode;
@@ -416,7 +416,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
return -EPERM;
 
if (value) {
-   acl = posix_acl_from_xattr(&init_user_ns, value, size);
+   acl = tmp = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
else if (acl) {
@@ -425,7 +425,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
goto release_and_out;
}
} else
-   acl = NULL;
+   acl = tmp = NULL;
 
 retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR,
@@ -452,7 +452,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
goto retry;
 
 release_and_out:
-   posix_acl_release(acl);
+   posix_acl_release(tmp);
return error;
 }
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH v3 1/2] spfs: Handle non-zero exit status in cleanup_spfs_mount()

2018-01-23 Thread Stanislav Kinsburskiy
Looks good to me

Acked-by: Stanislav Kinsburskiy 


23.01.2018 11:43, Kirill Tkhai пишет:
> Consider non-zero exit status of spfs similar to killed
> status and do the same cleanups.
> 
> v3: New
> 
> Signed-off-by: Kirill Tkhai 
> ---
>  manager/context.c |8 
>  manager/spfs.c|8 
>  2 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/manager/context.c b/manager/context.c
> index 1eb37c9..73d5ada 100644
> --- a/manager/context.c
> +++ b/manager/context.c
> @@ -45,12 +45,12 @@ const char *mgr_ovz_id(void)
>  static void cleanup_spfs_mount(struct spfs_manager_context_s *ctx,
>  struct spfs_info_s *info, int status)
>  {
> - bool killed = WIFSIGNALED(status);
> + bool failed = WIFSIGNALED(status) || !!WEXITSTATUS(status);
>  
>   pr_debug("removing info %s from the list\n", info->mnt.id);
>  
> - if (killed)
> - /* SPFS master was killed. We need to release the reference */
> + if (failed)
> + /* SPFS master was failed. We need to release the reference */
>   spfs_release_mnt(info);
>  
>   info->dead = true;
> @@ -59,7 +59,7 @@ static void cleanup_spfs_mount(struct 
> spfs_manager_context_s *ctx,
>   if (unlink(info->socket_path))
>   pr_perror("failed to unlink %s", info->socket_path);
>  
> - spfs_cleanup_env(info, killed);
> + spfs_cleanup_env(info, failed);
>  
>   close_namespaces(info->ns_fds);
>  }
> diff --git a/manager/spfs.c b/manager/spfs.c
> index 3e0f667..99845b1 100644
> --- a/manager/spfs.c
> +++ b/manager/spfs.c
> @@ -409,9 +409,9 @@ int spfs_prepare_env(struct spfs_info_s *info, const char 
> *proxy_dir)
>   return err ? err : res;
>  }
>  
> -static int __spfs_cleanup_env(struct spfs_info_s *info, bool killed)
> +static int __spfs_cleanup_env(struct spfs_info_s *info, bool failed)
>  {
> - if (killed && umount(info->work_dir)) {
> + if (failed && umount(info->work_dir)) {
>   pr_perror("failed to umount %s", info->work_dir);
>   return -errno;
>   }
> @@ -423,7 +423,7 @@ static int __spfs_cleanup_env(struct spfs_info_s *info, 
> bool killed)
>   return 0;
>  }
>  
> -int spfs_cleanup_env(struct spfs_info_s *info, bool killed)
> +int spfs_cleanup_env(struct spfs_info_s *info, bool failed)
>  {
>   int err, res;
>   unsigned orig_ns_mask;
> @@ -432,7 +432,7 @@ int spfs_cleanup_env(struct spfs_info_s *info, bool 
> killed)
>   if (res)
>   return res;
>  
> - err = __spfs_cleanup_env(info, killed);
> + err = __spfs_cleanup_env(info, failed);
>  
>   res = leave_spfs_context(info, orig_ns_mask);
>  
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit

2018-01-23 Thread Stanislav Kinsburskiy


23.01.2018 11:19, Kirill Tkhai пишет:
> On 23.01.2018 13:18, Stanislav Kinsburskiy wrote:
>>
>>
>> 23.01.2018 11:17, Kirill Tkhai пишет:
>>> On 23.01.2018 13:14, Stanislav Kinsburskiy wrote:
>>>>
>>>>
>>>> 23.01.2018 11:09, Kirill Tkhai пишет:
>>>>> On 23.01.2018 13:07, Stanislav Kinsburskiy wrote:
>>>>>>
>>>>>>
>>>>>> 23.01.2018 10:51, Kirill Tkhai пишет:
>>>>>>> On 23.01.2018 12:48, Stanislav Kinsburskiy wrote:
>>>>>>>> Please, see a couple of nits below
>>>>>>>>
>>>>>>>> 23.01.2018 10:41, Kirill Tkhai пишет:
>>>>>>>>> Stanislav Kinsburskiy says:
>>>>>>>>>
>>>>>>>>> "SPFS manager has a special "--exit-with-spfs" options, which is used 
>>>>>>>>> by CRIU.
>>>>>>>>>  The idea of the option is simple: force SPFS manager to exit, when 
>>>>>>>>> it has some
>>>>>>>>>  SPFS processes among its children (i.e. spfs was mounted at least 
>>>>>>>>> once),
>>>>>>>>>  but all these processes have exited for whatever reason (which 
>>>>>>>>> usually happens
>>>>>>>>>  when restore has failed and spfs mounts where unmounted).
>>>>>>>>>  Although it works in overall (main SPFS manager process exits), its 
>>>>>>>>> children
>>>>>>>>>  (responsible to SPFS replacement) may wait on FUTEX for "release" 
>>>>>>>>> command
>>>>>>>>>  for corresponding SPFS mount and thus never stop until they are 
>>>>>>>>> killed".
>>>>>>>>>
>>>>>>>>> 1 spfs-manager
>>>>>>>>> 2   \_ spfs
>>>>>>>>> 3   \_ spfs-manager
>>>>>>>>> 4   \_ spfs
>>>>>>>>> 5   \_ spfs-manager
>>>>>>>>>
>>>>>>>>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount.
>>>>>>>>> The patch makes spfs-manager 1 kill 3 in case of 2 exited.
>>>>>>>>>
>>>>>>>>> https://jira.sw.ru/browse/PSBM-80055
>>>>>>>>>
>>>>>>>>> Signed-off-by: Kirill Tkhai 
>>>>>>>>> ---
>>>>>>>>>  manager/context.c |4 
>>>>>>>>>  manager/spfs.c|1 +
>>>>>>>>>  2 files changed, 5 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/manager/context.c b/manager/context.c
>>>>>>>>> index 1eb37c9..4464a23 100644
>>>>>>>>> --- a/manager/context.c
>>>>>>>>> +++ b/manager/context.c
>>>>>>>>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct 
>>>>>>>>> spfs_manager_context_s *ctx,
>>>>>>>>>   /* SPFS master was killed. We need to release the 
>>>>>>>>> reference */
>>>>>>>>>   spfs_release_mnt(info);
>>>>>>>>>  
>>>>>>>>> + if (killed || WEXITSTATUS(status))
>>>>>>>>> + kill(info->replacer, SIGKILL);
>>>>>>>>> +
>>>>>>>>
>>>>>>>> There is "if (killed)" check above.
>>>>>>>> Could you please move this hunk there?
>>>>>>>
>>>>>>> There is logical OR (||) in the hunk. How should I move it to 
>>>>>>> unconditional check?
>>>>>>>
>>>>>>
>>>>>> Ah, ok. Then let the check be as it is.
>>>>>> Could you please add warning message for this kill with the process pid 
>>>>>> being killed and the reason why (spfs was killed of exited with error)?
>>>>>
>>>>> Maybe we should call spfs_release_mnt() in case of exit status != 0 too?
>>>>>
>>>>
>>>> Well, this makes sense.
>>>> If spfs exited with non-zero result (either it was killed or exited due to 
>>>> some error), then there is no need in r

Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit

2018-01-23 Thread Stanislav Kinsburskiy


23.01.2018 11:17, Kirill Tkhai пишет:
> On 23.01.2018 13:14, Stanislav Kinsburskiy wrote:
>>
>>
>> 23.01.2018 11:09, Kirill Tkhai пишет:
>>> On 23.01.2018 13:07, Stanislav Kinsburskiy wrote:
>>>>
>>>>
>>>> 23.01.2018 10:51, Kirill Tkhai пишет:
>>>>> On 23.01.2018 12:48, Stanislav Kinsburskiy wrote:
>>>>>> Please, see a couple of nits below
>>>>>>
>>>>>> 23.01.2018 10:41, Kirill Tkhai пишет:
>>>>>>> Stanislav Kinsburskiy says:
>>>>>>>
>>>>>>> "SPFS manager has a special "--exit-with-spfs" options, which is used 
>>>>>>> by CRIU.
>>>>>>>  The idea of the option is simple: force SPFS manager to exit, when it 
>>>>>>> has some
>>>>>>>  SPFS processes among its children (i.e. spfs was mounted at least 
>>>>>>> once),
>>>>>>>  but all these processes have exited for whatever reason (which usually 
>>>>>>> happens
>>>>>>>  when restore has failed and spfs mounts where unmounted).
>>>>>>>  Although it works in overall (main SPFS manager process exits), its 
>>>>>>> children
>>>>>>>  (responsible to SPFS replacement) may wait on FUTEX for "release" 
>>>>>>> command
>>>>>>>  for corresponding SPFS mount and thus never stop until they are 
>>>>>>> killed".
>>>>>>>
>>>>>>> 1 spfs-manager
>>>>>>> 2   \_ spfs
>>>>>>> 3   \_ spfs-manager
>>>>>>> 4   \_ spfs
>>>>>>> 5   \_ spfs-manager
>>>>>>>
>>>>>>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount.
>>>>>>> The patch makes spfs-manager 1 kill 3 in case of 2 exited.
>>>>>>>
>>>>>>> https://jira.sw.ru/browse/PSBM-80055
>>>>>>>
>>>>>>> Signed-off-by: Kirill Tkhai 
>>>>>>> ---
>>>>>>>  manager/context.c |4 
>>>>>>>  manager/spfs.c|1 +
>>>>>>>  2 files changed, 5 insertions(+)
>>>>>>>
>>>>>>> diff --git a/manager/context.c b/manager/context.c
>>>>>>> index 1eb37c9..4464a23 100644
>>>>>>> --- a/manager/context.c
>>>>>>> +++ b/manager/context.c
>>>>>>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct 
>>>>>>> spfs_manager_context_s *ctx,
>>>>>>> /* SPFS master was killed. We need to release the 
>>>>>>> reference */
>>>>>>> spfs_release_mnt(info);
>>>>>>>  
>>>>>>> +   if (killed || WEXITSTATUS(status))
>>>>>>> +   kill(info->replacer, SIGKILL);
>>>>>>> +
>>>>>>
>>>>>> There is "if (killed)" check above.
>>>>>> Could you please move this hunk there?
>>>>>
>>>>> There is logical OR (||) in the hunk. How should I move it to 
>>>>> unconditional check?
>>>>>
>>>>
>>>> Ah, ok. Then let the check be as it is.
>>>> Could you please add warning message for this kill with the process pid 
>>>> being killed and the reason why (spfs was killed of exited with error)?
>>>
>>> Maybe we should call spfs_release_mnt() in case of exit status != 0 too?
>>>
>>
>> Well, this makes sense.
>> If spfs exited with non-zero result (either it was killed or exited due to 
>> some error), then there is no need in replacer. And there is no need in the 
>> reference.
>> So, probably this check you proposed should be used for both 
>> spfs_release_mnt(info) and kill(info->replacer, SIGKILL).
> 
> Are you OK with this change sent in the only patch, or should we use one more 
> patch?
> 

I think one patch is OK. Only mention the change explicitly in the patch 
description, please.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit

2018-01-23 Thread Stanislav Kinsburskiy


23.01.2018 11:09, Kirill Tkhai пишет:
> On 23.01.2018 13:07, Stanislav Kinsburskiy wrote:
>>
>>
>> 23.01.2018 10:51, Kirill Tkhai пишет:
>>> On 23.01.2018 12:48, Stanislav Kinsburskiy wrote:
>>>> Please, see a couple of nits below
>>>>
>>>> 23.01.2018 10:41, Kirill Tkhai пишет:
>>>>> Stanislav Kinsburskiy says:
>>>>>
>>>>> "SPFS manager has a special "--exit-with-spfs" options, which is used by 
>>>>> CRIU.
>>>>>  The idea of the option is simple: force SPFS manager to exit, when it 
>>>>> has some
>>>>>  SPFS processes among its children (i.e. spfs was mounted at least once),
>>>>>  but all these processes have exited for whatever reason (which usually 
>>>>> happens
>>>>>  when restore has failed and spfs mounts where unmounted).
>>>>>  Although it works in overall (main SPFS manager process exits), its 
>>>>> children
>>>>>  (responsible to SPFS replacement) may wait on FUTEX for "release" command
>>>>>  for corresponding SPFS mount and thus never stop until they are killed".
>>>>>
>>>>> 1 spfs-manager
>>>>> 2   \_ spfs
>>>>> 3   \_ spfs-manager
>>>>> 4   \_ spfs
>>>>> 5   \_ spfs-manager
>>>>>
>>>>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount.
>>>>> The patch makes spfs-manager 1 kill 3 in case of 2 exited.
>>>>>
>>>>> https://jira.sw.ru/browse/PSBM-80055
>>>>>
>>>>> Signed-off-by: Kirill Tkhai 
>>>>> ---
>>>>>  manager/context.c |4 
>>>>>  manager/spfs.c|1 +
>>>>>  2 files changed, 5 insertions(+)
>>>>>
>>>>> diff --git a/manager/context.c b/manager/context.c
>>>>> index 1eb37c9..4464a23 100644
>>>>> --- a/manager/context.c
>>>>> +++ b/manager/context.c
>>>>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct 
>>>>> spfs_manager_context_s *ctx,
>>>>>   /* SPFS master was killed. We need to release the reference */
>>>>>   spfs_release_mnt(info);
>>>>>  
>>>>> + if (killed || WEXITSTATUS(status))
>>>>> + kill(info->replacer, SIGKILL);
>>>>> +
>>>>
>>>> There is "if (killed)" check above.
>>>> Could you please move this hunk there?
>>>
>>> There is logical OR (||) in the hunk. How should I move it to unconditional 
>>> check?
>>>
>>
>> Ah, ok. Then let the check be as it is.
>> Could you please add warning message for this kill with the process pid 
>> being killed and the reason why (spfs was killed of exited with error)?
> 
> Maybe we should call spfs_release_mnt() in case of exit status != 0 too?
> 

Well, this makes sense.
If spfs exited with non-zero result (either it was killed or exited due to some 
error), then there is no need in replacer. And there is no need in the 
reference.
So, probably this check you proposed should be used for both 
spfs_release_mnt(info) and kill(info->replacer, SIGKILL).



___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit

2018-01-23 Thread Stanislav Kinsburskiy


23.01.2018 10:51, Kirill Tkhai пишет:
> On 23.01.2018 12:48, Stanislav Kinsburskiy wrote:
>> Please, see a couple of nits below
>>
>> 23.01.2018 10:41, Kirill Tkhai пишет:
>>> Stanislav Kinsburskiy says:
>>>
>>> "SPFS manager has a special "--exit-with-spfs" options, which is used by 
>>> CRIU.
>>>  The idea of the option is simple: force SPFS manager to exit, when it has 
>>> some
>>>  SPFS processes among its children (i.e. spfs was mounted at least once),
>>>  but all these processes have exited for whatever reason (which usually 
>>> happens
>>>  when restore has failed and spfs mounts where unmounted).
>>>  Although it works in overall (main SPFS manager process exits), its 
>>> children
>>>  (responsible to SPFS replacement) may wait on FUTEX for "release" command
>>>  for corresponding SPFS mount and thus never stop until they are killed".
>>>
>>> 1 spfs-manager
>>> 2   \_ spfs
>>> 3   \_ spfs-manager
>>> 4   \_ spfs
>>> 5   \_ spfs-manager
>>>
>>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount.
>>> The patch makes spfs-manager 1 kill 3 in case of 2 exited.
>>>
>>> https://jira.sw.ru/browse/PSBM-80055
>>>
>>> Signed-off-by: Kirill Tkhai 
>>> ---
>>>  manager/context.c |4 
>>>  manager/spfs.c|1 +
>>>  2 files changed, 5 insertions(+)
>>>
>>> diff --git a/manager/context.c b/manager/context.c
>>> index 1eb37c9..4464a23 100644
>>> --- a/manager/context.c
>>> +++ b/manager/context.c
>>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct 
>>> spfs_manager_context_s *ctx,
>>> /* SPFS master was killed. We need to release the reference */
>>> spfs_release_mnt(info);
>>>  
>>> +   if (killed || WEXITSTATUS(status))
>>> +   kill(info->replacer, SIGKILL);
>>> +
>>
>> There is "if (killed)" check above.
>> Could you please move this hunk there?
> 
> There is logical OR (||) in the hunk. How should I move it to unconditional 
> check?
> 

Ah, ok. Then let the check be as it is.
Could you please add warning message for this kill with the process pid being 
killed and the reason why (spfs was killed of exited with error)?
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit

2018-01-23 Thread Stanislav Kinsburskiy
Please, see a couple of nits below

23.01.2018 10:41, Kirill Tkhai пишет:
> Stanislav Kinsburskiy says:
> 
> "SPFS manager has a special "--exit-with-spfs" options, which is used by CRIU.
>  The idea of the option is simple: force SPFS manager to exit, when it has 
> some
>  SPFS processes among its children (i.e. spfs was mounted at least once),
>  but all these processes have exited for whatever reason (which usually 
> happens
>  when restore has failed and spfs mounts where unmounted).
>  Although it works in overall (main SPFS manager process exits), its children
>  (responsible to SPFS replacement) may wait on FUTEX for "release" command
>  for corresponding SPFS mount and thus never stop until they are killed".
> 
> 1 spfs-manager
> 2   \_ spfs
> 3   \_ spfs-manager
> 4   \_ spfs
> 5   \_ spfs-manager
> 
> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount.
> The patch makes spfs-manager 1 kill 3 in case of 2 exited.
> 
> https://jira.sw.ru/browse/PSBM-80055
> 
> Signed-off-by: Kirill Tkhai 
> ---
>  manager/context.c |4 
>  manager/spfs.c|1 +
>  2 files changed, 5 insertions(+)
> 
> diff --git a/manager/context.c b/manager/context.c
> index 1eb37c9..4464a23 100644
> --- a/manager/context.c
> +++ b/manager/context.c
> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct 
> spfs_manager_context_s *ctx,
>   /* SPFS master was killed. We need to release the reference */
>   spfs_release_mnt(info);
>  
> + if (killed || WEXITSTATUS(status))
> + kill(info->replacer, SIGKILL);
> +

There is "if (killed)" check above.
Could you please move this hunk there?

>   info->dead = true;
>   del_spfs_info(ctx->spfs_mounts, info);
>  
> @@ -88,6 +91,7 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, 
> void *data)
>* corresponding fd.
>*/
>   spfs_release_mnt(info);
> + info->replacer = -1;
>   } else {
>   info = find_spfs_by_pid(ctx->spfs_mounts, pid);
>   if (info) {
> diff --git a/manager/spfs.c b/manager/spfs.c
> index 3e0f667..0b94587 100644
> --- a/manager/spfs.c
> +++ b/manager/spfs.c
> @@ -39,6 +39,7 @@ int create_spfs_info(const char *id,
>   pr_err("failed to allocate\n");
>   return -ENOMEM;
>   }
> + info->replacer = -1;
>  

Could you, please, move it down to the end of the function where unconditional 
actions take place?

>   err = init_mount_info(&info->mnt, id, mountpoint, ns_mountpoint);
>   if (err)
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 2/3] libvzctl-4.14: join namespaces in explicitly provided order

2018-01-10 Thread Stanislav Kinsburskiy
This is needed to make sure, that mnt ns is the last (otherwise join other
namespaces after mnt ns will fail).

Signed-off-by: Stanislav Kinsburskiy 
---
 lib/env_nsops.c |   39 +++
 1 file changed, 11 insertions(+), 28 deletions(-)

diff --git a/lib/env_nsops.c b/lib/env_nsops.c
index 0771eb2..d885d1c 100644
--- a/lib/env_nsops.c
+++ b/lib/env_nsops.c
@@ -885,7 +885,7 @@ static int ns_is_env_run(struct vzctl_env_handle *h)
return cg_env_get_ve_state(EID(h));
 }
 
-int set_ns(pid_t pid, const char *name, int flags)
+static int set_ns(pid_t pid, const char *name, int flags)
 {
int ret, fd;
char path[PATH_MAX];
@@ -924,11 +924,10 @@ int enter_net_ns(struct vzctl_env_handle *h, pid_t 
*ct_pid)
 
 static int ns_env_enter(struct vzctl_env_handle *h, int flags)
 {
-   DIR *dp;
-   struct dirent *ep;
pid_t pid;
-   char path[PATH_MAX];
-   int ret;
+   int ret, i;
+   const char *ns[] = {"cgroup", "ipc", "net", "uts", "pid",
+   "pid_for_children", "user", "mnt"};
 
ret = reset_loginuid();
if (ret)
@@ -939,37 +938,21 @@ static int ns_env_enter(struct vzctl_env_handle *h, int 
flags)
 
logger(10, 0, "* Attach by pid %d", pid);
 
-   snprintf(path, sizeof(path), "/proc/%d/ns", pid);
-   dp = opendir(path);
-   if (dp == NULL)
-   return vzctl_err(-1, errno, "Unable to open dir %s", path);
-
ret = cg_attach_task(EID(h), getpid(), NULL, NULL);
if (ret)
-   goto err;
-
-   while ((ep = readdir (dp))) {
-   if (!strcmp(ep->d_name, ".") ||
-   !strcmp(ep->d_name, ".."))
-   continue;
+   return ret;
 
-   ret = set_ns(pid, ep->d_name, 0);
+   for (i = 0; i < sizeof(ns) / sizeof(ns[0]); ++i) {
+   ret = set_ns(pid, ns[i], 0);
if (ret)
-   goto err;
+   return ret;
}
 
/* Clear supplementary group IDs */
-   if (setgroups(0, NULL)) {
-   ret = vzctl_err(-1, errno, "ns_env_enter: setgroups()");
-   goto err;
-   }
-   
-   ret = set_personality32();
+   if (setgroups(0, NULL))
+   return vzctl_err(-1, errno, "ns_env_enter: setgroups()");
 
-err:
-   closedir(dp);
-
-   return ret;
+   return set_personality32();
 }
 
 static int ns_env_exec(struct vzctl_env_handle *h, struct exec_param *param,

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 1/3] libvzctl-4.14: clone CT in new cgroup namespace

2018-01-10 Thread Stanislav Kinsburskiy
We are going to use it along with all the other namespaces.

Signed-off-by: Stanislav Kinsburskiy 
---
 lib/env_nsops.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/env_nsops.c b/lib/env_nsops.c
index 13e65b0..0771eb2 100644
--- a/lib/env_nsops.c
+++ b/lib/env_nsops.c
@@ -807,8 +807,10 @@ static int do_env_create(struct vzctl_env_handle *h, 
struct start_param *param)
}
param->init_p = init_p;
 
+#define CLONE_NEWCGROUP 0x0200  /* New cgroup namespace */
+
clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC|
-   CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWUSER;
+   CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWCGROUP;
pid = clone(real_ns_env_create,
child_stack + sizeof(child_stack),
clone_flags|SIGCHLD , (void *) param);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 0/3] libvzctl: a set a patches to make CT running on 4.14 kernel

2018-01-10 Thread Stanislav Kinsburskiy
This series (plus patch, removing beacounters usage) allows to start container
on 4.14 kernel, which is available here:

Repo  : ssh://g...@git.sw.ru:7999/vzs/vzkernel.git
Branch: stable-4.14

---

Stanislav Kinsburskiy (3):
  libvzctl-4.14: clone CT in new cgroup namespace
  libvzctl-4.14: join namespaces in explicitly provided order
  libvzctl-4.14: do not mount sysfs in container


 lib/env.c   |3 ---
 lib/env_nsops.c |   43 ++-
 2 files changed, 14 insertions(+), 32 deletions(-)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 3/3] libvzctl-4.14: do not mount sysfs in container

2018-01-10 Thread Stanislav Kinsburskiy
Our current approach is to use one sysfs mount for all the containers, but set
limited visibility to sysfs dentries in a container.

Signed-off-by: Stanislav Kinsburskiy 
---
 lib/env.c |3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/env.c b/lib/env.c
index 1b03ce4..a284ec9 100644
--- a/lib/env.c
+++ b/lib/env.c
@@ -772,9 +772,6 @@ int pre_setup_env(const struct start_param *param)
if (setup_devtmpfs())
return VZCTL_E_SYSTEM;
 
-   if (stat_file("/sys"))
-   mount("sysfs", "/sys", "sysfs", 0, 0);
-
if (env->features->mask & VE_FEATURE_NFSD) {
mount("nfsd", "/proc/fs/nfsd", "nfsd", 0, 0);
make_dir("/var/lib/nfs/rpc_pipefs", 1);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] 4.14: remove beancounter cgroup usage

2018-01-09 Thread Stanislav Kinsburskiy
This might be a temporary patch (since we don't have beaconters in 4.14 yet),
but it's needed to proceed with testing.

Signed-off-by: Stanislav Kinsburskiy 
---
 lib/cgroup.c|   30 +-
 lib/cgroup.h|1 -
 lib/env_nsops.c |   38 ++
 3 files changed, 3 insertions(+), 66 deletions(-)

diff --git a/lib/cgroup.c b/lib/cgroup.c
index cd71064..52dc013 100644
--- a/lib/cgroup.c
+++ b/lib/cgroup.c
@@ -56,7 +56,6 @@ static struct cg_ctl cg_ctl_map[] = {
{CG_DEVICES},
{CG_BLKIO},
{CG_FREEZER},
-   {CG_UB, 1},
{CG_VE, 1},
{CG_PERF_EVENT},
{CG_HUGETLB},
@@ -741,39 +740,12 @@ int cg_env_set_memory(const char *ctid, const char *name, 
unsigned long value)
 
 int cg_env_set_ub(const char *ctid, const char *name, unsigned long b, 
unsigned long l)
 {
-   int rc;
-   char _name[STR_SIZE];
-
-   snprintf(_name, sizeof(_name), "beancounter.%s.barrier", name);
-   rc = cg_set_ul(ctid, CG_UB, _name, b);
-   if (rc)
-   return rc;
-
-   snprintf(_name, sizeof(_name), "beancounter.%s.limit", name);
-   return cg_set_ul(ctid, CG_UB, _name, l);
+   return 0;
 }
 
 static int cg_env_set_io(const char *ctid, const char *name, unsigned int 
speed,
unsigned int burst, unsigned int latency)
 {
-   int ret;
-   char buf[STR_SIZE];
-
-   snprintf(buf, sizeof(buf), CG_UB".%s.speed", name);
-   ret = cg_set_ul(ctid, CG_UB, buf, speed);
-   if (ret)
-   return ret;
-
-   snprintf(buf, sizeof(buf), CG_UB".%s.burst", name);
-   ret = cg_set_ul(ctid, CG_UB, buf, burst);
-   if (ret)
-   return ret;
-
-   snprintf(buf, sizeof(buf), CG_UB".%s.latency", name);
-   ret = cg_set_ul(ctid, CG_UB, buf, latency);
-   if (ret)
-   return ret;
-
return 0;
 }
 
diff --git a/lib/cgroup.h b/lib/cgroup.h
index 53429e9..c0b308d 100644
--- a/lib/cgroup.h
+++ b/lib/cgroup.h
@@ -29,7 +29,6 @@
 #define CG_MEMORY  "memory"
 #define CG_NET_CLS "net_cls"
 #define CG_VE  "ve"
-#define CG_UB  "beancounter"
 #define CG_BLKIO   "blkio"
 #define CG_FREEZER "freezer"
 #define CG_PERF_EVENT  "perf_event"
diff --git a/lib/env_nsops.c b/lib/env_nsops.c
index a30eb6a..13e65b0 100644
--- a/lib/env_nsops.c
+++ b/lib/env_nsops.c
@@ -566,24 +566,12 @@ static int init_env_cgroup(struct vzctl_env_handle *h, 
int flags)
"cpuset.cpus",
"cpuset.mems"
};
-   char *bc[] = {
-   "beancounter.memory",
-   "beancounter.blkio"
-   };
 
logger(10, 0, "* init Container cgroup");
if (h->veid && cg_set_veid(EID(h), h->veid) == -1)
return vzctl_err(VZCTL_E_RESOURCE, 0,
"Failed to set VEID=%u", h->veid);
 
-   /* Bind beancounter with blkio/memory cgroups */
-   for (i = 0; i < sizeof(bc)/sizeof(bc[0]); i++) {
-   snprintf(buf, sizeof(buf), "/%s/%s", cg_get_slice_name(), 
EID(h));
-   ret = cg_set_param(EID(h), CG_UB, bc[i], buf);
-   if (ret)
-   return ret;
-   }
-
ret = cg_env_set_memory(h->ctid, "memory.use_hierarchy", 1);
if (ret)
return ret;
@@ -1674,14 +1662,7 @@ static int ns_set_iolimit(struct vzctl_env_handle *h, 
unsigned int speed)
 
 static int ns_get_iolimit(struct vzctl_env_handle *h, unsigned int *speed)
 {
-   int ret;
-   unsigned long n;
-
-   ret = cg_get_ul(EID(h), CG_UB, CG_UB".iolimit.speed", &n);
-
-   *speed = (unsigned int) n;
-
-   return ret;
+   return 0;
 }
 
 static int ns_set_ioprio(struct vzctl_env_handle *h, int prio)
@@ -1712,13 +1693,7 @@ static int ns_set_iopslimit(struct vzctl_env_handle *h, 
unsigned int speed)
 
 static int ns_get_iopslimit(struct vzctl_env_handle *h, unsigned int *speed)
 {
-   int ret;
-   unsigned long n;
-
-   ret = cg_get_ul(EID(h), CG_UB, CG_UB".iopslimit.speed", &n);
-   *speed = (unsigned int) n;
-
-   return ret;
+   return 0;
 }
 
 static int ns_get_runtime_param(struct vzctl_env_handle *h, int flags)
@@ -1761,11 +1736,6 @@ int vzctl2_set_limits(struct vzctl_env_handle *h, int 
release)
 {
int ret;
 
-   if (release) {
-   cg_set_ul("", CG_UB, "tasks", getpid());
-   return cg_destroy_cgroup(EID(h));
-   }
-
if (EMPTY_CTID(h->ctid))
vzctl2_generate_ctid(EID(h));
 
@@ -1786,10 +1756,6 @@ int vzctl2_set_limits(struct vzctl_env_handle *h, int 
release)
if (ret)

[Devel] [PATCH] criu: return original error code from bc_failcnt_check() if failed

2018-01-04 Thread Stanislav Kinsburskiy
Currently this helper doesn't see "/proc/bc/%s/resources" (another mount ns?).
But this leads to a situation, when error is swallowed by the helper.
I.e. it returns 0 even if there was an error during restore.
So, let's return original error code instead.

https://jira.sw.ru/browse/PSBM-80056

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/cr-restore.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index bd36523..6337c0f 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2034,7 +2034,7 @@ static int bc_failcnt_check(int ret)
snprintf(buf, sizeof(buf), "/proc/bc/%s/resources", veid);
f = fopen(buf, "r");
if (!f) {
-   return 0;
+   return ret;
}

while (fgets(buf, sizeof(buf), f)) {

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] spfs: start SPFS manager in containers network namespace

2017-12-22 Thread Stanislav Kinsburskiy
This is needed because in case of killing of a container with SPFS manager
inside (due to any restore error) all the network namespaces of container
processes will be marked to drop SUNRPC packets (libvzctl does it on fast stop).

This in turn happens, because we want to be able to kill container with
blocked network and NFS mount inside.

Thus all the processes, belonging to VE cgroup have to have containers
network namespaces otherwise SUNRPC trafic is dropped in init network
namespace, like it happens now.

https://jira.sw.ru/browse/PSBM-79733

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index 6ce2ac8..d46fe11 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -21,6 +21,7 @@
 #include "spfs.h"
 #include "proc_parse.h"
 #include "cgroup.h"
+#include "net.h"
 
 #define SPFS_MANAGER_WORK_DIR  "/run/spfs-manager/%d"
 #define VE_SPFS_MANAGER_WORK_DIR   "/vz/private/%s/dump/spfs-manager/%d"
@@ -121,7 +122,7 @@ static char *spfs_manager_log_dir(void)
return work_dir;
 }
 
-static int start_spfs_manager(void)
+static int __start_spfs_manager(void)
 {
char *spfs_manager = "spfs-manager";
char *socket_path = spfs_manager_socket_path();
@@ -159,6 +160,24 @@ static int start_spfs_manager(void)
return sock;
 }
 
+static int start_spfs_manager(void)
+{
+   int old_net_ns, sock;
+
+   if (switch_ns(root_item->pid->real, &net_ns_desc, &old_net_ns)) {
+   pr_err("failed to switch to containers network namespace\n");
+   return -1;
+   }
+
+   sock = __start_spfs_manager();
+
+   if (restore_ns(old_net_ns, &net_ns_desc)) {
+   pr_err("failed to restore original usernsd network 
namespace\n");
+   return -1;
+   }
+   return sock;
+}
+
 static int get_spfs_mngr_sock(void *start, int fd, pid_t pid)
 {
int sock;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 6/7] spfs: return duplicated socket from usernsd

2017-12-21 Thread Stanislav Kinsburskiy
Usernsd closes socket when sent.

https://jira.sw.ru/browse/PSBM-79462

v2: duplicate only service descriptor

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index 7ce30e7..6ce2ac8 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -164,8 +164,13 @@ static int get_spfs_mngr_sock(void *start, int fd, pid_t 
pid)
int sock;
 
sock = get_service_fd(SPFS_MNGR_SK);
-   if (sock < 0 && start)
+   if (sock != -1) {
+   sock = dup(sock);
+   if (sock < 0)
+   pr_perror("failed to duplicate SPFS manager socket\n");
+   } else if (start)
sock = start_spfs_manager();
+
return sock;
 }
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 7/7] spfs: switch mounts mode to STUB after root yard depopulation

2017-12-21 Thread Stanislav Kinsburskiy
Otherwise CRIU stuck on SPFS mounts

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/cr-restore.c |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index eb9f50d..bd36523 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2253,12 +2253,6 @@ static int restore_root_task(struct pstree_item *init)
if (ret < 0)
goto out_kill;
 
-   if (spfs_is_running) {
-   ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB);
-   if (ret < 0)
-   goto out_kill;
-   }
-
if (fault_injected(FI_POST_RESTORE))
goto out_kill;
 
@@ -2279,6 +2273,12 @@ static int restore_root_task(struct pstree_item *init)
 
close_safe(&mnt_ns_fd);
 
+   if (spfs_is_running) {
+   ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB);
+   if (ret < 0)
+   goto out_kill;
+   }
+
if (write_restored_pid())
goto out_kill;
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 4/7] spfs: improve SPFS manager start debug and error output

2017-12-21 Thread Stanislav Kinsburskiy
Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index 223bf93..89fb9f9 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -127,6 +127,8 @@ static int start_spfs_manager(void)
char *socket_path = spfs_manager_socket_path();
int err = -ENOMEM, sock;
 
+   pr_info("Starting SPFS manager\n");
+
err = cr_system(-1, -1, -1, spfs_manager,
(char *[]){ "spfs-manager", "-",
 "-d",
@@ -135,13 +137,17 @@ static int start_spfs_manager(void)
 "--log-dir", spfs_manager_log_dir(),
 "--exit-with-spfs", NULL },
0);
-   pr_info("%s: spfs manager start result: %d\n", __func__, err);
-   if (err)
+   if (err) {
+   pr_err("failed to start SPFS manager binary: %d\n", err);
return err;
+   }
 
sock = sock_seqpacket_connect(socket_path);
-   if (sock < 0)
+   if (sock < 0) {
+   pr_err("failed to connect to SPFS manager via %s: %d\n",
+   socket_path, err);
return sock;
+   }
 
err = install_service_fd(SPFS_MNGR_SK, sock);
if (err < 0) {

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 2/7] spfs: improve error and debug output for spfs_mount()

2017-12-21 Thread Stanislav Kinsburskiy
Use request_spfs_mngr_sock() for both start and socket request.

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index 0d44b72..1ead01b 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -292,18 +292,23 @@ int spfs_mount(struct mount_info *mi, const char *source,
 
sock = start_spfs_mngr();
if (sock < 0) {
-   pr_err("failed to mount NFS to path %s\n", mi->mountpoint);
-   return sock;
+   pr_err("failed to connect to SPFS manager: %d\n", sock);
+   ret = sock;
+   goto err;
}
-
ret = spfs_request_mount(sock, mi, source, filesystemtype, mountflags);
close(sock);
if (ret) {
-   pr_err("mount of %s (%s) failed: %d\n", source, filesystemtype, 
ret);
-   return ret;
+   pr_err("mount request for %s (%s) failed: %d\n",
+   source, filesystemtype, ret);
+   goto err;
}
 
return 0;
+
+err:
+   pr_err("failed to mount NFS to path %s\n", mi->mountpoint);
+   return ret;
 }
 
 int spfs_set_env(void)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 5/7] spfs: improve prints in spfs_set_mode() and spfs_release_replace()

2017-12-21 Thread Stanislav Kinsburskiy
To make them more grepable.

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index 89fb9f9..7ce30e7 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -379,9 +379,9 @@ int spfs_set_mode(int sock, const char *mode)
 
ret = spfs_send_request(sock, req, strlen(req) + 1);
if (ret)
-   pr_err("set mode to %s request failed: %d\n", mode, ret);
+   pr_err("set SPFS mode to %s request failed: %d\n", mode, ret);
else
-   pr_debug("Set mode to %s request succeeded\n", mode);
+   pr_debug("Set SPFS mode to %s request succeeded\n", mode);
 
free(req);
return 0;
@@ -394,9 +394,9 @@ int spfs_release_replace(int sock)
 
ret = spfs_send_request(sock, req, strlen(req) + 1);
if (ret)
-   pr_err("release replace request failed: %d\n", ret);
+   pr_err("SPFS release replace request failed: %d\n", ret);
else
-   pr_debug("Release replace request succeeded\n");
+   pr_debug("SPFS \"release replace\" request succeeded\n");
 
return 0;
 }

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 1/7] spfs: introduce request_spfs_mngr_sock)() helper

2017-12-21 Thread Stanislav Kinsburskiy
This generic helper will be responsible for both SPFS manager start via
usernsd and socket request.

v2:
Do not use variable, but explicitly provide fake void pointer ad boolean value
to request_spfs_mngr_sock()

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   43 ---
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index a5f6031..0d44b72 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -153,16 +153,36 @@ static int start_spfs_manager(void)
return sock;
 }
 
-static int get_spfs_mngr_sock(void *arg, int fd, pid_t pid)
+static int get_spfs_mngr_sock(void *start, int fd, pid_t pid)
 {
int sock;
 
sock = get_service_fd(SPFS_MNGR_SK);
-   if (sock < 0)
+   if (sock < 0 && start)
sock = start_spfs_manager();
return sock;
 }
 
+static int request_spfs_mngr_sock(bool *start_mngr)
+{
+   int ns_fd;
+   int sock;
+
+   ns_fd = open_proc(PROC_SELF, "ns");
+   if (ns_fd < 0)
+   return ns_fd;
+
+   sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, start_mngr, 0, ns_fd);
+
+   close(ns_fd);
+   return sock;
+}
+
+static int start_spfs_mngr(void)
+{
+   return request_spfs_mngr_sock((void *)1);
+}
+
 static int spfs_request_mount(int sock, struct mount_info *mi, const char 
*source,
  const char *type, unsigned long mountflags)
 {
@@ -268,15 +288,9 @@ int spfs_mount(struct mount_info *mi, const char *source,
   const char *filesystemtype, unsigned long mountflags)
 {
int ret;
-   int ns_fd;
int sock;
 
-   ns_fd = open_proc(PROC_SELF, "ns");
-   if (ns_fd < 0)
-   return ns_fd;
-
-   sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, NULL, 0, ns_fd);
-   close(ns_fd);
+   sock = start_spfs_mngr();
if (sock < 0) {
pr_err("failed to mount NFS to path %s\n", mi->mountpoint);
return sock;
@@ -345,16 +359,7 @@ int spfs_mngr_status(bool *active)
 
 int spfs_mngr_sock(void)
 {
-   int ns_fd, fd;
-
-   ns_fd = open_proc(PROC_SELF, "ns");
-   if (ns_fd < 0)
-   return ns_fd;
-
-   fd = userns_call(spfs_service_fd, UNS_FDOUT, NULL, 0, ns_fd);
-
-   close(ns_fd);
-   return fd;
+   return request_spfs_mngr_sock((void *)0);
 }
 
 int spfs_set_mode(int sock, const char *mode)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 0/7] spfs: duplicate socket before sending it from usernsd

2017-12-21 Thread Stanislav Kinsburskiy
Usernds closes socket once it was sent.
So, id should be duplicated before sending, if the socket is expected to be
send multiple times.

https://jira.sw.ru/browse/PSBM-79462

The following series implements...

---

Stanislav Kinsburskiy (7):
  spfs: introduce request_spfs_mngr_sock)() helper
  spfs: improve error and debug output for spfs_mount()
  spfs: remove redundant spfs_service_fd() helper
  spfs: improve SPFS manager start debug and error output
  spfs: improve prints in spfs_set_mode() and spfs_release_replace()
  spfs: return duplicated socket from usernsd
  spfs: switch mounts mode to STUB after root yard depopulation


 criu/cr-restore.c |   12 +++
 criu/spfs.c   |   92 +++--
 2 files changed, 59 insertions(+), 45 deletions(-)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2 3/7] spfs: remove redundant spfs_service_fd() helper

2017-12-21 Thread Stanislav Kinsburskiy
Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index 1ead01b..223bf93 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -336,16 +336,9 @@ int spfs_set_env(void)
return 0;
 }
 
-static int spfs_service_fd(void *arg, int fd, pid_t pid)
-{
-   return get_service_fd(SPFS_MNGR_SK);
-}
-
 static int spfs_service_is_active(void *arg, int fd, pid_t pid)
 {
-   if (spfs_service_fd(arg, fd, pid) < 0)
-   return 0;
-   return 1;
+   return get_service_fd(SPFS_MNGR_SK) < 0 ? 0 : 1;
 }
 
 int spfs_mngr_status(bool *active)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 7/7] spfs: switch mountsmode to STUB after root yard depopulation

2017-12-20 Thread Stanislav Kinsburskiy
Otherwise CRIU stuck on SPFS mounts

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/cr-restore.c |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index eb9f50d..bd36523 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2253,12 +2253,6 @@ static int restore_root_task(struct pstree_item *init)
if (ret < 0)
goto out_kill;
 
-   if (spfs_is_running) {
-   ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB);
-   if (ret < 0)
-   goto out_kill;
-   }
-
if (fault_injected(FI_POST_RESTORE))
goto out_kill;
 
@@ -2279,6 +2273,12 @@ static int restore_root_task(struct pstree_item *init)
 
close_safe(&mnt_ns_fd);
 
+   if (spfs_is_running) {
+   ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB);
+   if (ret < 0)
+   goto out_kill;
+   }
+
if (write_restored_pid())
goto out_kill;
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 6/7] spfs: return duplicated socket from usernsd

2017-12-20 Thread Stanislav Kinsburskiy
Usernsd closes socket when sent.

https://jira.sw.ru/browse/PSBM-79462

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index da19179..857a167 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -162,11 +162,19 @@ static int start_spfs_manager(void)
 static int get_spfs_mngr_sock(void *start, int fd, pid_t pid)
 {
int sock;
+   int sfd;
 
sock = get_service_fd(SPFS_MNGR_SK);
if (sock < 0 && start)
sock = start_spfs_manager();
-   return sock;
+   if (sock < 0)
+   return sock;
+
+   sfd = dup(sock);
+   if (sfd < 0)
+   pr_perror("failed to duplicate socket %d", sock);
+
+   return sfd;
 }
 
 static int request_spfs_mngr_sock(bool *start_mngr)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 3/7] spfs: remove redundant spfs_service_fd() helper

2017-12-20 Thread Stanislav Kinsburskiy
Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index c202b14..be3bca4 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -338,16 +338,9 @@ int spfs_set_env(void)
return 0;
 }
 
-static int spfs_service_fd(void *arg, int fd, pid_t pid)
-{
-   return get_service_fd(SPFS_MNGR_SK);
-}
-
 static int spfs_service_is_active(void *arg, int fd, pid_t pid)
 {
-   if (spfs_service_fd(arg, fd, pid) < 0)
-   return 0;
-   return 1;
+   return get_service_fd(SPFS_MNGR_SK) < 0 ? 0 : 1;
 }
 
 int spfs_mngr_status(bool *active)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 1/7] spfs: introduce request_spfs_mngr_sock)() helper

2017-12-20 Thread Stanislav Kinsburskiy
This generic helper will be responsible for both SPFS manager start via
usernsd and socket request.

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   45 ++---
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index a5f6031..fff7b9f 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -153,16 +153,38 @@ static int start_spfs_manager(void)
return sock;
 }
 
-static int get_spfs_mngr_sock(void *arg, int fd, pid_t pid)
+static int get_spfs_mngr_sock(void *start, int fd, pid_t pid)
 {
int sock;
 
sock = get_service_fd(SPFS_MNGR_SK);
-   if (sock < 0)
+   if (sock < 0 && start)
sock = start_spfs_manager();
return sock;
 }
 
+static int request_spfs_mngr_sock(bool *start_mngr)
+{
+   int ns_fd;
+   int sock;
+
+   ns_fd = open_proc(PROC_SELF, "ns");
+   if (ns_fd < 0)
+   return ns_fd;
+
+   sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, start_mngr, 0, ns_fd);
+
+   close(ns_fd);
+   return sock;
+}
+
+static int start_spfs_mngr(void)
+{
+   bool start;
+
+   return request_spfs_mngr_sock(&start);
+}
+
 static int spfs_request_mount(int sock, struct mount_info *mi, const char 
*source,
  const char *type, unsigned long mountflags)
 {
@@ -268,15 +290,9 @@ int spfs_mount(struct mount_info *mi, const char *source,
   const char *filesystemtype, unsigned long mountflags)
 {
int ret;
-   int ns_fd;
int sock;
 
-   ns_fd = open_proc(PROC_SELF, "ns");
-   if (ns_fd < 0)
-   return ns_fd;
-
-   sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, NULL, 0, ns_fd);
-   close(ns_fd);
+   sock = start_spfs_mngr();
if (sock < 0) {
pr_err("failed to mount NFS to path %s\n", mi->mountpoint);
return sock;
@@ -345,16 +361,7 @@ int spfs_mngr_status(bool *active)
 
 int spfs_mngr_sock(void)
 {
-   int ns_fd, fd;
-
-   ns_fd = open_proc(PROC_SELF, "ns");
-   if (ns_fd < 0)
-   return ns_fd;
-
-   fd = userns_call(spfs_service_fd, UNS_FDOUT, NULL, 0, ns_fd);
-
-   close(ns_fd);
-   return fd;
+   return request_spfs_mngr_sock(NULL);
 }
 
 int spfs_set_mode(int sock, const char *mode)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 2/7] spfs: improve error and debug output for spfs_mount()

2017-12-20 Thread Stanislav Kinsburskiy
Use request_spfs_mngr_sock() for both start and socket request.

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index fff7b9f..c202b14 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -294,18 +294,23 @@ int spfs_mount(struct mount_info *mi, const char *source,
 
sock = start_spfs_mngr();
if (sock < 0) {
-   pr_err("failed to mount NFS to path %s\n", mi->mountpoint);
-   return sock;
+   pr_err("failed to connect to SPFS manager: %d\n", sock);
+   ret = sock;
+   goto err;
}
-
ret = spfs_request_mount(sock, mi, source, filesystemtype, mountflags);
close(sock);
if (ret) {
-   pr_err("mount of %s (%s) failed: %d\n", source, filesystemtype, 
ret);
-   return ret;
+   pr_err("mount request for %s (%s) failed: %d\n",
+   source, filesystemtype, ret);
+   goto err;
}
 
return 0;
+
+err:
+   pr_err("failed to mount NFS to path %s\n", mi->mountpoint);
+   return ret;
 }
 
 int spfs_set_env(void)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 4/7] spfs: improve SPFS manager start debug and error output

2017-12-20 Thread Stanislav Kinsburskiy
Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |   12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index be3bca4..d3391e2 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -127,6 +127,8 @@ static int start_spfs_manager(void)
char *socket_path = spfs_manager_socket_path();
int err = -ENOMEM, sock;
 
+   pr_info("Starting SPFS manager\n");
+
err = cr_system(-1, -1, -1, spfs_manager,
(char *[]){ "spfs-manager", "-",
 "-d",
@@ -135,13 +137,17 @@ static int start_spfs_manager(void)
 "--log-dir", spfs_manager_log_dir(),
 "--exit-with-spfs", NULL },
0);
-   pr_info("%s: spfs manager start result: %d\n", __func__, err);
-   if (err)
+   if (err) {
+   pr_err("failed to start SPFS manager binary: %d\n", err);
return err;
+   }
 
sock = sock_seqpacket_connect(socket_path);
-   if (sock < 0)
+   if (sock < 0) {
+   pr_err("failed to connect to SPFS manager via %s: %d\n",
+   socket_path, err);
return sock;
+   }
 
err = install_service_fd(SPFS_MNGR_SK, sock);
if (err < 0) {

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 5/7] spfs: improve prints in spfs_set_mode() and spfs_release_replace()

2017-12-20 Thread Stanislav Kinsburskiy
To make them more grepable.

Signed-off-by: Stanislav Kinsburskiy 
---
 criu/spfs.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/criu/spfs.c b/criu/spfs.c
index d3391e2..da19179 100644
--- a/criu/spfs.c
+++ b/criu/spfs.c
@@ -381,9 +381,9 @@ int spfs_set_mode(int sock, const char *mode)
 
ret = spfs_send_request(sock, req, strlen(req) + 1);
if (ret)
-   pr_err("set mode to %s request failed: %d\n", mode, ret);
+   pr_err("set SPFS mode to %s request failed: %d\n", mode, ret);
else
-   pr_debug("Set mode to %s request succeeded\n", mode);
+   pr_debug("Set SPFS mode to %s request succeeded\n", mode);
 
free(req);
return 0;
@@ -396,9 +396,9 @@ int spfs_release_replace(int sock)
 
ret = spfs_send_request(sock, req, strlen(req) + 1);
if (ret)
-   pr_err("release replace request failed: %d\n", ret);
+   pr_err("SPFS release replace request failed: %d\n", ret);
else
-   pr_debug("Release replace request succeeded\n");
+   pr_debug("SPFS \"release replace\" request succeeded\n");
 
return 0;
 }

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 0/7] spfs: duplicate socket before sending it from usernsd

2017-12-20 Thread Stanislav Kinsburskiy
Usernds closes socket once it's sent.
So, id should be duplicated before sending, if the socket is expected to be
send multiple times.

https://jira.sw.ru/browse/PSBM-79462

The following series implements...

---

Stanislav Kinsburskiy (7):
  spfs: introduce request_spfs_mngr_sock)() helper
  spfs: improve error and debug output for spfs_mount()
  spfs: remove redundant spfs_service_fd() helper
  spfs: improve SPFS manager start debug and error output
  spfs: improve prints in spfs_set_mode() and spfs_release_replace()
  spfs: return duplicated socket from usernsd
  spfs: switch mountsmode to STUB after root yard depopulation


 criu/cr-restore.c |   12 +++
 criu/spfs.c   |   97 -
 2 files changed, 64 insertions(+), 45 deletions(-)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2] cgroup/cpuset: emulate cgroup in container

2017-12-13 Thread Stanislav Kinsburskiy
Any changes to this cgroup are skipped in container, but success code is
returned.
The idea is to fool Docker/Kubernetes.

https://jira.sw.ru/browse/PSBM-58423

This patch obsoletes "ve/proc/cpuset: do not show cpuset in CT"

v2:
Do not attach tasks in cpuset_change_cpumask on cpuset set change, it
requested from non-super VE.
This is a second part of the logic.
The first was to not change cpuset for newly added task. This one - to not
set new cpuset for all the tasks in cgroup.

Signed-off-by: Stanislav Kinsburskiy 
---
 kernel/cpuset.c |   12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 26d88eb..43b1410 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -848,6 +848,9 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 static void cpuset_change_cpumask(struct task_struct *tsk,
  struct cgroup_scanner *scan)
 {
+   if (!ve_is_super(get_exec_env()))
+   return;
+
set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
 }
 
@@ -1441,6 +1444,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct 
cgroup_taskset *tset)
struct task_struct *task;
int ret;
 
+   if (!ve_is_super(get_exec_env()))
+   return 0;
+
mutex_lock(&cpuset_mutex);
 
ret = -ENOSPC;
@@ -1470,6 +1476,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct 
cgroup_taskset *tset)
 static void cpuset_cancel_attach(struct cgroup *cgrp,
 struct cgroup_taskset *tset)
 {
+   if (!ve_is_super(get_exec_env()))
+   return;
+
mutex_lock(&cpuset_mutex);
cgroup_cs(cgrp)->attach_in_progress--;
mutex_unlock(&cpuset_mutex);
@@ -1494,6 +1503,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct 
cgroup_taskset *tset)
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp);
 
+   if (!ve_is_super(get_exec_env()))
+   return;
+
mutex_lock(&cpuset_mutex);
 
/* prepare for attach */

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] cgroup/cpuset: emulate cgroup in container

2017-12-13 Thread Stanislav Kinsburskiy
Nice catch, thanks!
I'll first try to tweak this gently, so it will look like cpuset cgroup works, 
but it won't.

13.12.2017 16:18, Pavel Tikhomirov пишет:
> So change to cpuset.cpus is actually applied to the task, that's what I mean. 
> Correct me if I'm wrong.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] cgroup/cpuset: emulate cgroup in container

2017-12-13 Thread Stanislav Kinsburskiy
Hi Pavel, please, see my comments/question below

13.12.2017 13:43, Pavel Tikhomirov пишет:
> Personally I don't like these as we still have no unswer to "If cpusets are 
> optional for docker, why k8s can't work without them?" it seem there is not 
> enough explanation in VZAP-31.
> 

Well... I have to admit that I don't like it either.
I (frankly) don't like the whole idea of putting kuber into container.
And the reason is so simple: CT is a cheating technique. Those, who like 
"Russian dolls" should use VMs with nested virtualization.
But the truth is that we do care about our customers (not sure why, since they 
pay us less and less).
And because of this we've put so much various sh*t into our kernel already, 
that I got tired to trow it away on each major rebase a long time ago.
But this all is lyrics.
There is a task - and that's the fix.

> We also need to revert the patch below to show cpuset in CT:
> commit 5160bd34c9bd ("ve/proc/cpuset: do not show cpuset in CT")
> 

Sure! It's mentioned in the patch description.

> It seem I can still attach a process to a nested cgroup in CT with these 
> patch:
> 
> CT-6ecd9be1 /# cat /proc/cgroups | grep cpuset
> cpuset    16    1    1
> CT-6ecd9be1 /# ls /sys/fs/cgroup/cpuset/cpuset.cpus
> /sys/fs/cgroup/cpuset/cpuset.cpus
> CT-6ecd9be1 /# mkdir /sys/fs/cgroup/cpuset/test
> CT-6ecd9be1 /# sleep 1000 &
> [1] 678
> CT-6ecd9be1 /# echo  678 > /sys/fs/cgroup/cpuset/test/tasks
> CT-6ecd9be1 /# cat /sys/fs/cgroup/cpuset/test/tasks
> 678
> 

Isn't it wonderful? :)
Poor me, I have to admit, that I didn't know, that this task will be even 
visible in the nest cgroup... :(
I thought, that emulation would be less effective.

Nevertheless, if you have some better way to solve the issue, please, share.

> On 12/13/2017 01:37 PM, Stanislav Kinsburskiy wrote:
>> Any changes to this cgroup are skipped in container, but success code is
>> returned.
>> The idea is to fool Docker/Kubernetes.
>>
>> https://jira.sw.ru/browse/PSBM-58423
>>
>> This patch obsoletes "ve/proc/cpuset: do not show cpuset in CT"
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>   kernel/cpuset.c |    9 +
>>   1 file changed, 9 insertions(+)
>>
>> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
>> index 26d88eb..dfac505 100644
>> --- a/kernel/cpuset.c
>> +++ b/kernel/cpuset.c
>> @@ -1441,6 +1441,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, 
>> struct cgroup_taskset *tset)
>>   struct task_struct *task;
>>   int ret;
>>   +    if (!ve_is_super(get_exec_env()))
>> +    return 0;
>> +
>>   mutex_lock(&cpuset_mutex);
>>     ret = -ENOSPC;
>> @@ -1470,6 +1473,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, 
>> struct cgroup_taskset *tset)
>>   static void cpuset_cancel_attach(struct cgroup *cgrp,
>>    struct cgroup_taskset *tset)
>>   {
>> +    if (!ve_is_super(get_exec_env()))
>> +    return;
>> +
>>   mutex_lock(&cpuset_mutex);
>>   cgroup_cs(cgrp)->attach_in_progress--;
>>   mutex_unlock(&cpuset_mutex);
>> @@ -1494,6 +1500,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct 
>> cgroup_taskset *tset)
>>   struct cpuset *cs = cgroup_cs(cgrp);
>>   struct cpuset *oldcs = cgroup_cs(oldcgrp);
>>   +    if (!ve_is_super(get_exec_env()))
>> +    return;
>> +
>>   mutex_lock(&cpuset_mutex);
>>     /* prepare for attach */
>>
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] cgroup/cpuset: emulate cgroup in container

2017-12-13 Thread Stanislav Kinsburskiy
Any changes to this cgroup are skipped in container, but success code is
returned.
The idea is to fool Docker/Kubernetes.

https://jira.sw.ru/browse/PSBM-58423

This patch obsoletes "ve/proc/cpuset: do not show cpuset in CT"

Signed-off-by: Stanislav Kinsburskiy 
---
 kernel/cpuset.c |9 +
 1 file changed, 9 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 26d88eb..dfac505 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1441,6 +1441,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct 
cgroup_taskset *tset)
struct task_struct *task;
int ret;
 
+   if (!ve_is_super(get_exec_env()))
+   return 0;
+
mutex_lock(&cpuset_mutex);
 
ret = -ENOSPC;
@@ -1470,6 +1473,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct 
cgroup_taskset *tset)
 static void cpuset_cancel_attach(struct cgroup *cgrp,
 struct cgroup_taskset *tset)
 {
+   if (!ve_is_super(get_exec_env()))
+   return;
+
mutex_lock(&cpuset_mutex);
cgroup_cs(cgrp)->attach_in_progress--;
mutex_unlock(&cpuset_mutex);
@@ -1494,6 +1500,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct 
cgroup_taskset *tset)
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp);
 
+   if (!ve_is_super(get_exec_env()))
+   return;
+
mutex_lock(&cpuset_mutex);
 
/* prepare for attach */

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] ve: fix container stopped state check

2017-12-01 Thread Stanislav Kinsburskiy
Checking for empty cgroup is not correct, because init process leaves cgroup
early in do_exit.
This leads to a situation, when container is treated as stopped but its
resources (VEIP for instance) are not yet released.
Which in turn leads to container restart failure due to non-releases VEIP
address.

https://jira.sw.ru/browse/PSBM-78078

Signed-off-by: Stanislav Kinsburskiy 
---
 kernel/ve/ve.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index b0188c3..c628516 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -846,7 +846,7 @@ static int ve_state_read(struct cgroup *cg, struct cftype 
*cft,
 
if (ve->is_running)
seq_puts(m, "RUNNING");
-   else if (!nr_threads_ve(ve))
+   else if (!ve->init_task)
seq_puts(m, "STOPPED");
else if (ve->ve_ns)
seq_puts(m, "STOPPING");

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] venet: remove obsolete tgt_veip variable

2017-11-27 Thread Stanislav Kinsburskiy
This was used for "vzredir" feature in vz6 and not used anymore.

Signed-off-by: Stanislav Kinsburskiy 
---
 drivers/net/venetdev.c |   12 +++-
 include/linux/venet.h  |1 -
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c
index 11f4a66..dffebdc 100644
--- a/drivers/net/venetdev.c
+++ b/drivers/net/venetdev.c
@@ -256,8 +256,7 @@ static void __veip_stop(struct ve_struct *ve)
ptr = list_entry(p, struct ip_entry_struct, ve_list);
ptr->active_env = NULL;
 
-   if (ptr->tgt_veip == NULL)
-   ip_entry_unhash(ptr);
+   ip_entry_unhash(ptr);
}
 
veip_pool_ops->veip_release(ve);
@@ -277,8 +276,6 @@ static int veip_entry_conflict(struct ip_entry_struct 
*entry, struct ve_struct *
 {
if (entry->active_env != NULL)
return -EADDRINUSE;
-   if (entry->tgt_veip && entry->tgt_veip->veid != ve->veid)
-   return -EADDRNOTAVAIL;
 
entry->active_env = ve;
return 0;
@@ -340,8 +337,7 @@ static int veip_entry_del(struct ve_struct *ve, struct 
ve_addr_struct *addr)
err = 0;
found->active_env = NULL;
 
-   if (found->tgt_veip == NULL)
-   ip_entry_unhash(found);
+   ip_entry_unhash(found);
 out:
spin_unlock(&veip_lock);
return err;
@@ -891,7 +887,6 @@ static int veip_seq_show(struct seq_file *m, void *v)
 {
struct hlist_node *p;
struct ip_entry_struct *entry;
-   struct veip_struct *veip;
char s[40];
 
if (v == SEQ_START_TOKEN) {
@@ -902,8 +897,7 @@ static int veip_seq_show(struct seq_file *m, void *v)
p = (struct hlist_node *)v;
entry = hlist_entry(p, struct ip_entry_struct, ip_hash);
veaddr_print(s, sizeof(s), &entry->addr);
-   veip = ACCESS_ONCE(entry->tgt_veip);
-   seq_printf(m, "%39s %10u\n", s, veip == NULL ? 0 : veip->veid);
+   seq_printf(m, "%39s 0\n", s);
return 0;
 }
 
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 7562996..08d89bc 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -28,7 +28,6 @@ struct ip_entry_struct
 {
struct ve_addr_struct   addr;
struct ve_struct*active_env;
-   struct veip_struct  *tgt_veip;
struct hlist_node   ip_hash;
union {
struct list_headve_list;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] venet: VEIP release debug patch

2017-11-27 Thread Stanislav Kinsburskiy
Sorry, ignore

27.11.2017 15:48, Stanislav Kinsburskiy пишет:
> Needed to investigate VEIP release - CT stop race.
> 
> https://jira.sw.ru/browse/PSBM-78078
> 
> Signed-off-by: Stanislav Kinsburskiy 
> ---
>  drivers/net/venetdev.c |6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c
> index 11f4a66..dcdb51d 100644
> --- a/drivers/net/venetdev.c
> +++ b/drivers/net/venetdev.c
> @@ -256,8 +256,12 @@ static void __veip_stop(struct ve_struct *ve)
>   ptr = list_entry(p, struct ip_entry_struct, ve_list);
>   ptr->active_env = NULL;
>  
> - if (ptr->tgt_veip == NULL)
> + if (ptr->tgt_veip == NULL) {
> + printk("%s: removing IP for ve %d\n", __func__,
> + ptr->tgt_veip->veid);
> + dump_stack();
>   ip_entry_unhash(ptr);
> + }
>   }
>  
>   veip_pool_ops->veip_release(ve);
> 
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] venet: VEIP release debug patch

2017-11-27 Thread Stanislav Kinsburskiy
Needed to investigate VEIP release - CT stop race.

https://jira.sw.ru/browse/PSBM-78078

Signed-off-by: Stanislav Kinsburskiy 
---
 drivers/net/venetdev.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c
index 11f4a66..dcdb51d 100644
--- a/drivers/net/venetdev.c
+++ b/drivers/net/venetdev.c
@@ -256,8 +256,12 @@ static void __veip_stop(struct ve_struct *ve)
ptr = list_entry(p, struct ip_entry_struct, ve_list);
ptr->active_env = NULL;
 
-   if (ptr->tgt_veip == NULL)
+   if (ptr->tgt_veip == NULL) {
+   printk("%s: removing IP for ve %d\n", __func__,
+   ptr->tgt_veip->veid);
+   dump_stack();
ip_entry_unhash(ptr);
+   }
}
 
veip_pool_ops->veip_release(ve);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] netfilter: kill the fake untracked conntrack objects

2017-11-24 Thread Stanislav Kinsburskiy
resurrect an old patch from Pablo Neira to remove the untracked objects.

Currently, there are four possible states of an skb wrt. conntrack.

1. No conntrack attached, ct is NULL.
2. Normal (kmem cache allocated) ct attached.
3. a template (kmalloc'd), not in any hash tables at any point in time
4. the 'untracked' conntrack, a percpu nf_conn object, tagged via
IPS_UNTRACKED_BIT in ct->status.

Untracked is supposed to be identical to case 1.  It exists only
so users can check

-m conntrack --ctstate UNTRACKED vs.
-m conntrack --ctstate INVALID

e.g. attempts to set connmark on INVALID or UNTRACKED conntracks is
supposed to be a no-op.

Thus currently we need to check
ct == NULL || nf_ct_is_untracked(ct)

in a lot of places in order to avoid altering untracked objects.

The other consequence of the percpu untracked object is that all
-j NOTRACK (and, later, kfree_skb of such skbs) result in an atomic op
(inc/dec the untracked conntracks refcount).

This adds a new kernel-private ctinfo state, IP_CT_UNTRACKED, to
make the distinction instead.

The (few) places that care about packet invalid (ct is NULL) vs.
packet untracked now need to test ct == NULL vs. ctinfo ==
IP_CT_UNTRACKED,
but all other places can omit the nf_ct_is_untracked() check.

https://jira.sw.ru/browse/PSBM-71153

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
Signed-off-by: Stanislav Kinsburskiy 
---
 include/net/ip_vs.h|5 +-
 include/net/netfilter/nf_conntrack.h   |   10 -
 include/uapi/linux/netfilter/nf_conntrack_common.h |6 ++-
 net/ipv4/netfilter/nf_dup_ipv4.c   |5 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |5 +-
 net/ipv6/netfilter/nf_dup_ipv6.c   |5 +-
 net/netfilter/nf_conntrack_core.c  |   42 ++--
 net/netfilter/nf_nat_core.c|3 -
 net/netfilter/xt_CT.c  |   21 +-
 net/netfilter/xt_conntrack.c   |   13 +++---
 net/netfilter/xt_state.c   |   13 +++---
 11 files changed, 41 insertions(+), 87 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e1599ad..69d2a9a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1540,9 +1540,8 @@ static inline void ip_vs_notrack(struct sk_buff *skb)
 
if (!ct || !nf_ct_is_untracked(ct)) {
nf_conntrack_put(skb->nfct);
-   skb->nfct = &nf_ct_untracked_get()->ct_general;
-   skb->nfctinfo = IP_CT_NEW;
-   nf_conntrack_get(skb->nfct);
+   skb->nfct = NULL;
+   skb->nfctinfo = IP_CT_UNTRACKED;
}
 #endif
 }
diff --git a/include/net/netfilter/nf_conntrack.h 
b/include/net/netfilter/nf_conntrack.h
index edb8911..b278973 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -232,14 +232,6 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
   enum ip_conntrack_dir dir,
   u32 seq);
 
-/* Fake conntrack entry for untracked connections */
-DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
-static inline struct nf_conn *nf_ct_untracked_get(void)
-{
-   return &__raw_get_cpu_var(nf_conntrack_untracked);
-}
-void nf_ct_untracked_status_or(unsigned long bits);
-
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 void nf_ct_iterate_cleanup(struct net *net,
   int (*iter)(struct nf_conn *i, void *data),
@@ -272,7 +264,7 @@ static inline int nf_ct_is_dying(struct nf_conn *ct)
 
 static inline int nf_ct_is_untracked(const struct nf_conn *ct)
 {
-   return test_bit(IPS_UNTRACKED_BIT, &ct->status);
+   return false;
 }
 
 /* Packet is received from loopback */
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h 
b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 6d074d1..6c18960 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -28,12 +28,14 @@ enum ip_conntrack_info {
/* only for userspace compatibility */
 #ifndef __KERNEL__
IP_CT_NEW_REPLY = IP_CT_NUMBER,
+#else
+   IP_CT_UNTRACKED = 7,
 #endif
 };
 
 #define NF_CT_STATE_INVALID_BIT(1 << 0)
 #define NF_CT_STATE_BIT(ctinfo)(1 << ((ctinfo) % 
IP_CT_IS_REPLY + 1))
-#define NF_CT_STATE_UNTRACKED_BIT  (1 << (IP_CT_NUMBER + 1))
+#define NF_CT_STATE_UNTRACKED_BIT  (1 << (IP_CT_UNTRACKED + 1))
 
 /* Bitset representing status of connection. */
 enum ip_conntrack_status {
@@ -90,7 +92,7 @@ enum ip_conntrack_status {
IPS_TEMPLATE_BIT = 11,
IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT),
 
-   /* Conntrack is a fake untracke

[Devel] [PATCH v2] nfs: abort delegation in dying VE

2017-11-15 Thread Stanislav Kinsburskiy
Don't queue delegation request, if ve init is exiting.

https://jira.sw.ru/browse/PSBM-77061

v2: check ve is dying under rcu_lock()

Inspired-by: Kirill Tkhai 
Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/delegation.c |   18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 66af497..1d1500c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -189,15 +189,31 @@ void nfs_inode_reclaim_delegation(struct inode *inode, 
struct rpc_cred *cred,
nfs_inode_set_delegation(inode, cred, res);
 }
 
+static bool ve_abort_delegation(struct inode *inode)
+{
+   struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+   struct rpc_xprt *xprt;
+   bool abort;
+
+   rcu_read_lock();
+   xprt = rcu_dereference(clp->cl_rpcclient->cl_xprt);
+   abort = xprt->xprt_net->owner_ve->ve_netns == NULL;
+   rcu_read_unlock();
+
+   return abort;
+}
+
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation 
*delegation, int issync)
 {
int res = 0;
 
-   if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+   if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+   !ve_abort_delegation(inode)) {
res = nfs4_proc_delegreturn(inode,
delegation->cred,
&delegation->stateid,
issync);
+   }
nfs_free_delegation(delegation);
return res;
 }

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] nfs: abort delegation in dying VE

2017-11-15 Thread Stanislav Kinsburskiy
Don't queue delegation request, if ve init is exiting.

https://jira.sw.ru/browse/PSBM-77061

Inspired-by: Kirill Tkhai 
Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/delegation.c |   16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 66af497..2422754 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -189,15 +189,29 @@ void nfs_inode_reclaim_delegation(struct inode *inode, 
struct rpc_cred *cred,
nfs_inode_set_delegation(inode, cred, res);
 }
 
+static bool ve_abort_delegation(struct inode *inode)
+{
+   struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+   struct rpc_xprt *xprt;
+
+   rcu_read_lock();
+   xprt = rcu_dereference(clp->cl_rpcclient->cl_xprt);
+   rcu_read_unlock();
+
+   return xprt->xprt_net->owner_ve->ve_netns == NULL;
+}
+
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation 
*delegation, int issync)
 {
int res = 0;
 
-   if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+   if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+   !ve_abort_delegation(inode)) {
res = nfs4_proc_delegreturn(inode,
delegation->cred,
&delegation->stateid,
issync);
+   }
nfs_free_delegation(delegation);
return res;
 }

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v6 0/2] nfs: fix race between callback shutdown and execution

2017-11-13 Thread Stanislav Kinsburskiy
The idea is to use mutex for protecting callback execution agains per-net
callback shutdown and destroying all the net-related backchannel requests
before transports destruction.

---

Stanislav Kinsburskiy (2):
  sunrpc: bc_svc_flush_queue_net() helper introduced
  nfs: protect callback execution against per-net callback thread shutdown


 fs/nfs/callback.c  |   20 
 include/linux/sunrpc/svc.h |3 +++
 net/sunrpc/svc.c   |   15 +++
 3 files changed, 38 insertions(+)

--
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v6 2/2] nfs: protect callback execution against per-net callback thread shutdown

2017-11-13 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v6: destroy all per-net backchannel requests only for NFSv4.1

v5: destroy all per-net backchannel requests before transports on in
nfs_callback_down_net

v4: use another mutex to protect callback execution agains per-net transports
shutdown.
This guarantees, that transports won't be destroyed by shutdown callback while
execution is in progress and vice versa.

v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.
The idea is to simply check if thread has to exit, if mutex lock has failed.
This is a busy loop, but it shouldn't happend often and for long.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c  |   20 
 include/linux/sunrpc/svc.h |1 +
 2 files changed, 21 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..feffccf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv)
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static DEFINE_MUTEX(nfs41_callback_mutex);
+
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
 
+   mutex_lock(&nfs41_callback_mutex);
+   if (kthread_should_stop()) {
+   mutex_unlock(&nfs41_callback_mutex);
+   return 0;
+   }
+
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
@@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs41_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs41_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
@@ -139,6 +149,13 @@ nfs41_callback_svc(void *vrqstp)
return 0;
 }
 
+static void nfs41_callback_down_net(struct svc_serv *serv, struct net *net)
+{
+   mutex_lock(&nfs41_callback_mutex);
+   bc_svc_flush_queue_net(serv, net);
+   mutex_unlock(&nfs41_callback_mutex);
+}
+
 /*
  * Bring up the NFSv4.1 callback service
  */
@@ -150,6 +167,7 @@ nfs41_callback_up(struct svc_serv *serv)
INIT_LIST_HEAD(&serv->sv_cb_list);
spin_lock_init(&serv->sv_cb_lock);
init_waitqueue_head(&serv->sv_cb_waitq);
+   serv->svc_cb_down_net = nfs41_callback_down_net;
rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
return rqstp;
@@ -242,6 +260,8 @@ static void nfs_callback_down_net(u32 minorversion, struct 
svc_serv *serv, struc
return;
 
dprintk("NFS: destroy per-net callback data; net=%p\n", net);
+   if (serv->svc_cb_down_net)
+   serv->svc_cb_down_net(serv, net);
svc_shutdown_net(serv, net);
 }
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index fe70ff0..c04ef80 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -108,6 +108,7 @@ struct svc_serv {
wait_queue_head_t   sv_cb_waitq;/* sleep here if there are no
 * entries in the svc_cb_list */
struct svc_xprt *sv_bc_xprt;/* callback on fore channel */
+   void(*svc_cb_down_net)(struct svc_serv *serv, 
struct net *net);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 };
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v6 1/2] sunrpc: bc_svc_flush_queue_net() helper introduced

2017-11-13 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

This helper can be used to remove backchannel requests from callback queue on
per-net basis.

Signed-off-by: Stanislav Kinsburskiy 
---
 include/linux/sunrpc/svc.h |2 ++
 net/sunrpc/svc.c   |   15 +++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 2b30868..fe70ff0 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -484,6 +484,8 @@ void   svc_reserve(struct svc_rqst *rqstp, 
int space);
 struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
 char *svc_print_addr(struct svc_rqst *, char *, size_t);
 
+void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net);
+
 #defineRPC_MAX_ADDRBUFLEN  (63U)
 
 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index de8cded..2ca4ff7 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1338,6 +1338,21 @@ svc_process(struct svc_rqst *rqstp)
 EXPORT_SYMBOL_GPL(svc_process);
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
+void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net)
+{
+   struct rpc_rqst *req, *tmp;
+
+   spin_lock_bh(&serv->sv_cb_lock);
+   list_for_each_entry_safe(req, tmp, &serv->sv_cb_list, rq_bc_list) {
+   if (req->rq_xprt->xprt_net == net) {
+   list_del(&req->rq_bc_list);
+   xprt_free_bc_request(req);
+   }
+   }
+   spin_unlock_bh(&serv->sv_cb_lock);
+}
+EXPORT_SYMBOL_GPL(bc_svc_flush_queue_net);
+
 /*
  * Process a backchannel RPC request that arrived over an existing
  * outbound connection

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH v5 2/2] nfs: protect callback execution against per-net callback thread shutdown

2017-11-09 Thread Stanislav Kinsburskiy


09.11.2017 10:33, Kirill Tkhai пишет:
> 
> Looks good for me, except there is one more call of nfs_callback_down_net() 
> in nfs_callback_up().
> It's not clear for me, we shouldn't protect it via the mutex too. Should we?
> 

Looks like no, mount is not ready yet and there is no connection to server, 
thus no request from server can
come to the moment and we don't need any additional protection here.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v5 2/2] nfs: protect callback execution against per-net callback thread shutdown

2017-11-08 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v5: destroy all per-net backchannel requests before transports on in
nfs_callback_down_net

v4: use another mutex to protect callback execution agains per-net transports
shutdown.
This guarantees, that transports won't be destroyed by shutdown callback while
execution is in progress and vice versa.

v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.
The idea is to simply check if thread has to exit, if mutex lock has failed.
This is a busy loop, but it shouldn't happend often and for long.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |   17 +
 1 file changed, 17 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..e18d774 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv)
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static DEFINE_MUTEX(nfs41_callback_mutex);
+
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
 
+   mutex_lock(&nfs41_callback_mutex);
+   if (kthread_should_stop()) {
+   mutex_unlock(&nfs41_callback_mutex);
+   return 0;
+   }
+
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
@@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs41_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs41_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
@@ -242,6 +252,7 @@ static void nfs_callback_down_net(u32 minorversion, struct 
svc_serv *serv, struc
return;
 
dprintk("NFS: destroy per-net callback data; net=%p\n", net);
+   bc_svc_flush_queue_net(serv, net);
svc_shutdown_net(serv, net);
 }
 
@@ -377,7 +388,13 @@ void nfs_callback_down(int minorversion, struct net *net)
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
 
mutex_lock(&nfs_callback_mutex);
+#if defined(CONFIG_NFS_V4_1)
+   mutex_lock(&nfs41_callback_mutex);
+   nfs_callback_down_net(minorversion, cb_info->serv, net);
+   mutex_unlock(&nfs41_callback_mutex);
+#else
nfs_callback_down_net(minorversion, cb_info->serv, net);
+#endif
cb_info->users--;
if (cb_info->users == 0 && cb_info->task != NULL) {
kthread_stop(cb_info->task);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v5 0/2] nfs: fix race between callback shutdown and execution

2017-11-08 Thread Stanislav Kinsburskiy
The idea is to use mutex for protecting callback execution agains per-net
callback shutdown and destroying all the net-related backchannel requests
before transports destruction.


---

Stanislav Kinsburskiy (2):
  sunrpc: bc_svc_flush_queue_net() helper introduced
  nfs: protect callback execution against per-net callback thread shutdown


 fs/nfs/callback.c  |   17 +
 include/linux/sunrpc/svc.h |2 ++
 net/sunrpc/svc.c   |   15 +++
 3 files changed, 34 insertions(+)

--
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v5 1/2] sunrpc: bc_svc_flush_queue_net() helper introduced

2017-11-08 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

This helper can be used to remove backchannel requests from callback queue on
per-net basis.

Signed-off-by: Stanislav Kinsburskiy 
---
 include/linux/sunrpc/svc.h |2 ++
 net/sunrpc/svc.c   |   15 +++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 2b30868..fe70ff0 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -484,6 +484,8 @@ void   svc_reserve(struct svc_rqst *rqstp, 
int space);
 struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
 char *svc_print_addr(struct svc_rqst *, char *, size_t);
 
+void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net);
+
 #defineRPC_MAX_ADDRBUFLEN  (63U)
 
 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index de8cded..2ca4ff7 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1338,6 +1338,21 @@ svc_process(struct svc_rqst *rqstp)
 EXPORT_SYMBOL_GPL(svc_process);
 
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
+void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net)
+{
+   struct rpc_rqst *req, *tmp;
+
+   spin_lock_bh(&serv->sv_cb_lock);
+   list_for_each_entry_safe(req, tmp, &serv->sv_cb_list, rq_bc_list) {
+   if (req->rq_xprt->xprt_net == net) {
+   list_del(&req->rq_bc_list);
+   xprt_free_bc_request(req);
+   }
+   }
+   spin_unlock_bh(&serv->sv_cb_lock);
+}
+EXPORT_SYMBOL_GPL(bc_svc_flush_queue_net);
+
 /*
  * Process a backchannel RPC request that arrived over an existing
  * outbound connection

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-08 Thread Stanislav Kinsburskiy
Please, ignore

08.11.2017 15:07, Stanislav Kinsburskiy пишет:
> From: Stanislav Kinsburskiy 
> 
> Here is the race:
> 
> CPU #0CPU#1
> 
> cleanup_mnt   nfs41_callback_svc (get xprt from the list)
> nfs_callback_down ...
> ...   ...
> svc_close_net ...
> ...   ...
> svc_xprt_free ...
> svc_bc_sock_free  bc_svc_process
> kfree(xprt)   svc_process_common
>   rqstp->rq_xprt->xpt_ops (use after free)
> 
> The problem is that per-net SUNRPC transports shutdown is done regardless
> current callback execution. This is a race leading to transport use-after-free
> in callback handler.
> This patch fixes it in stright-forward way. I.e. it protects callback
> execution with the same mutex used for per-net data creation and destruction.
> Hopefully, it won't slow down NFS client significantly.
> 
> https://jira.sw.ru/browse/PSBM-75751
> 
> v4: use another mutex to protect callback execution agains per-net transports
> shutdown.
> This guarantees, that transports won't be destroyed by shutdown callback while
> execution is in progress and vice versa.
> 
> v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
> mutex taken), while thread wait for the mutex to take.
> The idea is to simply check if thread has to exit, if mutex lock has failed.
> This is a busy loop, but it shouldn't happend often and for long.
> 
> Signed-off-by: Stanislav Kinsburskiy 
> ---
>  fs/nfs/callback.c |   16 
>  1 file changed, 16 insertions(+)
> 
> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
> index 0beb275..4d7979b 100644
> --- a/fs/nfs/callback.c
> +++ b/fs/nfs/callback.c
> @@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv)
>  }
>  
>  #if defined(CONFIG_NFS_V4_1)
> +static DEFINE_MUTEX(nfs41_callback_mutex);
> +
>  /*
>   * The callback service for NFSv4.1 callbacks
>   */
> @@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp)
>   if (try_to_freeze())
>   continue;
>  
> + mutex_lock(&nfs41_callback_mutex);
> + if (kthread_should_stop()) {
> + mutex_unlock(&nfs41_callback_mutex);
> + return 0;
> + }
> +
>   prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
>   spin_lock_bh(&serv->sv_cb_lock);
>   if (!list_empty(&serv->sv_cb_list)) {
> @@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp)
>   error = bc_svc_process(serv, req, rqstp);
>   dprintk("bc_svc_process() returned w/ error code= %d\n",
>   error);
> + mutex_unlock(&nfs41_callback_mutex);
>   } else {
>   spin_unlock_bh(&serv->sv_cb_lock);
> + mutex_unlock(&nfs41_callback_mutex);
>   schedule();
>   finish_wait(&serv->sv_cb_waitq, &wq);
>   }
> @@ -377,7 +387,13 @@ void nfs_callback_down(int minorversion, struct net *net)
>   struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
>  
>   mutex_lock(&nfs_callback_mutex);
> +#if defined(CONFIG_NFS_V4_1)
> + mutex_lock(&nfs41_callback_mutex);
> + nfs_callback_down_net(minorversion, cb_info->serv, net);
> + mutex_unlock(&nfs41_callback_mutex);
> +#else
>   nfs_callback_down_net(minorversion, cb_info->serv, net);
> +#endif
>   cb_info->users--;
>   if (cb_info->users == 0 && cb_info->task != NULL) {
>   kthread_stop(cb_info->task);
> 
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-08 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v4: use another mutex to protect callback execution agains per-net transports
shutdown.
This guarantees, that transports won't be destroyed by shutdown callback while
execution is in progress and vice versa.

v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.
The idea is to simply check if thread has to exit, if mutex lock has failed.
This is a busy loop, but it shouldn't happend often and for long.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |   16 
 1 file changed, 16 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..4d7979b 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv)
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static DEFINE_MUTEX(nfs41_callback_mutex);
+
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
 
+   mutex_lock(&nfs41_callback_mutex);
+   if (kthread_should_stop()) {
+   mutex_unlock(&nfs41_callback_mutex);
+   return 0;
+   }
+
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
@@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs41_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs41_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
@@ -377,7 +387,13 @@ void nfs_callback_down(int minorversion, struct net *net)
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
 
mutex_lock(&nfs_callback_mutex);
+#if defined(CONFIG_NFS_V4_1)
+   mutex_lock(&nfs41_callback_mutex);
+   nfs_callback_down_net(minorversion, cb_info->serv, net);
+   mutex_unlock(&nfs41_callback_mutex);
+#else
nfs_callback_down_net(minorversion, cb_info->serv, net);
+#endif
cb_info->users--;
if (cb_info->users == 0 && cb_info->task != NULL) {
kthread_stop(cb_info->task);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v4] nfs: protect callback execution against per-net callback thread shutdown

2017-11-08 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v4: use another mutex to protect callback execution agains per-net transports
shutdown.
This guarantees, that transports won't be destroyed by shutdown callback while
execution is in progress and vice versa.

v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.
The idea is to simply check if thread has to exit, if mutex lock has failed.
This is a busy loop, but it shouldn't happend often and for long.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |   16 
 1 file changed, 16 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..4d7979b 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv)
 }
 
 #if defined(CONFIG_NFS_V4_1)
+static DEFINE_MUTEX(nfs41_callback_mutex);
+
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
 
+   mutex_lock(&nfs41_callback_mutex);
+   if (kthread_should_stop()) {
+   mutex_unlock(&nfs41_callback_mutex);
+   return 0;
+   }
+
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
@@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs41_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs41_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
@@ -377,7 +387,13 @@ void nfs_callback_down(int minorversion, struct net *net)
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
 
mutex_lock(&nfs_callback_mutex);
+#if defined(CONFIG_NFS_V4_1)
+   mutex_lock(&nfs41_callback_mutex);
+   nfs_callback_down_net(minorversion, cb_info->serv, net);
+   mutex_unlock(&nfs41_callback_mutex);
+#else
nfs_callback_down_net(minorversion, cb_info->serv, net);
+#endif
cb_info->users--;
if (cb_info->users == 0 && cb_info->task != NULL) {
kthread_stop(cb_info->task);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH v3] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Please, see my comments below.

07.11.2017 12:30, Kirill Tkhai пишет:
> On 07.11.2017 13:39, Stanislav Kinsburskiy wrote:
>> From: Stanislav Kinsburskiy 
>>
>> Here is the race:
>>
>> CPU #0   CPU#1
>>
>> cleanup_mnt  nfs41_callback_svc (get xprt from the list)
>> nfs_callback_down...
>> ...  ...
>> svc_close_net...
>> ...  ...
>> svc_xprt_free...
>> svc_bc_sock_free bc_svc_process
>> kfree(xprt)  svc_process_common
>>  rqstp->rq_xprt->xpt_ops (use after free)
>>
>> The problem is that per-net SUNRPC transports shutdown is done regardless
>> current callback execution. This is a race leading to transport 
>> use-after-free
>> in callback handler.
>> This patch fixes it in stright-forward way. I.e. it protects callback
>> execution with the same mutex used for per-net data creation and destruction.
>> Hopefully, it won't slow down NFS client significantly.
>>
>> https://jira.sw.ru/browse/PSBM-75751
>>
>> v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
>> mutex taken), while thread wait for the mutex to take.
>> The idea is to simply check if thread has to exit, if mutex lock has failed.
>> This is a busy loop, but it shouldn't happend often and for long.
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/nfs/callback.c |6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
>> index 0beb275..bbb07e4 100644
>> --- a/fs/nfs/callback.c
>> +++ b/fs/nfs/callback.c
>> @@ -117,7 +117,11 @@ nfs41_callback_svc(void *vrqstp)
>>  if (try_to_freeze())
>>  continue;
>>  
>> +if (!mutex_trylock(&nfs_callback_mutex))
>> +   continue;
> 
> This looks like a busy loop (that especially bad, because mutex-owner may 
> sleep
> at the moment we are looping).
> 

Well, yes. It a busy loop. And that's not good. But I have to mention, that 
this busy loop can happen only on umount request.

> It seems the solution may be to change nfs_callback_down() function.
> Can't we flush pending request in nfs_callback_down()?

Well, this already happens. The problem is in race between transport usage 
(unconditional) and its destruction (also unconditional).
IOW, there should be either reference counting or some critical section.
Looks like the former will need a way more code and thus more error-prone, the 
latter is uglier, but code should be simpler at first sight.


> Or just delete it from sv_cb_list without handling (does nfs proto allow 
> that)?
> 

Well, this looks like a promising idea. But yet again, there should be some 
protection against transport usage in shutdown helper.
And it's not yet clear, how to implement it without significant code change...

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v3] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.
The idea is to simply check if thread has to exit, if mutex lock has failed.
This is a busy loop, but it shouldn't happend often and for long.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..bbb07e4 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -117,7 +117,11 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
 
+   if (!mutex_trylock(&nfs_callback_mutex))
+  continue;
+
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +133,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH v2] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Sorry, this one is bad as well.

07.11.2017 11:28, Stanislav Kinsburskiy пишет:
> Here is the race:
> 
> CPU #0CPU#1
> 
> cleanup_mnt   nfs41_callback_svc (get xprt from the list)
> nfs_callback_down ...
> ...   ...
> svc_close_net ...
> ...   ...
> svc_xprt_free ...
> svc_bc_sock_free  bc_svc_process
> kfree(xprt)   svc_process_common
>   rqstp->rq_xprt->xpt_ops (use after free)
> 
> The problem is that per-net SUNRPC transports shutdown is done regardless
> current callback execution. This is a race leading to transport use-after-free
> in callback handler.
> This patch fixes it in stright-forward way. I.e. it protects callback
> execution with the same mutex used for per-net data creation and destruction.
> Hopefully, it won't slow down NFS client significantly.
> 
> https://jira.sw.ru/browse/PSBM-75751
> 
> v2: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
> mutex taken), while thread wait for the mutex to take.
> 
> Signed-off-by: Stanislav Kinsburskiy 
> ---
>  fs/nfs/callback.c |7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
> index 0beb275..eed861a 100644
> --- a/fs/nfs/callback.c
> +++ b/fs/nfs/callback.c
> @@ -118,6 +118,11 @@ nfs41_callback_svc(void *vrqstp)
>   continue;
>  
>   prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
> +
> + if (!mutex_trylock(&nfs_callback_mutex) &&
> + kthread_should_stop())
> + return 0;
> +
>   spin_lock_bh(&serv->sv_cb_lock);
>   if (!list_empty(&serv->sv_cb_list)) {
>   req = list_first_entry(&serv->sv_cb_list,
> @@ -129,8 +134,10 @@ nfs41_callback_svc(void *vrqstp)
>   error = bc_svc_process(serv, req, rqstp);
>   dprintk("bc_svc_process() returned w/ error code= %d\n",
>   error);
> + mutex_unlock(&nfs_callback_mutex);
>   } else {
>   spin_unlock_bh(&serv->sv_cb_lock);
> + mutex_unlock(&nfs_callback_mutex);
>   schedule();
>   finish_wait(&serv->sv_cb_waitq, &wq);
>   }
> 
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v2: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..eed861a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,6 +118,11 @@ nfs41_callback_svc(void *vrqstp)
continue;
 
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+
+   if (!mutex_trylock(&nfs_callback_mutex) &&
+   kthread_should_stop())
+   return 0;
+
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +134,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
07.11.2017 11:03, Kirill Tkhai пишет:
>>> Couldn't this change introduce a deadlock like below?
>>>   [thread]
>>> nfs_callback_down()   
>>> nfs41_callback_svc()
>>>mutex_lock(&nfs_callback_mutex);   
>>>kthread_stop(cb_info->task);   
>>>   wake_up_process();  
>>>   wait_for_completion(&kthread->exited);  
> 
> And what about above one?
> 

Good catch. Need to think more about it then.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Sure, here it is:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc
... (get transport from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use 
after free)



07.11.2017 10:49, Kirill Tkhai пишет:
> On 03.11.2017 19:47, Stanislav Kinsburskiy wrote:
>> From: Stanislav Kinsburskiy 
>>
>> The problem is that per-net SUNRPC transports shutdown is done regardless
>> current callback execution. This is a race leading to transport 
>> use-after-free
>> in callback handler.
> 
> Could you please draw the race to show the interaction between functions?
> 
>> This patch fixes it in stright-forward way. I.e. it protects callback
>> execution with the same mutex used for per-net data creation and destruction.
>> Hopefully, it won't slow down NFS client significantly.
>>
>> https://jira.sw.ru/browse/PSBM-75751
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/nfs/callback.c |3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
>> index 0beb275..82e8ed1 100644
>> --- a/fs/nfs/callback.c
>> +++ b/fs/nfs/callback.c
>> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
>>  continue;
>>  
>>  prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
>> +mutex_lock(&nfs_callback_mutex);
>>  spin_lock_bh(&serv->sv_cb_lock);
>>  if (!list_empty(&serv->sv_cb_list)) {
>>  req = list_first_entry(&serv->sv_cb_list,
>> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
>>  error = bc_svc_process(serv, req, rqstp);
>>  dprintk("bc_svc_process() returned w/ error code= %d\n",
>>  error);
>> +mutex_unlock(&nfs_callback_mutex);
>>  } else {
>>  spin_unlock_bh(&serv->sv_cb_lock);
>> +mutex_unlock(&nfs_callback_mutex);
>>  schedule();
>>  finish_wait(&serv->sv_cb_waitq, &wq);
>>  }
> 
> Couldn't this change introduce a deadlock like below?
>   [thread]
> nfs_callback_down()   nfs41_callback_svc()
>mutex_lock(&nfs_callback_mutex);   
>kthread_stop(cb_info->task);   
>   wake_up_process();  
>   wait_for_completion(&kthread->exited);  
> 
>   
> mutex_lock(&nfs_callback_mutex); 
> 
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Well... Maybe.
Let's check, how it works with our kernel.

07.11.2017 10:16, Konstantin Khorenko пишет:
> Going to send it to mainstream as well?
> 
> -- 
> Best regards,
> 
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
> 
> On 11/03/2017 07:47 PM, Stanislav Kinsburskiy wrote:
>> From: Stanislav Kinsburskiy 
>>
>> The problem is that per-net SUNRPC transports shutdown is done regardless
>> current callback execution. This is a race leading to transport 
>> use-after-free
>> in callback handler.
>> This patch fixes it in stright-forward way. I.e. it protects callback
>> execution with the same mutex used for per-net data creation and destruction.
>> Hopefully, it won't slow down NFS client significantly.
>>
>> https://jira.sw.ru/browse/PSBM-75751
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/nfs/callback.c |    3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
>> index 0beb275..82e8ed1 100644
>> --- a/fs/nfs/callback.c
>> +++ b/fs/nfs/callback.c
>> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
>>  continue;
>>
>>  prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
>> +    mutex_lock(&nfs_callback_mutex);
>>  spin_lock_bh(&serv->sv_cb_lock);
>>  if (!list_empty(&serv->sv_cb_list)) {
>>  req = list_first_entry(&serv->sv_cb_list,
>> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
>>  error = bc_svc_process(serv, req, rqstp);
>>  dprintk("bc_svc_process() returned w/ error code= %d\n",
>>  error);
>> +    mutex_unlock(&nfs_callback_mutex);
>>  } else {
>>  spin_unlock_bh(&serv->sv_cb_lock);
>> +    mutex_unlock(&nfs_callback_mutex);
>>  schedule();
>>  finish_wait(&serv->sv_cb_waitq, &wq);
>>  }
>>
>> ___
>> Devel mailing list
>> Devel@openvz.org
>> https://lists.openvz.org/mailman/listinfo/devel
>> .
>>
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-03 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..82e8ed1 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
continue;
 
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+   mutex_lock(&nfs_callback_mutex);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7] netfilter: Allow xt_owner in any user namespace

2017-10-17 Thread Stanislav Kinsburskiy


17.10.2017 08:53, Andrei Vagin пишет:
> On Mon, Oct 16, 2017 at 05:50:38PM +0200, Stanislav Kinsburskiy wrote:
>> Well, patch looks ok.
>> But shouldn't all the ve_init_user_ns() replaced by the par->net?
> 
> This patch does this.
> 

Yes, but not everywhere.
Say, there are owner_mt_ve0 and owner_mt6_ve0.
Shouldn't there functions also patched?

>>
>> 14.10.2017 01:20, Andrei Vagin пишет:
>>> From: "Eric W. Biederman" 
>>>
>>> ML: 9847371a84b0be330f4bc4aaa98904101ee8573d
>>> https://jira.sw.ru/browse/PSBM-69409?
>>>
>>> Making this work is a little tricky as it really isn't kosher to
>>> change the xt_owner_match_info in a check function.
>>>
>>> Without changing xt_owner_match_info we need to know the user
>>> namespace the uids and gids are specified in.  In the common case
>>> net->user_ns == current_user_ns().  Verify net->user_ns ==
>>> current_user_ns() in owner_check so we can later assume it in
>>> owner_mt.
>>>
>>> In owner_check also verify that all of the uids and gids specified are
>>> in net->user_ns and that the expected min/max relationship exists
>>> between the uids and gids in xt_owner_match_info.
>>>
>>> In owner_mt get the network namespace from the outgoing socket, as this
>>> must be the same network namespace as the netfilter rules, and use that
>>> network namespace to find the user namespace the uids and gids in
>>> xt_match_owner_info are encoded in.  Then convert from their encoded
>>> from into the kernel internal format for uids and gids and perform the
>>> owner match.
>>>
>>> Similar to ping_group_range, this code does not try to detect
>>> noncontiguous UID/GID ranges.
>>>
>>> Signed-off-by: "Eric W. Biederman" 
>>> Signed-off-by: Kevin Cernekee 
>>> Signed-off-by: Pablo Neira Ayuso 
>>> Signed-off-by: Andrei Vagin 
>>> ---
>>>  net/netfilter/xt_owner.c | 41 +++--
>>>  1 file changed, 35 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
>>> index 31dec4a..1744f78 100644
>>> --- a/net/netfilter/xt_owner.c
>>> +++ b/net/netfilter/xt_owner.c
>>> @@ -80,11 +80,39 @@ owner_mt6_v0(const struct sk_buff *skb, struct 
>>> xt_action_param *par)
>>>  static int owner_check(const struct xt_mtchk_param *par)
>>>  {
>>> struct xt_owner_match_info *info = par->matchinfo;
>>> +   struct net *net = par->net;
>>>  
>>> -   /* For now only allow adding matches from the initial user namespace */
>>> +   /* Only allow the common case where the userns of the writer
>>> +* matches the userns of the network namespace.
>>> +*/
>>> if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) &&
>>> -   !current_user_ns_initial())
>>> +   (current_user_ns() != net->user_ns))
>>> return -EINVAL;
>>> +
>>> +   /* Ensure the uids are valid */
>>> +   if (info->match & XT_OWNER_UID) {
>>> +   kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
>>> +   kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
>>> +
>>> +   if (!uid_valid(uid_min) || !uid_valid(uid_max) ||
>>> +   (info->uid_max < info->uid_min) ||
>>> +   uid_lt(uid_max, uid_min)) {
>>> +   return -EINVAL;
>>> +   }
>>> +   }
>>> +
>>> +   /* Ensure the gids are valid */
>>> +   if (info->match & XT_OWNER_GID) {
>>> +   kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
>>> +   kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
>>> +
>>> +   if (!gid_valid(gid_min) || !gid_valid(gid_max) ||
>>> +   (info->gid_max < info->gid_min) ||
>>> +   gid_lt(gid_max, gid_min)) {
>>> +   return -EINVAL;
>>> +   }
>>> +   }
>>> +
>>> return 0;
>>>  }
>>>  
>>> @@ -93,6 +121,7 @@ owner_mt(const struct sk_buff *skb, struct 
>>> xt_action_param *par)
>>>  {
>>> const struct xt_owner_match_info *info = par->matchinfo;
>>> const struct file *filp;
>>> +   struct net *net = dev_net(par->in ? par->in : par->out);
&

Re: [Devel] [PATCH rh7] netfilter: Allow xt_owner in any user namespace

2017-10-16 Thread Stanislav Kinsburskiy
Well, patch looks ok.
But shouldn't all the ve_init_user_ns() replaced by the par->net?

14.10.2017 01:20, Andrei Vagin пишет:
> From: "Eric W. Biederman" 
> 
> ML: 9847371a84b0be330f4bc4aaa98904101ee8573d
> https://jira.sw.ru/browse/PSBM-69409?
> 
> Making this work is a little tricky as it really isn't kosher to
> change the xt_owner_match_info in a check function.
> 
> Without changing xt_owner_match_info we need to know the user
> namespace the uids and gids are specified in.  In the common case
> net->user_ns == current_user_ns().  Verify net->user_ns ==
> current_user_ns() in owner_check so we can later assume it in
> owner_mt.
> 
> In owner_check also verify that all of the uids and gids specified are
> in net->user_ns and that the expected min/max relationship exists
> between the uids and gids in xt_owner_match_info.
> 
> In owner_mt get the network namespace from the outgoing socket, as this
> must be the same network namespace as the netfilter rules, and use that
> network namespace to find the user namespace the uids and gids in
> xt_match_owner_info are encoded in.  Then convert from their encoded
> from into the kernel internal format for uids and gids and perform the
> owner match.
> 
> Similar to ping_group_range, this code does not try to detect
> noncontiguous UID/GID ranges.
> 
> Signed-off-by: "Eric W. Biederman" 
> Signed-off-by: Kevin Cernekee 
> Signed-off-by: Pablo Neira Ayuso 
> Signed-off-by: Andrei Vagin 
> ---
>  net/netfilter/xt_owner.c | 41 +++--
>  1 file changed, 35 insertions(+), 6 deletions(-)
> 
> diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
> index 31dec4a..1744f78 100644
> --- a/net/netfilter/xt_owner.c
> +++ b/net/netfilter/xt_owner.c
> @@ -80,11 +80,39 @@ owner_mt6_v0(const struct sk_buff *skb, struct 
> xt_action_param *par)
>  static int owner_check(const struct xt_mtchk_param *par)
>  {
>   struct xt_owner_match_info *info = par->matchinfo;
> + struct net *net = par->net;
>  
> - /* For now only allow adding matches from the initial user namespace */
> + /* Only allow the common case where the userns of the writer
> +  * matches the userns of the network namespace.
> +  */
>   if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) &&
> - !current_user_ns_initial())
> + (current_user_ns() != net->user_ns))
>   return -EINVAL;
> +
> + /* Ensure the uids are valid */
> + if (info->match & XT_OWNER_UID) {
> + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
> + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
> +
> + if (!uid_valid(uid_min) || !uid_valid(uid_max) ||
> + (info->uid_max < info->uid_min) ||
> + uid_lt(uid_max, uid_min)) {
> + return -EINVAL;
> + }
> + }
> +
> + /* Ensure the gids are valid */
> + if (info->match & XT_OWNER_GID) {
> + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
> + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
> +
> + if (!gid_valid(gid_min) || !gid_valid(gid_max) ||
> + (info->gid_max < info->gid_min) ||
> + gid_lt(gid_max, gid_min)) {
> + return -EINVAL;
> + }
> + }
> +
>   return 0;
>  }
>  
> @@ -93,6 +121,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param 
> *par)
>  {
>   const struct xt_owner_match_info *info = par->matchinfo;
>   const struct file *filp;
> + struct net *net = dev_net(par->in ? par->in : par->out);
>  
>   if (skb->sk == NULL || skb->sk->sk_socket == NULL)
>   return (info->match ^ info->invert) == 0;
> @@ -109,8 +138,8 @@ owner_mt(const struct sk_buff *skb, struct 
> xt_action_param *par)
>  (XT_OWNER_UID | XT_OWNER_GID)) == 0;
>  
>   if (info->match & XT_OWNER_UID) {
> - kuid_t uid_min = make_kuid(ve_init_user_ns(), info->uid_min);
> - kuid_t uid_max = make_kuid(ve_init_user_ns(), info->uid_max);
> + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
> + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
>   if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
>uid_lte(filp->f_cred->fsuid, uid_max)) ^
>   !(info->invert & XT_OWNER_UID))
> @@ -118,8 +147,8 @@ owner_mt(const struct sk_buff *skb, struct 
> xt_action_param *par)
>   }
>  
>   if (info->match & XT_OWNER_GID) {
> - kgid_t gid_min = make_kgid(ve_init_user_ns(), info->gid_min);
> - kgid_t gid_max = make_kgid(ve_init_user_ns(), info->gid_max);
> + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
> + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
>   if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
>gid_lte(filp->f_cred->fsgid

[Devel] [PATCH] venet: destroy VE IP on venet destruction in NFS is enabled

2017-10-13 Thread Stanislav Kinsburskiy
We skip VE IP destruion in shutdown hook, if NFS is enabled in CT (to allow
NFS mounts to disappear.
Thus we have to destroy it with venet device.

https://jira.sw.ru/browse/PSBM-75120

Signed-off-by: Stanislav Kinsburskiy 
---
 drivers/net/venetdev.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c
index 7a546cc..fad232b 100644
--- a/drivers/net/venetdev.c
+++ b/drivers/net/venetdev.c
@@ -759,9 +759,12 @@ static void venet_dellink(struct net_device *dev, struct 
list_head *head)
struct ve_struct *env = dev->nd_net->owner_ve;
 
/* We check ve_netns to avoid races with veip SHUTDOWN hook, called from
-* ve_exit_ns()
+* ve_exit_ns().
+* Also, in veip SHUTDOWN hook we skip veip destructionif, if container
+* has VE_FEATURE_NFS enabled. Thus here we have to destroy veip in
+* this case.
 */
-   if (env->ve_netns)
+   if (env->ve_netns || (env->features & VE_FEATURE_NFS))
veip_shutdown(env);
 
env->_venet_dev = NULL;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] venet: do not destroy VE IP on shutdown hook if NFS is allowed

2017-10-05 Thread Stanislav Kinsburskiy
Mounts are destroyed asynchroniously and thus can race with VE IP destruction
leading to dead lock in kernel: NFS can'n be unmounted.
Fix it by skipping VE IP desctruction, is NFS feature is enabled in the
container.

https://jira.sw.ru/browse/PSBM-73614

Signed-off-by: Stanislav Kinsburskiy 
---
 drivers/net/venetdev.c |   16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c
index 5710792..7a546cc 100644
--- a/drivers/net/venetdev.c
+++ b/drivers/net/venetdev.c
@@ -744,10 +744,8 @@ static void venet_setup(struct net_device *dev)
SET_ETHTOOL_OPS(dev, &venet_ethtool_ops);
 }
 
-static void veip_shutdown(void *data)
+static void veip_shutdown(struct ve_struct *ve)
 {
-   struct ve_struct *ve = data;
-
spin_lock(&veip_lock);
if (ve->veip) {
__venet_ext_clean(ve);
@@ -1178,8 +1176,18 @@ static struct rtnl_link_ops venet_link_ops = {
.maxtype= VENET_INFO_MAX,
 };
 
+static void veip_shutdown_fini(void *data)
+{
+   struct ve_struct *ve = data;
+
+   if (ve->features & VE_FEATURE_NFS)
+   return;
+
+   veip_shutdown(ve);
+}
+
 static struct ve_hook veip_shutdown_hook = {
-   .fini   = veip_shutdown,
+   .fini   = veip_shutdown_fini,
.priority   = HOOK_PRIO_FINISHING,
.owner  = THIS_MODULE,
 };

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] ve: don't call shutdown hook for now

2017-10-02 Thread Stanislav Kinsburskiy
It was me.
The reason is explained in the patch description (you can find more details in 
description of the commit, introduced the hook).
I'm waiting for khorenko@ to convince him, that the idea to "podkovat' blohy" 
was wrong by design and we should drop the hook.
Once I succeed, I'll sent appropriate series to get rid of it completely.
So this patch is temporary one, because too many tests have failed because of 
this race.


2 окт. 2017 г. 8:05 PM пользователь Andrey Vagin  написал:
On Mon, Oct 02, 2017 at 06:47:19PM +0400, Stanislav Kinsburskiy wrote:
> This hook is needed only for releaseing venet IP address early (thus allowing
> to restart container with the same IP faster).
> But mount are destroyed asynchroniosly, and thus NFS mount can be destroyed
> after IP address is dropped.
> Let's fir this race it the way how all the world does things: release IP with
> network namespace.

Why don't you remove this code? Who added this hook? What was a reason?

>
> https://jira.sw.ru/browse/PSBM-73193
>
> Signed-off-by: Stanislav Kinsburskiy 
> ---
>  kernel/ve/ve.c |3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index b0188c3..4efb9d9 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -451,7 +451,8 @@ static void ve_drop_context(struct ve_struct *ve)
>synchronize_rcu();
>put_nsproxy(ve_ns);
>
> - ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve);
> + /* This have to be revisited */
> +//   ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve);
>
>put_cred(ve->init_cred);
>ve->init_cred = NULL;
>
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] ve: don't call shutdown hook for now

2017-10-02 Thread Stanislav Kinsburskiy
This hook is needed only for releaseing venet IP address early (thus allowing
to restart container with the same IP faster).
But mount are destroyed asynchroniosly, and thus NFS mount can be destroyed
after IP address is dropped.
Let's fir this race it the way how all the world does things: release IP with
network namespace.

https://jira.sw.ru/browse/PSBM-73193

Signed-off-by: Stanislav Kinsburskiy 
---
 kernel/ve/ve.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index b0188c3..4efb9d9 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -451,7 +451,8 @@ static void ve_drop_context(struct ve_struct *ve)
synchronize_rcu();
put_nsproxy(ve_ns);
 
-   ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve);
+   /* This have to be revisited */
+// ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve);
 
put_cred(ve->init_cred);
ve->init_cred = NULL;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] scripts: add "-w" to iptables command

2017-09-28 Thread Stanislav Kinsburskiy
How old should it be?
I checked with v1.4.21

28.09.2017 12:55, Kirill Tkhai пишет:
> Could you please to say will it work on old iptables?
> 
> On 28.09.2017 13:03, Stanislav Kinsburskiy wrote:
>> What a brilliant idea it was to ignore unknown keys.
>> Should take it into account.
>>
>> 28.09.2017 10:26, Vasily Averin пишет:
>>> kthai@ explained that old version of iptables ignores unknown keys, so 
>>> adding -w is safe.
>>>
>>> On 2017-09-28 10:40, Pavel Tikhomirov wrote:
>>>> Can we have these script running with older iptables version which does 
>>>> not have "-w"?
>>>>
>>>> On 09/27/2017 02:11 PM, Stanislav Kinsburskiy wrote:
>>>>> Neede to support new versions of iptables.
>>>>>
>>>>> https://jira.sw.ru/browse/PSBM-73153
>>>>>
>>>>> Signed-off-by: Stanislav Kinsburskiy 
>>>>> ---
>>>>>   scripts/nfs-ports-allow.sh |   16 
>>>>>   1 file changed, 8 insertions(+), 8 deletions(-)
>>>>>
>>>>> diff --git a/scripts/nfs-ports-allow.sh b/scripts/nfs-ports-allow.sh
>>>>> index 97541dc..ac5cf5f 100644
>>>>> --- a/scripts/nfs-ports-allow.sh
>>>>> +++ b/scripts/nfs-ports-allow.sh
>>>>> @@ -36,10 +36,10 @@ function add_accept_rules {
>>>>>   local server=$1
>>>>>   local port=$2
>>>>>   -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
>>>>> $server --dport $port -j ACCEPT &&
>>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
>>>>> $server --dport $port -j ACCEPT
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
>>>>> $server --dport $port -j ACCEPT &&
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
>>>>> $server --dport $port -j ACCEPT
>>>>>   }
>>>>> function iptables_allow_nfs_ports {
>>>>> @@ -63,10 +63,10 @@ function allow_portmapper_port {
>>>>>   local server=$1
>>>>>   local port=111
>>>>>   -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
>>>>> $server --dport $port -j ACCEPT &&
>>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
>>>>> $server --dport $port -j ACCEPT
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
>>>>> $server --dport $port -j ACCEPT &&
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
>>>>> $server --sport $port -j ACCEPT &&
>>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
>>>>> $server --dport $port -j ACCEPT
>>>>>   }
>>>>> for s in $servers; do
>>>>>
>>>>> ___
>>>>> Devel mailing list
>>>>> Devel@openvz.org
>>>>> https://lists.openvz.org/mailman/listinfo/devel
>>>>>
>>>>
>>> ___
>>> Devel mailing list
>>> Devel@openvz.org
>>> https://lists.openvz.org/mailman/listinfo/devel
>>>
>> ___
>> Devel mailing list
>> Devel@openvz.org
>> https://lists.openvz.org/mailman/listinfo/devel
>>
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] scripts: add "-w" to iptables command

2017-09-28 Thread Stanislav Kinsburskiy
What a brilliant idea it was to ignore unknown keys.
Should take it into account.

28.09.2017 10:26, Vasily Averin пишет:
> kthai@ explained that old version of iptables ignores unknown keys, so adding 
> -w is safe.
> 
> On 2017-09-28 10:40, Pavel Tikhomirov wrote:
>> Can we have these script running with older iptables version which does not 
>> have "-w"?
>>
>> On 09/27/2017 02:11 PM, Stanislav Kinsburskiy wrote:
>>> Neede to support new versions of iptables.
>>>
>>> https://jira.sw.ru/browse/PSBM-73153
>>>
>>> Signed-off-by: Stanislav Kinsburskiy 
>>> ---
>>>   scripts/nfs-ports-allow.sh |   16 
>>>   1 file changed, 8 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/scripts/nfs-ports-allow.sh b/scripts/nfs-ports-allow.sh
>>> index 97541dc..ac5cf5f 100644
>>> --- a/scripts/nfs-ports-allow.sh
>>> +++ b/scripts/nfs-ports-allow.sh
>>> @@ -36,10 +36,10 @@ function add_accept_rules {
>>>   local server=$1
>>>   local port=$2
>>>   -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
>>> $server --sport $port -j ACCEPT &&
>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server 
>>> --dport $port -j ACCEPT &&
>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server 
>>> --sport $port -j ACCEPT &&
>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server 
>>> --dport $port -j ACCEPT
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
>>> $server --sport $port -j ACCEPT &&
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
>>> $server --dport $port -j ACCEPT &&
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
>>> $server --sport $port -j ACCEPT &&
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
>>> $server --dport $port -j ACCEPT
>>>   }
>>> function iptables_allow_nfs_ports {
>>> @@ -63,10 +63,10 @@ function allow_portmapper_port {
>>>   local server=$1
>>>   local port=111
>>>   -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
>>> $server --sport $port -j ACCEPT &&
>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server 
>>> --dport $port -j ACCEPT &&
>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server 
>>> --sport $port -j ACCEPT &&
>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server 
>>> --dport $port -j ACCEPT
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
>>> $server --sport $port -j ACCEPT &&
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
>>> $server --dport $port -j ACCEPT &&
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
>>> $server --sport $port -j ACCEPT &&
>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
>>> $server --dport $port -j ACCEPT
>>>   }
>>> for s in $servers; do
>>>
>>> ___
>>> Devel mailing list
>>> Devel@openvz.org
>>> https://lists.openvz.org/mailman/listinfo/devel
>>>
>>
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] scripts: add "-w" to iptables command

2017-09-27 Thread Stanislav Kinsburskiy
Neede to support new versions of iptables.

https://jira.sw.ru/browse/PSBM-73153

Signed-off-by: Stanislav Kinsburskiy 
---
 scripts/nfs-ports-allow.sh |   16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/nfs-ports-allow.sh b/scripts/nfs-ports-allow.sh
index 97541dc..ac5cf5f 100644
--- a/scripts/nfs-ports-allow.sh
+++ b/scripts/nfs-ports-allow.sh
@@ -36,10 +36,10 @@ function add_accept_rules {
local server=$1
local port=$2
 
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server 
--sport $port -j ACCEPT &&
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server 
--dport $port -j ACCEPT &&
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server 
--sport $port -j ACCEPT &&
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server 
--dport $port -j ACCEPT 
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
$server --sport $port -j ACCEPT &&
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
$server --dport $port -j ACCEPT &&
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
$server --sport $port -j ACCEPT &&
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
$server --dport $port -j ACCEPT 
 }
 
 function iptables_allow_nfs_ports {
@@ -63,10 +63,10 @@ function allow_portmapper_port {
local server=$1
local port=111
 
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server 
--sport $port -j ACCEPT &&
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server 
--dport $port -j ACCEPT &&
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server 
--sport $port -j ACCEPT &&
-   ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server 
--dport $port -j ACCEPT 
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s 
$server --sport $port -j ACCEPT &&
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d 
$server --dport $port -j ACCEPT &&
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s 
$server --sport $port -j ACCEPT &&
+   ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d 
$server --dport $port -j ACCEPT 
 }
 
 for s in $servers; do

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] connector: bump skb->users before callback invocation

2017-09-20 Thread Stanislav Kinsburskiy
From: Florian Westphal 

Backport of commit 55285bf09427c5abf43ee1d54e892f352092b1f1.

Dmitry reports memleak with syskaller program.
Problem is that connector bumps skb usecount but might not invoke callback.

So move skb_get to where we invoke the callback.

https://jira.sw.ru/browse/PSBM-71904

Reported-by: Dmitry Vyukov 
Signed-off-by: Florian Westphal 
Signed-off-by: David S. Miller 
Signed-off-by: Stanislav Kinsburskiy 
---
 drivers/connector/connector.c |   11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 752c692..57285ba 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -161,26 +161,21 @@ static int cn_call_callback(struct sk_buff *skb)
  *
  * It checks skb, netlink header and msg sizes, and calls callback helper.
  */
-static void cn_rx_skb(struct sk_buff *__skb)
+static void cn_rx_skb(struct sk_buff *skb)
 {
struct nlmsghdr *nlh;
-   struct sk_buff *skb;
int len, err;
 
-   skb = skb_get(__skb);
-
if (skb->len >= NLMSG_HDRLEN) {
nlh = nlmsg_hdr(skb);
len = nlmsg_len(nlh);
 
if (len < (int)sizeof(struct cn_msg) ||
skb->len < nlh->nlmsg_len ||
-   len > CONNECTOR_MAX_MSG_SIZE) {
-   kfree_skb(skb);
+   len > CONNECTOR_MAX_MSG_SIZE)
return;
-   }
 
-   err = cn_call_callback(skb);
+   err = cn_call_callback(skb_get(skb));
if (err < 0)
kfree_skb(skb);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation

2017-09-14 Thread Stanislav Kinsburskiy


14.09.2017 13:45, Ian Kent пишет:
> On 14/09/17 19:39, Stanislav Kinsburskiy wrote:
>>
>>
>> 14.09.2017 13:29, Ian Kent пишет:
>>> On 14/09/17 17:24, Stanislav Kinsburskiy wrote:
>>>>
>>>>
>>>> 14.09.2017 02:38, Ian Kent пишет:
>>>>> On 01/09/17 19:21, Stanislav Kinsburskiy wrote:
>>>>>> Signed-off-by: Stanislav Kinsburskiy 
>>>>>> ---
>>>>>>  fs/autofs4/autofs_i.h  |3 +++
>>>>>>  fs/autofs4/dev-ioctl.c |3 +++
>>>>>>  fs/autofs4/inode.c |4 +++-
>>>>>>  3 files changed, 9 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
>>>>>> index 4737615..3da105f 100644
>>>>>> --- a/fs/autofs4/autofs_i.h
>>>>>> +++ b/fs/autofs4/autofs_i.h
>>>>>> @@ -120,6 +120,9 @@ struct autofs_sb_info {
>>>>>>  struct list_head active_list;
>>>>>>  struct list_head expiring_list;
>>>>>>  struct rcu_head rcu;
>>>>>> +#ifdef CONFIG_COMPAT
>>>>>> +unsigned is32bit:1;
>>>>>> +#endif
>>>>>>  };
>>>>>>  
>>>>>>  static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
>>>>>> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
>>>>>> index b7c816f..467d6c4 100644
>>>>>> --- a/fs/autofs4/dev-ioctl.c
>>>>>> +++ b/fs/autofs4/dev-ioctl.c
>>>>>> @@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file 
>>>>>> *fp,
>>>>>>  sbi->pipefd = pipefd;
>>>>>>  sbi->pipe = pipe;
>>>>>>  sbi->catatonic = 0;
>>>>>> +#ifdef CONFIG_COMPAT
>>>>>> +sbi->is32bit = is_compat_task();
>>>>>> +#endif
>>>>>>  }
>>>>>>  out:
>>>>>>  put_pid(new_pid);
>>>>>> diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
>>>>>> index 09e7d68..21d3c0b 100644
>>>>>> --- a/fs/autofs4/inode.c
>>>>>> +++ b/fs/autofs4/inode.c
>>>>>> @@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void 
>>>>>> *data, int silent)
>>>>>>  } else {
>>>>>>  sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
>>>>>>  }
>>>>>> -
>>>>>> +#ifdef CONFIG_COMPAT
>>>>>> +sbi->is32bit = is_compat_task();
>>>>>> +#endif
>>>>>>  if (autofs_type_trigger(sbi->type))
>>>>>>  __managed_dentry_set_managed(root);
>>>>>>  
>>>>>>
>>>>>
>>>>> Not sure about this.
>>>>>
>>>>> Don't you think it would be better to avoid the in code #ifdefs by doing 
>>>>> some
>>>>> checks and defines in the header file and defining what's need to just use
>>>>> is_compat_task().
>>>>>
>>>>
>>>> Yes, might be...
>>>>
>>>>> Not sure 2 patches are needed for this either ..
>>>>>
>>>>
>>>> Well, I found this issue occasionally.
>>>
>>> I'm wondering what the symptoms are?
>>>
>>
>> Size of struct autofs_v5_packet is 300 bytes for x86 and 304 bytes for 
>> x86_64.
>> Which means, that 32bit task can read more than size of autofs_v5_packet on 
>> 64bit kernel.
> 
> Are you sure?
> 
> Shouldn't that be a short read on the x86 side of a 4 bytes longer
> structure on the x86_64 side.
> 
> I didn't think you could have a 64 bit client on a 32 bit kernel
> so the converse (the read past end of struct) doesn't apply.
> 

Sorry for the confusion, I had to add brackets like this:

> Which means, that 32bit task can read more than size of autofs_v5_packet (on 
> 64bit kernel).

IOW, 32bit task expects to read 300 bytes (size of struct autofs_v5_packet) 
while there are 304 bytes available on the "wire" from the 64bit kernel.


> Ian
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation

2017-09-14 Thread Stanislav Kinsburskiy


14.09.2017 13:29, Ian Kent пишет:
> On 14/09/17 17:24, Stanislav Kinsburskiy wrote:
>>
>>
>> 14.09.2017 02:38, Ian Kent пишет:
>>> On 01/09/17 19:21, Stanislav Kinsburskiy wrote:
>>>> Signed-off-by: Stanislav Kinsburskiy 
>>>> ---
>>>>  fs/autofs4/autofs_i.h  |3 +++
>>>>  fs/autofs4/dev-ioctl.c |3 +++
>>>>  fs/autofs4/inode.c |4 +++-
>>>>  3 files changed, 9 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
>>>> index 4737615..3da105f 100644
>>>> --- a/fs/autofs4/autofs_i.h
>>>> +++ b/fs/autofs4/autofs_i.h
>>>> @@ -120,6 +120,9 @@ struct autofs_sb_info {
>>>>struct list_head active_list;
>>>>struct list_head expiring_list;
>>>>struct rcu_head rcu;
>>>> +#ifdef CONFIG_COMPAT
>>>> +  unsigned is32bit:1;
>>>> +#endif
>>>>  };
>>>>  
>>>>  static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
>>>> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
>>>> index b7c816f..467d6c4 100644
>>>> --- a/fs/autofs4/dev-ioctl.c
>>>> +++ b/fs/autofs4/dev-ioctl.c
>>>> @@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
>>>>sbi->pipefd = pipefd;
>>>>sbi->pipe = pipe;
>>>>sbi->catatonic = 0;
>>>> +#ifdef CONFIG_COMPAT
>>>> +  sbi->is32bit = is_compat_task();
>>>> +#endif
>>>>}
>>>>  out:
>>>>put_pid(new_pid);
>>>> diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
>>>> index 09e7d68..21d3c0b 100644
>>>> --- a/fs/autofs4/inode.c
>>>> +++ b/fs/autofs4/inode.c
>>>> @@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void 
>>>> *data, int silent)
>>>>} else {
>>>>sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
>>>>}
>>>> -
>>>> +#ifdef CONFIG_COMPAT
>>>> +  sbi->is32bit = is_compat_task();
>>>> +#endif
>>>>if (autofs_type_trigger(sbi->type))
>>>>__managed_dentry_set_managed(root);
>>>>  
>>>>
>>>
>>> Not sure about this.
>>>
>>> Don't you think it would be better to avoid the in code #ifdefs by doing 
>>> some
>>> checks and defines in the header file and defining what's need to just use
>>> is_compat_task().
>>>
>>
>> Yes, might be...
>>
>>> Not sure 2 patches are needed for this either ..
>>>
>>
>> Well, I found this issue occasionally.
> 
> I'm wondering what the symptoms are?
> 

Size of struct autofs_v5_packet is 300 bytes for x86 and 304 bytes for x86_64.
Which means, that 32bit task can read more than size of autofs_v5_packet on 
64bit kernel.

>> And, frankly speaking, it's not clear to me, whether this issue is important 
>> at all, so I wanted to clarify this first.
>> Thanks to O_DIRECT, the only way to catch the issue is to try to read more, 
>> than expected, in compat task (that's how I found it).
> 
> Right, the O_DIRECT patch from Linus was expected to fix the structure
> alignment problem. The stuct field offsets are ok aren't they?
> 

Yes, they are ok.

>> I don't see any other flaw so far. And if so, that, probably, we shouldn't 
>> care about the issue at all.
>> What do you think?
> 
> If we are seeing hangs, incorrect struct fields or similar something
> should be done about it but if all is actually working ok then the
> O_DIRECT fix is doing it's job and further changes aren't necessary.
> 

Well, yes. O_DIRECT fix covers the issue.
Ok then.
Thanks for the clarification!

> Ian
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation

2017-09-14 Thread Stanislav Kinsburskiy


14.09.2017 02:38, Ian Kent пишет:
> On 01/09/17 19:21, Stanislav Kinsburskiy wrote:
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/autofs4/autofs_i.h  |3 +++
>>  fs/autofs4/dev-ioctl.c |3 +++
>>  fs/autofs4/inode.c |4 +++-
>>  3 files changed, 9 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
>> index 4737615..3da105f 100644
>> --- a/fs/autofs4/autofs_i.h
>> +++ b/fs/autofs4/autofs_i.h
>> @@ -120,6 +120,9 @@ struct autofs_sb_info {
>>  struct list_head active_list;
>>  struct list_head expiring_list;
>>  struct rcu_head rcu;
>> +#ifdef CONFIG_COMPAT
>> +unsigned is32bit:1;
>> +#endif
>>  };
>>  
>>  static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
>> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
>> index b7c816f..467d6c4 100644
>> --- a/fs/autofs4/dev-ioctl.c
>> +++ b/fs/autofs4/dev-ioctl.c
>> @@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
>>  sbi->pipefd = pipefd;
>>  sbi->pipe = pipe;
>>  sbi->catatonic = 0;
>> +#ifdef CONFIG_COMPAT
>> +sbi->is32bit = is_compat_task();
>> +#endif
>>  }
>>  out:
>>  put_pid(new_pid);
>> diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
>> index 09e7d68..21d3c0b 100644
>> --- a/fs/autofs4/inode.c
>> +++ b/fs/autofs4/inode.c
>> @@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void 
>> *data, int silent)
>>  } else {
>>  sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
>>  }
>> -
>> +#ifdef CONFIG_COMPAT
>> +sbi->is32bit = is_compat_task();
>> +#endif
>>  if (autofs_type_trigger(sbi->type))
>>  __managed_dentry_set_managed(root);
>>  
>>
> 
> Not sure about this.
> 
> Don't you think it would be better to avoid the in code #ifdefs by doing some
> checks and defines in the header file and defining what's need to just use
> is_compat_task().
> 

Yes, might be...

> Not sure 2 patches are needed for this either ..
> 

Well, I found this issue occasionally.
And, frankly speaking, it's not clear to me, whether this issue is important at 
all, so I wanted to clarify this first.
Thanks to O_DIRECT, the only way to catch the issue is to try to read more, 
than expected, in compat task (that's how I found it).
I don't see any other flaw so far. And if so, that, probably, we shouldn't care 
about the issue at all.
What do you think?


> Ian
> 


 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] sysfs: don't use VE namespace for "dev/char" directory

2017-09-12 Thread Stanislav Kinsburskiy
Looks like we don't have devices, using it. Or lost some patches.
Anyway, currently it doesn't work. To make this VE namespace work, "namespace"
callback has to be added to ve_kobj_type. And this callback has to get VE
structure from kobject somehow. IOW, device has to be allocated per-ve.
Another issue here is that it's unclear, how to mark generic devices as
allocated in VE#0 thus making them visible in VE#0.
Looks like the whole idea was wrong...

https://jira.sw.ru/browse/PSBM-71811

Signed-off-by: Stanislav Kinsburskiy 
---
 drivers/base/core.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 80cf3eb..86540d9 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1538,7 +1538,7 @@ int __init devices_init(void)
sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
if (!sysfs_dev_block_kobj)
goto block_kobj_err;
-   sysfs_dev_char_kobj = kobject_create_and_add_ve("char", dev_kobj);
+   sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
if (!sysfs_dev_char_kobj)
goto char_kobj_err;
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] zdtm: fix package memory allocation in autofs.c

2017-09-12 Thread Stanislav Kinsburskiy
No, we don't.
But this one could be applied: "tests: do not try to read more than packet in 
AutoFS test"

09.09.2017 02:01, Andrei Vagin пишет:
> On Thu, Aug 31, 2017 at 12:10:40PM +0300, Stanislav Kinsburskiy wrote:
>> Plus some cleanup.
> 
> Do we need this patch for the upstream criu? Could you send it to
> the criu mailing list?
> 
>>
>> https://jira.sw.ru/browse/PSBM-71078
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  test/zdtm/static/autofs.c |   18 +-
>>  1 file changed, 9 insertions(+), 9 deletions(-)
>>
>> diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c
>> index 8d917ee..882289f 100644
>> --- a/test/zdtm/static/autofs.c
>> +++ b/test/zdtm/static/autofs.c
>> @@ -460,10 +460,10 @@ static int automountd_loop(int pipe, const char 
>> *mountpoint, struct autofs_param
>>  {
>>  union autofs_v5_packet_union *packet;
>>  ssize_t bytes;
>> -size_t psize = sizeof(*packet) * 2;
>> +size_t psize = sizeof(*packet);
>>  int err = 0;
>>  
>> -packet = malloc(psize);
>> +packet = malloc(psize * 2);
>>  if (!packet) {
>>  pr_err("failed to allocate autofs packet\n");
>>  return -ENOMEM;
>> @@ -473,7 +473,7 @@ static int automountd_loop(int pipe, const char 
>> *mountpoint, struct autofs_param
>>  siginterrupt(SIGUSR2, 1);
>>  
>>  while (!stop && !err) {
>> -memset(packet, 0, sizeof(*packet));
>> +memset(packet, 0, psize * 2);
>>  
>>  bytes = read(pipe, packet, psize);
>>  if (bytes < 0) {
>> @@ -483,12 +483,12 @@ static int automountd_loop(int pipe, const char 
>> *mountpoint, struct autofs_param
>>  }
>>  continue;
>>  }
>> -if (bytes > psize) {
>> -pr_err("read more that expected: %zd > %zd\n", bytes, 
>> psize);
>> -return -EINVAL;
>> -}
>> -if (bytes != sizeof(*packet)) {
>> -pr_err("read less than expected: %zd\n", bytes);
>> +if (bytes != psize) {
>> +pr_err("read %s that expected: %zd %s %zd\n",
>> +(bytes > psize) ? "more" : "less",
>> +bytes,
>> +(bytes > psize) ? ">" : "<",
>> +psize);
>>  return -EINVAL;
>>  }
>>  err = automountd_serve(mountpoint, param, packet);
>>
>> ___
>> Devel mailing list
>> Devel@openvz.org
>> https://lists.openvz.org/mailman/listinfo/devel
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] tcp: fix 0 divide in __tcp_select_window()

2017-09-04 Thread Stanislav Kinsburskiy
From: Eric Dumazet 

syszkaller fuzzer was able to trigger a divide by zero, when
TCP window scaling is not enabled.

SO_RCVBUF can be used not only to increase sk_rcvbuf, also
to decrease it below current receive buffers utilization.

If mss is negative or 0, just return a zero TCP window.

https://jira.sw.ru/browse/PSBM-71285

Back-port of ML commit: 06425c308b92eaf60767bc71d359f4cbc7a561f8

Signed-off-by: Eric Dumazet 
Reported-by: Dmitry Vyukov  
Acked-by: Neal Cardwell 
Signed-off-by: David S. Miller 
Signed-off-by: Stanislav Kinsburskiy 
---
 net/ipv4/tcp_output.c |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f395a09..3d24481 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2295,9 +2295,11 @@ u32 __tcp_select_window(struct sock *sk)
int full_space = min_t(int, tp->window_clamp, allowed_space);
int window;
 
-   if (mss > full_space)
+   if (unlikely(mss > full_space)) {
mss = full_space;
-
+   if (mss <= 0)
+   return 0;
+   }
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process

2017-09-03 Thread Stanislav Kinsburskiy


03.09.2017 21:23, Dmitry V. Levin P?P8QP5Q:
> On Sat, Sep 02, 2017 at 02:26:26PM +0300, Stanislav Kinsburskiy wrote:
>> 01.09.2017 21:23, Dmitry V. Levin P?P8QP5Q:
>>> On Fri, Sep 01, 2017 at 05:02:45PM +0300, Stanislav Kinsburskiy wrote:
>>>> 01.09.2017 16:53, Dmitry V. Levin P?P8QP5Q:
>>>>> On Fri, Sep 01, 2017 at 12:15:17PM +0300, Stanislav Kinsburskiy wrote:
>>>>>> 31.08.2017 20:22, Dmitry V. Levin P?P8QP5Q:
>>>>>>> On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote:
>>>>>>>> The structure autofs_v5_packet (except name) is not aligned by 8 
>>>>>>>> bytes, which
>>>>>>>> lead to different sizes in 32 and 64-bit architectures.
>>>>>>>> Let's form 32-bit compatible packet when daemon has 32-bit 
>>>>>>>> addressation.
>>>>>>>>
>>>>>>>> Signed-off-by: Stanislav Kinsburskiy 
>>>>>>>> ---
>>>>>>>>  fs/autofs4/waitq.c |   11 +--
>>>>>>>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
>>>>>>>> index 309ca6b..484cf2e 100644
>>>>>>>> --- a/fs/autofs4/waitq.c
>>>>>>>> +++ b/fs/autofs4/waitq.c
>>>>>>>> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct 
>>>>>>>> autofs_sb_info *sbi,
>>>>>>>>{
>>>>>>>>struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
>>>>>>>>struct user_namespace *user_ns = 
>>>>>>>> sbi->pipe->f_cred->user_ns;
>>>>>>>> +  size_t name_offset;
>>>>>>>>  
>>>>>>>> -  pktsz = sizeof(*packet);
>>>>>>>> +  if (sbi->is32bit)
>>>>>>>> +  name_offset = offsetof(struct autofs_v5_packet, 
>>>>>>>> len) +
>>>>>>>> +sizeof(packet->len);
>>>>>>>> +  else
>>>>>>>> +  name_offset = offsetof(struct autofs_v5_packet, 
>>>>>>>> name);
>>>>>>>
>>>>>>> This doesn't help at all because the offset of struct 
>>>>>>> autofs_v5_packet.name
>>>>>>> does not change.
>>>>>>>
>>>>>>>> +  pktsz = name_offset + sizeof(packet->name);
>>>>>>>
>>>>>>> What changes is pktsz: it's either sizeof(struct autofs_v5_packet)
>>>>>>> or 4 bytes less, depending on the architecture.
>>>>>>
>>>>>> Indeed. Thanks!
>>>>>>
>>>>>>> For example,
>>>>>>>
>>>>>>> #ifdef CONFIG_COMPAT
>>>>>>> if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit)
>>>>>>
>>>>>> Unfortunately I don't get this "alignof" checks.
>>>>>> The intention of "is32bit" was to define this compat case completely.
>>>>>> Why are they required?
>>>>>
>>>>> On some 32-bit architectures like arm, u64 is 64-bit aligned, on others
>>>>> like x86 it is not.  This alignof check ensures that compat 32-bit
>>>>> architectures with 64-bit alignment are not going to be broken
>>>>> by the change.
>>>>
>>>> Thanks for the explanation!
>>>> But looks like the issue is hidden so deep (thanks to O_DIRECT), that 
>>>> becomes unimportant.
>>>
>>> I'm not quite sure how O_DIRECT makes this unimportant.
>>> For example, automount_dispatch_io from systemd tries to read exactly
>>> sizeof(union autofs_v5_packet_union) bytes and fails in case of short read.
>>
>> That's weird. Can you reproduce it?
>> The problem is that kernel always send arch packet (i.e. 300 bytes for x86, 
>> or 304 for x86_64).
> 
> Well, if sending a lengthy packet is not a problem, what is the problem
> you were trying to solve by patching struct autofs_v5_packet?
> 

Well, nothing anymore. I took some time to realize, that this issue doesn't 
hurt anyone.
So I fixed our test instead.

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process

2017-09-02 Thread Stanislav Kinsburskiy


01.09.2017 21:23, Dmitry V. Levin P?P8QP5Q:
> On Fri, Sep 01, 2017 at 05:02:45PM +0300, Stanislav Kinsburskiy wrote:
>>
>>
>> 01.09.2017 16:53, Dmitry V. Levin P?P8QP5Q:
>>> On Fri, Sep 01, 2017 at 12:15:17PM +0300, Stanislav Kinsburskiy wrote:
>>>> 31.08.2017 20:22, Dmitry V. Levin P?P8QP5Q:
>>>>> On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote:
>>>>>> The structure autofs_v5_packet (except name) is not aligned by 8 bytes, 
>>>>>> which
>>>>>> lead to different sizes in 32 and 64-bit architectures.
>>>>>> Let's form 32-bit compatible packet when daemon has 32-bit addressation.
>>>>>>
>>>>>> Signed-off-by: Stanislav Kinsburskiy 
>>>>>> ---
>>>>>>  fs/autofs4/waitq.c |   11 +--
>>>>>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
>>>>>> index 309ca6b..484cf2e 100644
>>>>>> --- a/fs/autofs4/waitq.c
>>>>>> +++ b/fs/autofs4/waitq.c
>>>>>> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct 
>>>>>> autofs_sb_info *sbi,
>>>>>>  {
>>>>>>  struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
>>>>>>  struct user_namespace *user_ns = 
>>>>>> sbi->pipe->f_cred->user_ns;
>>>>>> +size_t name_offset;
>>>>>>  
>>>>>> -pktsz = sizeof(*packet);
>>>>>> +if (sbi->is32bit)
>>>>>> +name_offset = offsetof(struct autofs_v5_packet, 
>>>>>> len) +
>>>>>> +  sizeof(packet->len);
>>>>>> +else
>>>>>> +name_offset = offsetof(struct autofs_v5_packet, 
>>>>>> name);
>>>>>
>>>>> This doesn't help at all because the offset of struct 
>>>>> autofs_v5_packet.name
>>>>> does not change.
>>>>>
>>>>>> +pktsz = name_offset + sizeof(packet->name);
>>>>>
>>>>> What changes is pktsz: it's either sizeof(struct autofs_v5_packet)
>>>>> or 4 bytes less, depending on the architecture.
>>>>
>>>> Indeed. Thanks!
>>>>
>>>>> For example,
>>>>>
>>>>> #ifdef CONFIG_COMPAT
>>>>>   if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit)
>>>>
>>>> Unfortunately I don't get this "alignof" checks.
>>>> The intention of "is32bit" was to define this compat case completely.
>>>> Why are they required?
>>>
>>> On some 32-bit architectures like arm, u64 is 64-bit aligned, on others
>>> like x86 it is not.  This alignof check ensures that compat 32-bit
>>> architectures with 64-bit alignment are not going to be broken
>>> by the change.
>>
>> Thanks for the explanation!
>> But looks like the issue is hidden so deep (thanks to O_DIRECT), that 
>> becomes unimportant.
> 
> I'm not quite sure how O_DIRECT makes this unimportant.
> For example, automount_dispatch_io from systemd tries to read exactly
> sizeof(union autofs_v5_packet_union) bytes and fails in case of short read.
> 

That's weird. Can you reproduce it?
The problem is that kernel always send arch packet (i.e. 300 bytes for x86, or 
304 for x86_64).
So, how can systemd get less? It can get more, but only if it'll try to read 
more.
Otherwise, as "man pipe(2)" says:

*  If a read(2) specifies a buffer size that is smaller than the next packet, 
then the requested number of bytes are read, and the excess bytes in the packet 
are discarded.

> 
> 
> 
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process

2017-09-01 Thread Stanislav Kinsburskiy


01.09.2017 16:53, Dmitry V. Levin P?P8QP5Q:
> On Fri, Sep 01, 2017 at 12:15:17PM +0300, Stanislav Kinsburskiy wrote:
>> 31.08.2017 20:22, Dmitry V. Levin P?P8QP5Q:
>>> On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote:
>>>> The structure autofs_v5_packet (except name) is not aligned by 8 bytes, 
>>>> which
>>>> lead to different sizes in 32 and 64-bit architectures.
>>>> Let's form 32-bit compatible packet when daemon has 32-bit addressation.
>>>>
>>>> Signed-off-by: Stanislav Kinsburskiy 
>>>> ---
>>>>  fs/autofs4/waitq.c |   11 +--
>>>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
>>>> index 309ca6b..484cf2e 100644
>>>> --- a/fs/autofs4/waitq.c
>>>> +++ b/fs/autofs4/waitq.c
>>>> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct 
>>>> autofs_sb_info *sbi,
>>>>{
>>>>struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
>>>>struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
>>>> +  size_t name_offset;
>>>>  
>>>> -  pktsz = sizeof(*packet);
>>>> +  if (sbi->is32bit)
>>>> +  name_offset = offsetof(struct autofs_v5_packet, len) +
>>>> +sizeof(packet->len);
>>>> +  else
>>>> +  name_offset = offsetof(struct autofs_v5_packet, name);
>>>
>>> This doesn't help at all because the offset of struct autofs_v5_packet.name
>>> does not change.
>>>
>>>> +  pktsz = name_offset + sizeof(packet->name);
>>>
>>> What changes is pktsz: it's either sizeof(struct autofs_v5_packet)
>>> or 4 bytes less, depending on the architecture.
>>
>> Indeed. Thanks!
>>
>>> For example,
>>>
>>> #ifdef CONFIG_COMPAT
>>> if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit)
>>
>> Unfortunately I don't get this "alignof" checks.
>> The intention of "is32bit" was to define this compat case completely.
>> Why are they required?
> 
> On some 32-bit architectures like arm, u64 is 64-bit aligned, on others
> like x86 it is not.  This alignof check ensures that compat 32-bit
> architectures with 64-bit alignment are not going to be broken
> by the change.
> 

Thanks for the explanation!
But looks like the issue is hidden so deep (thanks to O_DIRECT), that becomes 
unimportant.

> 
> 
> 
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] zdtm: fix autofs to work with autofs_v5_packet_union issue

2017-09-01 Thread Stanislav Kinsburskiy
The only patch in the series discards the following CRIU commits:

27ce0613e7319c97cc7bfc0a240e3f5f61f3d912
e90b5ed57af866010bc8f5c7c9730aa68995cb77
2fc59ffe3ead2eff44542a415f16b1559e2c8140

The following series implements...

---

Stanislav Kinsburskiy (1):
  tests: do not try to read more than packet in AutoFS test


 test/zdtm/static/autofs.c |   13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] tests: do not try to read more than packet in AutoFS test

2017-09-01 Thread Stanislav Kinsburskiy
The intention was to make sure, that only one packet is sent at a time.
And thus read has to return exactly the size of one packet.
But it doesnt' work as expected, because size of autofs_v5_packet_union
differs on 32 bit and 64 bit architectures.
This is a bug, but it's hidden so deeply, that no one really cares by the
following 2 aspects:
1) Autofs pipe has O_DIRECT flag, which means excess bytes will be discarded
upon read.
2) No one tries to read more than one packet at a time.

So let's fix the test instead and do not try to read more bytes, than
expected.

https://jira.sw.ru/browse/PSBM-71078

Signed-off-by: Stanislav Kinsburskiy 
---
 test/zdtm/static/autofs.c |   13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c
index 747ab69..ae78538 100644
--- a/test/zdtm/static/autofs.c
+++ b/test/zdtm/static/autofs.c
@@ -460,7 +460,7 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
 {
union autofs_v5_packet_union *packet;
ssize_t bytes;
-   size_t psize = sizeof(*packet) + 1;
+   size_t psize = sizeof(*packet);
int err = 0;
 
packet = malloc(psize);
@@ -473,7 +473,7 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
siginterrupt(SIGUSR2, 1);
 
while (!stop && !err) {
-   memset(packet, 0, sizeof(*packet));
+   memset(packet, 0, psize);
 
bytes = read(pipe, packet, psize);
if (bytes < 0) {
@@ -483,12 +483,9 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
}
continue;
}
-   if (bytes == psize) {
-   pr_err("read more that expected\n");
-   return -EINVAL;
-   }
-   if (bytes != sizeof(*packet)) {
-   pr_err("read less than expected: %zd\n", bytes);
+   if (bytes != psize) {
+   pr_err("read less than expected: %zd < %zd\n",
+   bytes, psize);
return -EINVAL;
}
err = automountd_serve(mountpoint, param, packet);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process

2017-09-01 Thread Stanislav Kinsburskiy
The structure autofs_v5_packet (except name) is not aligned by 8 bytes, which
leads to different sizes in 32 and 64-bit architectures.
Let's form 32-bit compatible packet when daemon has 32-bit addressation.

Suggested-by: Dmitry V. Levin 
Signed-off-by: Stanislav Kinsburskiy 
---
 fs/autofs4/waitq.c |5 +
 1 file changed, 5 insertions(+)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 24a58bf..1f9b7d8 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -151,6 +151,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info 
*sbi,
struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
 
+#ifdef CONFIG_COMPAT
+   if (sbi->is32bit)
+   pktsz = offsetofend(struct autofs_v5_packet, name);
+   else
+#endif
pktsz = sizeof(*packet);
 
packet->wait_queue_token = wq->wait_queue_token;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation

2017-09-01 Thread Stanislav Kinsburskiy
Signed-off-by: Stanislav Kinsburskiy 
---
 fs/autofs4/autofs_i.h  |3 +++
 fs/autofs4/dev-ioctl.c |3 +++
 fs/autofs4/inode.c |4 +++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4737615..3da105f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -120,6 +120,9 @@ struct autofs_sb_info {
struct list_head active_list;
struct list_head expiring_list;
struct rcu_head rcu;
+#ifdef CONFIG_COMPAT
+   unsigned is32bit:1;
+#endif
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index b7c816f..467d6c4 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
sbi->pipefd = pipefd;
sbi->pipe = pipe;
sbi->catatonic = 0;
+#ifdef CONFIG_COMPAT
+   sbi->is32bit = is_compat_task();
+#endif
}
 out:
put_pid(new_pid);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 09e7d68..21d3c0b 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
} else {
sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
}
-
+#ifdef CONFIG_COMPAT
+   sbi->is32bit = is_compat_task();
+#endif
if (autofs_type_trigger(sbi->type))
__managed_dentry_set_managed(root);
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RFC PATCH 0/2] autofs: fix autofs_v5_packet dlivery in compat mode

2017-09-01 Thread Stanislav Kinsburskiy
The problem is that in compat mode struct autofs_v5_packet has to have 
different size
(i.e. 4 bytes less).

This is RFC because:
1) This issue is hidden, because autofs pipe has O_DIRECT and the rest of the
epacket is truncated when read.
2) X86 arch doesn't have is_compat_task() helper
3) It's now clear, what to do if "pgrp" option is specified.

The following series implements...

---

Stanislav Kinsburskiy (2):
  autofs: set compat flag on sbi when daemon uses 32bit addressation
  autofs: sent 32-bit sized packet for 32-bit process


 fs/autofs4/autofs_i.h  |3 +++
 fs/autofs4/dev-ioctl.c |3 +++
 fs/autofs4/inode.c |4 +++-
 fs/autofs4/waitq.c |5 +
 4 files changed, 14 insertions(+), 1 deletion(-)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] autofs: fix leaked pid on error path in autofs4_fill_super

2017-09-01 Thread Stanislav Kinsburskiy
Check for protocol happens after pid get.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/autofs4/inode.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 0ba9c02..2cd4e7e 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -298,7 +298,7 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
   "daemon (%d, %d) kernel (%d, %d)\n",
sbi->min_proto, sbi->max_proto,
AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
-   goto fail_dput;
+   goto fail_put_pid;
}
 
/* Establish highest kernel protocol version */

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] autofs: fix double pid put in error path

2017-09-01 Thread Stanislav Kinsburskiy
Signed-off-by: Stanislav Kinsburskiy 
---
 fs/autofs4/inode.c |1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index b23cf2a..0ba9c02 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -343,7 +343,6 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
 fail_ino:
kfree(ino);
 fail_free:
-   put_pid(sbi->oz_pgrp);
kfree(sbi);
s->s_fs_info = NULL;
return ret;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process

2017-09-01 Thread Stanislav Kinsburskiy


31.08.2017 20:22, Dmitry V. Levin пишет:
> On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote:
>> The structure autofs_v5_packet (except name) is not aligned by 8 bytes, which
>> lead to different sizes in 32 and 64-bit architectures.
>> Let's form 32-bit compatible packet when daemon has 32-bit addressation.
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/autofs4/waitq.c |   11 +--
>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
>> index 309ca6b..484cf2e 100644
>> --- a/fs/autofs4/waitq.c
>> +++ b/fs/autofs4/waitq.c
>> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct 
>> autofs_sb_info *sbi,
>>  {
>>  struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
>>  struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
>> +size_t name_offset;
>>  
>> -pktsz = sizeof(*packet);
>> +if (sbi->is32bit)
>> +name_offset = offsetof(struct autofs_v5_packet, len) +
>> +  sizeof(packet->len);
>> +else
>> +name_offset = offsetof(struct autofs_v5_packet, name);
> 
> This doesn't help at all because the offset of struct autofs_v5_packet.name
> does not change.
> 
>> +pktsz = name_offset + sizeof(packet->name);
> 
> What changes is pktsz: it's either sizeof(struct autofs_v5_packet)
> or 4 bytes less, depending on the architecture.

Indeed. Thanks!

> For example,
> 
> #ifdef CONFIG_COMPAT
>   if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit)

Unfortunately I don't get this "alignof" checks.
The intention of "is32bit" was to define this compat case completely.
Why are they required?

>   pktsz = offsetofend(struct autofs_v5_packet, name);
>   else
> #endif
>   pktsz = sizeof(*packet);
> 
> 



___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process

2017-08-31 Thread Stanislav Kinsburskiy
The structure autofs_v5_packet (except name) is not aligned by 8 bytes, which
lead to different sizes in 32 and 64-bit architectures.
Let's form 32-bit compatible packet when daemon has 32-bit addressation.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/autofs4/waitq.c |   11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 309ca6b..484cf2e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info 
*sbi,
{
struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
+   size_t name_offset;
 
-   pktsz = sizeof(*packet);
+   if (sbi->is32bit)
+   name_offset = offsetof(struct autofs_v5_packet, len) +
+ sizeof(packet->len);
+   else
+   name_offset = offsetof(struct autofs_v5_packet, name);
+
+   pktsz = name_offset + sizeof(packet->name);
 
packet->wait_queue_token = wq->wait_queue_token;
packet->len = wq->name.len;
-   memcpy(packet->name, wq->name.name, wq->name.len);
+   memcpy(packet + name_offset, wq->name.name, wq->name.len);
packet->name[wq->name.len] = '\0';
packet->dev = wq->dev;
packet->ino = wq->ino;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation

2017-08-31 Thread Stanislav Kinsburskiy
Signed-off-by: Stanislav Kinsburskiy 
---
 fs/autofs4/inode.c |   16 
 1 file changed, 16 insertions(+)

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index b23cf2a..989ac38 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -217,6 +217,7 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
int pgrp;
bool pgrp_set = false;
int ret = -EINVAL;
+   struct task_struct *tsk;
 
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -281,10 +282,25 @@ int autofs4_fill_super(struct super_block *s, void *data, 
int silent)
pgrp);
goto fail_dput;
}
+   tsk = get_pid_task(sbi->oz_pgrp, PIDTYPE_PGID);
+   if (!tsk) {
+   pr_warn("autofs: could not find process group leader 
%d\n",
+   pgrp);
+   goto fail_put_pid;
+   }
} else {
sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+   get_task_struct(current);
+   tsk = current;
}
 
+   if (test_tsk_thread_flag(tsk, TIF_ADDR32))
+   sbi->is32bit = 1;
+   else
+   sbi->is32bit = 0;
+
+   put_task_struct(tsk);
+
if (autofs_type_trigger(sbi->type))
__managed_dentry_set_managed(root);
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RFC PATCH 0/2] autofs: add "compat" support

2017-08-31 Thread Stanislav Kinsburskiy
The idea is simple: reduce autofs_v5_packet for 32bit damon on 64bit
architectures.

---

Stanislav Kinsburskiy (2):
  autofs: set compat flag on sbi when daemon uses 32bit addressation
  autofs: sent 32-bit sized packet for 32-bit process


 fs/autofs4/inode.c |   16 
 fs/autofs4/waitq.c |   11 +--
 2 files changed, 25 insertions(+), 2 deletions(-)

--
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode

2017-08-31 Thread Stanislav Kinsburskiy


31.08.2017 15:05, Dmitry V. Levin пишет:
> On Thu, Aug 31, 2017 at 02:40:23PM +0300, Dmitry V. Levin wrote:
>> On Thu, Aug 31, 2017 at 01:48:27PM +0300, Stanislav Kinsburskiy wrote:
>>>
>>>
>>> 31.08.2017 13:38, Dmitry V. Levin пишет:
>>>> On Thu, Aug 31, 2017 at 02:11:34PM +0400, Stanislav Kinsburskiy wrote:
>>>>> Due to integer variables alignment size of struct autofs_v5_packet in 300
>>>>> bytes in 32-bit architectures (instead of 304 bytes in 64-bits 
>>>>> architectures).
>>>>>
>>>>> This may lead to memory corruption (64 bits kernel always send 304 bytes,
>>>>> while 32-bit userspace application expects for 300).
>>>>>
>>>>> https://jira.sw.ru/browse/PSBM-71078
>>>>>
>>>>> Signed-off-by: Stanislav Kinsburskiy 
>>>>> ---
>>>>>  include/uapi/linux/auto_fs4.h |2 ++
>>>>>  1 file changed, 2 insertions(+)
>>>>>
>>>>> diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
>>>>> index e02982f..8729a47 100644
>>>>> --- a/include/uapi/linux/auto_fs4.h
>>>>> +++ b/include/uapi/linux/auto_fs4.h
>>>>> @@ -137,6 +137,8 @@ struct autofs_v5_packet {
>>>>>   __u32 pid;
>>>>>   __u32 tgid;
>>>>>   __u32 len;
>>>>> + __u32 blob; /* This is needed to align structure up to 8
>>>>> +bytes for ALL archs including 32-bit */
>>>>>   char name[NAME_MAX+1];
>>>>>  };
>>>>
>>>> This change breaks ABI because it changes offsetof(struct 
>>>> autofs_v5_packet, name).
>>>> If you need to fix the alignment, use  __attribute__((aligned(8))).
>>>>
>>>
>>> Nice to know you're watching.
>>> Yes, attribute is better.
>>> But how ABI is broken? On x86_64 this alignment is implied, so nothing is 
>>> changed.
>>
>> Your change increases offsetof(struct autofs_v5_packet, name) by 4 on all
>> architectures.  On architectures where the structure is 32-bit aligned
>> this also leads to increase of its size by 4.
>>
>>>> An alignment change would also be an ABI breakage on 32-bit architectures,
>>>> though.
>>>>
>>>
>>> True.
>>> But from my POW better have it working on 64bit archs for 32bit apps.
>>> But anyway, upstream guys will device, whether they want 32-bit autofs 
>>> applications properly work on 64 or 32 bits.
>>
>> Let's fix old bugs without introducing new bugs.
>> The right fix here seems to be a compat structure, that is, both 64-bit
>> and 32-bit kernels should send the same 32-bit aligned structure, and
>> it has to be the same structure sent by traditional 32-bit kernels.
> 
> Alternatively, a much more simple fix would be to change 64-bit kernels
> not to send the trailing 4 padding bytes of 64-bit aligned
> struct autofs_v5_packet.  That is, just send
> offsetofend(struct autofs_v5_packet, name) bytes instead of
> sizeof(struct autofs_v5_packet) regardless of architecture.
> 

Fair enough, thanks!
But this approach won't work, because autofs pipe has O_DIRECT flag.
Compat structure looks more promising, but not yet clear to me, how to define 
one properly.
Probably by replacing "len" with char array like this:

/* autofs v5 common packet struct */
struct autofs_v5_packet {
struct autofs_packet_hdr hdr;
autofs_wqt_t wait_queue_token;
__u32 dev;
__u64 ino;
__u32 uid;
__u32 gid;
__u32 pid;
__u32 tgid;
__u8  len[4];
char name[NAME_MAX+1];
};

What do you think?
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode

2017-08-31 Thread Stanislav Kinsburskiy


31.08.2017 13:38, Dmitry V. Levin пишет:
> On Thu, Aug 31, 2017 at 02:11:34PM +0400, Stanislav Kinsburskiy wrote:
>> Due to integer variables alignment size of struct autofs_v5_packet in 300
>> bytes in 32-bit architectures (instead of 304 bytes in 64-bits 
>> architectures).
>>
>> This may lead to memory corruption (64 bits kernel always send 304 bytes,
>> while 32-bit userspace application expects for 300).
>>
>> https://jira.sw.ru/browse/PSBM-71078
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  include/uapi/linux/auto_fs4.h |2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
>> index e02982f..8729a47 100644
>> --- a/include/uapi/linux/auto_fs4.h
>> +++ b/include/uapi/linux/auto_fs4.h
>> @@ -137,6 +137,8 @@ struct autofs_v5_packet {
>>  __u32 pid;
>>  __u32 tgid;
>>  __u32 len;
>> +__u32 blob; /* This is needed to align structure up to 8
>> +   bytes for ALL archs including 32-bit */
>>  char name[NAME_MAX+1];
>>  };
> 
> This change breaks ABI because it changes offsetof(struct autofs_v5_packet, 
> name).
> If you need to fix the alignment, use  __attribute__((aligned(8))).
> 

Nice to know you're watching.
Yes, attribute is better.
But how ABI is broken? On x86_64 this alignment is implied, so nothing is 
changed.

> An alignment change would also be an ABI breakage on 32-bit architectures,
> though.
> 

True.
But from my POW better have it working on 64bit archs for 32bit apps.
But anyway, upstream guys will device, whether they want 32-bit autofs 
applications properly work on 64 or 32 bits.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode

2017-08-31 Thread Stanislav Kinsburskiy
Yes

31.08.2017 13:37, Konstantin Khorenko пишет:
> Will you send it to mainstream as well?
> 
> -- 
> Best regards,
> 
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
> 
> On 08/31/2017 01:11 PM, Stanislav Kinsburskiy wrote:
>> Due to integer variables alignment size of struct autofs_v5_packet in 300
>> bytes in 32-bit architectures (instead of 304 bytes in 64-bits 
>> architectures).
>>
>> This may lead to memory corruption (64 bits kernel always send 304 bytes,
>> while 32-bit userspace application expects for 300).
>>
>> https://jira.sw.ru/browse/PSBM-71078
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  include/uapi/linux/auto_fs4.h |2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
>> index e02982f..8729a47 100644
>> --- a/include/uapi/linux/auto_fs4.h
>> +++ b/include/uapi/linux/auto_fs4.h
>> @@ -137,6 +137,8 @@ struct autofs_v5_packet {
>>  __u32 pid;
>>  __u32 tgid;
>>  __u32 len;
>> +__u32 blob;/* This is needed to align structure up to 8
>> +   bytes for ALL archs including 32-bit */
>>  char name[NAME_MAX+1];
>>  };
>>
>>
>> .
>>
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode

2017-08-31 Thread Stanislav Kinsburskiy
Due to integer variables alignment size of struct autofs_v5_packet in 300
bytes in 32-bit architectures (instead of 304 bytes in 64-bits architectures).

This may lead to memory corruption (64 bits kernel always send 304 bytes,
while 32-bit userspace application expects for 300).

https://jira.sw.ru/browse/PSBM-71078

Signed-off-by: Stanislav Kinsburskiy 
---
 include/uapi/linux/auto_fs4.h |2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index e02982f..8729a47 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -137,6 +137,8 @@ struct autofs_v5_packet {
__u32 pid;
__u32 tgid;
__u32 len;
+   __u32 blob; /* This is needed to align structure up to 8
+  bytes for ALL archs including 32-bit */
char name[NAME_MAX+1];
 };
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] zdtm: fix package memory allocation in autofs.c

2017-08-31 Thread Stanislav Kinsburskiy
Plus some cleanup.

https://jira.sw.ru/browse/PSBM-71078

Signed-off-by: Stanislav Kinsburskiy 
---
 test/zdtm/static/autofs.c |   18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c
index 8d917ee..882289f 100644
--- a/test/zdtm/static/autofs.c
+++ b/test/zdtm/static/autofs.c
@@ -460,10 +460,10 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
 {
union autofs_v5_packet_union *packet;
ssize_t bytes;
-   size_t psize = sizeof(*packet) * 2;
+   size_t psize = sizeof(*packet);
int err = 0;
 
-   packet = malloc(psize);
+   packet = malloc(psize * 2);
if (!packet) {
pr_err("failed to allocate autofs packet\n");
return -ENOMEM;
@@ -473,7 +473,7 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
siginterrupt(SIGUSR2, 1);
 
while (!stop && !err) {
-   memset(packet, 0, sizeof(*packet));
+   memset(packet, 0, psize * 2);
 
bytes = read(pipe, packet, psize);
if (bytes < 0) {
@@ -483,12 +483,12 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
}
continue;
}
-   if (bytes > psize) {
-   pr_err("read more that expected: %zd > %zd\n", bytes, 
psize);
-   return -EINVAL;
-   }
-   if (bytes != sizeof(*packet)) {
-   pr_err("read less than expected: %zd\n", bytes);
+   if (bytes != psize) {
+   pr_err("read %s that expected: %zd %s %zd\n",
+   (bytes > psize) ? "more" : "less",
+   bytes,
+   (bytes > psize) ? ">" : "<",
+   psize);
return -EINVAL;
}
err = automountd_serve(mountpoint, param, packet);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] zdtm: fix autofs tes compilation

2017-08-30 Thread Stanislav Kinsburskiy
Variables type of size_t have to be printed with "%z" directive.

https://jira.sw.ru/browse/PSBM-71041

Signed-off-by: Stanislav Kinsburskiy 
---
 test/zdtm/static/autofs.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c
index d854fd0..8d917ee 100644
--- a/test/zdtm/static/autofs.c
+++ b/test/zdtm/static/autofs.c
@@ -484,7 +484,7 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
continue;
}
if (bytes > psize) {
-   pr_err("read more that expected: %ld > %ld\n", bytes, 
psize);
+   pr_err("read more that expected: %zd > %zd\n", bytes, 
psize);
return -EINVAL;
}
if (bytes != sizeof(*packet)) {

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 2/2] mm/memcg: reclaim only kmem if kmem limit reached.

2017-08-28 Thread Stanislav Kinsburskiy


25.08.2017 18:38, Andrey Ryabinin пишет:
> If kmem limit on memcg reached, we go into memory reclaim,
> and reclaim everything we can, including page cache and anon.
> Reclaiming page cache or anon won't help since we need to lower
> only kmem usage. This patch fixes the problem by avoiding
> non-kmem reclaim on hitting the kmem limit.
> 

Can't there be a situation, when some object in anon mem or page cache holds 
some object in kmem (indirectly)?

> https://jira.sw.ru/browse/PSBM-69226
> Signed-off-by: Andrey Ryabinin 
> ---
>  include/linux/memcontrol.h | 10 ++
>  include/linux/swap.h   |  2 +-
>  mm/memcontrol.c| 30 --
>  mm/vmscan.c| 31 ---
>  4 files changed, 51 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1a52e58ab7de..1d6bc80c4c90 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
>   unsigned int generation;
>  };
>  
> +/*
> + * Reclaim flags for mem_cgroup_hierarchical_reclaim
> + */
> +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT0x0
> +#define MEM_CGROUP_RECLAIM_NOSWAP(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> +#define MEM_CGROUP_RECLAIM_SHRINK_BIT0x1
> +#define MEM_CGROUP_RECLAIM_SHRINK(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> +#define MEM_CGROUP_RECLAIM_KMEM_BIT  0x2
> +#define MEM_CGROUP_RECLAIM_KMEM  (1 << 
> MEM_CGROUP_RECLAIM_KMEM_BIT)
> +
>  #ifdef CONFIG_MEMCG
>  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> gfp_t gfp_mask, struct mem_cgroup **memcgp);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bd162f9bef0d..bd47451ec95a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist 
> *zonelist, int order,
>  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> unsigned long nr_pages,
> -   gfp_t gfp_mask, bool noswap);
> +   gfp_t gfp_mask, int flags);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
>   gfp_t gfp_mask, bool noswap,
>   struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 97824e281d7a..f9a5f3819a31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -511,16 +511,6 @@ enum res_type {
>  #define OOM_CONTROL  (0)
>  
>  /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -#define MEM_CGROUP_RECLAIM_KMEM_BIT  0x2
> -#define MEM_CGROUP_RECLAIM_KMEM  (1 << 
> MEM_CGROUP_RECLAIM_KMEM_BIT)
> -
> -/*
>   * The memcg_create_mutex will be held whenever a new cgroup is created.
>   * As a consequence, any change that needs to protect against new child 
> cgroups
>   * appearing has to hold it as well.
> @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct 
> mem_cgroup *memcg,
>   if (loop)
>   drain_all_stock_async(memcg);
>   total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
> -   gfp_mask, noswap);
> +   gfp_mask, flags);
>   if (test_thread_flag(TIF_MEMDIE) ||
>   fatal_signal_pending(current))
>   return 1;
> @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct 
> mem_cgroup *memcg,
>   break;
>   if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
>   break;
> +
> + /*
> +  * Try harder to reclaim dcache. dcache reclaim may
> +  * temporarly fail due to dcache->dlock being held
> +  * by someone else. We must try harder to avoid premature
> +  * slab allocation failures.
> +  */
> + if (flags & MEM_CGROUP_RECLAIM_KMEM &&
> + page_counter_read(&memcg->dcache))
> + continue;
>   /*
>* If nothing was reclaimed after two attempts, there
>* may be no reclaimable pages in this hierarchy.
> @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t 
> gfp_mask, bool kmem_charge
>   struct mem_cgroup *mem_over_limit;
>   struc

[Devel] [PATCH] zdtm: print autofs request size, if read more than expected

2017-08-23 Thread Stanislav Kinsburskiy
This is more debug patch, than fix. But valuable for debugging.

https://jira.sw.ru/browse/PSBM-70345

Signed-off-by: Stanislav Kinsburskiy 
---
 test/zdtm/static/autofs.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c
index 747ab69..d854fd0 100644
--- a/test/zdtm/static/autofs.c
+++ b/test/zdtm/static/autofs.c
@@ -460,7 +460,7 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
 {
union autofs_v5_packet_union *packet;
ssize_t bytes;
-   size_t psize = sizeof(*packet) + 1;
+   size_t psize = sizeof(*packet) * 2;
int err = 0;
 
packet = malloc(psize);
@@ -483,8 +483,8 @@ static int automountd_loop(int pipe, const char 
*mountpoint, struct autofs_param
}
continue;
}
-   if (bytes == psize) {
-   pr_err("read more that expected\n");
+   if (bytes > psize) {
+   pr_err("read more that expected: %ld > %ld\n", bytes, 
psize);
return -EINVAL;
}
if (bytes != sizeof(*packet)) {

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


  1   2   3   4   5   >