[Devel] [PATCH] ext4: release leaked posix acl in ext4_acl_chmod
Note: only rh7-3.10.0-693.17.1.el7-based kernels are affected. I.e. starting from rh7-3.10.0-693.17.1.vz7.43.1. Posix acl is used to convert of an extended attribute, provided by user to ext4 attributes. In particular to i_mode in case of ACL_TYPE_ACCESS request. IOW, this object is allocated, used for convertion, not stored anywhere and must be freed. However posix_acl_update_mode() can zerofy the pointer to support ext4_set_acl() logic, but then the object is leaked. So, fix it by releasing new temporary pointer with the same value instead of acl pointer. In scope of https://jira.sw.ru/browse/PSBM-81384 RHEL bug URL: https://bugzilla.redhat.com/show_bug.cgi?id=1543020 Signed-off-by: Stanislav Kinsburskiy --- fs/ext4/acl.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index f8a38a2..046b338 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -297,7 +297,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) int ext4_acl_chmod(struct inode *inode) { - struct posix_acl *acl; + struct posix_acl *acl, *real_acl; handle_t *handle; int retries = 0; int error; @@ -315,6 +315,8 @@ ext4_acl_chmod(struct inode *inode) error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (error) return error; + + real_acl = acl; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, ext4_jbd2_credits_xattr(inode)); @@ -341,7 +343,7 @@ ext4_acl_chmod(struct inode *inode) ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; out: - posix_acl_release(acl); + posix_acl_release(real_acl); return error; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2] ext4: release leaked posix acl in ext4_xattr_set_acl
Note: only rh7-3.10.0-693.17.1.el7-based kernels are affcted. I.e. starting from rh7-3.10.0-693.17.1.vz7.43.1. Posix acl is used to convert of an extended attribute, provided by user to ext4 attributes. In particular to i_mode in case of ACL_TYPE_ACCESS request. IOW, this object is allocated, used for convertion, not stored anywhere and must be freed. However posix_acl_update_mode() can zerofy the pointer to support ext4_set_acl() logic, but then the object is leaked. So, fix it by releasing new temporary pointer with the same value instead of acl pointer. https://jira.sw.ru/browse/PSBM-81384 RHEL bug URL: https://bugzilla.redhat.com/show_bug.cgi?id=1543020 v2: Added affected kernel version + RHEL bug URL Signed-off-by: Stanislav Kinsburskiy --- fs/ext4/acl.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 917e819..f8a38a2 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -403,7 +403,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, { struct inode *inode = dentry->d_inode; handle_t *handle; - struct posix_acl *acl; + struct posix_acl *acl, *real_acl; int error, retries = 0; int update_mode = 0; umode_t mode = inode->i_mode; @@ -416,7 +416,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); + acl = real_acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { @@ -425,7 +425,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, goto release_and_out; } } else - acl = NULL; + acl = real_acl = NULL; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, @@ -452,7 +452,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, goto retry; release_and_out: - posix_acl_release(acl); + posix_acl_release(real_acl); return error; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] ext4: release leaked posix acl in ext4_xattr_set_acl
07.02.2018 14:51, Dmitry Monakhov пишет: > Stanislav Kinsburskiy writes: > >> Posix acl is used to convert of an extended attribute, provided by user to >> ext4 attributes. In particular to I-mode in case of ACL_TYPE_ACCESS request. >> IOW, this object is allocated, used for convertion, not stored anywhere and >> must be freed. >> However posix_acl_update_mode() can zerofy the pointer to support >> ext4_set_acl() logic, but then the object is leaked. >> So, fix it by releasing new temporary pointer with the same value instead of >> acl pointer. > So you are telling me that: > ext4_xattr_set_acl > L1 acl = posix_acl_from_xattr > L2 -> ext4_set_acl(handle, inode, type, acl) > L3->posix_acl_update_mode(inode, &inode->i_mode, &acl) > *acl = NULL; > You are saying that instruction above can affect value at L1? > HOW? acl passed to ext4_set_acl() by value, so > posix_acl_update_mode() can affect value only in L2 and L3 but not L1. > > Stas, have you drunk a lousy beer today? OMG, Dmitry... Looks like storage work clouded your mind and you are looking to some other sources. There is no "L3" in the bug. Because posix_acl_update_mode() in called from ext4_xattr_set_acl() (i.e on "L2"). Thus pointer is set to NULL on "L2" and released on "L1". IOW, pointer is leaked. >> >> https://jira.sw.ru/browse/PSBM-81384 >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> fs/ext4/acl.c |8 >> 1 file changed, 4 insertions(+), 4 deletions(-) >> >> diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c >> index 917e819..2640d7b 100644 >> --- a/fs/ext4/acl.c >> +++ b/fs/ext4/acl.c >> @@ -403,7 +403,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char >> *name, const void *value, >> { >> struct inode *inode = dentry->d_inode; >> handle_t *handle; >> -struct posix_acl *acl; >> +struct posix_acl *acl, *tmp; >> int error, retries = 0; >> int update_mode = 0; >> umode_t mode = inode->i_mode; >> @@ -416,7 +416,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char >> *name, const void *value, >> return -EPERM; >> >> if (value) { >> -acl = posix_acl_from_xattr(&init_user_ns, value, size); >> +acl = tmp = posix_acl_from_xattr(&init_user_ns, value, size); >> if (IS_ERR(acl)) >> return PTR_ERR(acl); >> else if (acl) { >> @@ -425,7 +425,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char >> *name, const void *value, >> goto release_and_out; >> } >> } else >> -acl = NULL; >> +acl = tmp = NULL; >> >> retry: >> handle = ext4_journal_start(inode, EXT4_HT_XATTR, >> @@ -452,7 +452,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char >> *name, const void *value, >> goto retry; >> >> release_and_out: >> -posix_acl_release(acl); >> +posix_acl_release(tmp); >> return error; >> } >> ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] ext4: release leaked posix acl in ext4_xattr_set_acl
Posix acl is used to convert of an extended attribute, provided by user to ext4 attributes. In particular to I-mode in case of ACL_TYPE_ACCESS request. IOW, this object is allocated, used for convertion, not stored anywhere and must be freed. However posix_acl_update_mode() can zerofy the pointer to support ext4_set_acl() logic, but then the object is leaked. So, fix it by releasing new temporary pointer with the same value instead of acl pointer. https://jira.sw.ru/browse/PSBM-81384 Signed-off-by: Stanislav Kinsburskiy --- fs/ext4/acl.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 917e819..2640d7b 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -403,7 +403,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, { struct inode *inode = dentry->d_inode; handle_t *handle; - struct posix_acl *acl; + struct posix_acl *acl, *tmp; int error, retries = 0; int update_mode = 0; umode_t mode = inode->i_mode; @@ -416,7 +416,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, return -EPERM; if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); + acl = tmp = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { @@ -425,7 +425,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, goto release_and_out; } } else - acl = NULL; + acl = tmp = NULL; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, @@ -452,7 +452,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, goto retry; release_and_out: - posix_acl_release(acl); + posix_acl_release(tmp); return error; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH v3 1/2] spfs: Handle non-zero exit status in cleanup_spfs_mount()
Looks good to me Acked-by: Stanislav Kinsburskiy 23.01.2018 11:43, Kirill Tkhai пишет: > Consider non-zero exit status of spfs similar to killed > status and do the same cleanups. > > v3: New > > Signed-off-by: Kirill Tkhai > --- > manager/context.c |8 > manager/spfs.c|8 > 2 files changed, 8 insertions(+), 8 deletions(-) > > diff --git a/manager/context.c b/manager/context.c > index 1eb37c9..73d5ada 100644 > --- a/manager/context.c > +++ b/manager/context.c > @@ -45,12 +45,12 @@ const char *mgr_ovz_id(void) > static void cleanup_spfs_mount(struct spfs_manager_context_s *ctx, > struct spfs_info_s *info, int status) > { > - bool killed = WIFSIGNALED(status); > + bool failed = WIFSIGNALED(status) || !!WEXITSTATUS(status); > > pr_debug("removing info %s from the list\n", info->mnt.id); > > - if (killed) > - /* SPFS master was killed. We need to release the reference */ > + if (failed) > + /* SPFS master was failed. We need to release the reference */ > spfs_release_mnt(info); > > info->dead = true; > @@ -59,7 +59,7 @@ static void cleanup_spfs_mount(struct > spfs_manager_context_s *ctx, > if (unlink(info->socket_path)) > pr_perror("failed to unlink %s", info->socket_path); > > - spfs_cleanup_env(info, killed); > + spfs_cleanup_env(info, failed); > > close_namespaces(info->ns_fds); > } > diff --git a/manager/spfs.c b/manager/spfs.c > index 3e0f667..99845b1 100644 > --- a/manager/spfs.c > +++ b/manager/spfs.c > @@ -409,9 +409,9 @@ int spfs_prepare_env(struct spfs_info_s *info, const char > *proxy_dir) > return err ? err : res; > } > > -static int __spfs_cleanup_env(struct spfs_info_s *info, bool killed) > +static int __spfs_cleanup_env(struct spfs_info_s *info, bool failed) > { > - if (killed && umount(info->work_dir)) { > + if (failed && umount(info->work_dir)) { > pr_perror("failed to umount %s", info->work_dir); > return -errno; > } > @@ -423,7 +423,7 @@ static int __spfs_cleanup_env(struct spfs_info_s *info, > bool killed) > return 0; > } > > -int spfs_cleanup_env(struct spfs_info_s *info, bool killed) > +int spfs_cleanup_env(struct spfs_info_s *info, bool failed) > { > int err, res; > unsigned orig_ns_mask; > @@ -432,7 +432,7 @@ int spfs_cleanup_env(struct spfs_info_s *info, bool > killed) > if (res) > return res; > > - err = __spfs_cleanup_env(info, killed); > + err = __spfs_cleanup_env(info, failed); > > res = leave_spfs_context(info, orig_ns_mask); > > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit
23.01.2018 11:19, Kirill Tkhai пишет: > On 23.01.2018 13:18, Stanislav Kinsburskiy wrote: >> >> >> 23.01.2018 11:17, Kirill Tkhai пишет: >>> On 23.01.2018 13:14, Stanislav Kinsburskiy wrote: >>>> >>>> >>>> 23.01.2018 11:09, Kirill Tkhai пишет: >>>>> On 23.01.2018 13:07, Stanislav Kinsburskiy wrote: >>>>>> >>>>>> >>>>>> 23.01.2018 10:51, Kirill Tkhai пишет: >>>>>>> On 23.01.2018 12:48, Stanislav Kinsburskiy wrote: >>>>>>>> Please, see a couple of nits below >>>>>>>> >>>>>>>> 23.01.2018 10:41, Kirill Tkhai пишет: >>>>>>>>> Stanislav Kinsburskiy says: >>>>>>>>> >>>>>>>>> "SPFS manager has a special "--exit-with-spfs" options, which is used >>>>>>>>> by CRIU. >>>>>>>>> The idea of the option is simple: force SPFS manager to exit, when >>>>>>>>> it has some >>>>>>>>> SPFS processes among its children (i.e. spfs was mounted at least >>>>>>>>> once), >>>>>>>>> but all these processes have exited for whatever reason (which >>>>>>>>> usually happens >>>>>>>>> when restore has failed and spfs mounts where unmounted). >>>>>>>>> Although it works in overall (main SPFS manager process exits), its >>>>>>>>> children >>>>>>>>> (responsible to SPFS replacement) may wait on FUTEX for "release" >>>>>>>>> command >>>>>>>>> for corresponding SPFS mount and thus never stop until they are >>>>>>>>> killed". >>>>>>>>> >>>>>>>>> 1 spfs-manager >>>>>>>>> 2 \_ spfs >>>>>>>>> 3 \_ spfs-manager >>>>>>>>> 4 \_ spfs >>>>>>>>> 5 \_ spfs-manager >>>>>>>>> >>>>>>>>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount. >>>>>>>>> The patch makes spfs-manager 1 kill 3 in case of 2 exited. >>>>>>>>> >>>>>>>>> https://jira.sw.ru/browse/PSBM-80055 >>>>>>>>> >>>>>>>>> Signed-off-by: Kirill Tkhai >>>>>>>>> --- >>>>>>>>> manager/context.c |4 >>>>>>>>> manager/spfs.c|1 + >>>>>>>>> 2 files changed, 5 insertions(+) >>>>>>>>> >>>>>>>>> diff --git a/manager/context.c b/manager/context.c >>>>>>>>> index 1eb37c9..4464a23 100644 >>>>>>>>> --- a/manager/context.c >>>>>>>>> +++ b/manager/context.c >>>>>>>>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct >>>>>>>>> spfs_manager_context_s *ctx, >>>>>>>>> /* SPFS master was killed. We need to release the >>>>>>>>> reference */ >>>>>>>>> spfs_release_mnt(info); >>>>>>>>> >>>>>>>>> + if (killed || WEXITSTATUS(status)) >>>>>>>>> + kill(info->replacer, SIGKILL); >>>>>>>>> + >>>>>>>> >>>>>>>> There is "if (killed)" check above. >>>>>>>> Could you please move this hunk there? >>>>>>> >>>>>>> There is logical OR (||) in the hunk. How should I move it to >>>>>>> unconditional check? >>>>>>> >>>>>> >>>>>> Ah, ok. Then let the check be as it is. >>>>>> Could you please add warning message for this kill with the process pid >>>>>> being killed and the reason why (spfs was killed of exited with error)? >>>>> >>>>> Maybe we should call spfs_release_mnt() in case of exit status != 0 too? >>>>> >>>> >>>> Well, this makes sense. >>>> If spfs exited with non-zero result (either it was killed or exited due to >>>> some error), then there is no need in r
Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit
23.01.2018 11:17, Kirill Tkhai пишет: > On 23.01.2018 13:14, Stanislav Kinsburskiy wrote: >> >> >> 23.01.2018 11:09, Kirill Tkhai пишет: >>> On 23.01.2018 13:07, Stanislav Kinsburskiy wrote: >>>> >>>> >>>> 23.01.2018 10:51, Kirill Tkhai пишет: >>>>> On 23.01.2018 12:48, Stanislav Kinsburskiy wrote: >>>>>> Please, see a couple of nits below >>>>>> >>>>>> 23.01.2018 10:41, Kirill Tkhai пишет: >>>>>>> Stanislav Kinsburskiy says: >>>>>>> >>>>>>> "SPFS manager has a special "--exit-with-spfs" options, which is used >>>>>>> by CRIU. >>>>>>> The idea of the option is simple: force SPFS manager to exit, when it >>>>>>> has some >>>>>>> SPFS processes among its children (i.e. spfs was mounted at least >>>>>>> once), >>>>>>> but all these processes have exited for whatever reason (which usually >>>>>>> happens >>>>>>> when restore has failed and spfs mounts where unmounted). >>>>>>> Although it works in overall (main SPFS manager process exits), its >>>>>>> children >>>>>>> (responsible to SPFS replacement) may wait on FUTEX for "release" >>>>>>> command >>>>>>> for corresponding SPFS mount and thus never stop until they are >>>>>>> killed". >>>>>>> >>>>>>> 1 spfs-manager >>>>>>> 2 \_ spfs >>>>>>> 3 \_ spfs-manager >>>>>>> 4 \_ spfs >>>>>>> 5 \_ spfs-manager >>>>>>> >>>>>>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount. >>>>>>> The patch makes spfs-manager 1 kill 3 in case of 2 exited. >>>>>>> >>>>>>> https://jira.sw.ru/browse/PSBM-80055 >>>>>>> >>>>>>> Signed-off-by: Kirill Tkhai >>>>>>> --- >>>>>>> manager/context.c |4 >>>>>>> manager/spfs.c|1 + >>>>>>> 2 files changed, 5 insertions(+) >>>>>>> >>>>>>> diff --git a/manager/context.c b/manager/context.c >>>>>>> index 1eb37c9..4464a23 100644 >>>>>>> --- a/manager/context.c >>>>>>> +++ b/manager/context.c >>>>>>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct >>>>>>> spfs_manager_context_s *ctx, >>>>>>> /* SPFS master was killed. We need to release the >>>>>>> reference */ >>>>>>> spfs_release_mnt(info); >>>>>>> >>>>>>> + if (killed || WEXITSTATUS(status)) >>>>>>> + kill(info->replacer, SIGKILL); >>>>>>> + >>>>>> >>>>>> There is "if (killed)" check above. >>>>>> Could you please move this hunk there? >>>>> >>>>> There is logical OR (||) in the hunk. How should I move it to >>>>> unconditional check? >>>>> >>>> >>>> Ah, ok. Then let the check be as it is. >>>> Could you please add warning message for this kill with the process pid >>>> being killed and the reason why (spfs was killed of exited with error)? >>> >>> Maybe we should call spfs_release_mnt() in case of exit status != 0 too? >>> >> >> Well, this makes sense. >> If spfs exited with non-zero result (either it was killed or exited due to >> some error), then there is no need in replacer. And there is no need in the >> reference. >> So, probably this check you proposed should be used for both >> spfs_release_mnt(info) and kill(info->replacer, SIGKILL). > > Are you OK with this change sent in the only patch, or should we use one more > patch? > I think one patch is OK. Only mention the change explicitly in the patch description, please. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit
23.01.2018 11:09, Kirill Tkhai пишет: > On 23.01.2018 13:07, Stanislav Kinsburskiy wrote: >> >> >> 23.01.2018 10:51, Kirill Tkhai пишет: >>> On 23.01.2018 12:48, Stanislav Kinsburskiy wrote: >>>> Please, see a couple of nits below >>>> >>>> 23.01.2018 10:41, Kirill Tkhai пишет: >>>>> Stanislav Kinsburskiy says: >>>>> >>>>> "SPFS manager has a special "--exit-with-spfs" options, which is used by >>>>> CRIU. >>>>> The idea of the option is simple: force SPFS manager to exit, when it >>>>> has some >>>>> SPFS processes among its children (i.e. spfs was mounted at least once), >>>>> but all these processes have exited for whatever reason (which usually >>>>> happens >>>>> when restore has failed and spfs mounts where unmounted). >>>>> Although it works in overall (main SPFS manager process exits), its >>>>> children >>>>> (responsible to SPFS replacement) may wait on FUTEX for "release" command >>>>> for corresponding SPFS mount and thus never stop until they are killed". >>>>> >>>>> 1 spfs-manager >>>>> 2 \_ spfs >>>>> 3 \_ spfs-manager >>>>> 4 \_ spfs >>>>> 5 \_ spfs-manager >>>>> >>>>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount. >>>>> The patch makes spfs-manager 1 kill 3 in case of 2 exited. >>>>> >>>>> https://jira.sw.ru/browse/PSBM-80055 >>>>> >>>>> Signed-off-by: Kirill Tkhai >>>>> --- >>>>> manager/context.c |4 >>>>> manager/spfs.c|1 + >>>>> 2 files changed, 5 insertions(+) >>>>> >>>>> diff --git a/manager/context.c b/manager/context.c >>>>> index 1eb37c9..4464a23 100644 >>>>> --- a/manager/context.c >>>>> +++ b/manager/context.c >>>>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct >>>>> spfs_manager_context_s *ctx, >>>>> /* SPFS master was killed. We need to release the reference */ >>>>> spfs_release_mnt(info); >>>>> >>>>> + if (killed || WEXITSTATUS(status)) >>>>> + kill(info->replacer, SIGKILL); >>>>> + >>>> >>>> There is "if (killed)" check above. >>>> Could you please move this hunk there? >>> >>> There is logical OR (||) in the hunk. How should I move it to unconditional >>> check? >>> >> >> Ah, ok. Then let the check be as it is. >> Could you please add warning message for this kill with the process pid >> being killed and the reason why (spfs was killed of exited with error)? > > Maybe we should call spfs_release_mnt() in case of exit status != 0 too? > Well, this makes sense. If spfs exited with non-zero result (either it was killed or exited due to some error), then there is no need in replacer. And there is no need in the reference. So, probably this check you proposed should be used for both spfs_release_mnt(info) and kill(info->replacer, SIGKILL). ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit
23.01.2018 10:51, Kirill Tkhai пишет: > On 23.01.2018 12:48, Stanislav Kinsburskiy wrote: >> Please, see a couple of nits below >> >> 23.01.2018 10:41, Kirill Tkhai пишет: >>> Stanislav Kinsburskiy says: >>> >>> "SPFS manager has a special "--exit-with-spfs" options, which is used by >>> CRIU. >>> The idea of the option is simple: force SPFS manager to exit, when it has >>> some >>> SPFS processes among its children (i.e. spfs was mounted at least once), >>> but all these processes have exited for whatever reason (which usually >>> happens >>> when restore has failed and spfs mounts where unmounted). >>> Although it works in overall (main SPFS manager process exits), its >>> children >>> (responsible to SPFS replacement) may wait on FUTEX for "release" command >>> for corresponding SPFS mount and thus never stop until they are killed". >>> >>> 1 spfs-manager >>> 2 \_ spfs >>> 3 \_ spfs-manager >>> 4 \_ spfs >>> 5 \_ spfs-manager >>> >>> 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount. >>> The patch makes spfs-manager 1 kill 3 in case of 2 exited. >>> >>> https://jira.sw.ru/browse/PSBM-80055 >>> >>> Signed-off-by: Kirill Tkhai >>> --- >>> manager/context.c |4 >>> manager/spfs.c|1 + >>> 2 files changed, 5 insertions(+) >>> >>> diff --git a/manager/context.c b/manager/context.c >>> index 1eb37c9..4464a23 100644 >>> --- a/manager/context.c >>> +++ b/manager/context.c >>> @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct >>> spfs_manager_context_s *ctx, >>> /* SPFS master was killed. We need to release the reference */ >>> spfs_release_mnt(info); >>> >>> + if (killed || WEXITSTATUS(status)) >>> + kill(info->replacer, SIGKILL); >>> + >> >> There is "if (killed)" check above. >> Could you please move this hunk there? > > There is logical OR (||) in the hunk. How should I move it to unconditional > check? > Ah, ok. Then let the check be as it is. Could you please add warning message for this kill with the process pid being killed and the reason why (spfs was killed of exited with error)? ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] spfs: Main process wakes and kills its children on exit
Please, see a couple of nits below 23.01.2018 10:41, Kirill Tkhai пишет: > Stanislav Kinsburskiy says: > > "SPFS manager has a special "--exit-with-spfs" options, which is used by CRIU. > The idea of the option is simple: force SPFS manager to exit, when it has > some > SPFS processes among its children (i.e. spfs was mounted at least once), > but all these processes have exited for whatever reason (which usually > happens > when restore has failed and spfs mounts where unmounted). > Although it works in overall (main SPFS manager process exits), its children > (responsible to SPFS replacement) may wait on FUTEX for "release" command > for corresponding SPFS mount and thus never stop until they are killed". > > 1 spfs-manager > 2 \_ spfs > 3 \_ spfs-manager > 4 \_ spfs > 5 \_ spfs-manager > > 2 and 3 are pair of a mount, and 4 and 5 are pair of another mount. > The patch makes spfs-manager 1 kill 3 in case of 2 exited. > > https://jira.sw.ru/browse/PSBM-80055 > > Signed-off-by: Kirill Tkhai > --- > manager/context.c |4 > manager/spfs.c|1 + > 2 files changed, 5 insertions(+) > > diff --git a/manager/context.c b/manager/context.c > index 1eb37c9..4464a23 100644 > --- a/manager/context.c > +++ b/manager/context.c > @@ -53,6 +53,9 @@ static void cleanup_spfs_mount(struct > spfs_manager_context_s *ctx, > /* SPFS master was killed. We need to release the reference */ > spfs_release_mnt(info); > > + if (killed || WEXITSTATUS(status)) > + kill(info->replacer, SIGKILL); > + There is "if (killed)" check above. Could you please move this hunk there? > info->dead = true; > del_spfs_info(ctx->spfs_mounts, info); > > @@ -88,6 +91,7 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, > void *data) >* corresponding fd. >*/ > spfs_release_mnt(info); > + info->replacer = -1; > } else { > info = find_spfs_by_pid(ctx->spfs_mounts, pid); > if (info) { > diff --git a/manager/spfs.c b/manager/spfs.c > index 3e0f667..0b94587 100644 > --- a/manager/spfs.c > +++ b/manager/spfs.c > @@ -39,6 +39,7 @@ int create_spfs_info(const char *id, > pr_err("failed to allocate\n"); > return -ENOMEM; > } > + info->replacer = -1; > Could you, please, move it down to the end of the function where unconditional actions take place? > err = init_mount_info(&info->mnt, id, mountpoint, ns_mountpoint); > if (err) > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 2/3] libvzctl-4.14: join namespaces in explicitly provided order
This is needed to make sure, that mnt ns is the last (otherwise join other namespaces after mnt ns will fail). Signed-off-by: Stanislav Kinsburskiy --- lib/env_nsops.c | 39 +++ 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/lib/env_nsops.c b/lib/env_nsops.c index 0771eb2..d885d1c 100644 --- a/lib/env_nsops.c +++ b/lib/env_nsops.c @@ -885,7 +885,7 @@ static int ns_is_env_run(struct vzctl_env_handle *h) return cg_env_get_ve_state(EID(h)); } -int set_ns(pid_t pid, const char *name, int flags) +static int set_ns(pid_t pid, const char *name, int flags) { int ret, fd; char path[PATH_MAX]; @@ -924,11 +924,10 @@ int enter_net_ns(struct vzctl_env_handle *h, pid_t *ct_pid) static int ns_env_enter(struct vzctl_env_handle *h, int flags) { - DIR *dp; - struct dirent *ep; pid_t pid; - char path[PATH_MAX]; - int ret; + int ret, i; + const char *ns[] = {"cgroup", "ipc", "net", "uts", "pid", + "pid_for_children", "user", "mnt"}; ret = reset_loginuid(); if (ret) @@ -939,37 +938,21 @@ static int ns_env_enter(struct vzctl_env_handle *h, int flags) logger(10, 0, "* Attach by pid %d", pid); - snprintf(path, sizeof(path), "/proc/%d/ns", pid); - dp = opendir(path); - if (dp == NULL) - return vzctl_err(-1, errno, "Unable to open dir %s", path); - ret = cg_attach_task(EID(h), getpid(), NULL, NULL); if (ret) - goto err; - - while ((ep = readdir (dp))) { - if (!strcmp(ep->d_name, ".") || - !strcmp(ep->d_name, "..")) - continue; + return ret; - ret = set_ns(pid, ep->d_name, 0); + for (i = 0; i < sizeof(ns) / sizeof(ns[0]); ++i) { + ret = set_ns(pid, ns[i], 0); if (ret) - goto err; + return ret; } /* Clear supplementary group IDs */ - if (setgroups(0, NULL)) { - ret = vzctl_err(-1, errno, "ns_env_enter: setgroups()"); - goto err; - } - - ret = set_personality32(); + if (setgroups(0, NULL)) + return vzctl_err(-1, errno, "ns_env_enter: setgroups()"); -err: - closedir(dp); - - return ret; + return set_personality32(); } static int ns_env_exec(struct vzctl_env_handle *h, struct exec_param *param, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 1/3] libvzctl-4.14: clone CT in new cgroup namespace
We are going to use it along with all the other namespaces. Signed-off-by: Stanislav Kinsburskiy --- lib/env_nsops.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/env_nsops.c b/lib/env_nsops.c index 13e65b0..0771eb2 100644 --- a/lib/env_nsops.c +++ b/lib/env_nsops.c @@ -807,8 +807,10 @@ static int do_env_create(struct vzctl_env_handle *h, struct start_param *param) } param->init_p = init_p; +#define CLONE_NEWCGROUP 0x0200 /* New cgroup namespace */ + clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC| - CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWUSER; + CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWCGROUP; pid = clone(real_ns_env_create, child_stack + sizeof(child_stack), clone_flags|SIGCHLD , (void *) param); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 0/3] libvzctl: a set a patches to make CT running on 4.14 kernel
This series (plus patch, removing beacounters usage) allows to start container on 4.14 kernel, which is available here: Repo : ssh://g...@git.sw.ru:7999/vzs/vzkernel.git Branch: stable-4.14 --- Stanislav Kinsburskiy (3): libvzctl-4.14: clone CT in new cgroup namespace libvzctl-4.14: join namespaces in explicitly provided order libvzctl-4.14: do not mount sysfs in container lib/env.c |3 --- lib/env_nsops.c | 43 ++- 2 files changed, 14 insertions(+), 32 deletions(-) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 3/3] libvzctl-4.14: do not mount sysfs in container
Our current approach is to use one sysfs mount for all the containers, but set limited visibility to sysfs dentries in a container. Signed-off-by: Stanislav Kinsburskiy --- lib/env.c |3 --- 1 file changed, 3 deletions(-) diff --git a/lib/env.c b/lib/env.c index 1b03ce4..a284ec9 100644 --- a/lib/env.c +++ b/lib/env.c @@ -772,9 +772,6 @@ int pre_setup_env(const struct start_param *param) if (setup_devtmpfs()) return VZCTL_E_SYSTEM; - if (stat_file("/sys")) - mount("sysfs", "/sys", "sysfs", 0, 0); - if (env->features->mask & VE_FEATURE_NFSD) { mount("nfsd", "/proc/fs/nfsd", "nfsd", 0, 0); make_dir("/var/lib/nfs/rpc_pipefs", 1); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] 4.14: remove beancounter cgroup usage
This might be a temporary patch (since we don't have beaconters in 4.14 yet), but it's needed to proceed with testing. Signed-off-by: Stanislav Kinsburskiy --- lib/cgroup.c| 30 +- lib/cgroup.h|1 - lib/env_nsops.c | 38 ++ 3 files changed, 3 insertions(+), 66 deletions(-) diff --git a/lib/cgroup.c b/lib/cgroup.c index cd71064..52dc013 100644 --- a/lib/cgroup.c +++ b/lib/cgroup.c @@ -56,7 +56,6 @@ static struct cg_ctl cg_ctl_map[] = { {CG_DEVICES}, {CG_BLKIO}, {CG_FREEZER}, - {CG_UB, 1}, {CG_VE, 1}, {CG_PERF_EVENT}, {CG_HUGETLB}, @@ -741,39 +740,12 @@ int cg_env_set_memory(const char *ctid, const char *name, unsigned long value) int cg_env_set_ub(const char *ctid, const char *name, unsigned long b, unsigned long l) { - int rc; - char _name[STR_SIZE]; - - snprintf(_name, sizeof(_name), "beancounter.%s.barrier", name); - rc = cg_set_ul(ctid, CG_UB, _name, b); - if (rc) - return rc; - - snprintf(_name, sizeof(_name), "beancounter.%s.limit", name); - return cg_set_ul(ctid, CG_UB, _name, l); + return 0; } static int cg_env_set_io(const char *ctid, const char *name, unsigned int speed, unsigned int burst, unsigned int latency) { - int ret; - char buf[STR_SIZE]; - - snprintf(buf, sizeof(buf), CG_UB".%s.speed", name); - ret = cg_set_ul(ctid, CG_UB, buf, speed); - if (ret) - return ret; - - snprintf(buf, sizeof(buf), CG_UB".%s.burst", name); - ret = cg_set_ul(ctid, CG_UB, buf, burst); - if (ret) - return ret; - - snprintf(buf, sizeof(buf), CG_UB".%s.latency", name); - ret = cg_set_ul(ctid, CG_UB, buf, latency); - if (ret) - return ret; - return 0; } diff --git a/lib/cgroup.h b/lib/cgroup.h index 53429e9..c0b308d 100644 --- a/lib/cgroup.h +++ b/lib/cgroup.h @@ -29,7 +29,6 @@ #define CG_MEMORY "memory" #define CG_NET_CLS "net_cls" #define CG_VE "ve" -#define CG_UB "beancounter" #define CG_BLKIO "blkio" #define CG_FREEZER "freezer" #define CG_PERF_EVENT "perf_event" diff --git a/lib/env_nsops.c b/lib/env_nsops.c index a30eb6a..13e65b0 100644 --- a/lib/env_nsops.c +++ b/lib/env_nsops.c @@ -566,24 +566,12 @@ static int init_env_cgroup(struct vzctl_env_handle *h, int flags) "cpuset.cpus", "cpuset.mems" }; - char *bc[] = { - "beancounter.memory", - "beancounter.blkio" - }; logger(10, 0, "* init Container cgroup"); if (h->veid && cg_set_veid(EID(h), h->veid) == -1) return vzctl_err(VZCTL_E_RESOURCE, 0, "Failed to set VEID=%u", h->veid); - /* Bind beancounter with blkio/memory cgroups */ - for (i = 0; i < sizeof(bc)/sizeof(bc[0]); i++) { - snprintf(buf, sizeof(buf), "/%s/%s", cg_get_slice_name(), EID(h)); - ret = cg_set_param(EID(h), CG_UB, bc[i], buf); - if (ret) - return ret; - } - ret = cg_env_set_memory(h->ctid, "memory.use_hierarchy", 1); if (ret) return ret; @@ -1674,14 +1662,7 @@ static int ns_set_iolimit(struct vzctl_env_handle *h, unsigned int speed) static int ns_get_iolimit(struct vzctl_env_handle *h, unsigned int *speed) { - int ret; - unsigned long n; - - ret = cg_get_ul(EID(h), CG_UB, CG_UB".iolimit.speed", &n); - - *speed = (unsigned int) n; - - return ret; + return 0; } static int ns_set_ioprio(struct vzctl_env_handle *h, int prio) @@ -1712,13 +1693,7 @@ static int ns_set_iopslimit(struct vzctl_env_handle *h, unsigned int speed) static int ns_get_iopslimit(struct vzctl_env_handle *h, unsigned int *speed) { - int ret; - unsigned long n; - - ret = cg_get_ul(EID(h), CG_UB, CG_UB".iopslimit.speed", &n); - *speed = (unsigned int) n; - - return ret; + return 0; } static int ns_get_runtime_param(struct vzctl_env_handle *h, int flags) @@ -1761,11 +1736,6 @@ int vzctl2_set_limits(struct vzctl_env_handle *h, int release) { int ret; - if (release) { - cg_set_ul("", CG_UB, "tasks", getpid()); - return cg_destroy_cgroup(EID(h)); - } - if (EMPTY_CTID(h->ctid)) vzctl2_generate_ctid(EID(h)); @@ -1786,10 +1756,6 @@ int vzctl2_set_limits(struct vzctl_env_handle *h, int release) if (ret)
[Devel] [PATCH] criu: return original error code from bc_failcnt_check() if failed
Currently this helper doesn't see "/proc/bc/%s/resources" (another mount ns?). But this leads to a situation, when error is swallowed by the helper. I.e. it returns 0 even if there was an error during restore. So, let's return original error code instead. https://jira.sw.ru/browse/PSBM-80056 Signed-off-by: Stanislav Kinsburskiy --- criu/cr-restore.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index bd36523..6337c0f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2034,7 +2034,7 @@ static int bc_failcnt_check(int ret) snprintf(buf, sizeof(buf), "/proc/bc/%s/resources", veid); f = fopen(buf, "r"); if (!f) { - return 0; + return ret; } while (fgets(buf, sizeof(buf), f)) { ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] spfs: start SPFS manager in containers network namespace
This is needed because in case of killing of a container with SPFS manager inside (due to any restore error) all the network namespaces of container processes will be marked to drop SUNRPC packets (libvzctl does it on fast stop). This in turn happens, because we want to be able to kill container with blocked network and NFS mount inside. Thus all the processes, belonging to VE cgroup have to have containers network namespaces otherwise SUNRPC trafic is dropped in init network namespace, like it happens now. https://jira.sw.ru/browse/PSBM-79733 Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 21 - 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/criu/spfs.c b/criu/spfs.c index 6ce2ac8..d46fe11 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -21,6 +21,7 @@ #include "spfs.h" #include "proc_parse.h" #include "cgroup.h" +#include "net.h" #define SPFS_MANAGER_WORK_DIR "/run/spfs-manager/%d" #define VE_SPFS_MANAGER_WORK_DIR "/vz/private/%s/dump/spfs-manager/%d" @@ -121,7 +122,7 @@ static char *spfs_manager_log_dir(void) return work_dir; } -static int start_spfs_manager(void) +static int __start_spfs_manager(void) { char *spfs_manager = "spfs-manager"; char *socket_path = spfs_manager_socket_path(); @@ -159,6 +160,24 @@ static int start_spfs_manager(void) return sock; } +static int start_spfs_manager(void) +{ + int old_net_ns, sock; + + if (switch_ns(root_item->pid->real, &net_ns_desc, &old_net_ns)) { + pr_err("failed to switch to containers network namespace\n"); + return -1; + } + + sock = __start_spfs_manager(); + + if (restore_ns(old_net_ns, &net_ns_desc)) { + pr_err("failed to restore original usernsd network namespace\n"); + return -1; + } + return sock; +} + static int get_spfs_mngr_sock(void *start, int fd, pid_t pid) { int sock; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 6/7] spfs: return duplicated socket from usernsd
Usernsd closes socket when sent. https://jira.sw.ru/browse/PSBM-79462 v2: duplicate only service descriptor Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c |7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/criu/spfs.c b/criu/spfs.c index 7ce30e7..6ce2ac8 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -164,8 +164,13 @@ static int get_spfs_mngr_sock(void *start, int fd, pid_t pid) int sock; sock = get_service_fd(SPFS_MNGR_SK); - if (sock < 0 && start) + if (sock != -1) { + sock = dup(sock); + if (sock < 0) + pr_perror("failed to duplicate SPFS manager socket\n"); + } else if (start) sock = start_spfs_manager(); + return sock; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 7/7] spfs: switch mounts mode to STUB after root yard depopulation
Otherwise CRIU stuck on SPFS mounts Signed-off-by: Stanislav Kinsburskiy --- criu/cr-restore.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index eb9f50d..bd36523 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2253,12 +2253,6 @@ static int restore_root_task(struct pstree_item *init) if (ret < 0) goto out_kill; - if (spfs_is_running) { - ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB); - if (ret < 0) - goto out_kill; - } - if (fault_injected(FI_POST_RESTORE)) goto out_kill; @@ -2279,6 +2273,12 @@ static int restore_root_task(struct pstree_item *init) close_safe(&mnt_ns_fd); + if (spfs_is_running) { + ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB); + if (ret < 0) + goto out_kill; + } + if (write_restored_pid()) goto out_kill; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 4/7] spfs: improve SPFS manager start debug and error output
Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index 223bf93..89fb9f9 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -127,6 +127,8 @@ static int start_spfs_manager(void) char *socket_path = spfs_manager_socket_path(); int err = -ENOMEM, sock; + pr_info("Starting SPFS manager\n"); + err = cr_system(-1, -1, -1, spfs_manager, (char *[]){ "spfs-manager", "-", "-d", @@ -135,13 +137,17 @@ static int start_spfs_manager(void) "--log-dir", spfs_manager_log_dir(), "--exit-with-spfs", NULL }, 0); - pr_info("%s: spfs manager start result: %d\n", __func__, err); - if (err) + if (err) { + pr_err("failed to start SPFS manager binary: %d\n", err); return err; + } sock = sock_seqpacket_connect(socket_path); - if (sock < 0) + if (sock < 0) { + pr_err("failed to connect to SPFS manager via %s: %d\n", + socket_path, err); return sock; + } err = install_service_fd(SPFS_MNGR_SK, sock); if (err < 0) { ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 2/7] spfs: improve error and debug output for spfs_mount()
Use request_spfs_mngr_sock() for both start and socket request. Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 15 ++- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index 0d44b72..1ead01b 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -292,18 +292,23 @@ int spfs_mount(struct mount_info *mi, const char *source, sock = start_spfs_mngr(); if (sock < 0) { - pr_err("failed to mount NFS to path %s\n", mi->mountpoint); - return sock; + pr_err("failed to connect to SPFS manager: %d\n", sock); + ret = sock; + goto err; } - ret = spfs_request_mount(sock, mi, source, filesystemtype, mountflags); close(sock); if (ret) { - pr_err("mount of %s (%s) failed: %d\n", source, filesystemtype, ret); - return ret; + pr_err("mount request for %s (%s) failed: %d\n", + source, filesystemtype, ret); + goto err; } return 0; + +err: + pr_err("failed to mount NFS to path %s\n", mi->mountpoint); + return ret; } int spfs_set_env(void) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 5/7] spfs: improve prints in spfs_set_mode() and spfs_release_replace()
To make them more grepable. Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index 89fb9f9..7ce30e7 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -379,9 +379,9 @@ int spfs_set_mode(int sock, const char *mode) ret = spfs_send_request(sock, req, strlen(req) + 1); if (ret) - pr_err("set mode to %s request failed: %d\n", mode, ret); + pr_err("set SPFS mode to %s request failed: %d\n", mode, ret); else - pr_debug("Set mode to %s request succeeded\n", mode); + pr_debug("Set SPFS mode to %s request succeeded\n", mode); free(req); return 0; @@ -394,9 +394,9 @@ int spfs_release_replace(int sock) ret = spfs_send_request(sock, req, strlen(req) + 1); if (ret) - pr_err("release replace request failed: %d\n", ret); + pr_err("SPFS release replace request failed: %d\n", ret); else - pr_debug("Release replace request succeeded\n"); + pr_debug("SPFS \"release replace\" request succeeded\n"); return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 1/7] spfs: introduce request_spfs_mngr_sock)() helper
This generic helper will be responsible for both SPFS manager start via usernsd and socket request. v2: Do not use variable, but explicitly provide fake void pointer ad boolean value to request_spfs_mngr_sock() Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 43 --- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index a5f6031..0d44b72 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -153,16 +153,36 @@ static int start_spfs_manager(void) return sock; } -static int get_spfs_mngr_sock(void *arg, int fd, pid_t pid) +static int get_spfs_mngr_sock(void *start, int fd, pid_t pid) { int sock; sock = get_service_fd(SPFS_MNGR_SK); - if (sock < 0) + if (sock < 0 && start) sock = start_spfs_manager(); return sock; } +static int request_spfs_mngr_sock(bool *start_mngr) +{ + int ns_fd; + int sock; + + ns_fd = open_proc(PROC_SELF, "ns"); + if (ns_fd < 0) + return ns_fd; + + sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, start_mngr, 0, ns_fd); + + close(ns_fd); + return sock; +} + +static int start_spfs_mngr(void) +{ + return request_spfs_mngr_sock((void *)1); +} + static int spfs_request_mount(int sock, struct mount_info *mi, const char *source, const char *type, unsigned long mountflags) { @@ -268,15 +288,9 @@ int spfs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags) { int ret; - int ns_fd; int sock; - ns_fd = open_proc(PROC_SELF, "ns"); - if (ns_fd < 0) - return ns_fd; - - sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, NULL, 0, ns_fd); - close(ns_fd); + sock = start_spfs_mngr(); if (sock < 0) { pr_err("failed to mount NFS to path %s\n", mi->mountpoint); return sock; @@ -345,16 +359,7 @@ int spfs_mngr_status(bool *active) int spfs_mngr_sock(void) { - int ns_fd, fd; - - ns_fd = open_proc(PROC_SELF, "ns"); - if (ns_fd < 0) - return ns_fd; - - fd = userns_call(spfs_service_fd, UNS_FDOUT, NULL, 0, ns_fd); - - close(ns_fd); - return fd; + return request_spfs_mngr_sock((void *)0); } int spfs_set_mode(int sock, const char *mode) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 0/7] spfs: duplicate socket before sending it from usernsd
Usernds closes socket once it was sent. So, id should be duplicated before sending, if the socket is expected to be send multiple times. https://jira.sw.ru/browse/PSBM-79462 The following series implements... --- Stanislav Kinsburskiy (7): spfs: introduce request_spfs_mngr_sock)() helper spfs: improve error and debug output for spfs_mount() spfs: remove redundant spfs_service_fd() helper spfs: improve SPFS manager start debug and error output spfs: improve prints in spfs_set_mode() and spfs_release_replace() spfs: return duplicated socket from usernsd spfs: switch mounts mode to STUB after root yard depopulation criu/cr-restore.c | 12 +++ criu/spfs.c | 92 +++-- 2 files changed, 59 insertions(+), 45 deletions(-) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2 3/7] spfs: remove redundant spfs_service_fd() helper
Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c |9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index 1ead01b..223bf93 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -336,16 +336,9 @@ int spfs_set_env(void) return 0; } -static int spfs_service_fd(void *arg, int fd, pid_t pid) -{ - return get_service_fd(SPFS_MNGR_SK); -} - static int spfs_service_is_active(void *arg, int fd, pid_t pid) { - if (spfs_service_fd(arg, fd, pid) < 0) - return 0; - return 1; + return get_service_fd(SPFS_MNGR_SK) < 0 ? 0 : 1; } int spfs_mngr_status(bool *active) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 7/7] spfs: switch mountsmode to STUB after root yard depopulation
Otherwise CRIU stuck on SPFS mounts Signed-off-by: Stanislav Kinsburskiy --- criu/cr-restore.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index eb9f50d..bd36523 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2253,12 +2253,6 @@ static int restore_root_task(struct pstree_item *init) if (ret < 0) goto out_kill; - if (spfs_is_running) { - ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB); - if (ret < 0) - goto out_kill; - } - if (fault_injected(FI_POST_RESTORE)) goto out_kill; @@ -2279,6 +2273,12 @@ static int restore_root_task(struct pstree_item *init) close_safe(&mnt_ns_fd); + if (spfs_is_running) { + ret = spfs_set_mode(spfs_sock, SPFS_MODE_STUB); + if (ret < 0) + goto out_kill; + } + if (write_restored_pid()) goto out_kill; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 6/7] spfs: return duplicated socket from usernsd
Usernsd closes socket when sent. https://jira.sw.ru/browse/PSBM-79462 Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/criu/spfs.c b/criu/spfs.c index da19179..857a167 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -162,11 +162,19 @@ static int start_spfs_manager(void) static int get_spfs_mngr_sock(void *start, int fd, pid_t pid) { int sock; + int sfd; sock = get_service_fd(SPFS_MNGR_SK); if (sock < 0 && start) sock = start_spfs_manager(); - return sock; + if (sock < 0) + return sock; + + sfd = dup(sock); + if (sfd < 0) + pr_perror("failed to duplicate socket %d", sock); + + return sfd; } static int request_spfs_mngr_sock(bool *start_mngr) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 3/7] spfs: remove redundant spfs_service_fd() helper
Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c |9 + 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index c202b14..be3bca4 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -338,16 +338,9 @@ int spfs_set_env(void) return 0; } -static int spfs_service_fd(void *arg, int fd, pid_t pid) -{ - return get_service_fd(SPFS_MNGR_SK); -} - static int spfs_service_is_active(void *arg, int fd, pid_t pid) { - if (spfs_service_fd(arg, fd, pid) < 0) - return 0; - return 1; + return get_service_fd(SPFS_MNGR_SK) < 0 ? 0 : 1; } int spfs_mngr_status(bool *active) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 1/7] spfs: introduce request_spfs_mngr_sock)() helper
This generic helper will be responsible for both SPFS manager start via usernsd and socket request. Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 45 ++--- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index a5f6031..fff7b9f 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -153,16 +153,38 @@ static int start_spfs_manager(void) return sock; } -static int get_spfs_mngr_sock(void *arg, int fd, pid_t pid) +static int get_spfs_mngr_sock(void *start, int fd, pid_t pid) { int sock; sock = get_service_fd(SPFS_MNGR_SK); - if (sock < 0) + if (sock < 0 && start) sock = start_spfs_manager(); return sock; } +static int request_spfs_mngr_sock(bool *start_mngr) +{ + int ns_fd; + int sock; + + ns_fd = open_proc(PROC_SELF, "ns"); + if (ns_fd < 0) + return ns_fd; + + sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, start_mngr, 0, ns_fd); + + close(ns_fd); + return sock; +} + +static int start_spfs_mngr(void) +{ + bool start; + + return request_spfs_mngr_sock(&start); +} + static int spfs_request_mount(int sock, struct mount_info *mi, const char *source, const char *type, unsigned long mountflags) { @@ -268,15 +290,9 @@ int spfs_mount(struct mount_info *mi, const char *source, const char *filesystemtype, unsigned long mountflags) { int ret; - int ns_fd; int sock; - ns_fd = open_proc(PROC_SELF, "ns"); - if (ns_fd < 0) - return ns_fd; - - sock = userns_call(get_spfs_mngr_sock, UNS_FDOUT, NULL, 0, ns_fd); - close(ns_fd); + sock = start_spfs_mngr(); if (sock < 0) { pr_err("failed to mount NFS to path %s\n", mi->mountpoint); return sock; @@ -345,16 +361,7 @@ int spfs_mngr_status(bool *active) int spfs_mngr_sock(void) { - int ns_fd, fd; - - ns_fd = open_proc(PROC_SELF, "ns"); - if (ns_fd < 0) - return ns_fd; - - fd = userns_call(spfs_service_fd, UNS_FDOUT, NULL, 0, ns_fd); - - close(ns_fd); - return fd; + return request_spfs_mngr_sock(NULL); } int spfs_set_mode(int sock, const char *mode) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 2/7] spfs: improve error and debug output for spfs_mount()
Use request_spfs_mngr_sock() for both start and socket request. Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 15 ++- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index fff7b9f..c202b14 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -294,18 +294,23 @@ int spfs_mount(struct mount_info *mi, const char *source, sock = start_spfs_mngr(); if (sock < 0) { - pr_err("failed to mount NFS to path %s\n", mi->mountpoint); - return sock; + pr_err("failed to connect to SPFS manager: %d\n", sock); + ret = sock; + goto err; } - ret = spfs_request_mount(sock, mi, source, filesystemtype, mountflags); close(sock); if (ret) { - pr_err("mount of %s (%s) failed: %d\n", source, filesystemtype, ret); - return ret; + pr_err("mount request for %s (%s) failed: %d\n", + source, filesystemtype, ret); + goto err; } return 0; + +err: + pr_err("failed to mount NFS to path %s\n", mi->mountpoint); + return ret; } int spfs_set_env(void) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 4/7] spfs: improve SPFS manager start debug and error output
Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index be3bca4..d3391e2 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -127,6 +127,8 @@ static int start_spfs_manager(void) char *socket_path = spfs_manager_socket_path(); int err = -ENOMEM, sock; + pr_info("Starting SPFS manager\n"); + err = cr_system(-1, -1, -1, spfs_manager, (char *[]){ "spfs-manager", "-", "-d", @@ -135,13 +137,17 @@ static int start_spfs_manager(void) "--log-dir", spfs_manager_log_dir(), "--exit-with-spfs", NULL }, 0); - pr_info("%s: spfs manager start result: %d\n", __func__, err); - if (err) + if (err) { + pr_err("failed to start SPFS manager binary: %d\n", err); return err; + } sock = sock_seqpacket_connect(socket_path); - if (sock < 0) + if (sock < 0) { + pr_err("failed to connect to SPFS manager via %s: %d\n", + socket_path, err); return sock; + } err = install_service_fd(SPFS_MNGR_SK, sock); if (err < 0) { ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 5/7] spfs: improve prints in spfs_set_mode() and spfs_release_replace()
To make them more grepable. Signed-off-by: Stanislav Kinsburskiy --- criu/spfs.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/criu/spfs.c b/criu/spfs.c index d3391e2..da19179 100644 --- a/criu/spfs.c +++ b/criu/spfs.c @@ -381,9 +381,9 @@ int spfs_set_mode(int sock, const char *mode) ret = spfs_send_request(sock, req, strlen(req) + 1); if (ret) - pr_err("set mode to %s request failed: %d\n", mode, ret); + pr_err("set SPFS mode to %s request failed: %d\n", mode, ret); else - pr_debug("Set mode to %s request succeeded\n", mode); + pr_debug("Set SPFS mode to %s request succeeded\n", mode); free(req); return 0; @@ -396,9 +396,9 @@ int spfs_release_replace(int sock) ret = spfs_send_request(sock, req, strlen(req) + 1); if (ret) - pr_err("release replace request failed: %d\n", ret); + pr_err("SPFS release replace request failed: %d\n", ret); else - pr_debug("Release replace request succeeded\n"); + pr_debug("SPFS \"release replace\" request succeeded\n"); return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 0/7] spfs: duplicate socket before sending it from usernsd
Usernds closes socket once it's sent. So, id should be duplicated before sending, if the socket is expected to be send multiple times. https://jira.sw.ru/browse/PSBM-79462 The following series implements... --- Stanislav Kinsburskiy (7): spfs: introduce request_spfs_mngr_sock)() helper spfs: improve error and debug output for spfs_mount() spfs: remove redundant spfs_service_fd() helper spfs: improve SPFS manager start debug and error output spfs: improve prints in spfs_set_mode() and spfs_release_replace() spfs: return duplicated socket from usernsd spfs: switch mountsmode to STUB after root yard depopulation criu/cr-restore.c | 12 +++ criu/spfs.c | 97 - 2 files changed, 64 insertions(+), 45 deletions(-) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2] cgroup/cpuset: emulate cgroup in container
Any changes to this cgroup are skipped in container, but success code is returned. The idea is to fool Docker/Kubernetes. https://jira.sw.ru/browse/PSBM-58423 This patch obsoletes "ve/proc/cpuset: do not show cpuset in CT" v2: Do not attach tasks in cpuset_change_cpumask on cpuset set change, it requested from non-super VE. This is a second part of the logic. The first was to not change cpuset for newly added task. This one - to not set new cpuset for all the tasks in cgroup. Signed-off-by: Stanislav Kinsburskiy --- kernel/cpuset.c | 12 1 file changed, 12 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 26d88eb..43b1410 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -848,6 +848,9 @@ static int cpuset_test_cpumask(struct task_struct *tsk, static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { + if (!ve_is_super(get_exec_env())) + return; + set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); } @@ -1441,6 +1444,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct task_struct *task; int ret; + if (!ve_is_super(get_exec_env())) + return 0; + mutex_lock(&cpuset_mutex); ret = -ENOSPC; @@ -1470,6 +1476,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) static void cpuset_cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + if (!ve_is_super(get_exec_env())) + return; + mutex_lock(&cpuset_mutex); cgroup_cs(cgrp)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1494,6 +1503,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + if (!ve_is_super(get_exec_env())) + return; + mutex_lock(&cpuset_mutex); /* prepare for attach */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] cgroup/cpuset: emulate cgroup in container
Nice catch, thanks! I'll first try to tweak this gently, so it will look like cpuset cgroup works, but it won't. 13.12.2017 16:18, Pavel Tikhomirov пишет: > So change to cpuset.cpus is actually applied to the task, that's what I mean. > Correct me if I'm wrong. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] cgroup/cpuset: emulate cgroup in container
Hi Pavel, please, see my comments/question below 13.12.2017 13:43, Pavel Tikhomirov пишет: > Personally I don't like these as we still have no unswer to "If cpusets are > optional for docker, why k8s can't work without them?" it seem there is not > enough explanation in VZAP-31. > Well... I have to admit that I don't like it either. I (frankly) don't like the whole idea of putting kuber into container. And the reason is so simple: CT is a cheating technique. Those, who like "Russian dolls" should use VMs with nested virtualization. But the truth is that we do care about our customers (not sure why, since they pay us less and less). And because of this we've put so much various sh*t into our kernel already, that I got tired to trow it away on each major rebase a long time ago. But this all is lyrics. There is a task - and that's the fix. > We also need to revert the patch below to show cpuset in CT: > commit 5160bd34c9bd ("ve/proc/cpuset: do not show cpuset in CT") > Sure! It's mentioned in the patch description. > It seem I can still attach a process to a nested cgroup in CT with these > patch: > > CT-6ecd9be1 /# cat /proc/cgroups | grep cpuset > cpuset 16 1 1 > CT-6ecd9be1 /# ls /sys/fs/cgroup/cpuset/cpuset.cpus > /sys/fs/cgroup/cpuset/cpuset.cpus > CT-6ecd9be1 /# mkdir /sys/fs/cgroup/cpuset/test > CT-6ecd9be1 /# sleep 1000 & > [1] 678 > CT-6ecd9be1 /# echo 678 > /sys/fs/cgroup/cpuset/test/tasks > CT-6ecd9be1 /# cat /sys/fs/cgroup/cpuset/test/tasks > 678 > Isn't it wonderful? :) Poor me, I have to admit, that I didn't know, that this task will be even visible in the nest cgroup... :( I thought, that emulation would be less effective. Nevertheless, if you have some better way to solve the issue, please, share. > On 12/13/2017 01:37 PM, Stanislav Kinsburskiy wrote: >> Any changes to this cgroup are skipped in container, but success code is >> returned. >> The idea is to fool Docker/Kubernetes. >> >> https://jira.sw.ru/browse/PSBM-58423 >> >> This patch obsoletes "ve/proc/cpuset: do not show cpuset in CT" >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> kernel/cpuset.c | 9 + >> 1 file changed, 9 insertions(+) >> >> diff --git a/kernel/cpuset.c b/kernel/cpuset.c >> index 26d88eb..dfac505 100644 >> --- a/kernel/cpuset.c >> +++ b/kernel/cpuset.c >> @@ -1441,6 +1441,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, >> struct cgroup_taskset *tset) >> struct task_struct *task; >> int ret; >> + if (!ve_is_super(get_exec_env())) >> + return 0; >> + >> mutex_lock(&cpuset_mutex); >> ret = -ENOSPC; >> @@ -1470,6 +1473,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, >> struct cgroup_taskset *tset) >> static void cpuset_cancel_attach(struct cgroup *cgrp, >> struct cgroup_taskset *tset) >> { >> + if (!ve_is_super(get_exec_env())) >> + return; >> + >> mutex_lock(&cpuset_mutex); >> cgroup_cs(cgrp)->attach_in_progress--; >> mutex_unlock(&cpuset_mutex); >> @@ -1494,6 +1500,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct >> cgroup_taskset *tset) >> struct cpuset *cs = cgroup_cs(cgrp); >> struct cpuset *oldcs = cgroup_cs(oldcgrp); >> + if (!ve_is_super(get_exec_env())) >> + return; >> + >> mutex_lock(&cpuset_mutex); >> /* prepare for attach */ >> > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] cgroup/cpuset: emulate cgroup in container
Any changes to this cgroup are skipped in container, but success code is returned. The idea is to fool Docker/Kubernetes. https://jira.sw.ru/browse/PSBM-58423 This patch obsoletes "ve/proc/cpuset: do not show cpuset in CT" Signed-off-by: Stanislav Kinsburskiy --- kernel/cpuset.c |9 + 1 file changed, 9 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 26d88eb..dfac505 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1441,6 +1441,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct task_struct *task; int ret; + if (!ve_is_super(get_exec_env())) + return 0; + mutex_lock(&cpuset_mutex); ret = -ENOSPC; @@ -1470,6 +1473,9 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) static void cpuset_cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + if (!ve_is_super(get_exec_env())) + return; + mutex_lock(&cpuset_mutex); cgroup_cs(cgrp)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1494,6 +1500,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + if (!ve_is_super(get_exec_env())) + return; + mutex_lock(&cpuset_mutex); /* prepare for attach */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] ve: fix container stopped state check
Checking for empty cgroup is not correct, because init process leaves cgroup early in do_exit. This leads to a situation, when container is treated as stopped but its resources (VEIP for instance) are not yet released. Which in turn leads to container restart failure due to non-releases VEIP address. https://jira.sw.ru/browse/PSBM-78078 Signed-off-by: Stanislav Kinsburskiy --- kernel/ve/ve.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b0188c3..c628516 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -846,7 +846,7 @@ static int ve_state_read(struct cgroup *cg, struct cftype *cft, if (ve->is_running) seq_puts(m, "RUNNING"); - else if (!nr_threads_ve(ve)) + else if (!ve->init_task) seq_puts(m, "STOPPED"); else if (ve->ve_ns) seq_puts(m, "STOPPING"); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] venet: remove obsolete tgt_veip variable
This was used for "vzredir" feature in vz6 and not used anymore. Signed-off-by: Stanislav Kinsburskiy --- drivers/net/venetdev.c | 12 +++- include/linux/venet.h |1 - 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c index 11f4a66..dffebdc 100644 --- a/drivers/net/venetdev.c +++ b/drivers/net/venetdev.c @@ -256,8 +256,7 @@ static void __veip_stop(struct ve_struct *ve) ptr = list_entry(p, struct ip_entry_struct, ve_list); ptr->active_env = NULL; - if (ptr->tgt_veip == NULL) - ip_entry_unhash(ptr); + ip_entry_unhash(ptr); } veip_pool_ops->veip_release(ve); @@ -277,8 +276,6 @@ static int veip_entry_conflict(struct ip_entry_struct *entry, struct ve_struct * { if (entry->active_env != NULL) return -EADDRINUSE; - if (entry->tgt_veip && entry->tgt_veip->veid != ve->veid) - return -EADDRNOTAVAIL; entry->active_env = ve; return 0; @@ -340,8 +337,7 @@ static int veip_entry_del(struct ve_struct *ve, struct ve_addr_struct *addr) err = 0; found->active_env = NULL; - if (found->tgt_veip == NULL) - ip_entry_unhash(found); + ip_entry_unhash(found); out: spin_unlock(&veip_lock); return err; @@ -891,7 +887,6 @@ static int veip_seq_show(struct seq_file *m, void *v) { struct hlist_node *p; struct ip_entry_struct *entry; - struct veip_struct *veip; char s[40]; if (v == SEQ_START_TOKEN) { @@ -902,8 +897,7 @@ static int veip_seq_show(struct seq_file *m, void *v) p = (struct hlist_node *)v; entry = hlist_entry(p, struct ip_entry_struct, ip_hash); veaddr_print(s, sizeof(s), &entry->addr); - veip = ACCESS_ONCE(entry->tgt_veip); - seq_printf(m, "%39s %10u\n", s, veip == NULL ? 0 : veip->veid); + seq_printf(m, "%39s 0\n", s); return 0; } diff --git a/include/linux/venet.h b/include/linux/venet.h index 7562996..08d89bc 100644 --- a/include/linux/venet.h +++ b/include/linux/venet.h @@ -28,7 +28,6 @@ struct ip_entry_struct { struct ve_addr_struct addr; struct ve_struct*active_env; - struct veip_struct *tgt_veip; struct hlist_node ip_hash; union { struct list_headve_list; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] venet: VEIP release debug patch
Sorry, ignore 27.11.2017 15:48, Stanislav Kinsburskiy пишет: > Needed to investigate VEIP release - CT stop race. > > https://jira.sw.ru/browse/PSBM-78078 > > Signed-off-by: Stanislav Kinsburskiy > --- > drivers/net/venetdev.c |6 +- > 1 file changed, 5 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c > index 11f4a66..dcdb51d 100644 > --- a/drivers/net/venetdev.c > +++ b/drivers/net/venetdev.c > @@ -256,8 +256,12 @@ static void __veip_stop(struct ve_struct *ve) > ptr = list_entry(p, struct ip_entry_struct, ve_list); > ptr->active_env = NULL; > > - if (ptr->tgt_veip == NULL) > + if (ptr->tgt_veip == NULL) { > + printk("%s: removing IP for ve %d\n", __func__, > + ptr->tgt_veip->veid); > + dump_stack(); > ip_entry_unhash(ptr); > + } > } > > veip_pool_ops->veip_release(ve); > > ___ > Devel mailing list > Devel@openvz.org > https://lists.openvz.org/mailman/listinfo/devel > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] venet: VEIP release debug patch
Needed to investigate VEIP release - CT stop race. https://jira.sw.ru/browse/PSBM-78078 Signed-off-by: Stanislav Kinsburskiy --- drivers/net/venetdev.c |6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c index 11f4a66..dcdb51d 100644 --- a/drivers/net/venetdev.c +++ b/drivers/net/venetdev.c @@ -256,8 +256,12 @@ static void __veip_stop(struct ve_struct *ve) ptr = list_entry(p, struct ip_entry_struct, ve_list); ptr->active_env = NULL; - if (ptr->tgt_veip == NULL) + if (ptr->tgt_veip == NULL) { + printk("%s: removing IP for ve %d\n", __func__, + ptr->tgt_veip->veid); + dump_stack(); ip_entry_unhash(ptr); + } } veip_pool_ops->veip_release(ve); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] netfilter: kill the fake untracked conntrack objects
resurrect an old patch from Pablo Neira to remove the untracked objects. Currently, there are four possible states of an skb wrt. conntrack. 1. No conntrack attached, ct is NULL. 2. Normal (kmem cache allocated) ct attached. 3. a template (kmalloc'd), not in any hash tables at any point in time 4. the 'untracked' conntrack, a percpu nf_conn object, tagged via IPS_UNTRACKED_BIT in ct->status. Untracked is supposed to be identical to case 1. It exists only so users can check -m conntrack --ctstate UNTRACKED vs. -m conntrack --ctstate INVALID e.g. attempts to set connmark on INVALID or UNTRACKED conntracks is supposed to be a no-op. Thus currently we need to check ct == NULL || nf_ct_is_untracked(ct) in a lot of places in order to avoid altering untracked objects. The other consequence of the percpu untracked object is that all -j NOTRACK (and, later, kfree_skb of such skbs) result in an atomic op (inc/dec the untracked conntracks refcount). This adds a new kernel-private ctinfo state, IP_CT_UNTRACKED, to make the distinction instead. The (few) places that care about packet invalid (ct is NULL) vs. packet untracked now need to test ct == NULL vs. ctinfo == IP_CT_UNTRACKED, but all other places can omit the nf_ct_is_untracked() check. https://jira.sw.ru/browse/PSBM-71153 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Stanislav Kinsburskiy --- include/net/ip_vs.h|5 +- include/net/netfilter/nf_conntrack.h | 10 - include/uapi/linux/netfilter/nf_conntrack_common.h |6 ++- net/ipv4/netfilter/nf_dup_ipv4.c |5 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |5 +- net/ipv6/netfilter/nf_dup_ipv6.c |5 +- net/netfilter/nf_conntrack_core.c | 42 ++-- net/netfilter/nf_nat_core.c|3 - net/netfilter/xt_CT.c | 21 +- net/netfilter/xt_conntrack.c | 13 +++--- net/netfilter/xt_state.c | 13 +++--- 11 files changed, 41 insertions(+), 87 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e1599ad..69d2a9a 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1540,9 +1540,8 @@ static inline void ip_vs_notrack(struct sk_buff *skb) if (!ct || !nf_ct_is_untracked(ct)) { nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); + skb->nfct = NULL; + skb->nfctinfo = IP_CT_UNTRACKED; } #endif } diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index edb8911..b278973 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -232,14 +232,6 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); -/* Fake conntrack entry for untracked connections */ -DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked); -static inline struct nf_conn *nf_ct_untracked_get(void) -{ - return &__raw_get_cpu_var(nf_conntrack_untracked); -} -void nf_ct_untracked_status_or(unsigned long bits); - /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), @@ -272,7 +264,7 @@ static inline int nf_ct_is_dying(struct nf_conn *ct) static inline int nf_ct_is_untracked(const struct nf_conn *ct) { - return test_bit(IPS_UNTRACKED_BIT, &ct->status); + return false; } /* Packet is received from loopback */ diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 6d074d1..6c18960 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -28,12 +28,14 @@ enum ip_conntrack_info { /* only for userspace compatibility */ #ifndef __KERNEL__ IP_CT_NEW_REPLY = IP_CT_NUMBER, +#else + IP_CT_UNTRACKED = 7, #endif }; #define NF_CT_STATE_INVALID_BIT(1 << 0) #define NF_CT_STATE_BIT(ctinfo)(1 << ((ctinfo) % IP_CT_IS_REPLY + 1)) -#define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_NUMBER + 1)) +#define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_UNTRACKED + 1)) /* Bitset representing status of connection. */ enum ip_conntrack_status { @@ -90,7 +92,7 @@ enum ip_conntrack_status { IPS_TEMPLATE_BIT = 11, IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT), - /* Conntrack is a fake untracke
[Devel] [PATCH v2] nfs: abort delegation in dying VE
Don't queue delegation request, if ve init is exiting. https://jira.sw.ru/browse/PSBM-77061 v2: check ve is dying under rcu_lock() Inspired-by: Kirill Tkhai Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/delegation.c | 18 +- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 66af497..1d1500c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -189,15 +189,31 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, nfs_inode_set_delegation(inode, cred, res); } +static bool ve_abort_delegation(struct inode *inode) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct rpc_xprt *xprt; + bool abort; + + rcu_read_lock(); + xprt = rcu_dereference(clp->cl_rpcclient->cl_xprt); + abort = xprt->xprt_net->owner_ve->ve_netns == NULL; + rcu_read_unlock(); + + return abort; +} + static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) { int res = 0; - if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && + !ve_abort_delegation(inode)) { res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync); + } nfs_free_delegation(delegation); return res; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] nfs: abort delegation in dying VE
Don't queue delegation request, if ve init is exiting. https://jira.sw.ru/browse/PSBM-77061 Inspired-by: Kirill Tkhai Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/delegation.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 66af497..2422754 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -189,15 +189,29 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, nfs_inode_set_delegation(inode, cred, res); } +static bool ve_abort_delegation(struct inode *inode) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct rpc_xprt *xprt; + + rcu_read_lock(); + xprt = rcu_dereference(clp->cl_rpcclient->cl_xprt); + rcu_read_unlock(); + + return xprt->xprt_net->owner_ve->ve_netns == NULL; +} + static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) { int res = 0; - if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && + !ve_abort_delegation(inode)) { res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync); + } nfs_free_delegation(delegation); return res; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v6 0/2] nfs: fix race between callback shutdown and execution
The idea is to use mutex for protecting callback execution agains per-net callback shutdown and destroying all the net-related backchannel requests before transports destruction. --- Stanislav Kinsburskiy (2): sunrpc: bc_svc_flush_queue_net() helper introduced nfs: protect callback execution against per-net callback thread shutdown fs/nfs/callback.c | 20 include/linux/sunrpc/svc.h |3 +++ net/sunrpc/svc.c | 15 +++ 3 files changed, 38 insertions(+) -- ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v6 2/2] nfs: protect callback execution against per-net callback thread shutdown
From: Stanislav Kinsburskiy Here is the race: CPU #0 CPU#1 cleanup_mnt nfs41_callback_svc (get xprt from the list) nfs_callback_down ... ... ... svc_close_net ... ... ... svc_xprt_free ... svc_bc_sock_freebc_svc_process kfree(xprt) svc_process_common rqstp->rq_xprt->xpt_ops (use after free) The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 v6: destroy all per-net backchannel requests only for NFSv4.1 v5: destroy all per-net backchannel requests before transports on in nfs_callback_down_net v4: use another mutex to protect callback execution agains per-net transports shutdown. This guarantees, that transports won't be destroyed by shutdown callback while execution is in progress and vice versa. v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with mutex taken), while thread wait for the mutex to take. The idea is to simply check if thread has to exit, if mutex lock has failed. This is a busy loop, but it shouldn't happend often and for long. Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c | 20 include/linux/sunrpc/svc.h |1 + 2 files changed, 21 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..feffccf 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv) } #if defined(CONFIG_NFS_V4_1) +static DEFINE_MUTEX(nfs41_callback_mutex); + /* * The callback service for NFSv4.1 callbacks */ @@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; + mutex_lock(&nfs41_callback_mutex); + if (kthread_should_stop()) { + mutex_unlock(&nfs41_callback_mutex); + return 0; + } + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { @@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs41_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs41_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } @@ -139,6 +149,13 @@ nfs41_callback_svc(void *vrqstp) return 0; } +static void nfs41_callback_down_net(struct svc_serv *serv, struct net *net) +{ + mutex_lock(&nfs41_callback_mutex); + bc_svc_flush_queue_net(serv, net); + mutex_unlock(&nfs41_callback_mutex); +} + /* * Bring up the NFSv4.1 callback service */ @@ -150,6 +167,7 @@ nfs41_callback_up(struct svc_serv *serv) INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); + serv->svc_cb_down_net = nfs41_callback_down_net; rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; @@ -242,6 +260,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%p\n", net); + if (serv->svc_cb_down_net) + serv->svc_cb_down_net(serv, net); svc_shutdown_net(serv, net); } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index fe70ff0..c04ef80 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -108,6 +108,7 @@ struct svc_serv { wait_queue_head_t sv_cb_waitq;/* sleep here if there are no * entries in the svc_cb_list */ struct svc_xprt *sv_bc_xprt;/* callback on fore channel */ + void(*svc_cb_down_net)(struct svc_serv *serv, struct net *net); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ }; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v6 1/2] sunrpc: bc_svc_flush_queue_net() helper introduced
From: Stanislav Kinsburskiy This helper can be used to remove backchannel requests from callback queue on per-net basis. Signed-off-by: Stanislav Kinsburskiy --- include/linux/sunrpc/svc.h |2 ++ net/sunrpc/svc.c | 15 +++ 2 files changed, 17 insertions(+) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 2b30868..fe70ff0 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -484,6 +484,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char *svc_print_addr(struct svc_rqst *, char *, size_t); +void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net); + #defineRPC_MAX_ADDRBUFLEN (63U) /* diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index de8cded..2ca4ff7 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1338,6 +1338,21 @@ svc_process(struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) +void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net) +{ + struct rpc_rqst *req, *tmp; + + spin_lock_bh(&serv->sv_cb_lock); + list_for_each_entry_safe(req, tmp, &serv->sv_cb_list, rq_bc_list) { + if (req->rq_xprt->xprt_net == net) { + list_del(&req->rq_bc_list); + xprt_free_bc_request(req); + } + } + spin_unlock_bh(&serv->sv_cb_lock); +} +EXPORT_SYMBOL_GPL(bc_svc_flush_queue_net); + /* * Process a backchannel RPC request that arrived over an existing * outbound connection ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH v5 2/2] nfs: protect callback execution against per-net callback thread shutdown
09.11.2017 10:33, Kirill Tkhai пишет: > > Looks good for me, except there is one more call of nfs_callback_down_net() > in nfs_callback_up(). > It's not clear for me, we shouldn't protect it via the mutex too. Should we? > Looks like no, mount is not ready yet and there is no connection to server, thus no request from server can come to the moment and we don't need any additional protection here. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v5 2/2] nfs: protect callback execution against per-net callback thread shutdown
From: Stanislav Kinsburskiy Here is the race: CPU #0 CPU#1 cleanup_mnt nfs41_callback_svc (get xprt from the list) nfs_callback_down ... ... ... svc_close_net ... ... ... svc_xprt_free ... svc_bc_sock_freebc_svc_process kfree(xprt) svc_process_common rqstp->rq_xprt->xpt_ops (use after free) The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 v5: destroy all per-net backchannel requests before transports on in nfs_callback_down_net v4: use another mutex to protect callback execution agains per-net transports shutdown. This guarantees, that transports won't be destroyed by shutdown callback while execution is in progress and vice versa. v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with mutex taken), while thread wait for the mutex to take. The idea is to simply check if thread has to exit, if mutex lock has failed. This is a busy loop, but it shouldn't happend often and for long. Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c | 17 + 1 file changed, 17 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..e18d774 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv) } #if defined(CONFIG_NFS_V4_1) +static DEFINE_MUTEX(nfs41_callback_mutex); + /* * The callback service for NFSv4.1 callbacks */ @@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; + mutex_lock(&nfs41_callback_mutex); + if (kthread_should_stop()) { + mutex_unlock(&nfs41_callback_mutex); + return 0; + } + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { @@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs41_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs41_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } @@ -242,6 +252,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%p\n", net); + bc_svc_flush_queue_net(serv, net); svc_shutdown_net(serv, net); } @@ -377,7 +388,13 @@ void nfs_callback_down(int minorversion, struct net *net) struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; mutex_lock(&nfs_callback_mutex); +#if defined(CONFIG_NFS_V4_1) + mutex_lock(&nfs41_callback_mutex); + nfs_callback_down_net(minorversion, cb_info->serv, net); + mutex_unlock(&nfs41_callback_mutex); +#else nfs_callback_down_net(minorversion, cb_info->serv, net); +#endif cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v5 0/2] nfs: fix race between callback shutdown and execution
The idea is to use mutex for protecting callback execution agains per-net callback shutdown and destroying all the net-related backchannel requests before transports destruction. --- Stanislav Kinsburskiy (2): sunrpc: bc_svc_flush_queue_net() helper introduced nfs: protect callback execution against per-net callback thread shutdown fs/nfs/callback.c | 17 + include/linux/sunrpc/svc.h |2 ++ net/sunrpc/svc.c | 15 +++ 3 files changed, 34 insertions(+) -- ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v5 1/2] sunrpc: bc_svc_flush_queue_net() helper introduced
From: Stanislav Kinsburskiy This helper can be used to remove backchannel requests from callback queue on per-net basis. Signed-off-by: Stanislav Kinsburskiy --- include/linux/sunrpc/svc.h |2 ++ net/sunrpc/svc.c | 15 +++ 2 files changed, 17 insertions(+) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 2b30868..fe70ff0 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -484,6 +484,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char *svc_print_addr(struct svc_rqst *, char *, size_t); +void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net); + #defineRPC_MAX_ADDRBUFLEN (63U) /* diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index de8cded..2ca4ff7 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1338,6 +1338,21 @@ svc_process(struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_process); #if defined(CONFIG_SUNRPC_BACKCHANNEL) +void bc_svc_flush_queue_net(struct svc_serv *serv, struct net *net) +{ + struct rpc_rqst *req, *tmp; + + spin_lock_bh(&serv->sv_cb_lock); + list_for_each_entry_safe(req, tmp, &serv->sv_cb_list, rq_bc_list) { + if (req->rq_xprt->xprt_net == net) { + list_del(&req->rq_bc_list); + xprt_free_bc_request(req); + } + } + spin_unlock_bh(&serv->sv_cb_lock); +} +EXPORT_SYMBOL_GPL(bc_svc_flush_queue_net); + /* * Process a backchannel RPC request that arrived over an existing * outbound connection ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown
Please, ignore 08.11.2017 15:07, Stanislav Kinsburskiy пишет: > From: Stanislav Kinsburskiy > > Here is the race: > > CPU #0CPU#1 > > cleanup_mnt nfs41_callback_svc (get xprt from the list) > nfs_callback_down ... > ... ... > svc_close_net ... > ... ... > svc_xprt_free ... > svc_bc_sock_free bc_svc_process > kfree(xprt) svc_process_common > rqstp->rq_xprt->xpt_ops (use after free) > > The problem is that per-net SUNRPC transports shutdown is done regardless > current callback execution. This is a race leading to transport use-after-free > in callback handler. > This patch fixes it in stright-forward way. I.e. it protects callback > execution with the same mutex used for per-net data creation and destruction. > Hopefully, it won't slow down NFS client significantly. > > https://jira.sw.ru/browse/PSBM-75751 > > v4: use another mutex to protect callback execution agains per-net transports > shutdown. > This guarantees, that transports won't be destroyed by shutdown callback while > execution is in progress and vice versa. > > v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with > mutex taken), while thread wait for the mutex to take. > The idea is to simply check if thread has to exit, if mutex lock has failed. > This is a busy loop, but it shouldn't happend often and for long. > > Signed-off-by: Stanislav Kinsburskiy > --- > fs/nfs/callback.c | 16 > 1 file changed, 16 insertions(+) > > diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c > index 0beb275..4d7979b 100644 > --- a/fs/nfs/callback.c > +++ b/fs/nfs/callback.c > @@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv) > } > > #if defined(CONFIG_NFS_V4_1) > +static DEFINE_MUTEX(nfs41_callback_mutex); > + > /* > * The callback service for NFSv4.1 callbacks > */ > @@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp) > if (try_to_freeze()) > continue; > > + mutex_lock(&nfs41_callback_mutex); > + if (kthread_should_stop()) { > + mutex_unlock(&nfs41_callback_mutex); > + return 0; > + } > + > prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); > spin_lock_bh(&serv->sv_cb_lock); > if (!list_empty(&serv->sv_cb_list)) { > @@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp) > error = bc_svc_process(serv, req, rqstp); > dprintk("bc_svc_process() returned w/ error code= %d\n", > error); > + mutex_unlock(&nfs41_callback_mutex); > } else { > spin_unlock_bh(&serv->sv_cb_lock); > + mutex_unlock(&nfs41_callback_mutex); > schedule(); > finish_wait(&serv->sv_cb_waitq, &wq); > } > @@ -377,7 +387,13 @@ void nfs_callback_down(int minorversion, struct net *net) > struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; > > mutex_lock(&nfs_callback_mutex); > +#if defined(CONFIG_NFS_V4_1) > + mutex_lock(&nfs41_callback_mutex); > + nfs_callback_down_net(minorversion, cb_info->serv, net); > + mutex_unlock(&nfs41_callback_mutex); > +#else > nfs_callback_down_net(minorversion, cb_info->serv, net); > +#endif > cb_info->users--; > if (cb_info->users == 0 && cb_info->task != NULL) { > kthread_stop(cb_info->task); > > ___ > Devel mailing list > Devel@openvz.org > https://lists.openvz.org/mailman/listinfo/devel > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown
From: Stanislav Kinsburskiy Here is the race: CPU #0 CPU#1 cleanup_mnt nfs41_callback_svc (get xprt from the list) nfs_callback_down ... ... ... svc_close_net ... ... ... svc_xprt_free ... svc_bc_sock_freebc_svc_process kfree(xprt) svc_process_common rqstp->rq_xprt->xpt_ops (use after free) The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 v4: use another mutex to protect callback execution agains per-net transports shutdown. This guarantees, that transports won't be destroyed by shutdown callback while execution is in progress and vice versa. v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with mutex taken), while thread wait for the mutex to take. The idea is to simply check if thread has to exit, if mutex lock has failed. This is a busy loop, but it shouldn't happend often and for long. Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c | 16 1 file changed, 16 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..4d7979b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv) } #if defined(CONFIG_NFS_V4_1) +static DEFINE_MUTEX(nfs41_callback_mutex); + /* * The callback service for NFSv4.1 callbacks */ @@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; + mutex_lock(&nfs41_callback_mutex); + if (kthread_should_stop()) { + mutex_unlock(&nfs41_callback_mutex); + return 0; + } + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { @@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs41_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs41_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } @@ -377,7 +387,13 @@ void nfs_callback_down(int minorversion, struct net *net) struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; mutex_lock(&nfs_callback_mutex); +#if defined(CONFIG_NFS_V4_1) + mutex_lock(&nfs41_callback_mutex); + nfs_callback_down_net(minorversion, cb_info->serv, net); + mutex_unlock(&nfs41_callback_mutex); +#else nfs_callback_down_net(minorversion, cb_info->serv, net); +#endif cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v4] nfs: protect callback execution against per-net callback thread shutdown
From: Stanislav Kinsburskiy Here is the race: CPU #0 CPU#1 cleanup_mnt nfs41_callback_svc (get xprt from the list) nfs_callback_down ... ... ... svc_close_net ... ... ... svc_xprt_free ... svc_bc_sock_freebc_svc_process kfree(xprt) svc_process_common rqstp->rq_xprt->xpt_ops (use after free) The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 v4: use another mutex to protect callback execution agains per-net transports shutdown. This guarantees, that transports won't be destroyed by shutdown callback while execution is in progress and vice versa. v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with mutex taken), while thread wait for the mutex to take. The idea is to simply check if thread has to exit, if mutex lock has failed. This is a busy loop, but it shouldn't happend often and for long. Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c | 16 1 file changed, 16 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..4d7979b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -99,6 +99,8 @@ nfs4_callback_up(struct svc_serv *serv) } #if defined(CONFIG_NFS_V4_1) +static DEFINE_MUTEX(nfs41_callback_mutex); + /* * The callback service for NFSv4.1 callbacks */ @@ -117,6 +119,12 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; + mutex_lock(&nfs41_callback_mutex); + if (kthread_should_stop()) { + mutex_unlock(&nfs41_callback_mutex); + return 0; + } + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { @@ -129,8 +137,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs41_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs41_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } @@ -377,7 +387,13 @@ void nfs_callback_down(int minorversion, struct net *net) struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; mutex_lock(&nfs_callback_mutex); +#if defined(CONFIG_NFS_V4_1) + mutex_lock(&nfs41_callback_mutex); + nfs_callback_down_net(minorversion, cb_info->serv, net); + mutex_unlock(&nfs41_callback_mutex); +#else nfs_callback_down_net(minorversion, cb_info->serv, net); +#endif cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH v3] nfs: protect callback execution against per-net callback thread shutdown
Please, see my comments below. 07.11.2017 12:30, Kirill Tkhai пишет: > On 07.11.2017 13:39, Stanislav Kinsburskiy wrote: >> From: Stanislav Kinsburskiy >> >> Here is the race: >> >> CPU #0 CPU#1 >> >> cleanup_mnt nfs41_callback_svc (get xprt from the list) >> nfs_callback_down... >> ... ... >> svc_close_net... >> ... ... >> svc_xprt_free... >> svc_bc_sock_free bc_svc_process >> kfree(xprt) svc_process_common >> rqstp->rq_xprt->xpt_ops (use after free) >> >> The problem is that per-net SUNRPC transports shutdown is done regardless >> current callback execution. This is a race leading to transport >> use-after-free >> in callback handler. >> This patch fixes it in stright-forward way. I.e. it protects callback >> execution with the same mutex used for per-net data creation and destruction. >> Hopefully, it won't slow down NFS client significantly. >> >> https://jira.sw.ru/browse/PSBM-75751 >> >> v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with >> mutex taken), while thread wait for the mutex to take. >> The idea is to simply check if thread has to exit, if mutex lock has failed. >> This is a busy loop, but it shouldn't happend often and for long. >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> fs/nfs/callback.c |6 ++ >> 1 file changed, 6 insertions(+) >> >> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c >> index 0beb275..bbb07e4 100644 >> --- a/fs/nfs/callback.c >> +++ b/fs/nfs/callback.c >> @@ -117,7 +117,11 @@ nfs41_callback_svc(void *vrqstp) >> if (try_to_freeze()) >> continue; >> >> +if (!mutex_trylock(&nfs_callback_mutex)) >> + continue; > > This looks like a busy loop (that especially bad, because mutex-owner may > sleep > at the moment we are looping). > Well, yes. It a busy loop. And that's not good. But I have to mention, that this busy loop can happen only on umount request. > It seems the solution may be to change nfs_callback_down() function. > Can't we flush pending request in nfs_callback_down()? Well, this already happens. The problem is in race between transport usage (unconditional) and its destruction (also unconditional). IOW, there should be either reference counting or some critical section. Looks like the former will need a way more code and thus more error-prone, the latter is uglier, but code should be simpler at first sight. > Or just delete it from sv_cb_list without handling (does nfs proto allow > that)? > Well, this looks like a promising idea. But yet again, there should be some protection against transport usage in shutdown helper. And it's not yet clear, how to implement it without significant code change... ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v3] nfs: protect callback execution against per-net callback thread shutdown
From: Stanislav Kinsburskiy Here is the race: CPU #0 CPU#1 cleanup_mnt nfs41_callback_svc (get xprt from the list) nfs_callback_down ... ... ... svc_close_net ... ... ... svc_xprt_free ... svc_bc_sock_freebc_svc_process kfree(xprt) svc_process_common rqstp->rq_xprt->xpt_ops (use after free) The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with mutex taken), while thread wait for the mutex to take. The idea is to simply check if thread has to exit, if mutex lock has failed. This is a busy loop, but it shouldn't happend often and for long. Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c |6 ++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..bbb07e4 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -117,7 +117,11 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; + if (!mutex_trylock(&nfs_callback_mutex)) + continue; + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -129,8 +133,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH v2] nfs: protect callback execution against per-net callback thread shutdown
Sorry, this one is bad as well. 07.11.2017 11:28, Stanislav Kinsburskiy пишет: > Here is the race: > > CPU #0CPU#1 > > cleanup_mnt nfs41_callback_svc (get xprt from the list) > nfs_callback_down ... > ... ... > svc_close_net ... > ... ... > svc_xprt_free ... > svc_bc_sock_free bc_svc_process > kfree(xprt) svc_process_common > rqstp->rq_xprt->xpt_ops (use after free) > > The problem is that per-net SUNRPC transports shutdown is done regardless > current callback execution. This is a race leading to transport use-after-free > in callback handler. > This patch fixes it in stright-forward way. I.e. it protects callback > execution with the same mutex used for per-net data creation and destruction. > Hopefully, it won't slow down NFS client significantly. > > https://jira.sw.ru/browse/PSBM-75751 > > v2: Fix mutex deadlock, when shutdown callback waits for thread to exit (with > mutex taken), while thread wait for the mutex to take. > > Signed-off-by: Stanislav Kinsburskiy > --- > fs/nfs/callback.c |7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c > index 0beb275..eed861a 100644 > --- a/fs/nfs/callback.c > +++ b/fs/nfs/callback.c > @@ -118,6 +118,11 @@ nfs41_callback_svc(void *vrqstp) > continue; > > prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); > + > + if (!mutex_trylock(&nfs_callback_mutex) && > + kthread_should_stop()) > + return 0; > + > spin_lock_bh(&serv->sv_cb_lock); > if (!list_empty(&serv->sv_cb_list)) { > req = list_first_entry(&serv->sv_cb_list, > @@ -129,8 +134,10 @@ nfs41_callback_svc(void *vrqstp) > error = bc_svc_process(serv, req, rqstp); > dprintk("bc_svc_process() returned w/ error code= %d\n", > error); > + mutex_unlock(&nfs_callback_mutex); > } else { > spin_unlock_bh(&serv->sv_cb_lock); > + mutex_unlock(&nfs_callback_mutex); > schedule(); > finish_wait(&serv->sv_cb_waitq, &wq); > } > > ___ > Devel mailing list > Devel@openvz.org > https://lists.openvz.org/mailman/listinfo/devel > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH v2] nfs: protect callback execution against per-net callback thread shutdown
Here is the race: CPU #0 CPU#1 cleanup_mnt nfs41_callback_svc (get xprt from the list) nfs_callback_down ... ... ... svc_close_net ... ... ... svc_xprt_free ... svc_bc_sock_freebc_svc_process kfree(xprt) svc_process_common rqstp->rq_xprt->xpt_ops (use after free) The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 v2: Fix mutex deadlock, when shutdown callback waits for thread to exit (with mutex taken), while thread wait for the mutex to take. Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c |7 +++ 1 file changed, 7 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..eed861a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -118,6 +118,11 @@ nfs41_callback_svc(void *vrqstp) continue; prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + + if (!mutex_trylock(&nfs_callback_mutex) && + kthread_should_stop()) + return 0; + spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -129,8 +134,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown
07.11.2017 11:03, Kirill Tkhai пишет: >>> Couldn't this change introduce a deadlock like below? >>> [thread] >>> nfs_callback_down() >>> nfs41_callback_svc() >>>mutex_lock(&nfs_callback_mutex); >>>kthread_stop(cb_info->task); >>> wake_up_process(); >>> wait_for_completion(&kthread->exited); > > And what about above one? > Good catch. Need to think more about it then. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown
Sure, here it is: CPU #0 CPU#1 cleanup_mnt nfs41_callback_svc ... (get transport from the list) nfs_callback_down ... ... ... svc_close_net ... ... ... svc_xprt_free ... svc_bc_sock_freebc_svc_process kfree(xprt) svc_process_common rqstp->rq_xprt->xpt_ops (use after free) 07.11.2017 10:49, Kirill Tkhai пишет: > On 03.11.2017 19:47, Stanislav Kinsburskiy wrote: >> From: Stanislav Kinsburskiy >> >> The problem is that per-net SUNRPC transports shutdown is done regardless >> current callback execution. This is a race leading to transport >> use-after-free >> in callback handler. > > Could you please draw the race to show the interaction between functions? > >> This patch fixes it in stright-forward way. I.e. it protects callback >> execution with the same mutex used for per-net data creation and destruction. >> Hopefully, it won't slow down NFS client significantly. >> >> https://jira.sw.ru/browse/PSBM-75751 >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> fs/nfs/callback.c |3 +++ >> 1 file changed, 3 insertions(+) >> >> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c >> index 0beb275..82e8ed1 100644 >> --- a/fs/nfs/callback.c >> +++ b/fs/nfs/callback.c >> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp) >> continue; >> >> prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); >> +mutex_lock(&nfs_callback_mutex); >> spin_lock_bh(&serv->sv_cb_lock); >> if (!list_empty(&serv->sv_cb_list)) { >> req = list_first_entry(&serv->sv_cb_list, >> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp) >> error = bc_svc_process(serv, req, rqstp); >> dprintk("bc_svc_process() returned w/ error code= %d\n", >> error); >> +mutex_unlock(&nfs_callback_mutex); >> } else { >> spin_unlock_bh(&serv->sv_cb_lock); >> +mutex_unlock(&nfs_callback_mutex); >> schedule(); >> finish_wait(&serv->sv_cb_waitq, &wq); >> } > > Couldn't this change introduce a deadlock like below? > [thread] > nfs_callback_down() nfs41_callback_svc() >mutex_lock(&nfs_callback_mutex); >kthread_stop(cb_info->task); > wake_up_process(); > wait_for_completion(&kthread->exited); > > > mutex_lock(&nfs_callback_mutex); > > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown
Well... Maybe. Let's check, how it works with our kernel. 07.11.2017 10:16, Konstantin Khorenko пишет: > Going to send it to mainstream as well? > > -- > Best regards, > > Konstantin Khorenko, > Virtuozzo Linux Kernel Team > > On 11/03/2017 07:47 PM, Stanislav Kinsburskiy wrote: >> From: Stanislav Kinsburskiy >> >> The problem is that per-net SUNRPC transports shutdown is done regardless >> current callback execution. This is a race leading to transport >> use-after-free >> in callback handler. >> This patch fixes it in stright-forward way. I.e. it protects callback >> execution with the same mutex used for per-net data creation and destruction. >> Hopefully, it won't slow down NFS client significantly. >> >> https://jira.sw.ru/browse/PSBM-75751 >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> fs/nfs/callback.c | 3 +++ >> 1 file changed, 3 insertions(+) >> >> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c >> index 0beb275..82e8ed1 100644 >> --- a/fs/nfs/callback.c >> +++ b/fs/nfs/callback.c >> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp) >> continue; >> >> prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); >> + mutex_lock(&nfs_callback_mutex); >> spin_lock_bh(&serv->sv_cb_lock); >> if (!list_empty(&serv->sv_cb_list)) { >> req = list_first_entry(&serv->sv_cb_list, >> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp) >> error = bc_svc_process(serv, req, rqstp); >> dprintk("bc_svc_process() returned w/ error code= %d\n", >> error); >> + mutex_unlock(&nfs_callback_mutex); >> } else { >> spin_unlock_bh(&serv->sv_cb_lock); >> + mutex_unlock(&nfs_callback_mutex); >> schedule(); >> finish_wait(&serv->sv_cb_waitq, &wq); >> } >> >> ___ >> Devel mailing list >> Devel@openvz.org >> https://lists.openvz.org/mailman/listinfo/devel >> . >> ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown
From: Stanislav Kinsburskiy The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..82e8ed1 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp) continue; prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + mutex_lock(&nfs_callback_mutex); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] netfilter: Allow xt_owner in any user namespace
17.10.2017 08:53, Andrei Vagin пишет: > On Mon, Oct 16, 2017 at 05:50:38PM +0200, Stanislav Kinsburskiy wrote: >> Well, patch looks ok. >> But shouldn't all the ve_init_user_ns() replaced by the par->net? > > This patch does this. > Yes, but not everywhere. Say, there are owner_mt_ve0 and owner_mt6_ve0. Shouldn't there functions also patched? >> >> 14.10.2017 01:20, Andrei Vagin пишет: >>> From: "Eric W. Biederman" >>> >>> ML: 9847371a84b0be330f4bc4aaa98904101ee8573d >>> https://jira.sw.ru/browse/PSBM-69409? >>> >>> Making this work is a little tricky as it really isn't kosher to >>> change the xt_owner_match_info in a check function. >>> >>> Without changing xt_owner_match_info we need to know the user >>> namespace the uids and gids are specified in. In the common case >>> net->user_ns == current_user_ns(). Verify net->user_ns == >>> current_user_ns() in owner_check so we can later assume it in >>> owner_mt. >>> >>> In owner_check also verify that all of the uids and gids specified are >>> in net->user_ns and that the expected min/max relationship exists >>> between the uids and gids in xt_owner_match_info. >>> >>> In owner_mt get the network namespace from the outgoing socket, as this >>> must be the same network namespace as the netfilter rules, and use that >>> network namespace to find the user namespace the uids and gids in >>> xt_match_owner_info are encoded in. Then convert from their encoded >>> from into the kernel internal format for uids and gids and perform the >>> owner match. >>> >>> Similar to ping_group_range, this code does not try to detect >>> noncontiguous UID/GID ranges. >>> >>> Signed-off-by: "Eric W. Biederman" >>> Signed-off-by: Kevin Cernekee >>> Signed-off-by: Pablo Neira Ayuso >>> Signed-off-by: Andrei Vagin >>> --- >>> net/netfilter/xt_owner.c | 41 +++-- >>> 1 file changed, 35 insertions(+), 6 deletions(-) >>> >>> diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c >>> index 31dec4a..1744f78 100644 >>> --- a/net/netfilter/xt_owner.c >>> +++ b/net/netfilter/xt_owner.c >>> @@ -80,11 +80,39 @@ owner_mt6_v0(const struct sk_buff *skb, struct >>> xt_action_param *par) >>> static int owner_check(const struct xt_mtchk_param *par) >>> { >>> struct xt_owner_match_info *info = par->matchinfo; >>> + struct net *net = par->net; >>> >>> - /* For now only allow adding matches from the initial user namespace */ >>> + /* Only allow the common case where the userns of the writer >>> +* matches the userns of the network namespace. >>> +*/ >>> if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && >>> - !current_user_ns_initial()) >>> + (current_user_ns() != net->user_ns)) >>> return -EINVAL; >>> + >>> + /* Ensure the uids are valid */ >>> + if (info->match & XT_OWNER_UID) { >>> + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); >>> + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); >>> + >>> + if (!uid_valid(uid_min) || !uid_valid(uid_max) || >>> + (info->uid_max < info->uid_min) || >>> + uid_lt(uid_max, uid_min)) { >>> + return -EINVAL; >>> + } >>> + } >>> + >>> + /* Ensure the gids are valid */ >>> + if (info->match & XT_OWNER_GID) { >>> + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); >>> + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); >>> + >>> + if (!gid_valid(gid_min) || !gid_valid(gid_max) || >>> + (info->gid_max < info->gid_min) || >>> + gid_lt(gid_max, gid_min)) { >>> + return -EINVAL; >>> + } >>> + } >>> + >>> return 0; >>> } >>> >>> @@ -93,6 +121,7 @@ owner_mt(const struct sk_buff *skb, struct >>> xt_action_param *par) >>> { >>> const struct xt_owner_match_info *info = par->matchinfo; >>> const struct file *filp; >>> + struct net *net = dev_net(par->in ? par->in : par->out); &
Re: [Devel] [PATCH rh7] netfilter: Allow xt_owner in any user namespace
Well, patch looks ok. But shouldn't all the ve_init_user_ns() replaced by the par->net? 14.10.2017 01:20, Andrei Vagin пишет: > From: "Eric W. Biederman" > > ML: 9847371a84b0be330f4bc4aaa98904101ee8573d > https://jira.sw.ru/browse/PSBM-69409? > > Making this work is a little tricky as it really isn't kosher to > change the xt_owner_match_info in a check function. > > Without changing xt_owner_match_info we need to know the user > namespace the uids and gids are specified in. In the common case > net->user_ns == current_user_ns(). Verify net->user_ns == > current_user_ns() in owner_check so we can later assume it in > owner_mt. > > In owner_check also verify that all of the uids and gids specified are > in net->user_ns and that the expected min/max relationship exists > between the uids and gids in xt_owner_match_info. > > In owner_mt get the network namespace from the outgoing socket, as this > must be the same network namespace as the netfilter rules, and use that > network namespace to find the user namespace the uids and gids in > xt_match_owner_info are encoded in. Then convert from their encoded > from into the kernel internal format for uids and gids and perform the > owner match. > > Similar to ping_group_range, this code does not try to detect > noncontiguous UID/GID ranges. > > Signed-off-by: "Eric W. Biederman" > Signed-off-by: Kevin Cernekee > Signed-off-by: Pablo Neira Ayuso > Signed-off-by: Andrei Vagin > --- > net/netfilter/xt_owner.c | 41 +++-- > 1 file changed, 35 insertions(+), 6 deletions(-) > > diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c > index 31dec4a..1744f78 100644 > --- a/net/netfilter/xt_owner.c > +++ b/net/netfilter/xt_owner.c > @@ -80,11 +80,39 @@ owner_mt6_v0(const struct sk_buff *skb, struct > xt_action_param *par) > static int owner_check(const struct xt_mtchk_param *par) > { > struct xt_owner_match_info *info = par->matchinfo; > + struct net *net = par->net; > > - /* For now only allow adding matches from the initial user namespace */ > + /* Only allow the common case where the userns of the writer > + * matches the userns of the network namespace. > + */ > if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && > - !current_user_ns_initial()) > + (current_user_ns() != net->user_ns)) > return -EINVAL; > + > + /* Ensure the uids are valid */ > + if (info->match & XT_OWNER_UID) { > + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); > + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); > + > + if (!uid_valid(uid_min) || !uid_valid(uid_max) || > + (info->uid_max < info->uid_min) || > + uid_lt(uid_max, uid_min)) { > + return -EINVAL; > + } > + } > + > + /* Ensure the gids are valid */ > + if (info->match & XT_OWNER_GID) { > + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); > + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); > + > + if (!gid_valid(gid_min) || !gid_valid(gid_max) || > + (info->gid_max < info->gid_min) || > + gid_lt(gid_max, gid_min)) { > + return -EINVAL; > + } > + } > + > return 0; > } > > @@ -93,6 +121,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param > *par) > { > const struct xt_owner_match_info *info = par->matchinfo; > const struct file *filp; > + struct net *net = dev_net(par->in ? par->in : par->out); > > if (skb->sk == NULL || skb->sk->sk_socket == NULL) > return (info->match ^ info->invert) == 0; > @@ -109,8 +138,8 @@ owner_mt(const struct sk_buff *skb, struct > xt_action_param *par) > (XT_OWNER_UID | XT_OWNER_GID)) == 0; > > if (info->match & XT_OWNER_UID) { > - kuid_t uid_min = make_kuid(ve_init_user_ns(), info->uid_min); > - kuid_t uid_max = make_kuid(ve_init_user_ns(), info->uid_max); > + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); > + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); > if ((uid_gte(filp->f_cred->fsuid, uid_min) && >uid_lte(filp->f_cred->fsuid, uid_max)) ^ > !(info->invert & XT_OWNER_UID)) > @@ -118,8 +147,8 @@ owner_mt(const struct sk_buff *skb, struct > xt_action_param *par) > } > > if (info->match & XT_OWNER_GID) { > - kgid_t gid_min = make_kgid(ve_init_user_ns(), info->gid_min); > - kgid_t gid_max = make_kgid(ve_init_user_ns(), info->gid_max); > + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); > + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); > if ((gid_gte(filp->f_cred->fsgid, gid_min) && >gid_lte(filp->f_cred->fsgid
[Devel] [PATCH] venet: destroy VE IP on venet destruction in NFS is enabled
We skip VE IP destruion in shutdown hook, if NFS is enabled in CT (to allow NFS mounts to disappear. Thus we have to destroy it with venet device. https://jira.sw.ru/browse/PSBM-75120 Signed-off-by: Stanislav Kinsburskiy --- drivers/net/venetdev.c |7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c index 7a546cc..fad232b 100644 --- a/drivers/net/venetdev.c +++ b/drivers/net/venetdev.c @@ -759,9 +759,12 @@ static void venet_dellink(struct net_device *dev, struct list_head *head) struct ve_struct *env = dev->nd_net->owner_ve; /* We check ve_netns to avoid races with veip SHUTDOWN hook, called from -* ve_exit_ns() +* ve_exit_ns(). +* Also, in veip SHUTDOWN hook we skip veip destructionif, if container +* has VE_FEATURE_NFS enabled. Thus here we have to destroy veip in +* this case. */ - if (env->ve_netns) + if (env->ve_netns || (env->features & VE_FEATURE_NFS)) veip_shutdown(env); env->_venet_dev = NULL; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] venet: do not destroy VE IP on shutdown hook if NFS is allowed
Mounts are destroyed asynchroniously and thus can race with VE IP destruction leading to dead lock in kernel: NFS can'n be unmounted. Fix it by skipping VE IP desctruction, is NFS feature is enabled in the container. https://jira.sw.ru/browse/PSBM-73614 Signed-off-by: Stanislav Kinsburskiy --- drivers/net/venetdev.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c index 5710792..7a546cc 100644 --- a/drivers/net/venetdev.c +++ b/drivers/net/venetdev.c @@ -744,10 +744,8 @@ static void venet_setup(struct net_device *dev) SET_ETHTOOL_OPS(dev, &venet_ethtool_ops); } -static void veip_shutdown(void *data) +static void veip_shutdown(struct ve_struct *ve) { - struct ve_struct *ve = data; - spin_lock(&veip_lock); if (ve->veip) { __venet_ext_clean(ve); @@ -1178,8 +1176,18 @@ static struct rtnl_link_ops venet_link_ops = { .maxtype= VENET_INFO_MAX, }; +static void veip_shutdown_fini(void *data) +{ + struct ve_struct *ve = data; + + if (ve->features & VE_FEATURE_NFS) + return; + + veip_shutdown(ve); +} + static struct ve_hook veip_shutdown_hook = { - .fini = veip_shutdown, + .fini = veip_shutdown_fini, .priority = HOOK_PRIO_FINISHING, .owner = THIS_MODULE, }; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] ve: don't call shutdown hook for now
It was me. The reason is explained in the patch description (you can find more details in description of the commit, introduced the hook). I'm waiting for khorenko@ to convince him, that the idea to "podkovat' blohy" was wrong by design and we should drop the hook. Once I succeed, I'll sent appropriate series to get rid of it completely. So this patch is temporary one, because too many tests have failed because of this race. 2 окт. 2017 г. 8:05 PM пользователь Andrey Vagin написал: On Mon, Oct 02, 2017 at 06:47:19PM +0400, Stanislav Kinsburskiy wrote: > This hook is needed only for releaseing venet IP address early (thus allowing > to restart container with the same IP faster). > But mount are destroyed asynchroniosly, and thus NFS mount can be destroyed > after IP address is dropped. > Let's fir this race it the way how all the world does things: release IP with > network namespace. Why don't you remove this code? Who added this hook? What was a reason? > > https://jira.sw.ru/browse/PSBM-73193 > > Signed-off-by: Stanislav Kinsburskiy > --- > kernel/ve/ve.c |3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c > index b0188c3..4efb9d9 100644 > --- a/kernel/ve/ve.c > +++ b/kernel/ve/ve.c > @@ -451,7 +451,8 @@ static void ve_drop_context(struct ve_struct *ve) >synchronize_rcu(); >put_nsproxy(ve_ns); > > - ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve); > + /* This have to be revisited */ > +// ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve); > >put_cred(ve->init_cred); >ve->init_cred = NULL; > > ___ > Devel mailing list > Devel@openvz.org > https://lists.openvz.org/mailman/listinfo/devel ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] ve: don't call shutdown hook for now
This hook is needed only for releaseing venet IP address early (thus allowing to restart container with the same IP faster). But mount are destroyed asynchroniosly, and thus NFS mount can be destroyed after IP address is dropped. Let's fir this race it the way how all the world does things: release IP with network namespace. https://jira.sw.ru/browse/PSBM-73193 Signed-off-by: Stanislav Kinsburskiy --- kernel/ve/ve.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b0188c3..4efb9d9 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -451,7 +451,8 @@ static void ve_drop_context(struct ve_struct *ve) synchronize_rcu(); put_nsproxy(ve_ns); - ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve); + /* This have to be revisited */ +// ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve); put_cred(ve->init_cred); ve->init_cred = NULL; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] scripts: add "-w" to iptables command
How old should it be? I checked with v1.4.21 28.09.2017 12:55, Kirill Tkhai пишет: > Could you please to say will it work on old iptables? > > On 28.09.2017 13:03, Stanislav Kinsburskiy wrote: >> What a brilliant idea it was to ignore unknown keys. >> Should take it into account. >> >> 28.09.2017 10:26, Vasily Averin пишет: >>> kthai@ explained that old version of iptables ignores unknown keys, so >>> adding -w is safe. >>> >>> On 2017-09-28 10:40, Pavel Tikhomirov wrote: >>>> Can we have these script running with older iptables version which does >>>> not have "-w"? >>>> >>>> On 09/27/2017 02:11 PM, Stanislav Kinsburskiy wrote: >>>>> Neede to support new versions of iptables. >>>>> >>>>> https://jira.sw.ru/browse/PSBM-73153 >>>>> >>>>> Signed-off-by: Stanislav Kinsburskiy >>>>> --- >>>>> scripts/nfs-ports-allow.sh | 16 >>>>> 1 file changed, 8 insertions(+), 8 deletions(-) >>>>> >>>>> diff --git a/scripts/nfs-ports-allow.sh b/scripts/nfs-ports-allow.sh >>>>> index 97541dc..ac5cf5f 100644 >>>>> --- a/scripts/nfs-ports-allow.sh >>>>> +++ b/scripts/nfs-ports-allow.sh >>>>> @@ -36,10 +36,10 @@ function add_accept_rules { >>>>> local server=$1 >>>>> local port=$2 >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s >>>>> $server --sport $port -j ACCEPT && >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d >>>>> $server --dport $port -j ACCEPT && >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s >>>>> $server --sport $port -j ACCEPT && >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d >>>>> $server --dport $port -j ACCEPT >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s >>>>> $server --sport $port -j ACCEPT && >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d >>>>> $server --dport $port -j ACCEPT && >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s >>>>> $server --sport $port -j ACCEPT && >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d >>>>> $server --dport $port -j ACCEPT >>>>> } >>>>> function iptables_allow_nfs_ports { >>>>> @@ -63,10 +63,10 @@ function allow_portmapper_port { >>>>> local server=$1 >>>>> local port=111 >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s >>>>> $server --sport $port -j ACCEPT && >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d >>>>> $server --dport $port -j ACCEPT && >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s >>>>> $server --sport $port -j ACCEPT && >>>>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d >>>>> $server --dport $port -j ACCEPT >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s >>>>> $server --sport $port -j ACCEPT && >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d >>>>> $server --dport $port -j ACCEPT && >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s >>>>> $server --sport $port -j ACCEPT && >>>>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d >>>>> $server --dport $port -j ACCEPT >>>>> } >>>>> for s in $servers; do >>>>> >>>>> ___ >>>>> Devel mailing list >>>>> Devel@openvz.org >>>>> https://lists.openvz.org/mailman/listinfo/devel >>>>> >>>> >>> ___ >>> Devel mailing list >>> Devel@openvz.org >>> https://lists.openvz.org/mailman/listinfo/devel >>> >> ___ >> Devel mailing list >> Devel@openvz.org >> https://lists.openvz.org/mailman/listinfo/devel >> ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] scripts: add "-w" to iptables command
What a brilliant idea it was to ignore unknown keys. Should take it into account. 28.09.2017 10:26, Vasily Averin пишет: > kthai@ explained that old version of iptables ignores unknown keys, so adding > -w is safe. > > On 2017-09-28 10:40, Pavel Tikhomirov wrote: >> Can we have these script running with older iptables version which does not >> have "-w"? >> >> On 09/27/2017 02:11 PM, Stanislav Kinsburskiy wrote: >>> Neede to support new versions of iptables. >>> >>> https://jira.sw.ru/browse/PSBM-73153 >>> >>> Signed-off-by: Stanislav Kinsburskiy >>> --- >>> scripts/nfs-ports-allow.sh | 16 >>> 1 file changed, 8 insertions(+), 8 deletions(-) >>> >>> diff --git a/scripts/nfs-ports-allow.sh b/scripts/nfs-ports-allow.sh >>> index 97541dc..ac5cf5f 100644 >>> --- a/scripts/nfs-ports-allow.sh >>> +++ b/scripts/nfs-ports-allow.sh >>> @@ -36,10 +36,10 @@ function add_accept_rules { >>> local server=$1 >>> local port=$2 >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s >>> $server --sport $port -j ACCEPT && >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server >>> --dport $port -j ACCEPT && >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server >>> --sport $port -j ACCEPT && >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server >>> --dport $port -j ACCEPT >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s >>> $server --sport $port -j ACCEPT && >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d >>> $server --dport $port -j ACCEPT && >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s >>> $server --sport $port -j ACCEPT && >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d >>> $server --dport $port -j ACCEPT >>> } >>> function iptables_allow_nfs_ports { >>> @@ -63,10 +63,10 @@ function allow_portmapper_port { >>> local server=$1 >>> local port=111 >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s >>> $server --sport $port -j ACCEPT && >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server >>> --dport $port -j ACCEPT && >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server >>> --sport $port -j ACCEPT && >>> -${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server >>> --dport $port -j ACCEPT >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s >>> $server --sport $port -j ACCEPT && >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d >>> $server --dport $port -j ACCEPT && >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s >>> $server --sport $port -j ACCEPT && >>> +${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d >>> $server --dport $port -j ACCEPT >>> } >>> for s in $servers; do >>> >>> ___ >>> Devel mailing list >>> Devel@openvz.org >>> https://lists.openvz.org/mailman/listinfo/devel >>> >> > ___ > Devel mailing list > Devel@openvz.org > https://lists.openvz.org/mailman/listinfo/devel > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] scripts: add "-w" to iptables command
Neede to support new versions of iptables. https://jira.sw.ru/browse/PSBM-73153 Signed-off-by: Stanislav Kinsburskiy --- scripts/nfs-ports-allow.sh | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/nfs-ports-allow.sh b/scripts/nfs-ports-allow.sh index 97541dc..ac5cf5f 100644 --- a/scripts/nfs-ports-allow.sh +++ b/scripts/nfs-ports-allow.sh @@ -36,10 +36,10 @@ function add_accept_rules { local server=$1 local port=$2 - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server --sport $port -j ACCEPT && - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server --dport $port -j ACCEPT && - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server --sport $port -j ACCEPT && - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server --dport $port -j ACCEPT + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server --sport $port -j ACCEPT && + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server --dport $port -j ACCEPT && + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server --sport $port -j ACCEPT && + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server --dport $port -j ACCEPT } function iptables_allow_nfs_ports { @@ -63,10 +63,10 @@ function allow_portmapper_port { local server=$1 local port=111 - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server --sport $port -j ACCEPT && - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server --dport $port -j ACCEPT && - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server --sport $port -j ACCEPT && - ${JOIN_CT} ${IPTABLES} -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server --dport $port -j ACCEPT + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -s $server --sport $port -j ACCEPT && + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p udp -d $server --dport $port -j ACCEPT && + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -s $server --sport $port -j ACCEPT && + ${JOIN_CT} ${IPTABLES} -w -I ${CRTOOLS_IPTABLES_TABLE} -p tcp -d $server --dport $port -j ACCEPT } for s in $servers; do ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] connector: bump skb->users before callback invocation
From: Florian Westphal Backport of commit 55285bf09427c5abf43ee1d54e892f352092b1f1. Dmitry reports memleak with syskaller program. Problem is that connector bumps skb usecount but might not invoke callback. So move skb_get to where we invoke the callback. https://jira.sw.ru/browse/PSBM-71904 Reported-by: Dmitry Vyukov Signed-off-by: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Stanislav Kinsburskiy --- drivers/connector/connector.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c index 752c692..57285ba 100644 --- a/drivers/connector/connector.c +++ b/drivers/connector/connector.c @@ -161,26 +161,21 @@ static int cn_call_callback(struct sk_buff *skb) * * It checks skb, netlink header and msg sizes, and calls callback helper. */ -static void cn_rx_skb(struct sk_buff *__skb) +static void cn_rx_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; - struct sk_buff *skb; int len, err; - skb = skb_get(__skb); - if (skb->len >= NLMSG_HDRLEN) { nlh = nlmsg_hdr(skb); len = nlmsg_len(nlh); if (len < (int)sizeof(struct cn_msg) || skb->len < nlh->nlmsg_len || - len > CONNECTOR_MAX_MSG_SIZE) { - kfree_skb(skb); + len > CONNECTOR_MAX_MSG_SIZE) return; - } - err = cn_call_callback(skb); + err = cn_call_callback(skb_get(skb)); if (err < 0) kfree_skb(skb); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation
14.09.2017 13:45, Ian Kent пишет: > On 14/09/17 19:39, Stanislav Kinsburskiy wrote: >> >> >> 14.09.2017 13:29, Ian Kent пишет: >>> On 14/09/17 17:24, Stanislav Kinsburskiy wrote: >>>> >>>> >>>> 14.09.2017 02:38, Ian Kent пишет: >>>>> On 01/09/17 19:21, Stanislav Kinsburskiy wrote: >>>>>> Signed-off-by: Stanislav Kinsburskiy >>>>>> --- >>>>>> fs/autofs4/autofs_i.h |3 +++ >>>>>> fs/autofs4/dev-ioctl.c |3 +++ >>>>>> fs/autofs4/inode.c |4 +++- >>>>>> 3 files changed, 9 insertions(+), 1 deletion(-) >>>>>> >>>>>> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h >>>>>> index 4737615..3da105f 100644 >>>>>> --- a/fs/autofs4/autofs_i.h >>>>>> +++ b/fs/autofs4/autofs_i.h >>>>>> @@ -120,6 +120,9 @@ struct autofs_sb_info { >>>>>> struct list_head active_list; >>>>>> struct list_head expiring_list; >>>>>> struct rcu_head rcu; >>>>>> +#ifdef CONFIG_COMPAT >>>>>> +unsigned is32bit:1; >>>>>> +#endif >>>>>> }; >>>>>> >>>>>> static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) >>>>>> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c >>>>>> index b7c816f..467d6c4 100644 >>>>>> --- a/fs/autofs4/dev-ioctl.c >>>>>> +++ b/fs/autofs4/dev-ioctl.c >>>>>> @@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file >>>>>> *fp, >>>>>> sbi->pipefd = pipefd; >>>>>> sbi->pipe = pipe; >>>>>> sbi->catatonic = 0; >>>>>> +#ifdef CONFIG_COMPAT >>>>>> +sbi->is32bit = is_compat_task(); >>>>>> +#endif >>>>>> } >>>>>> out: >>>>>> put_pid(new_pid); >>>>>> diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c >>>>>> index 09e7d68..21d3c0b 100644 >>>>>> --- a/fs/autofs4/inode.c >>>>>> +++ b/fs/autofs4/inode.c >>>>>> @@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void >>>>>> *data, int silent) >>>>>> } else { >>>>>> sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); >>>>>> } >>>>>> - >>>>>> +#ifdef CONFIG_COMPAT >>>>>> +sbi->is32bit = is_compat_task(); >>>>>> +#endif >>>>>> if (autofs_type_trigger(sbi->type)) >>>>>> __managed_dentry_set_managed(root); >>>>>> >>>>>> >>>>> >>>>> Not sure about this. >>>>> >>>>> Don't you think it would be better to avoid the in code #ifdefs by doing >>>>> some >>>>> checks and defines in the header file and defining what's need to just use >>>>> is_compat_task(). >>>>> >>>> >>>> Yes, might be... >>>> >>>>> Not sure 2 patches are needed for this either .. >>>>> >>>> >>>> Well, I found this issue occasionally. >>> >>> I'm wondering what the symptoms are? >>> >> >> Size of struct autofs_v5_packet is 300 bytes for x86 and 304 bytes for >> x86_64. >> Which means, that 32bit task can read more than size of autofs_v5_packet on >> 64bit kernel. > > Are you sure? > > Shouldn't that be a short read on the x86 side of a 4 bytes longer > structure on the x86_64 side. > > I didn't think you could have a 64 bit client on a 32 bit kernel > so the converse (the read past end of struct) doesn't apply. > Sorry for the confusion, I had to add brackets like this: > Which means, that 32bit task can read more than size of autofs_v5_packet (on > 64bit kernel). IOW, 32bit task expects to read 300 bytes (size of struct autofs_v5_packet) while there are 304 bytes available on the "wire" from the 64bit kernel. > Ian > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation
14.09.2017 13:29, Ian Kent пишет: > On 14/09/17 17:24, Stanislav Kinsburskiy wrote: >> >> >> 14.09.2017 02:38, Ian Kent пишет: >>> On 01/09/17 19:21, Stanislav Kinsburskiy wrote: >>>> Signed-off-by: Stanislav Kinsburskiy >>>> --- >>>> fs/autofs4/autofs_i.h |3 +++ >>>> fs/autofs4/dev-ioctl.c |3 +++ >>>> fs/autofs4/inode.c |4 +++- >>>> 3 files changed, 9 insertions(+), 1 deletion(-) >>>> >>>> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h >>>> index 4737615..3da105f 100644 >>>> --- a/fs/autofs4/autofs_i.h >>>> +++ b/fs/autofs4/autofs_i.h >>>> @@ -120,6 +120,9 @@ struct autofs_sb_info { >>>>struct list_head active_list; >>>>struct list_head expiring_list; >>>>struct rcu_head rcu; >>>> +#ifdef CONFIG_COMPAT >>>> + unsigned is32bit:1; >>>> +#endif >>>> }; >>>> >>>> static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) >>>> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c >>>> index b7c816f..467d6c4 100644 >>>> --- a/fs/autofs4/dev-ioctl.c >>>> +++ b/fs/autofs4/dev-ioctl.c >>>> @@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, >>>>sbi->pipefd = pipefd; >>>>sbi->pipe = pipe; >>>>sbi->catatonic = 0; >>>> +#ifdef CONFIG_COMPAT >>>> + sbi->is32bit = is_compat_task(); >>>> +#endif >>>>} >>>> out: >>>>put_pid(new_pid); >>>> diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c >>>> index 09e7d68..21d3c0b 100644 >>>> --- a/fs/autofs4/inode.c >>>> +++ b/fs/autofs4/inode.c >>>> @@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void >>>> *data, int silent) >>>>} else { >>>>sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); >>>>} >>>> - >>>> +#ifdef CONFIG_COMPAT >>>> + sbi->is32bit = is_compat_task(); >>>> +#endif >>>>if (autofs_type_trigger(sbi->type)) >>>>__managed_dentry_set_managed(root); >>>> >>>> >>> >>> Not sure about this. >>> >>> Don't you think it would be better to avoid the in code #ifdefs by doing >>> some >>> checks and defines in the header file and defining what's need to just use >>> is_compat_task(). >>> >> >> Yes, might be... >> >>> Not sure 2 patches are needed for this either .. >>> >> >> Well, I found this issue occasionally. > > I'm wondering what the symptoms are? > Size of struct autofs_v5_packet is 300 bytes for x86 and 304 bytes for x86_64. Which means, that 32bit task can read more than size of autofs_v5_packet on 64bit kernel. >> And, frankly speaking, it's not clear to me, whether this issue is important >> at all, so I wanted to clarify this first. >> Thanks to O_DIRECT, the only way to catch the issue is to try to read more, >> than expected, in compat task (that's how I found it). > > Right, the O_DIRECT patch from Linus was expected to fix the structure > alignment problem. The stuct field offsets are ok aren't they? > Yes, they are ok. >> I don't see any other flaw so far. And if so, that, probably, we shouldn't >> care about the issue at all. >> What do you think? > > If we are seeing hangs, incorrect struct fields or similar something > should be done about it but if all is actually working ok then the > O_DIRECT fix is doing it's job and further changes aren't necessary. > Well, yes. O_DIRECT fix covers the issue. Ok then. Thanks for the clarification! > Ian > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation
14.09.2017 02:38, Ian Kent пишет: > On 01/09/17 19:21, Stanislav Kinsburskiy wrote: >> Signed-off-by: Stanislav Kinsburskiy >> --- >> fs/autofs4/autofs_i.h |3 +++ >> fs/autofs4/dev-ioctl.c |3 +++ >> fs/autofs4/inode.c |4 +++- >> 3 files changed, 9 insertions(+), 1 deletion(-) >> >> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h >> index 4737615..3da105f 100644 >> --- a/fs/autofs4/autofs_i.h >> +++ b/fs/autofs4/autofs_i.h >> @@ -120,6 +120,9 @@ struct autofs_sb_info { >> struct list_head active_list; >> struct list_head expiring_list; >> struct rcu_head rcu; >> +#ifdef CONFIG_COMPAT >> +unsigned is32bit:1; >> +#endif >> }; >> >> static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) >> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c >> index b7c816f..467d6c4 100644 >> --- a/fs/autofs4/dev-ioctl.c >> +++ b/fs/autofs4/dev-ioctl.c >> @@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, >> sbi->pipefd = pipefd; >> sbi->pipe = pipe; >> sbi->catatonic = 0; >> +#ifdef CONFIG_COMPAT >> +sbi->is32bit = is_compat_task(); >> +#endif >> } >> out: >> put_pid(new_pid); >> diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c >> index 09e7d68..21d3c0b 100644 >> --- a/fs/autofs4/inode.c >> +++ b/fs/autofs4/inode.c >> @@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void >> *data, int silent) >> } else { >> sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); >> } >> - >> +#ifdef CONFIG_COMPAT >> +sbi->is32bit = is_compat_task(); >> +#endif >> if (autofs_type_trigger(sbi->type)) >> __managed_dentry_set_managed(root); >> >> > > Not sure about this. > > Don't you think it would be better to avoid the in code #ifdefs by doing some > checks and defines in the header file and defining what's need to just use > is_compat_task(). > Yes, might be... > Not sure 2 patches are needed for this either .. > Well, I found this issue occasionally. And, frankly speaking, it's not clear to me, whether this issue is important at all, so I wanted to clarify this first. Thanks to O_DIRECT, the only way to catch the issue is to try to read more, than expected, in compat task (that's how I found it). I don't see any other flaw so far. And if so, that, probably, we shouldn't care about the issue at all. What do you think? > Ian > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] sysfs: don't use VE namespace for "dev/char" directory
Looks like we don't have devices, using it. Or lost some patches. Anyway, currently it doesn't work. To make this VE namespace work, "namespace" callback has to be added to ve_kobj_type. And this callback has to get VE structure from kobject somehow. IOW, device has to be allocated per-ve. Another issue here is that it's unclear, how to mark generic devices as allocated in VE#0 thus making them visible in VE#0. Looks like the whole idea was wrong... https://jira.sw.ru/browse/PSBM-71811 Signed-off-by: Stanislav Kinsburskiy --- drivers/base/core.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 80cf3eb..86540d9 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1538,7 +1538,7 @@ int __init devices_init(void) sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj); if (!sysfs_dev_block_kobj) goto block_kobj_err; - sysfs_dev_char_kobj = kobject_create_and_add_ve("char", dev_kobj); + sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); if (!sysfs_dev_char_kobj) goto char_kobj_err; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] zdtm: fix package memory allocation in autofs.c
No, we don't. But this one could be applied: "tests: do not try to read more than packet in AutoFS test" 09.09.2017 02:01, Andrei Vagin пишет: > On Thu, Aug 31, 2017 at 12:10:40PM +0300, Stanislav Kinsburskiy wrote: >> Plus some cleanup. > > Do we need this patch for the upstream criu? Could you send it to > the criu mailing list? > >> >> https://jira.sw.ru/browse/PSBM-71078 >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> test/zdtm/static/autofs.c | 18 +- >> 1 file changed, 9 insertions(+), 9 deletions(-) >> >> diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c >> index 8d917ee..882289f 100644 >> --- a/test/zdtm/static/autofs.c >> +++ b/test/zdtm/static/autofs.c >> @@ -460,10 +460,10 @@ static int automountd_loop(int pipe, const char >> *mountpoint, struct autofs_param >> { >> union autofs_v5_packet_union *packet; >> ssize_t bytes; >> -size_t psize = sizeof(*packet) * 2; >> +size_t psize = sizeof(*packet); >> int err = 0; >> >> -packet = malloc(psize); >> +packet = malloc(psize * 2); >> if (!packet) { >> pr_err("failed to allocate autofs packet\n"); >> return -ENOMEM; >> @@ -473,7 +473,7 @@ static int automountd_loop(int pipe, const char >> *mountpoint, struct autofs_param >> siginterrupt(SIGUSR2, 1); >> >> while (!stop && !err) { >> -memset(packet, 0, sizeof(*packet)); >> +memset(packet, 0, psize * 2); >> >> bytes = read(pipe, packet, psize); >> if (bytes < 0) { >> @@ -483,12 +483,12 @@ static int automountd_loop(int pipe, const char >> *mountpoint, struct autofs_param >> } >> continue; >> } >> -if (bytes > psize) { >> -pr_err("read more that expected: %zd > %zd\n", bytes, >> psize); >> -return -EINVAL; >> -} >> -if (bytes != sizeof(*packet)) { >> -pr_err("read less than expected: %zd\n", bytes); >> +if (bytes != psize) { >> +pr_err("read %s that expected: %zd %s %zd\n", >> +(bytes > psize) ? "more" : "less", >> +bytes, >> +(bytes > psize) ? ">" : "<", >> +psize); >> return -EINVAL; >> } >> err = automountd_serve(mountpoint, param, packet); >> >> ___ >> Devel mailing list >> Devel@openvz.org >> https://lists.openvz.org/mailman/listinfo/devel ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] tcp: fix 0 divide in __tcp_select_window()
From: Eric Dumazet syszkaller fuzzer was able to trigger a divide by zero, when TCP window scaling is not enabled. SO_RCVBUF can be used not only to increase sk_rcvbuf, also to decrease it below current receive buffers utilization. If mss is negative or 0, just return a zero TCP window. https://jira.sw.ru/browse/PSBM-71285 Back-port of ML commit: 06425c308b92eaf60767bc71d359f4cbc7a561f8 Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Stanislav Kinsburskiy --- net/ipv4/tcp_output.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f395a09..3d24481 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2295,9 +2295,11 @@ u32 __tcp_select_window(struct sock *sk) int full_space = min_t(int, tp->window_clamp, allowed_space); int window; - if (mss > full_space) + if (unlikely(mss > full_space)) { mss = full_space; - + if (mss <= 0) + return 0; + } if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process
03.09.2017 21:23, Dmitry V. Levin P?P8QP5Q: > On Sat, Sep 02, 2017 at 02:26:26PM +0300, Stanislav Kinsburskiy wrote: >> 01.09.2017 21:23, Dmitry V. Levin P?P8QP5Q: >>> On Fri, Sep 01, 2017 at 05:02:45PM +0300, Stanislav Kinsburskiy wrote: >>>> 01.09.2017 16:53, Dmitry V. Levin P?P8QP5Q: >>>>> On Fri, Sep 01, 2017 at 12:15:17PM +0300, Stanislav Kinsburskiy wrote: >>>>>> 31.08.2017 20:22, Dmitry V. Levin P?P8QP5Q: >>>>>>> On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote: >>>>>>>> The structure autofs_v5_packet (except name) is not aligned by 8 >>>>>>>> bytes, which >>>>>>>> lead to different sizes in 32 and 64-bit architectures. >>>>>>>> Let's form 32-bit compatible packet when daemon has 32-bit >>>>>>>> addressation. >>>>>>>> >>>>>>>> Signed-off-by: Stanislav Kinsburskiy >>>>>>>> --- >>>>>>>> fs/autofs4/waitq.c | 11 +-- >>>>>>>> 1 file changed, 9 insertions(+), 2 deletions(-) >>>>>>>> >>>>>>>> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c >>>>>>>> index 309ca6b..484cf2e 100644 >>>>>>>> --- a/fs/autofs4/waitq.c >>>>>>>> +++ b/fs/autofs4/waitq.c >>>>>>>> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct >>>>>>>> autofs_sb_info *sbi, >>>>>>>>{ >>>>>>>>struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; >>>>>>>>struct user_namespace *user_ns = >>>>>>>> sbi->pipe->f_cred->user_ns; >>>>>>>> + size_t name_offset; >>>>>>>> >>>>>>>> - pktsz = sizeof(*packet); >>>>>>>> + if (sbi->is32bit) >>>>>>>> + name_offset = offsetof(struct autofs_v5_packet, >>>>>>>> len) + >>>>>>>> +sizeof(packet->len); >>>>>>>> + else >>>>>>>> + name_offset = offsetof(struct autofs_v5_packet, >>>>>>>> name); >>>>>>> >>>>>>> This doesn't help at all because the offset of struct >>>>>>> autofs_v5_packet.name >>>>>>> does not change. >>>>>>> >>>>>>>> + pktsz = name_offset + sizeof(packet->name); >>>>>>> >>>>>>> What changes is pktsz: it's either sizeof(struct autofs_v5_packet) >>>>>>> or 4 bytes less, depending on the architecture. >>>>>> >>>>>> Indeed. Thanks! >>>>>> >>>>>>> For example, >>>>>>> >>>>>>> #ifdef CONFIG_COMPAT >>>>>>> if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit) >>>>>> >>>>>> Unfortunately I don't get this "alignof" checks. >>>>>> The intention of "is32bit" was to define this compat case completely. >>>>>> Why are they required? >>>>> >>>>> On some 32-bit architectures like arm, u64 is 64-bit aligned, on others >>>>> like x86 it is not. This alignof check ensures that compat 32-bit >>>>> architectures with 64-bit alignment are not going to be broken >>>>> by the change. >>>> >>>> Thanks for the explanation! >>>> But looks like the issue is hidden so deep (thanks to O_DIRECT), that >>>> becomes unimportant. >>> >>> I'm not quite sure how O_DIRECT makes this unimportant. >>> For example, automount_dispatch_io from systemd tries to read exactly >>> sizeof(union autofs_v5_packet_union) bytes and fails in case of short read. >> >> That's weird. Can you reproduce it? >> The problem is that kernel always send arch packet (i.e. 300 bytes for x86, >> or 304 for x86_64). > > Well, if sending a lengthy packet is not a problem, what is the problem > you were trying to solve by patching struct autofs_v5_packet? > Well, nothing anymore. I took some time to realize, that this issue doesn't hurt anyone. So I fixed our test instead. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process
01.09.2017 21:23, Dmitry V. Levin P?P8QP5Q: > On Fri, Sep 01, 2017 at 05:02:45PM +0300, Stanislav Kinsburskiy wrote: >> >> >> 01.09.2017 16:53, Dmitry V. Levin P?P8QP5Q: >>> On Fri, Sep 01, 2017 at 12:15:17PM +0300, Stanislav Kinsburskiy wrote: >>>> 31.08.2017 20:22, Dmitry V. Levin P?P8QP5Q: >>>>> On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote: >>>>>> The structure autofs_v5_packet (except name) is not aligned by 8 bytes, >>>>>> which >>>>>> lead to different sizes in 32 and 64-bit architectures. >>>>>> Let's form 32-bit compatible packet when daemon has 32-bit addressation. >>>>>> >>>>>> Signed-off-by: Stanislav Kinsburskiy >>>>>> --- >>>>>> fs/autofs4/waitq.c | 11 +-- >>>>>> 1 file changed, 9 insertions(+), 2 deletions(-) >>>>>> >>>>>> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c >>>>>> index 309ca6b..484cf2e 100644 >>>>>> --- a/fs/autofs4/waitq.c >>>>>> +++ b/fs/autofs4/waitq.c >>>>>> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct >>>>>> autofs_sb_info *sbi, >>>>>> { >>>>>> struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; >>>>>> struct user_namespace *user_ns = >>>>>> sbi->pipe->f_cred->user_ns; >>>>>> +size_t name_offset; >>>>>> >>>>>> -pktsz = sizeof(*packet); >>>>>> +if (sbi->is32bit) >>>>>> +name_offset = offsetof(struct autofs_v5_packet, >>>>>> len) + >>>>>> + sizeof(packet->len); >>>>>> +else >>>>>> +name_offset = offsetof(struct autofs_v5_packet, >>>>>> name); >>>>> >>>>> This doesn't help at all because the offset of struct >>>>> autofs_v5_packet.name >>>>> does not change. >>>>> >>>>>> +pktsz = name_offset + sizeof(packet->name); >>>>> >>>>> What changes is pktsz: it's either sizeof(struct autofs_v5_packet) >>>>> or 4 bytes less, depending on the architecture. >>>> >>>> Indeed. Thanks! >>>> >>>>> For example, >>>>> >>>>> #ifdef CONFIG_COMPAT >>>>> if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit) >>>> >>>> Unfortunately I don't get this "alignof" checks. >>>> The intention of "is32bit" was to define this compat case completely. >>>> Why are they required? >>> >>> On some 32-bit architectures like arm, u64 is 64-bit aligned, on others >>> like x86 it is not. This alignof check ensures that compat 32-bit >>> architectures with 64-bit alignment are not going to be broken >>> by the change. >> >> Thanks for the explanation! >> But looks like the issue is hidden so deep (thanks to O_DIRECT), that >> becomes unimportant. > > I'm not quite sure how O_DIRECT makes this unimportant. > For example, automount_dispatch_io from systemd tries to read exactly > sizeof(union autofs_v5_packet_union) bytes and fails in case of short read. > That's weird. Can you reproduce it? The problem is that kernel always send arch packet (i.e. 300 bytes for x86, or 304 for x86_64). So, how can systemd get less? It can get more, but only if it'll try to read more. Otherwise, as "man pipe(2)" says: * If a read(2) specifies a buffer size that is smaller than the next packet, then the requested number of bytes are read, and the excess bytes in the packet are discarded. > > > > ___ > Devel mailing list > Devel@openvz.org > https://lists.openvz.org/mailman/listinfo/devel > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process
01.09.2017 16:53, Dmitry V. Levin P?P8QP5Q: > On Fri, Sep 01, 2017 at 12:15:17PM +0300, Stanislav Kinsburskiy wrote: >> 31.08.2017 20:22, Dmitry V. Levin P?P8QP5Q: >>> On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote: >>>> The structure autofs_v5_packet (except name) is not aligned by 8 bytes, >>>> which >>>> lead to different sizes in 32 and 64-bit architectures. >>>> Let's form 32-bit compatible packet when daemon has 32-bit addressation. >>>> >>>> Signed-off-by: Stanislav Kinsburskiy >>>> --- >>>> fs/autofs4/waitq.c | 11 +-- >>>> 1 file changed, 9 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c >>>> index 309ca6b..484cf2e 100644 >>>> --- a/fs/autofs4/waitq.c >>>> +++ b/fs/autofs4/waitq.c >>>> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct >>>> autofs_sb_info *sbi, >>>>{ >>>>struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; >>>>struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; >>>> + size_t name_offset; >>>> >>>> - pktsz = sizeof(*packet); >>>> + if (sbi->is32bit) >>>> + name_offset = offsetof(struct autofs_v5_packet, len) + >>>> +sizeof(packet->len); >>>> + else >>>> + name_offset = offsetof(struct autofs_v5_packet, name); >>> >>> This doesn't help at all because the offset of struct autofs_v5_packet.name >>> does not change. >>> >>>> + pktsz = name_offset + sizeof(packet->name); >>> >>> What changes is pktsz: it's either sizeof(struct autofs_v5_packet) >>> or 4 bytes less, depending on the architecture. >> >> Indeed. Thanks! >> >>> For example, >>> >>> #ifdef CONFIG_COMPAT >>> if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit) >> >> Unfortunately I don't get this "alignof" checks. >> The intention of "is32bit" was to define this compat case completely. >> Why are they required? > > On some 32-bit architectures like arm, u64 is 64-bit aligned, on others > like x86 it is not. This alignof check ensures that compat 32-bit > architectures with 64-bit alignment are not going to be broken > by the change. > Thanks for the explanation! But looks like the issue is hidden so deep (thanks to O_DIRECT), that becomes unimportant. > > > > ___ > Devel mailing list > Devel@openvz.org > https://lists.openvz.org/mailman/listinfo/devel > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] zdtm: fix autofs to work with autofs_v5_packet_union issue
The only patch in the series discards the following CRIU commits: 27ce0613e7319c97cc7bfc0a240e3f5f61f3d912 e90b5ed57af866010bc8f5c7c9730aa68995cb77 2fc59ffe3ead2eff44542a415f16b1559e2c8140 The following series implements... --- Stanislav Kinsburskiy (1): tests: do not try to read more than packet in AutoFS test test/zdtm/static/autofs.c | 13 + 1 file changed, 5 insertions(+), 8 deletions(-) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] tests: do not try to read more than packet in AutoFS test
The intention was to make sure, that only one packet is sent at a time. And thus read has to return exactly the size of one packet. But it doesnt' work as expected, because size of autofs_v5_packet_union differs on 32 bit and 64 bit architectures. This is a bug, but it's hidden so deeply, that no one really cares by the following 2 aspects: 1) Autofs pipe has O_DIRECT flag, which means excess bytes will be discarded upon read. 2) No one tries to read more than one packet at a time. So let's fix the test instead and do not try to read more bytes, than expected. https://jira.sw.ru/browse/PSBM-71078 Signed-off-by: Stanislav Kinsburskiy --- test/zdtm/static/autofs.c | 13 + 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c index 747ab69..ae78538 100644 --- a/test/zdtm/static/autofs.c +++ b/test/zdtm/static/autofs.c @@ -460,7 +460,7 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param { union autofs_v5_packet_union *packet; ssize_t bytes; - size_t psize = sizeof(*packet) + 1; + size_t psize = sizeof(*packet); int err = 0; packet = malloc(psize); @@ -473,7 +473,7 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param siginterrupt(SIGUSR2, 1); while (!stop && !err) { - memset(packet, 0, sizeof(*packet)); + memset(packet, 0, psize); bytes = read(pipe, packet, psize); if (bytes < 0) { @@ -483,12 +483,9 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param } continue; } - if (bytes == psize) { - pr_err("read more that expected\n"); - return -EINVAL; - } - if (bytes != sizeof(*packet)) { - pr_err("read less than expected: %zd\n", bytes); + if (bytes != psize) { + pr_err("read less than expected: %zd < %zd\n", + bytes, psize); return -EINVAL; } err = automountd_serve(mountpoint, param, packet); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process
The structure autofs_v5_packet (except name) is not aligned by 8 bytes, which leads to different sizes in 32 and 64-bit architectures. Let's form 32-bit compatible packet when daemon has 32-bit addressation. Suggested-by: Dmitry V. Levin Signed-off-by: Stanislav Kinsburskiy --- fs/autofs4/waitq.c |5 + 1 file changed, 5 insertions(+) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 24a58bf..1f9b7d8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -151,6 +151,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; +#ifdef CONFIG_COMPAT + if (sbi->is32bit) + pktsz = offsetofend(struct autofs_v5_packet, name); + else +#endif pktsz = sizeof(*packet); packet->wait_queue_token = wq->wait_queue_token; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation
Signed-off-by: Stanislav Kinsburskiy --- fs/autofs4/autofs_i.h |3 +++ fs/autofs4/dev-ioctl.c |3 +++ fs/autofs4/inode.c |4 +++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 4737615..3da105f 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -120,6 +120,9 @@ struct autofs_sb_info { struct list_head active_list; struct list_head expiring_list; struct rcu_head rcu; +#ifdef CONFIG_COMPAT + unsigned is32bit:1; +#endif }; static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index b7c816f..467d6c4 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -397,6 +397,9 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; +#ifdef CONFIG_COMPAT + sbi->is32bit = is_compat_task(); +#endif } out: put_pid(new_pid); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 09e7d68..21d3c0b 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -301,7 +301,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } else { sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); } - +#ifdef CONFIG_COMPAT + sbi->is32bit = is_compat_task(); +#endif if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RFC PATCH 0/2] autofs: fix autofs_v5_packet dlivery in compat mode
The problem is that in compat mode struct autofs_v5_packet has to have different size (i.e. 4 bytes less). This is RFC because: 1) This issue is hidden, because autofs pipe has O_DIRECT and the rest of the epacket is truncated when read. 2) X86 arch doesn't have is_compat_task() helper 3) It's now clear, what to do if "pgrp" option is specified. The following series implements... --- Stanislav Kinsburskiy (2): autofs: set compat flag on sbi when daemon uses 32bit addressation autofs: sent 32-bit sized packet for 32-bit process fs/autofs4/autofs_i.h |3 +++ fs/autofs4/dev-ioctl.c |3 +++ fs/autofs4/inode.c |4 +++- fs/autofs4/waitq.c |5 + 4 files changed, 14 insertions(+), 1 deletion(-) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] autofs: fix leaked pid on error path in autofs4_fill_super
Check for protocol happens after pid get. Signed-off-by: Stanislav Kinsburskiy --- fs/autofs4/inode.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 0ba9c02..2cd4e7e 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -298,7 +298,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); - goto fail_dput; + goto fail_put_pid; } /* Establish highest kernel protocol version */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] autofs: fix double pid put in error path
Signed-off-by: Stanislav Kinsburskiy --- fs/autofs4/inode.c |1 - 1 file changed, 1 deletion(-) diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index b23cf2a..0ba9c02 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -343,7 +343,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) fail_ino: kfree(ino); fail_free: - put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; return ret; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process
31.08.2017 20:22, Dmitry V. Levin пишет: > On Thu, Aug 31, 2017 at 05:57:11PM +0400, Stanislav Kinsburskiy wrote: >> The structure autofs_v5_packet (except name) is not aligned by 8 bytes, which >> lead to different sizes in 32 and 64-bit architectures. >> Let's form 32-bit compatible packet when daemon has 32-bit addressation. >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> fs/autofs4/waitq.c | 11 +-- >> 1 file changed, 9 insertions(+), 2 deletions(-) >> >> diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c >> index 309ca6b..484cf2e 100644 >> --- a/fs/autofs4/waitq.c >> +++ b/fs/autofs4/waitq.c >> @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct >> autofs_sb_info *sbi, >> { >> struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; >> struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; >> +size_t name_offset; >> >> -pktsz = sizeof(*packet); >> +if (sbi->is32bit) >> +name_offset = offsetof(struct autofs_v5_packet, len) + >> + sizeof(packet->len); >> +else >> +name_offset = offsetof(struct autofs_v5_packet, name); > > This doesn't help at all because the offset of struct autofs_v5_packet.name > does not change. > >> +pktsz = name_offset + sizeof(packet->name); > > What changes is pktsz: it's either sizeof(struct autofs_v5_packet) > or 4 bytes less, depending on the architecture. Indeed. Thanks! > For example, > > #ifdef CONFIG_COMPAT > if (__alignof__(compat_u64) < __alignof__(u64) && sbi->is32bit) Unfortunately I don't get this "alignof" checks. The intention of "is32bit" was to define this compat case completely. Why are they required? > pktsz = offsetofend(struct autofs_v5_packet, name); > else > #endif > pktsz = sizeof(*packet); > > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RFC PATCH 2/2] autofs: sent 32-bit sized packet for 32-bit process
The structure autofs_v5_packet (except name) is not aligned by 8 bytes, which lead to different sizes in 32 and 64-bit architectures. Let's form 32-bit compatible packet when daemon has 32-bit addressation. Signed-off-by: Stanislav Kinsburskiy --- fs/autofs4/waitq.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 309ca6b..484cf2e 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -153,12 +153,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; + size_t name_offset; - pktsz = sizeof(*packet); + if (sbi->is32bit) + name_offset = offsetof(struct autofs_v5_packet, len) + + sizeof(packet->len); + else + name_offset = offsetof(struct autofs_v5_packet, name); + + pktsz = name_offset + sizeof(packet->name); packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; - memcpy(packet->name, wq->name.name, wq->name.len); + memcpy(packet + name_offset, wq->name.name, wq->name.len); packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RFC PATCH 1/2] autofs: set compat flag on sbi when daemon uses 32bit addressation
Signed-off-by: Stanislav Kinsburskiy --- fs/autofs4/inode.c | 16 1 file changed, 16 insertions(+) diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index b23cf2a..989ac38 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -217,6 +217,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pgrp; bool pgrp_set = false; int ret = -EINVAL; + struct task_struct *tsk; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -281,10 +282,25 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) pgrp); goto fail_dput; } + tsk = get_pid_task(sbi->oz_pgrp, PIDTYPE_PGID); + if (!tsk) { + pr_warn("autofs: could not find process group leader %d\n", + pgrp); + goto fail_put_pid; + } } else { sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + get_task_struct(current); + tsk = current; } + if (test_tsk_thread_flag(tsk, TIF_ADDR32)) + sbi->is32bit = 1; + else + sbi->is32bit = 0; + + put_task_struct(tsk); + if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RFC PATCH 0/2] autofs: add "compat" support
The idea is simple: reduce autofs_v5_packet for 32bit damon on 64bit architectures. --- Stanislav Kinsburskiy (2): autofs: set compat flag on sbi when daemon uses 32bit addressation autofs: sent 32-bit sized packet for 32-bit process fs/autofs4/inode.c | 16 fs/autofs4/waitq.c | 11 +-- 2 files changed, 25 insertions(+), 2 deletions(-) -- ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode
31.08.2017 15:05, Dmitry V. Levin пишет: > On Thu, Aug 31, 2017 at 02:40:23PM +0300, Dmitry V. Levin wrote: >> On Thu, Aug 31, 2017 at 01:48:27PM +0300, Stanislav Kinsburskiy wrote: >>> >>> >>> 31.08.2017 13:38, Dmitry V. Levin пишет: >>>> On Thu, Aug 31, 2017 at 02:11:34PM +0400, Stanislav Kinsburskiy wrote: >>>>> Due to integer variables alignment size of struct autofs_v5_packet in 300 >>>>> bytes in 32-bit architectures (instead of 304 bytes in 64-bits >>>>> architectures). >>>>> >>>>> This may lead to memory corruption (64 bits kernel always send 304 bytes, >>>>> while 32-bit userspace application expects for 300). >>>>> >>>>> https://jira.sw.ru/browse/PSBM-71078 >>>>> >>>>> Signed-off-by: Stanislav Kinsburskiy >>>>> --- >>>>> include/uapi/linux/auto_fs4.h |2 ++ >>>>> 1 file changed, 2 insertions(+) >>>>> >>>>> diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h >>>>> index e02982f..8729a47 100644 >>>>> --- a/include/uapi/linux/auto_fs4.h >>>>> +++ b/include/uapi/linux/auto_fs4.h >>>>> @@ -137,6 +137,8 @@ struct autofs_v5_packet { >>>>> __u32 pid; >>>>> __u32 tgid; >>>>> __u32 len; >>>>> + __u32 blob; /* This is needed to align structure up to 8 >>>>> +bytes for ALL archs including 32-bit */ >>>>> char name[NAME_MAX+1]; >>>>> }; >>>> >>>> This change breaks ABI because it changes offsetof(struct >>>> autofs_v5_packet, name). >>>> If you need to fix the alignment, use __attribute__((aligned(8))). >>>> >>> >>> Nice to know you're watching. >>> Yes, attribute is better. >>> But how ABI is broken? On x86_64 this alignment is implied, so nothing is >>> changed. >> >> Your change increases offsetof(struct autofs_v5_packet, name) by 4 on all >> architectures. On architectures where the structure is 32-bit aligned >> this also leads to increase of its size by 4. >> >>>> An alignment change would also be an ABI breakage on 32-bit architectures, >>>> though. >>>> >>> >>> True. >>> But from my POW better have it working on 64bit archs for 32bit apps. >>> But anyway, upstream guys will device, whether they want 32-bit autofs >>> applications properly work on 64 or 32 bits. >> >> Let's fix old bugs without introducing new bugs. >> The right fix here seems to be a compat structure, that is, both 64-bit >> and 32-bit kernels should send the same 32-bit aligned structure, and >> it has to be the same structure sent by traditional 32-bit kernels. > > Alternatively, a much more simple fix would be to change 64-bit kernels > not to send the trailing 4 padding bytes of 64-bit aligned > struct autofs_v5_packet. That is, just send > offsetofend(struct autofs_v5_packet, name) bytes instead of > sizeof(struct autofs_v5_packet) regardless of architecture. > Fair enough, thanks! But this approach won't work, because autofs pipe has O_DIRECT flag. Compat structure looks more promising, but not yet clear to me, how to define one properly. Probably by replacing "len" with char array like this: /* autofs v5 common packet struct */ struct autofs_v5_packet { struct autofs_packet_hdr hdr; autofs_wqt_t wait_queue_token; __u32 dev; __u64 ino; __u32 uid; __u32 gid; __u32 pid; __u32 tgid; __u8 len[4]; char name[NAME_MAX+1]; }; What do you think? ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode
31.08.2017 13:38, Dmitry V. Levin пишет: > On Thu, Aug 31, 2017 at 02:11:34PM +0400, Stanislav Kinsburskiy wrote: >> Due to integer variables alignment size of struct autofs_v5_packet in 300 >> bytes in 32-bit architectures (instead of 304 bytes in 64-bits >> architectures). >> >> This may lead to memory corruption (64 bits kernel always send 304 bytes, >> while 32-bit userspace application expects for 300). >> >> https://jira.sw.ru/browse/PSBM-71078 >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> include/uapi/linux/auto_fs4.h |2 ++ >> 1 file changed, 2 insertions(+) >> >> diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h >> index e02982f..8729a47 100644 >> --- a/include/uapi/linux/auto_fs4.h >> +++ b/include/uapi/linux/auto_fs4.h >> @@ -137,6 +137,8 @@ struct autofs_v5_packet { >> __u32 pid; >> __u32 tgid; >> __u32 len; >> +__u32 blob; /* This is needed to align structure up to 8 >> + bytes for ALL archs including 32-bit */ >> char name[NAME_MAX+1]; >> }; > > This change breaks ABI because it changes offsetof(struct autofs_v5_packet, > name). > If you need to fix the alignment, use __attribute__((aligned(8))). > Nice to know you're watching. Yes, attribute is better. But how ABI is broken? On x86_64 this alignment is implied, so nothing is changed. > An alignment change would also be an ABI breakage on 32-bit architectures, > though. > True. But from my POW better have it working on 64bit archs for 32bit apps. But anyway, upstream guys will device, whether they want 32-bit autofs applications properly work on 64 or 32 bits. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode
Yes 31.08.2017 13:37, Konstantin Khorenko пишет: > Will you send it to mainstream as well? > > -- > Best regards, > > Konstantin Khorenko, > Virtuozzo Linux Kernel Team > > On 08/31/2017 01:11 PM, Stanislav Kinsburskiy wrote: >> Due to integer variables alignment size of struct autofs_v5_packet in 300 >> bytes in 32-bit architectures (instead of 304 bytes in 64-bits >> architectures). >> >> This may lead to memory corruption (64 bits kernel always send 304 bytes, >> while 32-bit userspace application expects for 300). >> >> https://jira.sw.ru/browse/PSBM-71078 >> >> Signed-off-by: Stanislav Kinsburskiy >> --- >> include/uapi/linux/auto_fs4.h |2 ++ >> 1 file changed, 2 insertions(+) >> >> diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h >> index e02982f..8729a47 100644 >> --- a/include/uapi/linux/auto_fs4.h >> +++ b/include/uapi/linux/auto_fs4.h >> @@ -137,6 +137,8 @@ struct autofs_v5_packet { >> __u32 pid; >> __u32 tgid; >> __u32 len; >> +__u32 blob;/* This is needed to align structure up to 8 >> + bytes for ALL archs including 32-bit */ >> char name[NAME_MAX+1]; >> }; >> >> >> . >> ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] autofs: fix autofs_v5_packet structure for compat mode
Due to integer variables alignment size of struct autofs_v5_packet in 300 bytes in 32-bit architectures (instead of 304 bytes in 64-bits architectures). This may lead to memory corruption (64 bits kernel always send 304 bytes, while 32-bit userspace application expects for 300). https://jira.sw.ru/browse/PSBM-71078 Signed-off-by: Stanislav Kinsburskiy --- include/uapi/linux/auto_fs4.h |2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index e02982f..8729a47 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -137,6 +137,8 @@ struct autofs_v5_packet { __u32 pid; __u32 tgid; __u32 len; + __u32 blob; /* This is needed to align structure up to 8 + bytes for ALL archs including 32-bit */ char name[NAME_MAX+1]; }; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] zdtm: fix package memory allocation in autofs.c
Plus some cleanup. https://jira.sw.ru/browse/PSBM-71078 Signed-off-by: Stanislav Kinsburskiy --- test/zdtm/static/autofs.c | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c index 8d917ee..882289f 100644 --- a/test/zdtm/static/autofs.c +++ b/test/zdtm/static/autofs.c @@ -460,10 +460,10 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param { union autofs_v5_packet_union *packet; ssize_t bytes; - size_t psize = sizeof(*packet) * 2; + size_t psize = sizeof(*packet); int err = 0; - packet = malloc(psize); + packet = malloc(psize * 2); if (!packet) { pr_err("failed to allocate autofs packet\n"); return -ENOMEM; @@ -473,7 +473,7 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param siginterrupt(SIGUSR2, 1); while (!stop && !err) { - memset(packet, 0, sizeof(*packet)); + memset(packet, 0, psize * 2); bytes = read(pipe, packet, psize); if (bytes < 0) { @@ -483,12 +483,12 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param } continue; } - if (bytes > psize) { - pr_err("read more that expected: %zd > %zd\n", bytes, psize); - return -EINVAL; - } - if (bytes != sizeof(*packet)) { - pr_err("read less than expected: %zd\n", bytes); + if (bytes != psize) { + pr_err("read %s that expected: %zd %s %zd\n", + (bytes > psize) ? "more" : "less", + bytes, + (bytes > psize) ? ">" : "<", + psize); return -EINVAL; } err = automountd_serve(mountpoint, param, packet); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] zdtm: fix autofs tes compilation
Variables type of size_t have to be printed with "%z" directive. https://jira.sw.ru/browse/PSBM-71041 Signed-off-by: Stanislav Kinsburskiy --- test/zdtm/static/autofs.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c index d854fd0..8d917ee 100644 --- a/test/zdtm/static/autofs.c +++ b/test/zdtm/static/autofs.c @@ -484,7 +484,7 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param continue; } if (bytes > psize) { - pr_err("read more that expected: %ld > %ld\n", bytes, psize); + pr_err("read more that expected: %zd > %zd\n", bytes, psize); return -EINVAL; } if (bytes != sizeof(*packet)) { ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7 2/2] mm/memcg: reclaim only kmem if kmem limit reached.
25.08.2017 18:38, Andrey Ryabinin пишет: > If kmem limit on memcg reached, we go into memory reclaim, > and reclaim everything we can, including page cache and anon. > Reclaiming page cache or anon won't help since we need to lower > only kmem usage. This patch fixes the problem by avoiding > non-kmem reclaim on hitting the kmem limit. > Can't there be a situation, when some object in anon mem or page cache holds some object in kmem (indirectly)? > https://jira.sw.ru/browse/PSBM-69226 > Signed-off-by: Andrey Ryabinin > --- > include/linux/memcontrol.h | 10 ++ > include/linux/swap.h | 2 +- > mm/memcontrol.c| 30 -- > mm/vmscan.c| 31 --- > 4 files changed, 51 insertions(+), 22 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 1a52e58ab7de..1d6bc80c4c90 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie { > unsigned int generation; > }; > > +/* > + * Reclaim flags for mem_cgroup_hierarchical_reclaim > + */ > +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT0x0 > +#define MEM_CGROUP_RECLAIM_NOSWAP(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) > +#define MEM_CGROUP_RECLAIM_SHRINK_BIT0x1 > +#define MEM_CGROUP_RECLAIM_SHRINK(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) > +#define MEM_CGROUP_RECLAIM_KMEM_BIT 0x2 > +#define MEM_CGROUP_RECLAIM_KMEM (1 << > MEM_CGROUP_RECLAIM_KMEM_BIT) > + > #ifdef CONFIG_MEMCG > int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, > gfp_t gfp_mask, struct mem_cgroup **memcgp); > diff --git a/include/linux/swap.h b/include/linux/swap.h > index bd162f9bef0d..bd47451ec95a 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist > *zonelist, int order, > extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); > extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, > unsigned long nr_pages, > - gfp_t gfp_mask, bool noswap); > + gfp_t gfp_mask, int flags); > extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, > gfp_t gfp_mask, bool noswap, > struct zone *zone, > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 97824e281d7a..f9a5f3819a31 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -511,16 +511,6 @@ enum res_type { > #define OOM_CONTROL (0) > > /* > - * Reclaim flags for mem_cgroup_hierarchical_reclaim > - */ > -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT0x0 > -#define MEM_CGROUP_RECLAIM_NOSWAP(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) > -#define MEM_CGROUP_RECLAIM_SHRINK_BIT0x1 > -#define MEM_CGROUP_RECLAIM_SHRINK(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) > -#define MEM_CGROUP_RECLAIM_KMEM_BIT 0x2 > -#define MEM_CGROUP_RECLAIM_KMEM (1 << > MEM_CGROUP_RECLAIM_KMEM_BIT) > - > -/* > * The memcg_create_mutex will be held whenever a new cgroup is created. > * As a consequence, any change that needs to protect against new child > cgroups > * appearing has to hold it as well. > @@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct > mem_cgroup *memcg, > if (loop) > drain_all_stock_async(memcg); > total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX, > - gfp_mask, noswap); > + gfp_mask, flags); > if (test_thread_flag(TIF_MEMDIE) || > fatal_signal_pending(current)) > return 1; > @@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct > mem_cgroup *memcg, > break; > if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM)) > break; > + > + /* > + * Try harder to reclaim dcache. dcache reclaim may > + * temporarly fail due to dcache->dlock being held > + * by someone else. We must try harder to avoid premature > + * slab allocation failures. > + */ > + if (flags & MEM_CGROUP_RECLAIM_KMEM && > + page_counter_read(&memcg->dcache)) > + continue; > /* >* If nothing was reclaimed after two attempts, there >* may be no reclaimable pages in this hierarchy. > @@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t > gfp_mask, bool kmem_charge > struct mem_cgroup *mem_over_limit; > struc
[Devel] [PATCH] zdtm: print autofs request size, if read more than expected
This is more debug patch, than fix. But valuable for debugging. https://jira.sw.ru/browse/PSBM-70345 Signed-off-by: Stanislav Kinsburskiy --- test/zdtm/static/autofs.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c index 747ab69..d854fd0 100644 --- a/test/zdtm/static/autofs.c +++ b/test/zdtm/static/autofs.c @@ -460,7 +460,7 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param { union autofs_v5_packet_union *packet; ssize_t bytes; - size_t psize = sizeof(*packet) + 1; + size_t psize = sizeof(*packet) * 2; int err = 0; packet = malloc(psize); @@ -483,8 +483,8 @@ static int automountd_loop(int pipe, const char *mountpoint, struct autofs_param } continue; } - if (bytes == psize) { - pr_err("read more that expected\n"); + if (bytes > psize) { + pr_err("read more that expected: %ld > %ld\n", bytes, psize); return -EINVAL; } if (bytes != sizeof(*packet)) { ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel