Zenya,
1) an idea from Volodya Davydov: we can store the pointer to ve_struct, not the
veid,
thus we'll eliminate possible races with veid reuse.
2) there are 2 more comments from Stas (3 total), please take a look.
i don't mind leaving it as a global sysctl because hopefully it will never used
=>
less time spent to rework. And we currently really don't need it to be per-CT.
--
Best regards,
Konstantin Khorenko,
Virtuozzo Linux Kernel Team
On 11/05/2015 10:35 AM, Evgenii Shatokhin wrote:
02.11.2015 18:47, Stanislav Kinsburskiy пишет:
Yet again these sysctl handles...(((
I have some questions.
Please, see inlined.
02.11.2015 16:11, Evgenii Shatokhin пишет:
https://jira.sw.ru/browse/PSBM-34438
(This fix was adapted from PCS6.)
It is possible for a container to create lots of mount points, which may
make operations with them slower. As some of these operations take
global locks (namespace_sem, vfsmount_lock), it might affect other
containers as well.
Let us limit the maximum number of mount points a VE may create. The
limit can be customized via /proc/sys/fs/ve-mount-nr knob.
Signed-off-by: Evgenii Shatokhin <eshatok...@odin.com>
---
fs/mount.h | 2 ++
fs/namespace.c | 34 ++++++++++++++++++++++++++++++++--
include/linux/mnt_namespace.h | 2 ++
include/linux/ve.h | 1 +
include/linux/ve_proto.h | 11 +++++++++++
kernel/sysctl.c | 8 ++++++++
kernel/ve/ve.c | 2 ++
7 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/fs/mount.h b/fs/mount.h
index 3e16b57..e192ff9 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -50,6 +50,8 @@ struct mount {
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
+
+ envid_t mnt_owner_veid;
};
Do we really need it? Can't we just take get_exec_env() on mount and
umount?
Yes, in this case we won't account mount-umount operations from ve0.
But maybe we could just do not care about them?
If yes, then we can avoid adding another field to struct mount (which
would be great, btw).
Good point. Personally, I would rather leave struct mount alone.
However, I am not 100% sure yet it is safe.
Is it possible for mount to be called from a VE and umount from VE0? If
so, then without that field, ve->mnt_nr would not be decremented during
free_vfsmnt() and would become unbalanced.
If such conditions are impossible, then yes, I suppose, we can do
without additional fields in struct mount.
I introduced it only because the patch for PCS6 I backported used this
approach.
Perhaps, the authors of that patch could shed some light?
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any
mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index 8909c13..be80e8d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -42,6 +42,14 @@ static struct list_head
mountpoint_hashtable[HASH_SIZE];
static struct kmem_cache *mnt_cache __read_mostly;
static struct rw_semaphore namespace_sem;
+/*
+ * Operations with a big amount of mount points can require a lot of
time.
+ * These operations take the global lock namespace_sem, so they can
affect
+ * other containers. Let us allow no more than sysctl_ve_mount_nr mount
+ * points for a VE.
+ */
+unsigned int sysctl_ve_mount_nr = 4096;
+
Defining sysctl to fs/namespace.c looks unnecessary.
The same apply to adding sysctl handling in kernel/sysctl.c below.
Since it's anyway a global variable, can't we place it in /proc/sys/fs/
with other ve sysctl's (vz_sysfs_table)?
/* /sys/fs */
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
@@ -165,7 +173,15 @@ unsigned int mnt_get_count(struct mount *mnt)
static struct mount *alloc_vfsmnt(const char *name)
{
- struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+ struct mount *mnt;
+ struct ve_struct *ve = get_exec_env();
+
+ if (ve &&
+ atomic_add_return(1, &ve->mnt_nr) > sysctl_ve_mount_nr &&
+ !ve_is_super(ve))
+ goto out_mnt_nr_dec;
+
+ mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
int err;
@@ -190,6 +206,8 @@ static struct mount *alloc_vfsmnt(const char *name)
mnt->mnt_writers = 0;
#endif
+ mnt->mnt_owner_veid = get_veid(ve);
+
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -201,7 +219,9 @@ static struct mount *alloc_vfsmnt(const char *name)
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
- }
+ } else
+ goto out_mnt_nr_dec;
+
return mnt;
#ifdef CONFIG_SMP
@@ -212,6 +232,9 @@ out_free_id:
mnt_free_id(mnt);
out_free_cache:
kmem_cache_free(mnt_cache, mnt);
+out_mnt_nr_dec:
+ if (ve)
+ atomic_dec(&ve->mnt_nr);
return NULL;
}
@@ -542,6 +565,13 @@ int sb_prepare_remount_readonly(struct
super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
+ struct ve_struct *ve = get_ve_by_id(mnt->mnt_owner_veid);
+
+ if (ve) {
+ atomic_dec(&ve->mnt_nr);
+ put_ve(ve);
+ }
+
kfree(mnt->mnt_devname);
mnt_free_id(mnt);
#ifdef CONFIG_SMP
diff --git a/include/linux/mnt_namespace.h
b/include/linux/mnt_namespace.h
index 12b2ab5..4d83d3c 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -14,5 +14,7 @@ extern const struct file_operations
proc_mounts_operations;
extern const struct file_operations proc_mountinfo_operations;
extern const struct file_operations proc_mountstats_operations;
+extern unsigned int sysctl_ve_mount_nr;
+
#endif
#endif
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 86b95c3..c6530c2 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -128,6 +128,7 @@ struct ve_struct {
unsigned long aio_nr;
unsigned long aio_max_nr;
#endif
+ atomic_t mnt_nr;
};
struct ve_devmnt {
diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
index 0f5898e..3894c60 100644
--- a/include/linux/ve_proto.h
+++ b/include/linux/ve_proto.h
@@ -32,6 +32,7 @@ static inline bool ve_is_super(struct ve_struct *ve)
#define get_exec_env() (current->task_ve)
#define get_env_init(ve) (ve->ve_ns->pid_ns->child_reaper)
+#define get_veid(ve) (ve->veid)
const char *ve_name(struct ve_struct *ve);
@@ -124,6 +125,11 @@ static inline struct ve_struct *get_exec_env(void)
#define get_env_init(ve) (ve->ve_ns->pid_ns->child_reaper)
+static inline envid_t get_veid(struct ve_struct *ve)
+{
+ return 0;
+}
+
static inline bool ve_is_super(struct ve_struct *ve)
{
return true;
@@ -139,6 +145,11 @@ static inline const char *task_ve_name(struct
task_struct *task)
return "0";
}
+static inline struct ve_struct *get_ve_by_id(envid_t veid)
+{
+ return NULL;
+}
+
#define nr_threads_ve(ve) (nr_threads)
#endif /* CONFIG_VE */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9c081e3..ba69200 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
#include <linux/sched/sysctl.h>
#include <linux/kexec.h>
#include <linux/ve.h>
+#include <linux/mnt_namespace.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -1740,6 +1741,13 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "ve-mount-nr",
+ .data = &sysctl_ve_mount_nr,
+ .maxlen = sizeof(sysctl_ve_mount_nr),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
And, actually, the final question: is sysctl the best way to solve this
task?
This magic "4096" number of mounts also looks odd.
Maybe it makes sense to make this feature per-ve, and place it's handler
not to sysctls, but to ve cgroup control files?
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e9219e6..216608c 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -82,6 +82,7 @@ struct ve_struct ve0 = {
#endif
.sched_lat_ve.cur = &ve0_lat_stats,
.init_cred = &init_cred,
+ .mnt_nr = ATOMIC_INIT(0),
};
EXPORT_SYMBOL(ve0);
@@ -653,6 +654,7 @@ do_init:
ve->aio_nr = 0;
ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
#endif
+ atomic_set(&ve->mnt_nr, 0);
return &ve->css;
.
Regards,
Evgenii
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel