[Devel] [PATCH rh7 2/5] ub: get rid of dcache accounting related stuff
dcache is now accounted as part of memcg:kmem, so remove the leftovers. If we decide to account dcache separately, we will re-implement/port what we really need. Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- fs/namei.c |1 - include/bc/beancounter.h |6 -- include/bc/dcache.h | 18 kernel/bc/beancounter.c |5 - kernel/bc/dcache.c | 269 -- kernel/bc/proc.c |3 - kernel/bc/vm_pages.c |5 +- kernel/ve/vecalls.c |2 - 8 files changed, 2 insertions(+), 307 deletions(-) delete mode 100644 include/bc/dcache.h delete mode 100644 kernel/bc/dcache.c diff --git a/fs/namei.c b/fs/namei.c index 5b0146255e94..b62c93df99d1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -142,7 +142,6 @@ getname_flags(const char __user *filename, int flags, int *empty) if (result) return result; - /*ub_dentry_checkup();*/ result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); diff --git a/include/bc/beancounter.h b/include/bc/beancounter.h index 31671ff459da..4337e1363eeb 100644 --- a/include/bc/beancounter.h +++ b/include/bc/beancounter.h @@ -149,12 +149,6 @@ struct user_beancounter { void*private_data2; - struct list_headub_dentry_lru; - struct list_headub_dentry_top; - int ub_dentry_unused; - int ub_dentry_batch; - unsigned long ub_dentry_pruned; - /* resources statistic and settings */ struct ubparm ub_parms[UB_RESOURCES]; /* resources statistic for last interval */ diff --git a/include/bc/dcache.h b/include/bc/dcache.h deleted file mode 100644 index 186e0fc895d5.. --- a/include/bc/dcache.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef __UB_DCACHE_H__ -#define __UB_DCACHE_H__ - -#include bc/decl.h - -extern unsigned int ub_dcache_threshold; - -UB_DECLARE_FUNC(int, ub_dcache_charge(struct user_beancounter *ub, int name_len)) -UB_DECLARE_VOID_FUNC(ub_dcache_uncharge(struct user_beancounter *ub, int name_len)) -UB_DECLARE_VOID_FUNC(ub_dcache_set_owner(struct dentry *d, struct user_beancounter *ub)) -UB_DECLARE_VOID_FUNC(ub_dcache_change_owner(struct dentry *dentry, struct user_beancounter *ub)) -UB_DECLARE_VOID_FUNC(ub_dcache_clear_owner(struct dentry *dentry)) -UB_DECLARE_VOID_FUNC(ub_dcache_unuse(struct user_beancounter *ub)) -UB_DECLARE_VOID_FUNC(ub_dcache_reclaim(struct user_beancounter *ub, unsigned long numerator, unsigned long denominator)) -UB_DECLARE_FUNC(int, ub_dcache_shrink(struct user_beancounter *ub, unsigned long size, gfp_t gfp_mask)) -UB_DECLARE_FUNC(unsigned long, ub_dcache_get_size(struct dentry *dentry)) - -#endif diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c index cdbe846bf839..5cc0688131ae 100644 --- a/kernel/bc/beancounter.c +++ b/kernel/bc/beancounter.c @@ -42,7 +42,6 @@ #include bc/beancounter.h #include bc/io_acct.h #include bc/vmpages.h -#include bc/dcache.h #include bc/proc.h static struct kmem_cache *ub_cachep; @@ -465,8 +464,6 @@ static inline int bc_verify_held(struct user_beancounter *ub) clean = verify_res(ub, pincount, __ub_percpu_sum(ub, pincount)); - clean = verify_res(ub, dcache, !list_empty(ub-ub_dentry_lru)); - ub_debug_trace(!clean, 5, 60*HZ); return clean; @@ -958,8 +955,6 @@ static void init_beancounter_struct(struct user_beancounter *ub) spin_lock_init(ub-ub_lock); INIT_LIST_HEAD(ub-ub_tcp_sk_list); INIT_LIST_HEAD(ub-ub_other_sk_list); - INIT_LIST_HEAD(ub-ub_dentry_lru); - INIT_LIST_HEAD(ub-ub_dentry_top); init_oom_control(ub-oom_ctrl); spin_lock_init(ub-rl_lock); ub-rl_wall.tv64 = LLONG_MIN; diff --git a/kernel/bc/dcache.c b/kernel/bc/dcache.c deleted file mode 100644 index 2727e690fbb4.. --- a/kernel/bc/dcache.c +++ /dev/null @@ -1,269 +0,0 @@ -#include linux/slab.h -#include linux/dcache.h -#include linux/fs.h -#include linux/module.h -#include linux/sched.h - -#include bc/beancounter.h -#include bc/vmpages.h -#include bc/dcache.h -#include bc/kmem.h - -static unsigned int dcache_charge_size(int name_len) -{ - return dentry_cache-objuse + kmem_cache_objuse(inode_cachep) + - (name_len DNAME_INLINE_LEN ? name_len : 0); -} - -int ub_dcache_shrink(struct user_beancounter *ub, - unsigned long size, gfp_t gfp_mask) -{ - int count, pruned; - - if (!(gfp_mask __GFP_FS)) - return -EBUSY; - - count = DIV_ROUND_UP(size, dcache_charge_size(0)); - spin_lock(dcache_lock); - pruned = __shrink_dcache_ub(ub, count); - spin_unlock(dcache_lock); - if (!pruned) - return -ENOMEM; - - return 0; -} - -static int __ub_dcache_charge(struct user_beancounter *ub, - unsigned long
[Devel] [PATCH rh7] net-namespace: Don't forget to put_ve on error path
If error happened during new net-namespace creation we might end up having VE reference taken and never put back. | copy_net_ns | setup_net | ... | net-owner_ve = get_ve(get_exec_env()); | ... | error = ops_init(ops, net); | if (error 0) |goto out_undo; | ... | return error; | put_user_ns(user_ns); | net_drop_ns(net); | net_free(ns); |kfree(net-gen); |kmem_cache_free(net_cachep, net); So lets call for put_ve to balance. Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Andrey Vagin ava...@odin.com --- net/core/net_namespace.c |3 +++ 1 file changed, 3 insertions(+) Index: linux-pcs7.git/net/core/net_namespace.c === --- linux-pcs7.git.orig/net/core/net_namespace.c +++ linux-pcs7.git/net/core/net_namespace.c @@ -192,6 +192,9 @@ out_undo: ops_free_list(ops, net_exit_list); rcu_barrier(); +#ifdef CONFIG_VE + put_ve(net-owner_ve); +#endif goto out; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] mm/tswap/tcache: enable tcache and tswap by default
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit a1cd5a98145e5032cad97a0bf15e3e0904fad8d0 Author: Vladimir Davydov vdavy...@parallels.com Date: Mon May 18 17:00:04 2015 +0400 mm/tswap/tcache: enable tcache and tswap by default We use both of them = enable tcache and tswap by default. In order to disable them add appropriate kernel boot options: tcache.enabled=0 tswap.enabled=0 https://jira.sw.ru/browse/PSBM-31757 https://jira.sw.ru/browse/PSBM-32063 Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- mm/tcache.c | 2 +- mm/tswap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/tcache.c b/mm/tcache.c index bc740f0..e83ad05 100644 --- a/mm/tcache.c +++ b/mm/tcache.c @@ -125,7 +125,7 @@ static struct tcache_lru *tcache_lru_node; */ /* Enable/disable tcache backend (set at boot time) */ -static bool tcache_enabled __read_mostly; +static bool tcache_enabled __read_mostly = true; module_param_named(enabled, tcache_enabled, bool, 0444); /* Enable/disable populating the cache */ diff --git a/mm/tswap.c b/mm/tswap.c index c4effa3..4b792cd 100644 --- a/mm/tswap.c +++ b/mm/tswap.c @@ -27,7 +27,7 @@ struct tswap_lru { static struct tswap_lru *tswap_lru_node; /* Enable/disable tswap backend (set at boot time) */ -static bool tswap_enabled __read_mostly; +static bool tswap_enabled __read_mostly = true; module_param_named(enabled, tswap_enabled, bool, 0444); /* Enable/disable populating the cache */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2
On 05/18/2015 03:52 PM, Cyrill Gorcunov wrote: On Mon, May 18, 2015 at 11:21:40AM +0300, Konstantin Khorenko wrote: Is this true that without these checks a single thread of a multithread process can enter CT? If no - where is the check for this case? If yes - let's prohibit this. An update is attached: ether the task we're attaching should be singlethreaded task, either all threads should be moved at once (which as far as I understand is prepared by a caller code). Looks OK ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 3/5] bc: sysinfo: remove dead code
If meminfo_val != VE_MEMINFO_DEFAULT in bc_fill_sysinfo, it equals VE_MEMINFO_SYSTEM, in which case we return from bc_fill_sysinfo immediately. Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- kernel/bc/vm_pages.c | 22 -- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c index 7f5eece57aa7..b3d0dd09f8cf 100644 --- a/kernel/bc/vm_pages.c +++ b/kernel/bc/vm_pages.c @@ -202,18 +202,8 @@ static int bc_fill_sysinfo(struct user_beancounter *ub, total = physpages.limit; used = physpages.held; - if (total == UB_MAXVALUE) { - if (meminfo_val == VE_MEMINFO_DEFAULT) - total = totalram; - else { - total = min(meminfo_val, totalram); - used = __get_beancounter_usage_percpu(ub, UB_PRIVVMPAGES); - if (glob_ve_meminfo) { - ub_update_resources(ub); - used = ub-ub_parms[UB_OOMGUARPAGES].held; - } - } - } + if (total == UB_MAXVALUE) + total = totalram; si-totalram = total; si-freeram = (total used ? total - used : 0); @@ -221,12 +211,8 @@ static int bc_fill_sysinfo(struct user_beancounter *ub, total = swappages.limit; used = swappages.held; - if (total == UB_MAXVALUE) { - if (meminfo_val == VE_MEMINFO_DEFAULT) - total = totalswap; - else - total = 0; - } + if (total == UB_MAXVALUE) + total = totalswap; si-totalswap = total; si-freeswap = (total used ? total - used : 0); -- 1.7.10.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 1/5] ub: remove CONFIG_BC_RSS_ACCOUNTING
There's no point in it, because w/o it beancounters are useless. Plus, it isn't actually used throughout the code, because rss accounting is up to memcg now. So, just make CONFIG_BEANCOUNTERS depend on memcg and remove the option. Also, remove dependency on CONFIG_CGROUP_HUGETLB, because we don't actually require it. Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- config.OpenVZ |1 - kernel/bc/Kconfig | 19 --- kernel/bc/proc.c |8 3 files changed, 4 insertions(+), 24 deletions(-) diff --git a/config.OpenVZ b/config.OpenVZ index 73614cfd13e7..93f8d6ff4a22 100644 --- a/config.OpenVZ +++ b/config.OpenVZ @@ -5300,7 +5300,6 @@ CONFIG_VZ_IOLIMIT=m # User resources # CONFIG_BEANCOUNTERS=y -CONFIG_BC_RSS_ACCOUNTING=y CONFIG_BC_IO_ACCOUNTING=y CONFIG_BC_IO_PRIORITY=y CONFIG_BC_PROC=y diff --git a/kernel/bc/Kconfig b/kernel/bc/Kconfig index 4b8156690c46..a3379f949d3c 100644 --- a/kernel/bc/Kconfig +++ b/kernel/bc/Kconfig @@ -12,6 +12,10 @@ config BEANCOUNTERS bool Enable user resource accounting default y select CGROUPS + select MEMCG + select MEMCG_KMEM + select MEMCG_SWAP if SWAP + select MEMCG_SWAP_ENABLED if SWAP help This patch provides accounting and allows to configure limits for user's consumption of exhaustible system resources. @@ -26,21 +30,6 @@ config BEANCOUNTERS per-process basis. Per-process accounting doesn't prevent malicious users from spawning a lot of resource-consuming processes. -config BC_RSS_ACCOUNTING - bool Account physical memory usage - default y - depends on BEANCOUNTERS - select RESOURCE_COUNTERS - select MEMCG - select MEMCG_KMEM - select MEMCG_SWAP if SWAP - select MEMCG_SWAP_ENABLED if SWAP - select CGROUP_HUGETLB if HUGETLBFS - help - This allows to estimate per beancounter physical memory usage. - Implemented alghorithm accounts shared pages of memory as well, - dividing them by number of beancounter which use the page. - config BC_IO_ACCOUNTING bool Account file I/O default y diff --git a/kernel/bc/proc.c b/kernel/bc/proc.c index dd33d44a2cb0..af6a610a3e08 100644 --- a/kernel/bc/proc.c +++ b/kernel/bc/proc.c @@ -198,12 +198,7 @@ static struct bc_proc_entry bc_meminfo_entry = { .u.show = bc_proc_meminfo_show, }; -#ifdef CONFIG_BC_RSS_ACCOUNTING - -#include linux/memcontrol.h - #define K(x) ((x) (PAGE_SHIFT - 10)) - static int bc_proc_nodeinfo_show(struct seq_file *f, void *v) { int nid; @@ -241,7 +236,6 @@ static struct bc_proc_entry bc_nodeinfo_entry = { .name = nodeinfo, .u.show = bc_proc_nodeinfo_show, }; -#endif #if 0 @@ -931,9 +925,7 @@ static int __init ub_init_proc(void) // bc_register_proc_entry(bc_dcacheinfo_entry); bc_register_proc_root_entry(bc_all_resources_entry); bc_register_proc_entry(bc_meminfo_entry); -#ifdef CONFIG_BC_RSS_ACCOUNTING bc_register_proc_entry(bc_nodeinfo_entry); -#endif entry = proc_create(user_beancounters, S_IRUSR|S_ISVTX, NULL, ub_file_operations); -- 1.7.10.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 4/5] proc: fix oom_score output
oom_badness now returns absolute badness, not per mille. So we have to revert the chunk of PCS6 code that doesn't know that. Note, I use the global totalpages rather than per ub as it used to be, because ub's oom killer doesn't work anyway for now and will be reimplemented in the scope of the memory cgroup. Then I'll change it to per-memcg value. Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- fs/proc/base.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 12d9ea1eca6d..79ee3c875e76 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -449,15 +449,15 @@ static const struct file_operations proc_cpuset_operations = { static int proc_oom_score(struct task_struct *task, char *buffer) { - int points = 0; + unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long points = 0; read_lock(tasklist_lock); - if (pid_alive(task)) { - points = oom_badness(task, NULL, NULL, ub_oom_total_pages(get_exec_ub())); - points = clamp(points, 0, 1000); - } + if (pid_alive(task)) + points = oom_badness(task, NULL, NULL, totalpages) * + 1000 / totalpages; read_unlock(tasklist_lock); - return sprintf(buffer, %d\n, points); + return sprintf(buffer, %lu\n, points); } struct limit_names { -- 1.7.10.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 0/5] ub: remove some dead code
Vladimir Davydov (5): ub: remove CONFIG_BC_RSS_ACCOUNTING ub: get rid of dcache accounting related stuff bc: sysinfo: remove dead code proc: fix oom_score output bc: zap oom kill related stuff arch/x86/mm/fault.c |2 - config.OpenVZ|1 - drivers/tty/sysrq.c |3 - fs/namei.c |1 - fs/proc/base.c | 14 +-- include/bc/beancounter.h | 14 --- include/bc/dcache.h | 18 --- include/bc/oom_kill.h| 19 --- include/bc/vmpages.h |2 - include/linux/mm_types.h |2 - include/linux/oom.h | 13 --- kernel/bc/Kconfig| 19 +-- kernel/bc/beancounter.c |9 -- kernel/bc/dcache.c | 269 -- kernel/bc/oom_kill.c | 289 -- kernel/bc/proc.c | 13 --- kernel/bc/statd.c|3 - kernel/bc/vm_pages.c | 73 +--- kernel/exit.c|1 - kernel/fork.c|5 - kernel/ve/vecalls.c |2 - mm/page_alloc.c |4 - 22 files changed, 16 insertions(+), 760 deletions(-) delete mode 100644 include/bc/dcache.h delete mode 100644 include/bc/oom_kill.h delete mode 100644 kernel/bc/dcache.c delete mode 100644 kernel/bc/oom_kill.c -- 1.7.10.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/cgroups: fake num_cgroups in /proc/cgroups output
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 213b5800cbf1e1f36efaab61f2f49ea198bdb1e8 Author: Vasily Averin v...@odin.com Date: Mon May 18 16:32:55 2015 +0400 ve/cgroups: fake num_cgroups in /proc/cgroups output Like in rh6-based kernels, /proc/cgroups output inside container will show 1 in 'num_cgroups' column. https://jira.sw.ru/browse/PSBM-33400 Signed-off-by: Vasily Averin v...@openvz.org khorenko@: This is done in order to prevent people to try guessing the number of Containers running on a Hardware Node because even if the guess is correct, it gives no useful info, but people can easily come to wrong conclusions. --- kernel/cgroup.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f881f69..f897042 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4815,6 +4815,8 @@ out: return retval; } +#define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1) + /* Display information about each subsystem and each hierarchy */ static int proc_cgroupstats_show(struct seq_file *m, void *v) { @@ -4829,11 +4831,14 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(cgroup_mutex); for (i = 0; i CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + int num; + if (ss == NULL) continue; + num = _cg_virtualized(ss-root-number_of_cgroups); seq_printf(m, %s\t%d\t%d\t%d\n, ss-name, ss-root-hierarchy_id, - ss-root-number_of_cgroups, !ss-disabled); + num, !ss-disabled); } mutex_unlock(cgroup_mutex); return 0; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve
On Mon, May 18, 2015 at 01:22:22PM +0300, Cyrill Gorcunov wrote: In PCS7 cgroups are configured from user space, so there is no longer connection from ve to device cgroup via css as it was in PCS6. Instead we should open device cgroup explicitly. https://jira.sw.ru/browse/PSBM-33555 Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Andrey Vagin ava...@odin.com Reviewed-by: Vladimir Davydov vdavy...@parallels.com ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/net/printk: net_veboth_ratelimited introduced
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 16d8b1d984f26100bf006ed93fcd47642401dd26 Author: Vasily Averin v...@odin.com Date: Mon May 18 12:29:44 2015 +0400 ve/net/printk: net_veboth_ratelimited introduced net_veboth_ratelimited is required to save net-ratelimited messages both into host and into containers dmesg buffers Signed-off-by: Vasily Averin v...@openvz.org Acked-by: Kirill Tkhai ktk...@odin.com --- include/linux/net.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/net.h b/include/linux/net.h index d7b2205..7e59abe 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -249,6 +249,8 @@ do { \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) #define net_velog_ratelimited(fmt, ...)\ net_ratelimited_function(ve_printk, VE_LOG, fmt, ##__VA_ARGS__) +#define net_veboth_ratelimited(fmt, ...) \ + net_ratelimited_function(ve_printk, VE_LOG_BOTH, fmt, ##__VA_ARGS__) #define net_random() prandom_u32() ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RFC rh7] ve: cgroups -- Allow to attach non-self into ve cgroups
On 05/14/2015 07:52 PM, Cyrill Gorcunov wrote: In vzctl/libvzctl bundle we restore container like - create ve/$ctid cgroup - move self into this cgroup - run criu from inside So that kernel code passes ve_can_attach test. In turn for our P.Haul project (which is managing live migration) the situation is different -- it opens ve/$ctid but moves criu service pid instead (so that the service will start restore procedure). Which leads to situation where ve_can_attach fails with -EINVAL. Reported-by: Nikita Spiridonov nspirido...@odin.com Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Pavel Emelyanov xe...@odin.com CC: Andrey Vagin ava...@odin.com --- Guys, could you please take a look, especially from security POV, is it safe to remove all these checks? kernel/ve/ve.c | 31 +-- 1 file changed, 13 insertions(+), 18 deletions(-) Index: linux-pcs7.git/kernel/ve/ve.c === --- linux-pcs7.git.orig/kernel/ve/ve.c +++ linux-pcs7.git/kernel/ve/ve.c @@ -750,13 +750,6 @@ static void ve_destroy(struct cgroup *cg static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset) { struct ve_struct *ve = cgroup_ve(cg); - struct task_struct *task = current; - - if (cgroup_taskset_size(tset) != 1 || - cgroup_taskset_first(tset) != task || - !thread_group_leader(task) || - !thread_group_empty(task)) - return -EINVAL; Is this true that without these checks a single thread of a multithread process can enter CT? If no - where is the check for this case? If yes - let's prohibit this. if (ve-is_locked) return -EBUSY; @@ -775,20 +768,22 @@ static int ve_can_attach(struct cgroup * static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset) { struct ve_struct *ve = cgroup_ve(cg); - struct task_struct *tsk = current; - - /* this probihibts ptracing of task entered to VE from host system */ - if (ve-is_running tsk-mm) - tsk-mm-vps_dumpable = VD_VE_ENTER_TASK; + struct task_struct *tsk; - /* Drop OOM protection. */ - tsk-signal-oom_score_adj = 0; - tsk-signal-oom_score_adj_min = 0; + cgroup_taskset_for_each(tsk, cg, tset) { + /* this probihibts ptracing of task entered to VE from host system */ + if (ve-is_running tsk-mm) + tsk-mm-vps_dumpable = VD_VE_ENTER_TASK; + + /* Drop OOM protection. */ + tsk-signal-oom_score_adj = 0; + tsk-signal-oom_score_adj_min = 0; - /* Leave parent exec domain */ - tsk-parent_exec_id--; + /* Leave parent exec domain */ + tsk-parent_exec_id--; - tsk-task_ve = ve; + tsk-task_ve = ve; + } } static int ve_state_read(struct cgroup *cg, struct cftype *cft, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/netfilter: ve_printk for nf_conntrack: table full
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 8782918c418820d5127afa4a5db74c9b3eac3b82 Author: Vasily Averin v...@odin.com Date: Mon May 18 12:29:57 2015 +0400 ve/netfilter: ve_printk for nf_conntrack: table full port of diff-ve-printk-conntrack-tables-full from rh6-based kernels nf_conntrack: table full, dropping packet message should be visible both in CT and on HN and should contain CTID for reading simplicity. https://bugzilla.openvz.org/show_bug.cgi?id=2940 Signed-off-by: Vasily Averin v...@openvz.org Acked-by: Kirill Tkhai ktk...@odin.com --- net/netfilter/nf_conntrack_core.c | 4 +++- net/netfilter/nf_conntrack_expect.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 495b859..017c755 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -696,7 +696,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone, unlikely(atomic_read(net-ct.count) ct_max)) { if (!early_drop(net, hash_bucket(hash, net))) { atomic_dec(net-ct.count); - net_warn_ratelimited(nf_conntrack: table full, dropping packet\n); + net_veboth_ratelimited(KERN_WARNING VE%u: + nf_conntrack table full, dropping packet\n, + net-owner_ve-veid); return ERR_PTR(-ENOMEM); } } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index d80db92..bfa95fd 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -408,7 +408,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } if (net-ct.expect_count = init_net.ct.expect_max) { - net_warn_ratelimited(nf_conntrack: expectation table full\n); + net_veboth_ratelimited(KERN_WARNING VE%u + nf_conntrack: expectation table full\n, + net-owner_ve-veid); ret = -EMFILE; } out: ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2
On Mon, May 18, 2015 at 07:42:50PM +0300, Cyrill Gorcunov wrote: On Mon, May 18, 2015 at 07:34:45PM +0300, Vladimir Davydov wrote: /* + * We either moving the whole group of threads, + * either a single thread process. + */ + if (cgroup_taskset_size(tset) == 1) { != ? + task = cgroup_taskset_first(tset); + if (!thread_group_leader(task) !thread_group_empty(task)) + return -EINVAL; No, ==. The thing is that the kernel carries about multithreaded tasks and groups all threads into the array. In turn, when task is attached via pid (ie ve/ctid/tasks). the kernel simply looks up for a task, put it into an array and pass to us. So it's our duty to check that the only one task has been passed and if so we need to check it's not a thread from some multithreaded application. OK, I see, thanks. But if we are attaching one thread which is thread_group_leader, we will not fail even if the thread group is not empty and other threads are not moved, will we? ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v3
On Mon, May 18, 2015 at 08:41:33PM +0300, Vladimir Davydov wrote: OK, I see, thanks. But if we are attaching one thread which is thread_group_leader, we will not fail even if the thread group is not empty and other threads are not moved, will we? Yeah ;) It should be OR here. Thank you! From: Cyrill Gorcunov gorcu...@odin.com Subject: ve: cgroups -- Allow to attach non-self into ve cgroups In vzctl/libvzctl bundle we restore container like - create ve/$ctid cgroup - move self into this cgroup - run criu from inside So that kernel code passes ve_can_attach test. In turn for our P.Haul project (which is managing live migration) the situation is different -- it opens ve/$ctid but moves criu service pid instead (so that the service will start restore procedure). Which leads to situation where ve_can_attach fails with -EINVAL. Basically we need to 1) Check that in case if task is getting attached to VE cgroup it should be a single threaded task. 2) In case of multithread task all threads should be moved in one pass (this actually prepared by cgroup_attach_task caller). 3) In case if VE is stopping or starting only kernel threads can attach. Reported-by: Nikita Spiridonov nspirido...@odin.com Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Pavel Emelyanov xe...@odin.com CC: Andrey Vagin ava...@odin.com --- kernel/ve/ve.c | 53 +++-- 1 file changed, 31 insertions(+), 22 deletions(-) Index: linux-pcs7.git/kernel/ve/ve.c === --- linux-pcs7.git.orig/kernel/ve/ve.c +++ linux-pcs7.git/kernel/ve/ve.c @@ -775,24 +775,31 @@ static void ve_destroy(struct cgroup *cg static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset) { struct ve_struct *ve = cgroup_ve(cg); - struct task_struct *task = current; - - if (cgroup_taskset_size(tset) != 1 || - cgroup_taskset_first(tset) != task || - !thread_group_leader(task) || - !thread_group_empty(task)) - return -EINVAL; + struct task_struct *task; if (ve-is_locked) return -EBUSY; /* +* We either moving the whole group of threads, +* either a single thread process. +*/ + if (cgroup_taskset_size(tset) == 1) { + task = cgroup_taskset_first(tset); + if (!thread_group_leader(task) || !thread_group_empty(task)) + return -EINVAL; + } + + /* * Forbid userspace tasks to enter during starting or stopping. -* Permit attaching kernel threads and init task for this containers. +* Permit attaching kernel threads for this containers. */ - if (!ve-is_running (ve-ve_ns || nr_threads_ve(ve)) - !(task-flags PF_KTHREAD)) - return -EPIPE; + if (!ve-is_running (ve-ve_ns || nr_threads_ve(ve))) { + cgroup_taskset_for_each(task, cg, tset) { + if (!(task-flags PF_KTHREAD)) + return -EPIPE; + } + } return 0; } @@ -800,20 +807,22 @@ static int ve_can_attach(struct cgroup * static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset) { struct ve_struct *ve = cgroup_ve(cg); - struct task_struct *tsk = current; - - /* this probihibts ptracing of task entered to VE from host system */ - if (ve-is_running tsk-mm) - tsk-mm-vps_dumpable = VD_VE_ENTER_TASK; + struct task_struct *task; - /* Drop OOM protection. */ - tsk-signal-oom_score_adj = 0; - tsk-signal-oom_score_adj_min = 0; + cgroup_taskset_for_each(task, cg, tset) { + /* this probihibts ptracing of task entered to VE from host system */ + if (ve-is_running task-mm) + task-mm-vps_dumpable = VD_VE_ENTER_TASK; + + /* Drop OOM protection. */ + task-signal-oom_score_adj = 0; + task-signal-oom_score_adj_min = 0; - /* Leave parent exec domain */ - tsk-parent_exec_id--; + /* Leave parent exec domain */ + task-parent_exec_id--; - tsk-task_ve = ve; + task-task_ve = ve; + } } static int ve_state_read(struct cgroup *cg, struct cftype *cft, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2
On Mon, May 18, 2015 at 07:34:45PM +0300, Vladimir Davydov wrote: /* +* We either moving the whole group of threads, +* either a single thread process. +*/ + if (cgroup_taskset_size(tset) == 1) { != ? + task = cgroup_taskset_first(tset); + if (!thread_group_leader(task) !thread_group_empty(task)) + return -EINVAL; No, ==. The thing is that the kernel carries about multithreaded tasks and groups all threads into the array. In turn, when task is attached via pid (ie ve/ctid/tasks). the kernel simply looks up for a task, put it into an array and pass to us. So it's our duty to check that the only one task has been passed and if so we need to check it's not a thread from some multithreaded application. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve
On Mon, May 18, 2015 at 07:43:40PM +0300, Cyrill Gorcunov wrote: For uuid-named cgroups ve-veid != cgroup name. You should use ve-name instead. Please fix. Oh, i forgot about this new approach with uuid containers. Sure will do, thank you! Attached. From: Cyrill Gorcunov gorcu...@odin.com Subject: ve: device cgroup -- Implement devcgroup_seq_show_ve In PCS7 cgroups are configured from user space, so there is no longer connection from ve to device cgroup via css as it was in PCS6. Instead we should open device cgroup explicitly. https://jira.sw.ru/browse/PSBM-33555 v2 (by vdavydov@): - use ve::ve_name because we're switching to UUID based containers Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Andrey Vagin ava...@odin.com --- include/linux/device_cgroup.h |3 ++- kernel/ve/vecalls.c |2 +- security/device_cgroup.c | 14 +++--- 3 files changed, 14 insertions(+), 5 deletions(-) Index: linux-pcs7.git/include/linux/device_cgroup.h === --- linux-pcs7.git.orig/include/linux/device_cgroup.h +++ linux-pcs7.git/include/linux/device_cgroup.h @@ -19,7 +19,8 @@ extern int devcgroup_device_visible(umod struct cgroup; int devcgroup_default_perms_ve(struct cgroup *cgroup); int devcgroup_set_perms_ve(struct cgroup *cgroup, unsigned, dev_t, unsigned); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m); +struct ve_struct; +int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, struct seq_file *m); #else static inline int devcgroup_inode_permission(struct inode *inode, int mask) Index: linux-pcs7.git/kernel/ve/vecalls.c === --- linux-pcs7.git.orig/kernel/ve/vecalls.c +++ linux-pcs7.git/kernel/ve/vecalls.c @@ -891,7 +891,7 @@ static int devperms_seq_show(struct seq_ if (ve_is_super(ve)) seq_printf(m, %10u b 016 *:*\n%10u c 006 *:*\n, 0, 0); else - devcgroup_seq_show_ve(ve-css.cgroup, ve-veid, m); + devcgroup_seq_show_ve(devices_root, ve, m); return 0; } Index: linux-pcs7.git/security/device_cgroup.c === --- linux-pcs7.git.orig/security/device_cgroup.c +++ linux-pcs7.git/security/device_cgroup.c @@ -17,6 +17,7 @@ #include linux/major.h #include linux/module.h #include linux/capability.h +#include linux/ve.h #define ACC_MKNOD 1 #define ACC_READ 2 @@ -1091,10 +1092,16 @@ int devcgroup_set_perms_ve(struct cgroup } EXPORT_SYMBOL(devcgroup_set_perms_ve); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m) +int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); struct dev_exception_item *wh; + struct dev_cgroup *devcgroup; + struct cgroup *cgroup; + + cgroup = cgroup_kernel_open(devices_root, 0, ve_name(ve)); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + devcgroup = cgroup_to_devcgroup(cgroup); rcu_read_lock(); list_for_each_entry_rcu(wh, devcgroup-exceptions, list) { @@ -1112,12 +1119,13 @@ int devcgroup_seq_show_ve(struct cgroup perm |= S_IXOTH; seq_printf(m, %10u %c %03o %s:%s\n, - veid, + ve-veid, type_to_char(wh-type), perm, maj, min); } rcu_read_unlock(); + cgroup_kernel_close(cgroup); return 0; } EXPORT_SYMBOL(devcgroup_seq_show_ve); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve
On Mon, May 18, 2015 at 07:33:41PM +0300, Vladimir Davydov wrote: On Mon, May 18, 2015 at 01:22:22PM +0300, Cyrill Gorcunov wrote: --- linux-pcs7.git.orig/security/device_cgroup.c +++ linux-pcs7.git/security/device_cgroup.c @@ -1091,10 +1091,16 @@ int devcgroup_set_perms_ve(struct cgroup } EXPORT_SYMBOL(devcgroup_set_perms_ve); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m) +int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); struct dev_exception_item *wh; + struct dev_cgroup *devcgroup; + struct cgroup *cgroup; + + cgroup = ve_cgroup_open(devices_root, 0, veid); For uuid-named cgroups ve-veid != cgroup name. You should use ve-name instead. Please fix. Oh, i forgot about this new approach with uuid containers. Sure will do, thank you! ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve
On Mon, May 18, 2015 at 01:22:22PM +0300, Cyrill Gorcunov wrote: --- linux-pcs7.git.orig/security/device_cgroup.c +++ linux-pcs7.git/security/device_cgroup.c @@ -1091,10 +1091,16 @@ int devcgroup_set_perms_ve(struct cgroup } EXPORT_SYMBOL(devcgroup_set_perms_ve); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m) +int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); struct dev_exception_item *wh; + struct dev_cgroup *devcgroup; + struct cgroup *cgroup; + + cgroup = ve_cgroup_open(devices_root, 0, veid); For uuid-named cgroups ve-veid != cgroup name. You should use ve-name instead. Please fix. + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + devcgroup = cgroup_to_devcgroup(cgroup); rcu_read_lock(); list_for_each_entry_rcu(wh, devcgroup-exceptions, list) { @@ -1118,6 +1124,7 @@ int devcgroup_seq_show_ve(struct cgroup } rcu_read_unlock(); + cgroup_kernel_close(cgroup); return 0; } EXPORT_SYMBOL(devcgroup_seq_show_ve); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2
On Mon, May 18, 2015 at 03:52:35PM +0300, Cyrill Gorcunov wrote: --- linux-pcs7.git.orig/kernel/ve/ve.c +++ linux-pcs7.git/kernel/ve/ve.c @@ -750,24 +750,31 @@ static void ve_destroy(struct cgroup *cg static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset) { struct ve_struct *ve = cgroup_ve(cg); - struct task_struct *task = current; - - if (cgroup_taskset_size(tset) != 1 || - cgroup_taskset_first(tset) != task || - !thread_group_leader(task) || - !thread_group_empty(task)) - return -EINVAL; + struct task_struct *task; if (ve-is_locked) return -EBUSY; /* + * We either moving the whole group of threads, + * either a single thread process. + */ + if (cgroup_taskset_size(tset) == 1) { != ? + task = cgroup_taskset_first(tset); + if (!thread_group_leader(task) !thread_group_empty(task)) + return -EINVAL; + } + + /* * Forbid userspace tasks to enter during starting or stopping. - * Permit attaching kernel threads and init task for this containers. + * Permit attaching kernel threads for this containers. */ - if (!ve-is_running (ve-ve_ns || nr_threads_ve(ve)) - !(task-flags PF_KTHREAD)) - return -EPIPE; + if (!ve-is_running (ve-ve_ns || nr_threads_ve(ve))) { + cgroup_taskset_for_each(task, cg, tset) { + if (!(task-flags PF_KTHREAD)) + return -EPIPE; + } + } return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve
On Mon, May 18, 2015 at 08:04:27PM +0300, Cyrill Gorcunov wrote: From: Cyrill Gorcunov gorcu...@odin.com Subject: ve: device cgroup -- Implement devcgroup_seq_show_ve In PCS7 cgroups are configured from user space, so there is no longer connection from ve to device cgroup via css as it was in PCS6. Instead we should open device cgroup explicitly. https://jira.sw.ru/browse/PSBM-33555 v2 (by vdavydov@): - use ve::ve_name because we're switching to UUID based containers Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Andrey Vagin ava...@odin.com Reviewed-by: Vladimir Davydov vdavy...@parallels.com ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH] properly charge and uncharge shmem
From: Andrew Perepechko pa...@cloudlinux.com Currently, shmem_lock immediately and unconditionally uncharges what it has just charged for a lock request. This, indeed, causes a double uncharge with something like the following: shmid = shmget(12345, 8192, IPC_CREAT | 0666); rc = shmctl(shmid, SHM_LOCK, NULL); shmctl(shmid, IPC_RMID, 0); with the following in the kernel log: [ 455.815025] Uncharging too much 2 h 0, res lockedpages ub 0 Signed-off-by: Andrew Perepechko pa...@cloudlinux.com --- mm/shmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index a6b3e30..d09a230 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1359,11 +1359,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) mapping_set_unevictable(file-f_mapping); } if (!lock (info-flags VM_LOCKED) user) { + ub_lockedshm_uncharge(info, inode-i_size); user_shm_unlock(inode-i_size, user); info-flags = ~VM_LOCKED; mapping_clear_unevictable(file-f_mapping); } - retval = 0; + spin_unlock(info-lock); + return 0; out_nomem: ub_lockedshm_uncharge(info, inode-i_size); -- 1.9.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/cgroups: Allow to attach non-self into ve cgroups, v3
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 729323172bc760a2daf4d790a5bffc74ec10c04d Author: Cyrill Gorcunov gorcu...@odin.com Date: Tue May 19 00:43:44 2015 +0400 ve/cgroups: Allow to attach non-self into ve cgroups, v3 In vzctl/libvzctl bundle we restore container like - create ve/$ctid cgroup - move self into this cgroup - run criu from inside So that kernel code passes ve_can_attach test. In turn for our P.Haul project (which is managing live migration) the situation is different -- it opens ve/$ctid but moves criu service pid instead (so that the service will start restore procedure). Which leads to situation where ve_can_attach fails with -EINVAL. Basically we need to 1) Check that in case if task is getting attached to VE cgroup it should be a single threaded task. 2) In case of multithread task all threads should be moved in one pass (this actually prepared by cgroup_attach_task caller). 3) In case if VE is stopping or starting only kernel threads can attach. khorenko@: Check for thread_group_empty(task) is enough to be sure the task is single-threaded. https://jira.sw.ru/browse/PSBM-33561 Reported-by: Nikita Spiridonov nspirido...@odin.com Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Pavel Emelyanov xe...@odin.com CC: Andrey Vagin ava...@odin.com --- kernel/ve/ve.c | 51 ++- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index e598d15..cf7c848 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -775,24 +775,31 @@ static void ve_destroy(struct cgroup *cg) static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset) { struct ve_struct *ve = cgroup_ve(cg); - struct task_struct *task = current; - - if (cgroup_taskset_size(tset) != 1 || - cgroup_taskset_first(tset) != task || - !thread_group_leader(task) || - !thread_group_empty(task)) - return -EINVAL; + struct task_struct *task; if (ve-is_locked) return -EBUSY; /* +* We either moving the whole group of threads, +* either a single thread process. +*/ + if (cgroup_taskset_size(tset) == 1) { + task = cgroup_taskset_first(tset); + if (!thread_group_empty(task)) + return -EINVAL; + } + + /* * Forbid userspace tasks to enter during starting or stopping. -* Permit attaching kernel threads and init task for this containers. +* Permit attaching kernel threads for this containers. */ - if (!ve-is_running (ve-ve_ns || nr_threads_ve(ve)) - !(task-flags PF_KTHREAD)) - return -EPIPE; + if (!ve-is_running (ve-ve_ns || nr_threads_ve(ve))) { + cgroup_taskset_for_each(task, cg, tset) { + if (!(task-flags PF_KTHREAD)) + return -EPIPE; + } + } return 0; } @@ -800,20 +807,22 @@ static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset) static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset) { struct ve_struct *ve = cgroup_ve(cg); - struct task_struct *tsk = current; + struct task_struct *task; - /* this probihibts ptracing of task entered to VE from host system */ - if (ve-is_running tsk-mm) - tsk-mm-vps_dumpable = VD_VE_ENTER_TASK; + cgroup_taskset_for_each(task, cg, tset) { + /* this probihibts ptracing of task entered to VE from host system */ + if (ve-is_running task-mm) + task-mm-vps_dumpable = VD_VE_ENTER_TASK; - /* Drop OOM protection. */ - tsk-signal-oom_score_adj = 0; - tsk-signal-oom_score_adj_min = 0; + /* Drop OOM protection. */ + task-signal-oom_score_adj = 0; + task-signal-oom_score_adj_min = 0; - /* Leave parent exec domain */ - tsk-parent_exec_id--; + /* Leave parent exec domain */ + task-parent_exec_id--; - tsk-task_ve = ve; + task-task_ve = ve; + } } static int ve_state_read(struct cgroup *cg, struct cftype *cft, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: check new size of block device on ioctl(GROW)
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 0385f754e9f680c7d5095ae981fe29c1b6e7323a Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:26:55 2015 +0400 ploop: check new size of block device on ioctl(GROW) Return error if userspace attepmts to grow block device above limits imposed by ploop1 formats. https://jira.sw.ru/browse/PSBM-21027 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/fmt_ploop1.c | 4 drivers/block/ploop/ploop1_image.h | 13 + 2 files changed, 17 insertions(+) diff --git a/drivers/block/ploop/fmt_ploop1.c b/drivers/block/ploop/fmt_ploop1.c index 624bdc1..fb12c30 100644 --- a/drivers/block/ploop/fmt_ploop1.c +++ b/drivers/block/ploop/fmt_ploop1.c @@ -458,6 +458,10 @@ ploop1_prepare_grow(struct ploop_delta * delta, u64 *new_size, int *reloc) if (*new_size ((1 delta-cluster_log) - 1)) return -EINVAL; + if (*new_size ploop1_max_size(1 delta-plo-cluster_log, + delta-plo-fmt_version)) + return -EFBIG; + vh = (struct ploop_pvd_header *)page_address(ph-dyn_page); n_present = le32_to_cpu(vh-m_FirstBlockOffset) log; BUG_ON (!n_present); diff --git a/drivers/block/ploop/ploop1_image.h b/drivers/block/ploop/ploop1_image.h index 337c05b..c4efe87 100644 --- a/drivers/block/ploop/ploop1_image.h +++ b/drivers/block/ploop/ploop1_image.h @@ -247,6 +247,19 @@ ploop1_version(struct ploop_pvd_header *vh) return -1; } +static inline __u64 +ploop1_max_size(__u32 blocksize, int version) +{ + switch (version) { + case PLOOP_FMT_V1: + return (__u32)-1; + case PLOOP_FMT_V2: + return 0xUL * blocksize; + } + + return 0; +} + #ifdef __KERNEL__ static inline u64 get_SizeInSectors_from_le(struct ploop_pvd_header *vh, int version) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: fix a race condition on relocation of blocks
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit a762247cf8ff0b2ec0ba6e8a9742f7a5e38a8b15 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:02 2015 +0400 ploop: fix a race condition on relocation of blocks map_release() are not atomic, because it calls atomic_read and atomic_dec_and_test. Looks like it was designed to be called under plo-lock. https://jira.sw.ru/browse/PSBM-23905 Signed-off-by: Andrey Vagin ava...@openvz.org Acked-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/dev.c | 6 ++ drivers/block/ploop/map.c | 6 ++ 2 files changed, 12 insertions(+) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 353fb35..e3422d8 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1471,12 +1471,14 @@ static int prepare_merge_req(struct ploop_request * preq) return res; drop_map: + spin_lock_irq(plo-lock); map_release(preq-trans_map); preq-trans_map = NULL; if (preq-map) { map_release(preq-map); preq-map = NULL; } + spin_unlock_irq(plo-lock); return 1; } @@ -1688,8 +1690,10 @@ ploop_entry_reloc_a_req(struct ploop_request *preq, iblock_t *iblk) if (*clu = MAP_MAX_IND(preq)) break; + spin_lock_irq(plo-lock); map_release(preq-map); preq-map = NULL; + spin_unlock_irq(plo-lock); } if (*clu = plo-map.max_index) { @@ -1814,8 +1818,10 @@ static int discard_get_index(struct ploop_request *preq) preq-iblock = 0; if (preq-map) { + spin_lock_irq(plo-lock); map_release(preq-map); preq-map = NULL; + spin_unlock_irq(plo-lock); } return 0; diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 5f50f81..2e971cd 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -145,6 +145,10 @@ static void flush_lru_buffer(struct ploop_map * map) map-lru_buffer_ptr = 0; } +/* + * map_release() must be called under plo-lock, because + * The pair atomic_read atomic_dec_and_test is not atomic. + */ void map_release(struct map_node * m) { struct ploop_map * map = m-parent; @@ -1026,9 +1030,11 @@ static void map_wb_complete_post_process(struct ploop_map *map, } if (test_bit(PLOOP_REQ_RELOC_S, preq-state)) { + spin_lock_irq(plo-lock); del_lockout(preq); map_release(preq-map); preq-map = NULL; + spin_unlock_irq(plo-lock); requeue_req(preq, PLOOP_E_RELOC_COMPLETE); return; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: prioritize BAT operations
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit d742aa564de94c3816a9d3a7991adb00d23678d4 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:04 2015 +0400 ploop: prioritize BAT operations Ploop uses -read_page and -write_page methods of pio_direct to read/write index table. These operations are rare and usually someone is blocked on them. Let's give them a priority by setting SYNCIO flag. Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/io_direct.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index c18d2f0..e5eb66a 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -1432,7 +1432,7 @@ static void dio_read_page(struct ploop_io * io, struct ploop_request * preq, struct page * page, sector_t sec) { - dio_io_page(io, READ, preq, page, sec); + dio_io_page(io, READ | REQ_SYNC, preq, page, sec); } static void @@ -1444,7 +1444,8 @@ dio_write_page(struct ploop_io * io, struct ploop_request * preq, return; } - dio_io_page(io, WRITE | (fua ? REQ_FUA : 0), preq, page, sec); + dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC, + preq, page, sec); } static int ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: make manual abort transition verbose
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 9a5fe498a7a1d9c1ecf4001c0766f325f1139079 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:09 2015 +0400 ploop: make manual abort transition verbose Signed-off-by: Dmitry Monakhov dmonak...@openvz.org --- drivers/block/ploop/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ploop/sysfs.c b/drivers/block/ploop/sysfs.c index 3ef53ac..07a4829 100644 --- a/drivers/block/ploop/sysfs.c +++ b/drivers/block/ploop/sysfs.c @@ -326,6 +326,9 @@ static u32 show_aborted(struct ploop_device * plo) static int store_aborted(struct ploop_device * plo, u32 val) { + printk(KERN_INFO ploop: Force %s aborted state for ploop%d\n, + val ? set : clear, plo-index); + if (val) set_bit(PLOOP_S_ABORT, plo-state); else ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: warning on disk full condition
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit b6eb7575242d5e266d231ed53a4f7e03e47b2a68 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:10 2015 +0400 ploop: warning on disk full condition People complain that it's not always obvious why an app in CT gets -ENOSPC while there remains some space on host filesystem. The patch adds time ratelimited printk about disk full condition. Maximal rate is 1 per hour. https://bugzilla.openvz.org/show_bug.cgi?id=3045 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/dev.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 9aaab4a..ab99724 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -3533,8 +3533,18 @@ static int ploop_bd_full(struct backing_dev_info *bdi, long long nr, int root) current-journal_info = NULL; ret = sb-s_op-statfs(F_DENTRY(file), buf); - if (ret || buf.f_bfree * buf.f_bsize reserved + nr) + if (ret || buf.f_bfree * buf.f_bsize reserved + nr) { + static unsigned long full_warn_time; + + if (printk_timed_ratelimit(full_warn_time, 60*60*HZ)) + printk(KERN_WARNING + ploop%d: host disk is almost full + (%llu %llu); CT sees -ENOSPC !\n, + plo-index, buf.f_bfree * buf.f_bsize, + reserved + nr); + rc = 1; + } fput(file); current-journal_info = jctx; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: fix busyloop on secondary discard bio
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit a5678140dd8f793272b5e562e81e27e2a249e4fd Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:11 2015 +0400 ploop: fix busyloop on secondary discard bio After diff-ploop-add-a-separate-queue-for-discard-bio-s, ploop_thread() skips processing previously queued discard bio-s if any discard bio is already under processing (fbd-fbd_dbl is not empty). ploop_wait() must take care about such a case, otherwise a busyloop may happen: ploop_thread() believes that it has to go to sleep because all incoming queues are empty excepting plo-bio_discard_list which cannot be processed by now and calls ploop_wait(); the latter returns immediately because plo-bio_discard_list is not empty and hence needs for processing. The patch also fixes a trivial bug in discard bio accounting: ploop_bio_queue() is called for all bio-s including discard bio-s and it decrements bio_qlen unconditionally. This is incorrect: it has to decrement either bio_qlen or discard_bio_qlen dependently on the type of bio. https://jira.sw.ru/browse/PSBM-30451 https://bugzilla.openvz.org/show_bug.cgi?id=3124 Signed-off-by: Maxim Patlasov mpatla...@parallels.com Acked-by: Andrew Vagin ava...@parallels.com --- drivers/block/ploop/dev.c | 9 +++-- drivers/block/ploop/freeblks.c | 12 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index e2ff0aa..ac0f28f 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -551,7 +551,11 @@ ploop_bio_queue(struct ploop_device * plo, struct bio * bio, __TRACE(A %p %u\n, preq, preq-req_cluster); - plo-bio_qlen--; + if (unlikely(bio-bi_rw REQ_DISCARD)) + plo-bio_discard_qlen--; + else + plo-bio_qlen--; + ploop_entry_add(plo, preq); if (bio-bi_size !(bio-bi_rw REQ_DISCARD)) @@ -2563,7 +2567,8 @@ static void ploop_wait(struct ploop_device * plo, int once, struct blk_plug *plu !plo-active_reqs)) break; } else if (plo-bio_head || - !bio_list_empty(plo-bio_discard_list)) { + (!bio_list_empty(plo-bio_discard_list) +!ploop_discard_is_inprogress(plo-fbd))) { /* ready_queue and entry_queue are empty, but * bio list not. Obviously, we'd like to process * bio_list instead of sleeping */ diff --git a/drivers/block/ploop/freeblks.c b/drivers/block/ploop/freeblks.c index cf48d3a..89108c7 100644 --- a/drivers/block/ploop/freeblks.c +++ b/drivers/block/ploop/freeblks.c @@ -696,20 +696,24 @@ int ploop_fb_get_free_block(struct ploop_freeblks_desc *fbd, static void fbd_complete_bio(struct ploop_freeblks_desc *fbd, int err) { + struct ploop_device *plo = fbd-plo; unsigned int nr_completed = 0; while (fbd-fbd_dbl.head) { struct bio * bio = fbd-fbd_dbl.head; fbd-fbd_dbl.head = bio-bi_next; bio-bi_next = NULL; - BIO_ENDIO(fbd-plo-queue, bio, err); + BIO_ENDIO(plo-queue, bio, err); nr_completed++; } fbd-fbd_dbl.tail = NULL; - spin_lock_irq(fbd-plo-lock); - fbd-plo-bio_total -= nr_completed; - spin_unlock_irq(fbd-plo-lock); + spin_lock_irq(plo-lock); + plo-bio_total -= nr_completed; + if (!bio_list_empty(plo-bio_discard_list) + waitqueue_active(plo-waitq)) + wake_up_interruptible(plo-waitq); + spin_unlock_irq(plo-lock); } void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: mark reloc reqs to force FUA before write of relocated data
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 622b02378d190968a9ad04f5e8161a1574a1d2df Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:15 2015 +0400 ploop: mark reloc reqs to force FUA before write of relocated data Series description: During relocation of ploop clusters (resize/baloon) we need to FUA/fsync image file after such operations: a) new data block wrote b) BAT update c) nullify old data block for BAT grow. We do this already nullify of old data block at format module - complete_grow callback. This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules tuned by patch to force fsync/FUA if these flags are set. This code does FUA/fsync only for a) and b) cases, while c) already implemented. Also patch fixes inconsistent bio list FUA processing in direct module. The problem is that for bunch of bios we only set FUA at last bio. Its possible in case of power outage that last bio will be stored and previos are not because they are stored only in cache at the time of power failure. To solve problem this patch marking last bio as FLUSH|FUA if more than one bio in list. Moreover for KAIO if fsync possible at BAT update stage we do that like we did in direct case instead of 2 fsync's. For direct case if we going to make FUA at BAT update only(optimization trick that already exists) then we need to mark req to FLUSH previously written(without FUA) data. Performance: Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% of time. https://jira.sw.ru/browse/PSBM-31222 https://jira.sw.ru/browse/PSBM-31225 https://jira.sw.ru/browse/PSBM-31321 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Andrey Smetanin (7): ploop: define struct ploop_request-state flags to force pre FLUSH before write IO and FUA/fsync at I/O complete ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O ploop: mark reloc reqs to force FUA before write of relocated data ploop: direct: to support truly FLUSH/FUA of req we need mark first bio FLUSH, write all bios and mark last bio as FLUSH/FUA ploop: added ploop_req_delay_fua_possible() func that detects possible delaying of upcoming FUA to index update stage. This function will be lately used in direct/kaio code to detect and delay FUA ploop: make image fsync at I/O complete if it's required by FUA/fsync force flag or by req-req_rw ploop: do preflush or postfua according force FUA/flush flags, and delay FUA if possible but add force FLUSH to req if so This patch description: Need to force FUA/fsync of relocated data write for consistent resize. Signed-off-by: Andrey Smetanin asmeta...@parallels.com Reviewed-by: Andrew Vagin ava...@parallels.com --- drivers/block/ploop/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index ac0f28f..bd5fe37 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2434,6 +2434,9 @@ restart: top_delta = ploop_top_delta(plo); sbl.head = sbl.tail = preq-aux_bio; + /* Relocated data write required sync before BAT updatee */ + set_bit(PLOOP_REQ_FORCE_FUA, preq-state); + if (test_bit(PLOOP_REQ_RELOC_S, preq-state)) { preq-eng_state = PLOOP_E_DATA_WBI; plo-st.bio_out++; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/memcg/proc: add kpagecgroup file
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 3e48c113a59f801934292fc89e6915b1d8a341a7 Author: Vladimir Davydov vdavy...@parallels.com Date: Tue May 19 08:23:48 2015 +0400 ms/memcg/proc: add kpagecgroup file Patchset description: idle memory tracking This patch set backports https://lkml.org/lkml/2015/5/12/449 which is required by vcmmd. It is not yet clear if the original patch set will be accepted upstream as is, there still may be changes. However, I hope the user API will be preserved. If it is not, we will have to fix this in our kernel too. https://jira.sw.ru/browse/PSBM-32460 Vladimir Davydov (3): memcg: add page_cgroup_ino helper proc: add kpagecgroup file proc: add kpageidle file === This patch description: /proc/kpagecgroup contains a 64-bit inode number of the memory cgroup each page is charged to, indexed by PFN. Having this information is useful for estimating a cgroup working set size. The file is present if CONFIG_PROC_PAGE_MONITOR CONFIG_MEMCG. Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- Documentation/vm/pagemap.txt | 6 - fs/proc/Kconfig | 5 +++-- fs/proc/page.c | 53 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index fd7c3cf..e37cff9 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow userspace programs to examine the page tables and related information by reading files in /proc. -There are three components to pagemap: +There are four components to pagemap: * /proc/pid/pagemap. This file lets a userspace process find out which physical frame each virtual page is mapped to. It contains one 64-bit @@ -63,6 +63,10 @@ There are three components to pagemap: 21. KSM 22. THP + * /proc/kpagecgroup. This file contains a 64-bit inode number of the + memory cgroup each page is charged to, indexed by PFN. Only available when + CONFIG_MEMCG is set. + Short descriptions to the page flags: 0. LOCKED diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 15af622..e8ed22d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -65,5 +65,6 @@ config PROC_PAGE_MONITOR help Various /proc files exist to monitor process memory utilization: /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, - /proc/kpagecount, and /proc/kpageflags. Disabling these - interfaces will reduce the size of the kernel by approximately 4kb. + /proc/kpagecount, /proc/kpageflags, and /proc/kpagecgroup. + Disabling these interfaces will reduce the size of the kernel + by approximately 4kb. diff --git a/fs/proc/page.c b/fs/proc/page.c index cab84b6..c9cbed3 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -8,6 +8,7 @@ #include linux/proc_fs.h #include linux/seq_file.h #include linux/hugetlb.h +#include linux/memcontrol.h #include linux/kernel-page-flags.h #include asm/uaccess.h #include internal.h @@ -213,10 +214,62 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +#ifdef CONFIG_MEMCG +static ssize_t kpagecgroup_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 ino; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src KPMMASK || count KPMMASK) + return -EINVAL; + + while (count 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (ppage) + ino = page_cgroup_ino(ppage); + else + ino = 0; + + if (put_user(ino, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecgroup_operations = { + .llseek = mem_lseek, + .read = kpagecgroup_read, +}; +#endif /* CONFIG_MEMCG */ + static int __init proc_page_init(void) { proc_create(kpagecount, S_IRUSR,
[Devel] [PATCH RHEL7 COMMIT] ploop: prevent dangerous ploop-umount
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 8854414d2d97abd7ab86d4c9d1c74d9b2fc04c3c Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:26:56 2015 +0400 ploop: prevent dangerous ploop-umount Umounting ploop device if inner fs is still mounted on it leads to numerous complains in kernel logs like: VFS: Busy inodes after unmount. sb = 880108987000, fs type = ext4, sb count = 2, sb-s_root = / and is not what user expected. The patch adds some protection from dummy userspace mistakes: do not allow to stop ploop device (this is the first step of ploop-umount) if user uses /dev/ploopNp1 for ioctl, or if someone (inner fs) is still using the device. https://jira.sw.ru/browse/PSBM-21474 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/dev.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 5a3a5ec..2f4928d 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -3548,6 +3548,20 @@ static int ploop_stop(struct ploop_device * plo, struct block_device *bdev) struct ploop_delta * delta; int cnt; + if (bdev != bdev-bd_contains) { + if (printk_ratelimit()) + printk(KERN_INFO stop ploop%d failed (wrong bdev)\n, + plo-index); + return -ENODEV; + } + + if (bdev-bd_contains-bd_holders) { + if (printk_ratelimit()) + printk(KERN_INFO stop ploop%d failed (holders=%d)\n, + plo-index, bdev-bd_contains-bd_holders); + return -EBUSY; + } + if (!test_bit(PLOOP_S_RUNNING, plo-state)) return -EINVAL; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: fix iblk-to-sector calculations
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 96f009d1061c9e1ec9b6c7699eef565bcd44f26a Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:26:59 2015 +0400 ploop: fix iblk-to-sector calculations iblk stands for image-file block number. Its size is the same as u32. The size of 'sector' is the same as long. While converting the former to the latter like this: sec = iblk shift, we must always cast 'iblk' to long. And we actually do in most cases. The patch fixes a place in io_direct module where it was forgotten. https://jira.sw.ru/browse/PSBM-22961 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/io_direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index ab74849..56b9f37 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -119,8 +119,8 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, goto out_em_err; if (write em-block_start == BLOCK_UNINIT) { - sector_t end = (iblk + 1) preq-plo-cluster_log; - sec = iblk preq-plo-cluster_log; + sector_t end = (sector_t)(iblk + 1) preq-plo-cluster_log; + sec = (sector_t)iblk preq-plo-cluster_log; if (em-start = sec) sec = em-end; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: reverse order of fdatawait and fsync fop
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 0ac13b3ba07b42573f151c21d9727a2cbcd415d1 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:00 2015 +0400 ploop: reverse order of fdatawait and fsync fop dio_fsync_thread must call filemap_fdatawrite() before file-f_op-fsync(). Otherwise: 8,06 82 0.003095587 12328 D WS 441706496 + 512 [ploop19054] 8,06 83 0.003103726 12328 D WS 441707008 + 512 [ploop19054] 8,06 84 0.003108627 12328 D WS 441707520 + 512 [ploop19054] 8,06 85 0.003113176 12328 D WS 441708032 + 512 [ploop19054] ... 8,06 102 0.003149386 1299 D WS 3950526248 + 24 [jbd2/dm-1-8] ... 8,06 103 0.003305550 0 C WS 441706496 + 512 [0] 8,06 104 0.003458057 0 C WS 441707008 + 512 [0] 8,06 105 0.003608325 0 C WS 441707520 + 512 [0] 8,06 106 0.003758297 0 C WS 441708032 + 512 [0] 8,06 107 0.003794543 0 C WS 3950526248 + 24 [0] And if the node crashes (or reboot happens) after last dispatch, journal data may come to the disk while user bulk data -- not. The result would be ploop image corruption. The patch re-arranges the sequence of calls to make it safe and natural (the same way as in vfs_fsync_range()). Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/io_direct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index babc940..c18d2f0 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -735,14 +735,13 @@ static int dio_fsync_thread(void * data) spin_unlock_irq(plo-lock); /* filemap_fdatawrite() has been made already */ + filemap_fdatawait(io-files.mapping); err = 0; if (io-files.file-f_op-fsync) err = io-files.file-f_op-FOP_FSYNC(io-files.file, 0); - filemap_fdatawait(io-files.mapping); - /* Do we need to invalidate page cache? Not really, * because we use it only to create full new pages, * which we overwrite completely. Probably, we should ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: support 4K block-size of host block-device
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 5e02bd5942dd5cfd66f5b4096e966ae9b134b5ea Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:01 2015 +0400 ploop: support 4K block-size of host block-device Avoid 512-bytes reads/writes. They were used by 'expanded' format module to get and save format header. Let's use 4K reads/writes instead. Customer's problem: [root@pcstest10 ~]# ploop mount /vz3/test.hdd add delta dev=/dev/ploop19025 img=/vz3/test.hdd (rw) Can't add image /vz3/test.hdd: Input/output error [root@pcstest10 ~]# Right after trying to mount the image the kernel throws the following: [1564044.775584] sd 13:0:0:0: [sde] Bad block number requested The block size of this device is not 512 as for other direct attached disks. It is 4096 and the device is an iSCSI target. https://jira.sw.ru/browse/PSBM-21989 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/fmt_ploop1.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/block/ploop/fmt_ploop1.c b/drivers/block/ploop/fmt_ploop1.c index fb12c30..5ce6915 100644 --- a/drivers/block/ploop/fmt_ploop1.c +++ b/drivers/block/ploop/fmt_ploop1.c @@ -78,7 +78,7 @@ static int ploop1_stop(struct ploop_delta * delta) vh = (struct ploop_pvd_header *)page_address(ph-dyn_page); - err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0); if (err) return err; @@ -90,7 +90,7 @@ static int ploop1_stop(struct ploop_delta * delta) vh-m_DiskInUse = 0; - err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 0, 0); if (err) return err; @@ -128,7 +128,7 @@ ploop1_open(struct ploop_delta * delta) goto out_err; /* IO engine is ready. */ - err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0); if (err) goto out_err; @@ -168,7 +168,7 @@ ploop1_open(struct ploop_delta * delta) if (!(delta-flags PLOOP_FMT_RDONLY)) { vh-m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_IN_USE); - err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 0, 0); if (err) goto out_err; } @@ -198,7 +198,7 @@ ploop1_refresh(struct ploop_delta * delta) vh = (struct ploop_pvd_header *)page_address(ph-dyn_page); - err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0); if (err) return err; @@ -266,7 +266,7 @@ ploop1_sync(struct ploop_delta * delta) if (err) return err; - err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0); if (err) return err; @@ -279,7 +279,7 @@ ploop1_sync(struct ploop_delta * delta) vh-m_Flags = cpu_to_le32(vh-m_Flags); } - err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 0, 0); if (err) return err; @@ -312,7 +312,7 @@ ploop1_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd) if (err) goto out; - err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0); if (err) goto out; @@ -335,7 +335,7 @@ ploop1_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd) * remain valid. */ - err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 0, 0); if (err) goto out; @@ -367,7 +367,7 @@ ploop1_prepare_merge(struct ploop_delta * delta, struct ploop_snapdata * sd) vh = (struct ploop_pvd_header *)page_address(ph-dyn_page); - err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0); + err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0); if (err) return err; @@ -403,7 +403,7 @@ ploop1_start_merge(struct ploop_delta * delta, struct ploop_snapdata * sd) return -EIO; } - err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0); + err
[Devel] [PATCH RHEL7 COMMIT] ploop: bug on bad fiemap (v2)
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit e3b634ed036e618d74643faaa478dc3951c2f781 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:05 2015 +0400 ploop: bug on bad fiemap (v2) Based on crash analysis, one of extents from ploop em-tree is bad: 883fe6230ae0 start = 19380224 end = 19447808 block_start = 0 refs = { counter = 1 } ploop never calculates em-block_start other than by direct assigning: em-block_start = fi_extent.fe_physical 9; The patch attempts to catch erroneous (zero) output immediately after fiemap call. Changed in v2: - WARN_ON (instead of BUG_ON) for delalloc extents https://jira.sw.ru/browse/PSBM-26762 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/io_direct_map.c | 23 ++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/block/ploop/io_direct_map.c b/drivers/block/ploop/io_direct_map.c index b3cb04d..b9a0ce9 100644 --- a/drivers/block/ploop/io_direct_map.c +++ b/drivers/block/ploop/io_direct_map.c @@ -641,6 +641,7 @@ static struct extent_map *__map_extent_bmap(struct ploop_io *io, { struct extent_map_tree *tree = io-files.em_tree; struct inode *inode = mapping-host; + loff_t start_off = (loff_t)start 9; struct extent_map *em; struct fiemap_extent_info fieinfo; struct fiemap_extent fi_extent; @@ -681,6 +682,25 @@ again: old_fs = get_fs(); set_fs(KERNEL_DS); ret = inode-i_op-fiemap(inode, fieinfo, start 9, 1); + + /* chase for PSBM-26762: em-block_start == 0 */ + if (!ret fieinfo.fi_extents_mapped == 1 + !(fi_extent.fe_flags FIEMAP_EXTENT_UNWRITTEN) + (fi_extent.fe_physical 9) == 0) { + /* see how ext4_fill_fiemap_extents() implemented */ + if (!(fi_extent.fe_flags FIEMAP_EXTENT_DELALLOC)) { + printk(bad fiemap(%ld,%ld) on inode=%p fieinfo=%p +i_size=%lld\n, start, len, inode, fieinfo, + i_size_read(inode)); + BUG(); + } + /* complain about delalloc case -- ploop always fallocate + * before buffered write */ + WARN(1, ploop%d: delalloc extent [%lld,%lld] for [%lld,%ld]; +i_size=%lld\n, io-plo-index, fi_extent.fe_logical, + fi_extent.fe_length, start_off, len 9, i_size_read(inode)); + ret = -ENOENT; + } set_fs(old_fs); if (ret) { @@ -808,9 +828,10 @@ void trim_extent_mappings(struct extent_map_tree *tree, sector_t start) while ((em = lookup_extent_mapping(tree, start, ((sector_t)(-1ULL)) - start))) { remove_extent_mapping(tree, em); + WARN_ON(atomic_read(em-refs) != 2); /* once for us */ extent_put(em); - /* _XXX_ This cannot be correct in the case of concurrent lookups */ + /* No concurrent lookups due to ploop_quiesce(). See WARN_ON above */ /* once for the tree */ extent_put(em); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: put top-delta back if merge failed
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit ee2968cd8321728956effea19e98959befec32d0 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:07 2015 +0400 ploop: put top-delta back if merge failed Before merge, we move top-delta to a temporary plo-trans_map list. Since then, it's not present in the main plo-map list anymore. If merge failed, we must put it back to plo-map. Otherwise the delta will be lost forever (visible in /sys/block/ploop*/pdelta/*, but not accessible from ploop). https://jira.sw.ru/browse/PSBM-25252 Signed-off-by: Maxim Patlasov mpatla...@parallels.com Acked-by: Pavel Emelyanov xe...@parallels.com --- drivers/block/ploop/dev.c | 49 +++ 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index d2a9eb4..2e6302f 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -3280,6 +3280,26 @@ static void ploop_update_fmt_version(struct ploop_device * plo) } } +static void ploop_merge_cleanup(struct ploop_device * plo, + struct ploop_map * map, + struct ploop_delta * delta, int err) +{ + ploop_quiesce(plo); + mutex_lock(plo-sysfs_mutex); + list_del(delta-list); + + if (err) + list_add(delta-list, plo-map.delta_list); + else + ploop_update_fmt_version(plo); + + plo-trans_map = NULL; + plo-maintenance_type = PLOOP_MNTN_OFF; + mutex_unlock(plo-sysfs_mutex); + ploop_map_destroy(map); + ploop_relax(plo); +} + static int ploop_merge(struct ploop_device * plo) { int err; @@ -3368,32 +3388,19 @@ already: if (test_bit(PLOOP_S_ABORT, plo-state)) { printk(KERN_WARNING merge for ploop%d failed (state ABORT)\n, plo-index); - plo-trans_map = NULL; - plo-maintenance_type = PLOOP_MNTN_OFF; err = -EIO; - goto out; } - ploop_quiesce(plo); - mutex_lock(plo-sysfs_mutex); - plo-trans_map = NULL; - plo-maintenance_type = PLOOP_MNTN_OFF; - list_del(delta-list); - ploop_update_fmt_version(plo); - mutex_unlock(plo-sysfs_mutex); - ploop_map_destroy(map); - ploop_relax(plo); + ploop_merge_cleanup(plo, map, delta, err); - kfree(map); - - kobject_del(delta-kobj); - kobject_put(plo-kobj); - - delta-ops-stop(delta); - delta-ops-destroy(delta); - kobject_put(delta-kobj); - return 0; + if (!err) { + kobject_del(delta-kobj); + kobject_put(plo-kobj); + delta-ops-stop(delta); + delta-ops-destroy(delta); + kobject_put(delta-kobj); + } out: kfree(map); return err; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: added ploop_req_delay_fua_possible() func that detects possible delaying of upcoming FUA to index update stage
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit abc95cfc45bd725c7aba2f7697e322413ae5725a Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:12 2015 +0400 ploop: added ploop_req_delay_fua_possible() func that detects possible delaying of upcoming FUA to index update stage During relocation of ploop clusters (resize/baloon) we need to FUA/fsync image file after such operations: a) new data block wrote b) BAT update c) nullify old data block for BAT grow. We do this already nullify of old data block at format module - complete_grow callback. This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules tuned by patch to force fsync/FUA if these flags are set. This code does FUA/fsync only for a) and b) cases, while c) already implemented. Also patch fixes inconsistent bio list FUA processing in direct module. The problem is that for bunch of bios we only set FUA at last bio. Its possible in case of power outage that last bio will be stored and previos are not because they are stored only in cache at the time of power failure. To solve problem this patch marking last bio as FLUSH|FUA if more than one bio in list. Moreover for KAIO if fsync possible at BAT update stage we do that like we did in direct case instead of 2 fsync's. For direct case if we going to make FUA at BAT update only(optimization trick that already exists) then we need to mark req to FLUSH previously written(without FUA) data. Performance: Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% of time. https://jira.sw.ru/browse/PSBM-31222 https://jira.sw.ru/browse/PSBM-31225 https://jira.sw.ru/browse/PSBM-31321 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Andrey Smetanin (7): ploop: define struct ploop_request-state flags to force pre FLUSH before write IO and FUA/fsync at I/O complete ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O ploop: mark reloc reqs to force FUA before write of relocated data ploop: direct: to support truly FLUSH/FUA of req we need mark first bio FLUSH, write all bios and mark last bio as FLUSH/FUA ploop: added ploop_req_delay_fua_possible() func that detects possible delaying of upcoming FUA to index update stage. This function will be lately used in direct/kaio code to detect and delay FUA ploop: make image fsync at I/O complete if it's required by FUA/fsync force flag or by req-req_rw ploop: do preflush or postfua according force FUA/flush flags, and delay FUA if possible but add force FLUSH to req if so This patch description: This function will be lately used in direct/kaio code to detect and delay FUA. https://jira.sw.ru/browse/PSBM-31222 https://jira.sw.ru/browse/PSBM-31225 https://jira.sw.ru/browse/PSBM-31321 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Reviewed-by: Andrew Vagin ava...@parallels.com --- include/linux/ploop/ploop.h | 17 + 1 file changed, 17 insertions(+) diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index eacd36a..d8b83a6 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -577,6 +577,23 @@ void ploop_fail_request(struct ploop_request * preq, int err); void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list, int keep_locked); + +static inline int ploop_req_delay_fua_possible(unsigned long rw, + struct ploop_request *preq) +{ + int delay_fua = 0; + + /* In case of eng_state != COMPLETE, we'll do FUA in +* ploop_index_update(). Otherwise, we should post +* fua. +*/ + if (rw REQ_FUA) { + if (preq-eng_state != PLOOP_E_COMPLETE) + delay_fua = 1; + } + return delay_fua; +} + static inline void ploop_set_error(struct ploop_request * preq, int err) { if (!preq-error) { ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: force FUA of nullified blocks for BAT grow
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 051a2a154c0e040d7d15ab9a3b56b77d9de021b3 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:16 2015 +0400 ploop: force FUA of nullified blocks for BAT grow Lately we think we does sync of nullified blocks at format driver by image fsync before header BAT size grow update. But we write this data directly into underlying device bypassing EXT4 by usage of extent map tree (see dio_submit()). So fsync of EXT4 image doesnt help us. We need to force sync of nullified blocks. This patch does it by marking preq via PLOOP_REQ_FORCE_FUA flag. https://jira.sw.ru/browse/PSBM-31969 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Acked-by: Andrew Vagin ava...@parallels.com --- drivers/block/ploop/map.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 67e2852..8ea67e9 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -1056,10 +1056,14 @@ static void map_wb_complete_post_process(struct ploop_map *map, 0, PAGE_SIZE); /* -* FUA of this data occures at format driver -complete_grow() by -* all image sync. After that header size increased to use this -* cluster as BAT cluster. +* Lately we think we does sync of nullified blocks at format +* driver by image fsync before header update. +* But we write this data directly into underlying device +* bypassing EXT4 by usage of extent map tree +* (see dio_submit()). So fsync of EXT4 image doesnt help us. +* We need to force sync of nullified blocks. */ + set_bit(PLOOP_REQ_FORCE_FUA, preq-state); top_delta-io.ops-submit(top_delta-io, preq, preq-req_rw, sbl, preq-iblock, 1plo-cluster_log); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] vzctl PRE_CREATE hook
(I previously replied to Nikolay only -- re-sending with devel@ included) On 05/12/2015 02:36 AM, Nikolay Tenev wrote: Hello devs, In my project I wanted to make every OpenVZ container to use for a private directory (VE_PRIVATE) separated block device (HDD partition, lvm volume, NFS share, etc.). To use wrapper script over vzctl was one option, but PRE_CREATE hook, in which to create, mkfs and mount LVM volume would be even better. So, I'm not a developer, but using code from POST_CREATE hook I was able to create the PRE_CREATE one, which can be used as the other hooks e.g. add in /etc/vz/dists/ default PRE_CREATE = precreate.sh and during vzctl --create ... it will call /etc/vz/dists/scripts/precreate.sh with VEID as argument Nope. These scripts are per-distribution scripts, i.e. they are targeted for various distro-specific things, such as setting IP addresses etc. What you need is a global script, not dependent on CT distro. I suggest a precreate.sh script similar to prestart.sh one (for details, see commit https://github.com/kolyshkin/vzctl/commit/0807ef4) Currently I have a patch to vzctl master branch which implements this PRE_CREATE hook and I'm ready to share it. So my questions are: - Do you find this for interesting and/or useful? - If 'yes', what is the right way to send this patch: here, by email; or to create pull request in git repo? The best way would be to redo as advised above and send a patch to devel@ list. Thanks, Kir. Best regards! Nikolay Tenev ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/memcg: add page_cgroup_ino helper
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 77c59afe2b55a1dd631c3b8a6d3763eff8d09941 Author: Vladimir Davydov vdavy...@parallels.com Date: Tue May 19 08:23:31 2015 +0400 ms/memcg: add page_cgroup_ino helper Patchset description: idle memory tracking This patch set backports https://lkml.org/lkml/2015/5/12/449 which is required by vcmmd. It is not yet clear if the original patch set will be accepted upstream as is, there still may be changes. However, I hope the user API will be preserved. If it is not, we will have to fix this in our kernel too. https://jira.sw.ru/browse/PSBM-32460 Vladimir Davydov (3): memcg: add page_cgroup_ino helper proc: add kpagecgroup file proc: add kpageidle file === This patch description: Hwpoison allows to filter pages by memory cgroup ino. To ahieve that, it calls try_get_mem_cgroup_from_page(), then mem_cgroup_css(), and finally extracts the inode number from the cgroup returned. This looks bulky. Since in the next patch I need to get the ino of the memory cgroup a page is charged to too, in this patch I introduce the page_cgroup_ino() helper. Note that page_cgroup_ino() only considers those pages that are charged to mem_cgroup-res (i.e. page_cgroup-mem_cgroup != NULL), and for others it returns 0, while try_get_mem_cgroup_page(), used by hwpoison before, may extract the cgroup from a swapcache readahead page too. Ignoring swapcache readahead pages allows to call page_cgroup_ino() on unlocked pages, which is nice. Hwpoison users will hardly see any difference. Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- include/linux/memcontrol.h | 3 +++ mm/hwpoison-inject.c | 3 --- mm/memcontrol.c| 22 ++ mm/memory-failure.c| 18 +- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 675b4c5..5507be5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -200,6 +200,9 @@ void mem_cgroup_split_huge_fixup(struct page *head); bool mem_cgroup_bad_page_check(struct page *page); void mem_cgroup_print_bad_page(struct page *page); #endif + +unsigned long page_cgroup_ino(struct page *page); + #else /* CONFIG_MEMCG */ struct mem_cgroup; diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 3a61efc..bd580f8 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -44,12 +44,9 @@ static int hwpoison_inject(void *data, u64 val) /* * do a racy check with elevated page count, to make sure PG_hwpoison * will only be set for the targeted owner (or on a free page). -* We temporarily take page lock for try_get_mem_cgroup_from_page(). * memory_failure() will redo the check reliably inside page lock. */ - lock_page(hpage); err = hwpoison_filter(hpage); - unlock_page(hpage); if (err) return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e772a06..9dda309 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2877,6 +2877,28 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return memcg; } +/** + * page_cgroup_ino - return inode number of page's memcg + * @page: the page + * + * Look up the memory cgroup @page is charged to and return its inode number. + * It is safe to call this function without taking a reference to the page. + */ +unsigned long page_cgroup_ino(struct page *page) +{ + struct mem_cgroup *memcg; + struct page_cgroup *pc; + unsigned long ino = 0; + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + memcg = pc-mem_cgroup; + if (PageCgroupUsed(pc) memcg) + ino = memcg-css.cgroup-dentry-d_inode-i_ino; + unlock_page_cgroup(pc); + return ino; +} + static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, struct page *page, unsigned int nr_pages, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 06f8d308..b3b1a2d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -133,26 +133,10 @@ u64 hwpoison_filter_memcg; EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); static int hwpoison_filter_task(struct page *p) { - struct mem_cgroup *mem; - struct cgroup_subsys_state *css; - unsigned long ino; - if (!hwpoison_filter_memcg) return 0; - mem = try_get_mem_cgroup_from_page(p); - if (!mem) - return -EINVAL; - - css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ -
[Devel] [PATCH RHEL7 COMMIT] ms/mm/proc: add kpageidle file
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 35dcabf891ce1931294c5bf3d98e1203ff656432 Author: Vladimir Davydov vdavy...@parallels.com Date: Tue May 19 08:23:57 2015 +0400 ms/mm/proc: add kpageidle file Patchset description: idle memory tracking This patch set backports https://lkml.org/lkml/2015/5/12/449 which is required by vcmmd. It is not yet clear if the original patch set will be accepted upstream as is, there still may be changes. However, I hope the user API will be preserved. If it is not, we will have to fix this in our kernel too. https://jira.sw.ru/browse/PSBM-32460 Vladimir Davydov (3): memcg: add page_cgroup_ino helper proc: add kpagecgroup file proc: add kpageidle file === This patch description: Knowing the portion of memory that is not used by a certain application or memory cgroup (idle memory) can be useful for partitioning the system efficiently, e.g. by setting memory cgroup limits appropriately. Currently, the only means to estimate the amount of idle memory provided by the kernel is /proc/PID/{clear_refs,smaps}: the user can clear the access bit for all pages mapped to a particular process by writing 1 to clear_refs, wait for some time, and then count smaps:Referenced. However, this method has two serious shortcomings: - it does not count unmapped file pages - it affects the reclaimer logic To overcome these drawbacks, this patch introduces two new page flags, Idle and Young, and a new proc file, /proc/kpageidle. A page's Idle flag can only be set from userspace by setting bit in /proc/kpageidle at the offset corresponding to the page, and it is cleared whenever the page is accessed either through page tables (it is cleared in page_referenced() in this case) or using the read(2) system call (mark_page_accessed()). Thus by setting the Idle flag for pages of a particular workload, which can be found e.g. by reading /proc/PID/pagemap, waiting for some time to let the workload access its working set, and then reading the kpageidle file, one can estimate the amount of pages that are not used by the workload. The Young page flag is used to avoid interference with the memory reclaimer. A page's Young flag is set whenever the Access bit of a page table entry pointing to the page is cleared by writing to kpageidle. If page_referenced() is called on a Young page, it will add 1 to its return value, therefore concealing the fact that the Access bit was cleared. Note, since there is no room for extra page flags on 32 bit, this feature uses extended page flags when compiled on 32 bit. (on RH7 page ext is not available so make it depend on 64 bit) Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- Documentation/vm/pagemap.txt | 12 +++- fs/proc/page.c | 168 +++ fs/proc/task_mmu.c | 3 +- include/linux/mm.h | 50 + include/linux/page-flags.h | 9 +++ mm/Kconfig | 12 mm/page_alloc.c | 4 ++ mm/rmap.c| 9 +++ mm/swap.c| 2 + 9 files changed, 267 insertions(+), 2 deletions(-) diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index e37cff9..a4fe9b2 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow userspace programs to examine the page tables and related information by reading files in /proc. -There are four components to pagemap: +There are five components to pagemap: * /proc/pid/pagemap. This file lets a userspace process find out which physical frame each virtual page is mapped to. It contains one 64-bit @@ -67,6 +67,16 @@ There are four components to pagemap: memory cgroup each page is charged to, indexed by PFN. Only available when CONFIG_MEMCG is set. + * /proc/kpageidle. This file implements a bitmap where each bit corresponds + to a page, indexed by PFN. When the bit is set, the corresponding page is + idle. A page is considered idle if it has not been accessed since it was + marked idle. To mark a page idle one should set the bit corresponding to the + page by writing to the file. A value written to the file is OR-ed with the + current bitmap value. Only user memory pages can be marked idle, for other + page types input is silently ignored. Writing to this file beyond max PFN + results in the ENXIO error. Only available when CONFIG_IDLE_PAGE_TRACKING is + set. + Short descriptions to the page flags: 0.
[Devel] [PATCH RHEL7 COMMIT] ploop: prevent disclosure 4 bytes of the stack kernel
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit c25ed54c1a19bc8c11fcc472c3e4869c210eca97 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:26:57 2015 +0400 ploop: prevent disclosure 4 bytes of the stack kernel Memory leak (4 bytes) in the ploop_getdevice_ioc function. 217401 +static int ploop_getdevice_ioc(unsigned long arg) 217402 +{ 217403 + int err; 217404 + int index = 0; 217405 + struct rb_node *n; 217406 + struct ploop_getdevice_ctl ctl; 217407 + 217408 + mutex_lock(ploop_devices_mutex); 217409 + for (n = rb_first(ploop_devices_tree); n; n = rb_next(n), index++) { 217410 + struct ploop_device *plo; 217411 + plo = rb_entry(n, struct ploop_device, link); 217412 + if (plo-index != index || list_empty(plo-map.delta_list)) 217413 + break; 217414 + } 217415 + mutex_unlock(ploop_devices_mutex); 217416 + 217417 + ctl.minor = index PLOOP_PART_SHIFT; 217418 + if (ctl.minor ~MINORMASK) 217419 + return -ERANGE; 217420 + err = copy_to_user((void*)arg, ctl, sizeof(ctl)); 217421 + return err; 217422 +} The ploop_getdevice_ioc() function copy to user the ploop_getdevice_ctl structure but it initialize juste the 'minor' attribute. It's possible to disclosure 4 bytes of the stack kernel via the '__mbz1' attribute. Below the 'ploop_getdevice_ctl' structure : 3772915 +struct ploop_getdevice_ctl 3772916 +{ 3772917 + __u32 minor; 3772918 + __u32 __mbz1; 3772919 +} __attribute__ ((aligned (8))); Signed-off-by: Andrey Vagin ava...@openvz.org Reported-by: Jonathan Salwan (Sysdream Security Laboratory) jonathan.sal...@gmail.com --- drivers/block/ploop/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 2f4928d..8556af2 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -4277,7 +4277,7 @@ static int ploop_getdevice_ioc(unsigned long arg) int err; int index = 0; struct rb_node *n; - struct ploop_getdevice_ctl ctl; + struct ploop_getdevice_ctl ctl = {}; mutex_lock(ploop_devices_mutex); for (n = rb_first(ploop_devices_tree); n; n = rb_next(n), index++) { ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: skip writes of zeroes to unallocated blocks by default
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 6051dc5f6e200cef2011e2174d1c3b76280fe75f Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:01 2015 +0400 ploop: skip writes of zeroes to unallocated blocks by default Reading from unallocated blocks returns zeroes = we can safely skip writes of zeroes to unallocated blocks. As a lot of tests do dd if=/dev/zero ..., this optimization is valuable. Feature enabled, test results: [root@p2 ~]# echo 1 /sys/block/ploop37803/ptune/check_zeros [root@p2 ~]# dd if=/dev/zero of=/mnt/sb-io-test bs=1M count=1k oflag=dsync 1024+0 records in 1024+0 records out 1073741824 bytes (1.1 GB) copied, 1.58975 s, 675 MB/s The impact on CPU utilization is negligible. https://jira.sw.ru/browse/PSBM-22506 https://jira.sw.ru/browse/PSBM-22381 Signed-off-by: Konstantin Khorenko khore...@parallels.com Acked-by: Maxim V. Patlasov mpatla...@parallels.com --- include/linux/ploop/ploop.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index d295cba..434789e 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -323,6 +323,7 @@ struct ploop_tunable .congestion_low_watermark = DEFAULT_PLOOP_MAXRQ/2, \ .pass_flushes = 1, \ .pass_fuas = 1, \ +.check_zeros = 1, \ .max_active_requests = DEFAULT_PLOOP_BATCH_ENTRY_QLEN / 2, } struct ploop_stats ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: fix spurious hole complains
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit e4c1ce43241df81fad73953200d887c6a402d82f Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:07 2015 +0400 ploop: fix spurious hole complains Spurious complains were triggered by fiemap-ahead logic of pio_direct module. Fix it by suppressing complains if fiemap behind EOF failed. Also print more details about a hole. Signed-off-by: Maxim Patlasov mpatla...@parallels.com Acked-by: Andrew Vagin ava...@parallels.com --- drivers/block/ploop/io_direct_map.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/block/ploop/io_direct_map.c b/drivers/block/ploop/io_direct_map.c index b9a0ce9..c1d889b 100644 --- a/drivers/block/ploop/io_direct_map.c +++ b/drivers/block/ploop/io_direct_map.c @@ -681,7 +681,7 @@ again: old_fs = get_fs(); set_fs(KERNEL_DS); - ret = inode-i_op-fiemap(inode, fieinfo, start 9, 1); + ret = inode-i_op-fiemap(inode, fieinfo, start_off, 1); /* chase for PSBM-26762: em-block_start == 0 */ if (!ret fieinfo.fi_extents_mapped == 1 @@ -709,8 +709,11 @@ again: } if (fieinfo.fi_extents_mapped != 1) { - ploop_msg_once(io-plo, a hole in image file detected (%d), - fieinfo.fi_extents_mapped); + if (start_off i_size_read(inode)) + ploop_msg_once(io-plo, a hole in image file detected + (mapped=%d i_size=%llu off=%llu), + fieinfo.fi_extents_mapped, + i_size_read(inode), start_off); extent_put(em); return ERR_PTR(-EINVAL); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: notify blktrace about bio completions
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 3ebe6f8f4178ebf89b3aff5b064657e1e9615dce Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:11 2015 +0400 ploop: notify blktrace about bio completions Signed-off-by: Andrey Smetanin asmeta...@virtuozzo.com --- drivers/block/ploop/dev.c | 14 -- drivers/block/ploop/freeblks.c | 4 +++- include/linux/ploop/compat.h | 6 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 225c2ab..e2ff0aa 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -13,6 +13,8 @@ #include linux/ve.h #include asm/uaccess.h +#include trace/events/block.h + #include linux/ploop/ploop.h #include ploop_events.h #include freeblks.h @@ -518,7 +520,7 @@ ploop_bio_queue(struct ploop_device * plo, struct bio * bio, bio-bi_bdev = plo-bdev; clear_bit(BIO_BDEV_REUSED, bio-bi_flags); } - BIO_ENDIO(bio, err); + BIO_ENDIO(plo-queue, bio, err); list_add(preq-list, plo-free_list); plo-bio_qlen--; plo-bio_discard_qlen--; @@ -591,7 +593,7 @@ DEFINE_BIO_CB(ploop_fast_end_io) plo = orig-bi_bdev-bd_disk-private_data; - BIO_ENDIO(orig, err); + BIO_ENDIO(plo-queue, orig, err); /* End of fast bio wakes up main process only when this could * mean exit from ATTENTION state. @@ -800,13 +802,13 @@ static void ploop_make_request(struct request_queue *q, struct bio *bio) * marked as FLUSH, otherwise just warn and complete. */ if (!(bio-bi_rw REQ_FLUSH)) { WARN_ON(1); - BIO_ENDIO(bio, 0); + BIO_ENDIO(q, bio, 0); return; } /* useless to pass this bio further */ if (!plo-tune.pass_flushes) { ploop_acc_ff_in(plo, bio-bi_rw); - BIO_ENDIO(bio, 0); + BIO_ENDIO(q, bio, 0); return; } } @@ -862,7 +864,7 @@ static void ploop_make_request(struct request_queue *q, struct bio *bio) plo-bio_total--; spin_unlock_irq(plo-lock); - BIO_ENDIO(bio, -EIO); + BIO_ENDIO(q, bio, -EIO); if (nbio) bio_put(nbio); return; @@ -1208,7 +1210,7 @@ static void ploop_complete_request(struct ploop_request * preq) struct bio * bio = preq-bl.head; preq-bl.head = bio-bi_next; bio-bi_next = NULL; - BIO_ENDIO(bio, preq-error); + BIO_ENDIO(plo-queue, bio, preq-error); nr_completed++; } preq-bl.tail = NULL; diff --git a/drivers/block/ploop/freeblks.c b/drivers/block/ploop/freeblks.c index 569cb94..cf48d3a 100644 --- a/drivers/block/ploop/freeblks.c +++ b/drivers/block/ploop/freeblks.c @@ -8,6 +8,8 @@ #include linux/buffer_head.h #include linux/kthread.h +#include trace/events/block.h + #include linux/ploop/ploop.h #include freeblks.h @@ -700,7 +702,7 @@ static void fbd_complete_bio(struct ploop_freeblks_desc *fbd, int err) struct bio * bio = fbd-fbd_dbl.head; fbd-fbd_dbl.head = bio-bi_next; bio-bi_next = NULL; - BIO_ENDIO(bio, err); + BIO_ENDIO(fbd-plo-queue, bio, err); nr_completed++; } fbd-fbd_dbl.tail = NULL; diff --git a/include/linux/ploop/compat.h b/include/linux/ploop/compat.h index ace8ec1..03c3ae3 100644 --- a/include/linux/ploop/compat.h +++ b/include/linux/ploop/compat.h @@ -44,7 +44,11 @@ static void func(struct bio *bio, int err) { #define END_BIO_CB(func) } -#define BIO_ENDIO(_bio, _err) bio_endio(_bio, _err) +#define BIO_ENDIO(_queue, _bio, _err) \ + do {\ + trace_block_bio_complete((_queue), (_bio), (_err)); \ + bio_endio((_bio), (_err)); \ + } while (0); #define F_DENTRY(file) (file)-f_path.dentry #define F_MNT(file)(file)-f_path.mnt ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: add a separate queue for discard bio-s (v2)
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit fa6f3b8595f13c13eebd452bc0947754ac249c2c Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:10 2015 +0400 ploop: add a separate queue for discard bio-s (v2) When I created support of discard requests, process_bio_queue is called from ploop_thread. So I use ploop_quiesceploop_relax for synchronization. Now it is called from ploop_make_request too, so my synchronization doesn't work any more. The race was added by diff-ploop-converting-bio-into-ploop-request-in-function-ploop_make_request. This patch adds a separate queue for discard requests, which is handled only from ploop_thread(). In addition we get ability to postpone discard bio-s, while we are handling others. So we will not fail, if a bio is received while another one is processed. In a future this will allow us to handle more than one bio concurrently. v2: fix comments from Maxim Also, ploop_preq_drop() and ploop_complete_request() must wake up ploop-thread if !bio_list_empty(plo-bio_discard_list) as well. https://jira.sw.ru/browse/PSBM-27676 Note, that this is a plain(no logic changes) port for RHEL7 of Andrew Vagin original patch (RHEL6). Signed-off-by: Andrew Vagin ava...@openvz.org --- drivers/block/ploop/dev.c | 54 ++ drivers/block/ploop/freeblks.c | 5 drivers/block/ploop/freeblks.h | 1 + drivers/block/ploop/sysfs.c| 6 + include/linux/ploop/ploop.h| 2 ++ 5 files changed, 58 insertions(+), 10 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index ab99724..225c2ab 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -117,8 +117,9 @@ static void mitigation_timeout(unsigned long data) spin_lock_irq(plo-lock); if (test_bit(PLOOP_S_WAIT_PROCESS, plo-state) (!list_empty(plo-entry_queue) || -(plo-bio_head !list_empty(plo-free_list))) - waitqueue_active(plo-waitq)) +((plo-bio_head !bio_list_empty(plo-bio_discard_list)) + !list_empty(plo-free_list))) + waitqueue_active(plo-waitq)) wake_up_interruptible(plo-waitq); spin_unlock_irq(plo-lock); } @@ -237,7 +238,8 @@ void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list, if (waitqueue_active(plo-req_waitq)) wake_up(plo-req_waitq); else if (test_bit(PLOOP_S_WAIT_PROCESS, plo-state) - waitqueue_active(plo-waitq) plo-bio_head) + waitqueue_active(plo-waitq) + (plo-bio_head || !bio_list_empty(plo-bio_discard_list))) wake_up_interruptible(plo-waitq); ploop_uncongest(plo); @@ -519,6 +521,7 @@ ploop_bio_queue(struct ploop_device * plo, struct bio * bio, BIO_ENDIO(bio, err); list_add(preq-list, plo-free_list); plo-bio_qlen--; + plo-bio_discard_qlen--; plo-bio_total--; return; } @@ -756,6 +759,28 @@ static void ploop_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(cb); } +static void +process_discard_bio_queue(struct ploop_device *plo, struct list_head *drop_list) +{ + bool discard = test_bit(PLOOP_S_DISCARD, plo-state); + while (!list_empty(plo-free_list)) { + struct bio *tmp; + + /* Only one discard bio can be handled concurrently */ + if (discard ploop_discard_is_inprogress(plo-fbd)) + return; + + tmp = bio_list_pop(plo-bio_discard_list); + if (tmp == NULL) + break; + + /* If PLOOP_S_DISCARD isn't set, ploop_bio_queue +* will complete it with a proper error. +*/ + ploop_bio_queue(plo, tmp, drop_list); + } +} + static void ploop_make_request(struct request_queue *q, struct bio *bio) { struct bio * nbio; @@ -843,6 +868,12 @@ static void ploop_make_request(struct request_queue *q, struct bio *bio) return; } + if (bio-bi_rw REQ_DISCARD) { + bio_list_add(plo-bio_discard_list, bio); + plo-bio_discard_qlen++; + goto queued; + } + /* Write tracking in fast path does not work at the moment. */ if (unlikely(test_bit(PLOOP_S_TRACK, plo-state) (bio-bi_rw WRITE))) @@ -864,9 +895,6 @@ static void ploop_make_request(struct request_queue *q, struct bio *bio) if (unlikely(nbio == NULL)) goto queue; - if (bio-bi_rw REQ_DISCARD) -
[Devel] [PATCH RHEL7 COMMIT] ploop: define struct ploop_request-state flags to force pre FLUSH before write IO and FUA/fsync at I/O complete
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit ebf1008ff2e19354244317140b41ae3c2854f74b Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:13 2015 +0400 ploop: define struct ploop_request-state flags to force pre FLUSH before write IO and FUA/fsync at I/O complete Series description: During relocation of ploop clusters (resize/baloon) we need to FUA/fsync image file after such operations: a) new data block wrote b) BAT update c) nullify old data block for BAT grow. We do this already nullify of old data block at format module - complete_grow callback. This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules tuned by patch to force fsync/FUA if these flags are set. This code does FUA/fsync only for a) and b) cases, while c) already implemented. Also patch fixes inconsistent bio list FUA processing in direct module. The problem is that for bunch of bios we only set FUA at last bio. Its possible in case of power outage that last bio will be stored and previos are not because they are stored only in cache at the time of power failure. To solve problem this patch marking last bio as FLUSH|FUA if more than one bio in list. Moreover for KAIO if fsync possible at BAT update stage we do that like we did in direct case instead of 2 fsync's. For direct case if we going to make FUA at BAT update only(optimization trick that already exists) then we need to mark req to FLUSH previously written(without FUA) data. Performance: Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% of time. https://jira.sw.ru/browse/PSBM-31222 https://jira.sw.ru/browse/PSBM-31225 https://jira.sw.ru/browse/PSBM-31321 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Andrey Smetanin (7): ploop: define struct ploop_request-state flags to force pre FLUSH before write IO and FUA/fsync at I/O complete ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O ploop: mark reloc reqs to force FUA before write of relocated data ploop: direct: to support truly FLUSH/FUA of req we need mark first bio FLUSH, write all bios and mark last bio as FLUSH/FUA ploop: added ploop_req_delay_fua_possible() func that detects possible delaying of upcoming FUA to index update stage. This function will be lately used in direct/kaio code to detect and delay FUA ploop: make image fsync at I/O complete if it's required by FUA/fsync force flag or by req-req_rw ploop: do preflush or postfua according force FUA/flush flags, and delay FUA if possible but add force FLUSH to req if so This patch description: Need such defines to force FUA/FLUSH/fsync in direct/kaio modules. https://jira.sw.ru/browse/PSBM-31222 https://jira.sw.ru/browse/PSBM-31225 https://jira.sw.ru/browse/PSBM-31321 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Reviewed-by: Andrew Vagin ava...@parallels.com --- include/linux/ploop/ploop.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index d8b83a6..73280e0 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -456,6 +456,9 @@ enum PLOOP_REQ_ZERO, PLOOP_REQ_DISCARD, PLOOP_REQ_RSYNC, + PLOOP_REQ_FORCE_FUA,/*force fua of req write I/O by engine */ + PLOOP_REQ_FORCE_FLUSH, /*force flush by engine */ + PLOOP_REQ_KAIO_FSYNC, /*force image fsync by KAIO module */ }; enum ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: direct: to support truly FLUSH/FUA of req we need mark first bio FLUSH, write all bios and mark last bio as FLUSH/FUA
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 822c64967450485c2a26b9cfbf388d85ad022781 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:13 2015 +0400 ploop: direct: to support truly FLUSH/FUA of req we need mark first bio FLUSH, write all bios and mark last bio as FLUSH/FUA Series description: During relocation of ploop clusters (resize/baloon) we need to FUA/fsync image file after such operations: a) new data block wrote b) BAT update c) nullify old data block for BAT grow. We do this already nullify of old data block at format module - complete_grow callback. This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules tuned by patch to force fsync/FUA if these flags are set. This code does FUA/fsync only for a) and b) cases, while c) already implemented. Also patch fixes inconsistent bio list FUA processing in direct module. The problem is that for bunch of bios we only set FUA at last bio. Its possible in case of power outage that last bio will be stored and previos are not because they are stored only in cache at the time of power failure. To solve problem this patch marking last bio as FLUSH|FUA if more than one bio in list. Moreover for KAIO if fsync possible at BAT update stage we do that like we did in direct case instead of 2 fsync's. For direct case if we going to make FUA at BAT update only(optimization trick that already exists) then we need to mark req to FLUSH previously written(without FUA) data. Performance: Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% of time. https://jira.sw.ru/browse/PSBM-31222 https://jira.sw.ru/browse/PSBM-31225 https://jira.sw.ru/browse/PSBM-31321 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Andrey Smetanin (7): ploop: define struct ploop_request-state flags to force pre FLUSH before write IO and FUA/fsync at I/O complete ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O ploop: mark reloc reqs to force FUA before write of relocated data ploop: direct: to support truly FLUSH/FUA of req we need mark first bio FLUSH, write all bios and mark last bio as FLUSH/FUA ploop: added ploop_req_delay_fua_possible() func that detects possible delaying of upcoming FUA to index update stage. This function will be lately used in direct/kaio code to detect and delay FUA ploop: make image fsync at I/O complete if it's required by FUA/fsync force flag or by req-req_rw ploop: do preflush or postfua according force FUA/flush flags, and delay FUA if possible but add force FLUSH to req if so This patch description: Patch fixes inconsistent bio list FUA processing in direct module. The problem is that for bunch of bios we only set FUA at last bio. Its possible in case of power outage that last bio will be stored and previos are not because they are stored only in cache at the time of power failure. To solve problem this patch marking last bio as FLUSH|FUA if more than one bio in list. https://jira.sw.ru/browse/PSBM-31222 https://jira.sw.ru/browse/PSBM-31225 https://jira.sw.ru/browse/PSBM-31321 Signed-off-by: Andrey Smetanin asmeta...@parallels.com Reviewed-by: Andrew Vagin ava...@parallels.com --- drivers/block/ploop/io_direct.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 5e2e078..2e81d81 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -85,6 +85,7 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, int preflush; int postfua = 0; int write = !!(rw REQ_WRITE); + int bio_num; trace_submit(preq); @@ -215,6 +216,7 @@ flush_bio: } extent_put(em); + bio_num = 0; while (bl.head) { struct bio * b = bl.head; unsigned long rw2 = rw; @@ -230,10 +232,11 @@ flush_bio: preflush = 0; } if (unlikely(postfua !bl.head)) - rw2 |= REQ_FUA; + rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0)); ploop_acc_ff_out(preq-plo, rw2 | b-bi_rw); submit_bio(rw2 ~(bl.head ? REQ_SYNC : 0), b); + bio_num++; } ploop_complete_io_request(preq); @@ -1341,9 +1344,12 @@ dio_io_page(struct ploop_io * io, unsigned long rw, int err; int off; int postfua; + int
[Devel] [PATCH RHEL7 COMMIT] ploop: added printk of function, line, backtrace before ploop_set_error
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit a24f999555ce0caf879864e9b623908830e54232 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:17 2015 +0400 ploop: added printk of function, line, backtrace before ploop_set_error There are several bugs where we only see messages like: ploop_set_error=-7 on ploop28808. Sometimes such infomation insufficient to find fastly why and where error in ploop happens. This patch added extra printk of function, line and stack backtrace in case of ploop request failure. Signed-off-by: Andrey Smetanin asmeta...@virtuozzo.com --- drivers/block/ploop/dev.c| 32 +++- drivers/block/ploop/fmt_ploop1.c | 2 +- drivers/block/ploop/io_direct.c | 20 ++-- drivers/block/ploop/io_kaio.c| 16 drivers/block/ploop/map.c| 12 ++-- include/linux/ploop/ploop.h | 36 +++- 6 files changed, 79 insertions(+), 39 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index bd5fe37..f780618 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1331,7 +1331,7 @@ void ploop_fail_request(struct ploop_request * preq, int err) { struct ploop_device * plo = preq-plo; - ploop_set_error(preq, err); + ploop_req_set_error(preq, err); spin_lock_irq(plo-lock); if (err == -ENOSPC) { @@ -1351,13 +1351,19 @@ void ploop_fail_immediate(struct ploop_request * preq, int err) { struct ploop_device * plo = preq-plo; - ploop_set_error(preq, err); + ploop_req_set_error(preq, err); set_bit(PLOOP_S_ABORT, plo-state); preq-eng_state = PLOOP_E_COMPLETE; ploop_complete_request(preq); } +#define PLOOP_REQ_FAIL_IMMEDIATE(preq, err)\ + do {\ + PLOOP_REQ_TRACE_ERROR(preq, err); \ + ploop_fail_immediate(preq, err);\ + } while (0); + void ploop_complete_io_state(struct ploop_request * preq) { struct ploop_device * plo = preq-plo; @@ -1577,7 +1583,7 @@ ploop_reloc_sched_read(struct ploop_request *preq, iblock_t iblk) if (!preq-aux_bio || fill_bio(plo, preq-aux_bio, preq-req_cluster)) { - ploop_fail_immediate(preq, -ENOMEM); + PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM); return; } } @@ -2064,7 +2070,7 @@ restart: if (!preq-aux_bio || fill_bio(plo, preq-aux_bio, preq-req_cluster)) { - ploop_fail_immediate(preq, -ENOMEM); + PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM); return; } @@ -2170,7 +2176,7 @@ delta_io: if (!preq-aux_bio || fill_bio(plo, preq-aux_bio, preq-req_cluster)) { - ploop_fail_immediate(preq, -ENOMEM); + PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM); return; } spin_lock_irq(plo-lock); @@ -2225,7 +2231,7 @@ delta_io: return; error: - ploop_fail_immediate(preq, err); + PLOOP_REQ_FAIL_IMMEDIATE(preq, err); } static void ploop_req_state_process(struct ploop_request * preq) @@ -2271,7 +2277,7 @@ restart: if (preq-error || ((preq-req_rw REQ_WRITE) test_bit(PLOOP_S_ABORT, plo-state))) { - ploop_fail_immediate(preq, preq-error ? : -EIO); + PLOOP_REQ_FAIL_IMMEDIATE(preq, preq-error ? : -EIO); break; } @@ -2346,7 +2352,7 @@ restart: */ if (preq-error || test_bit(PLOOP_S_ABORT, plo-state)) { - ploop_fail_immediate(preq, preq-error ? : -EIO); + PLOOP_REQ_FAIL_IMMEDIATE(preq, preq-error ? : -EIO); break; } @@ -2386,7 +2392,7 @@ restart: if (!preq-aux_bio || fill_bio(plo, preq-aux_bio, preq-req_cluster)) { - ploop_fail_immediate(preq, -ENOMEM); + PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM); break; } @@ -2425,7 +2431,7 @@ restart: if (preq-error || test_bit(PLOOP_S_ABORT, plo-state)) { - ploop_fail_immediate(preq, preq-error ? : -EIO); +
[Devel] [PATCH RHEL7 COMMIT] ploop: fix accounting ploop_io_images_size
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 09fe781813a8c4e74ad6f9621cf26acb705424f4 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:26:54 2015 +0400 ploop: fix accounting ploop_io_images_size ploop_io_images_size is a global counter supposed to be total sum of io-size for all io structs. However, actual size of image can be used by userspace. E.g. when userspace grows lower delta for merge operartion. This means that by the time ploop_dio_close() is called, actual size may differ slightly from the size we initially accounted. The patch fixes the problem by accurate accounting of images sizes: we can subtruct from ploop_io_images_size only so many bytes as we actually added earlier. Another fix is for growing lower delta in userspace: we try to catch up changes made by userspace when kernel merge starts and ploop_dio_upgrade is being called. https://jira.sw.ru/browse/PSBM-19906 Signed-off-by: Maxim V. Patlasov mpatla...@parallels.com --- drivers/block/ploop/io_direct.c | 12 +--- drivers/block/ploop/io_direct_map.c | 20 +++- drivers/block/ploop/io_direct_map.h | 6 +++--- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 8d716ca..cbb7edc 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -852,7 +852,7 @@ static void dio_destroy(struct ploop_io * io) if (io-files.em_tree) { io-files.em_tree = NULL; mutex_lock(io-files.inode-i_mutex); - ploop_dio_close(io-files.mapping, delta-flags PLOOP_FMT_RDONLY); + ploop_dio_close(io, delta-flags PLOOP_FMT_RDONLY); (void)dio_invalidate_cache(io-files.mapping, io-files.bdev); mutex_unlock(io-files.inode-i_mutex); } @@ -910,7 +910,7 @@ static int dio_open(struct ploop_io * io) dio_fsync(file); mutex_lock(io-files.inode-i_mutex); - em_tree = ploop_dio_open(io-files.file, (delta-flags PLOOP_FMT_RDONLY)); + em_tree = ploop_dio_open(io, (delta-flags PLOOP_FMT_RDONLY)); err = PTR_ERR(em_tree); if (IS_ERR(em_tree)) goto out; @@ -920,7 +920,7 @@ static int dio_open(struct ploop_io * io) err = dio_invalidate_cache(io-files.mapping, io-files.bdev); if (err) { io-files.em_tree = NULL; - ploop_dio_close(io-files.mapping, 0); + ploop_dio_close(io, 0); goto out; } @@ -930,7 +930,7 @@ static int dio_open(struct ploop_io * io) delta-plo-index); if (io-fsync_thread == NULL) { io-files.em_tree = NULL; - ploop_dio_close(io-files.mapping, 0); + ploop_dio_close(io, 0); goto out; } wake_up_process(io-fsync_thread); @@ -938,8 +938,6 @@ static int dio_open(struct ploop_io * io) out: mutex_unlock(io-files.inode-i_mutex); - if (!err) - io-size = i_size_read(io-files.inode); return err; } @@ -1644,7 +1642,7 @@ static int dio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd) return err; } - err = ploop_dio_upgrade(io-files.mapping); + err = ploop_dio_upgrade(io); if (err) { mutex_unlock(io-files.inode-i_mutex); fput(file); diff --git a/drivers/block/ploop/io_direct_map.c b/drivers/block/ploop/io_direct_map.c index 62984bf..2ddf93a 100644 --- a/drivers/block/ploop/io_direct_map.c +++ b/drivers/block/ploop/io_direct_map.c @@ -52,10 +52,11 @@ extern atomic_long_t ploop_io_images_size; */ struct extent_map_tree * -ploop_dio_open(struct file * file, int rdonly) +ploop_dio_open(struct ploop_io * io, int rdonly) { int err; struct ploop_mapping *m, *pm; + struct file * file = io-files.file; struct address_space * mapping = file-f_mapping; pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL); @@ -100,7 +101,8 @@ out_unlock: pm-readers = rdonly ? 1 : -1; list_add(pm-list, ploop_mappings); mapping-host-i_flags |= S_SWAPFILE; - atomic_long_add(i_size_read(mapping-host), ploop_io_images_size); + io-size = i_size_read(mapping-host); + atomic_long_add(io-size, ploop_io_images_size); pm-saved_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, @@ -125,8 +127,9 @@ out_unlock: } int -ploop_dio_close(struct address_space * mapping, int rdonly) +ploop_dio_close(struct ploop_io *
[Devel] [PATCH RHEL7 COMMIT] ploop: fix dio_fsync wait
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 732676a0eafac636b18f060f06b79312e0ad829d Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:26:55 2015 +0400 ploop: fix dio_fsync wait we MUST wait for writeback to finish before file-f_op-fsync. https://jira.sw.ru/browse/PSBM-18049 Signed-off-by: Dmitry Monakhov dmonak...@openvz.org Acked-by: Maxim V. Patlasov mpatla...@parallels.com --- drivers/block/ploop/io_direct.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index cbb7edc..a9910ba 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -774,16 +774,13 @@ static int dio_fsync(struct file * file) int err, ret; struct address_space *mapping = file-f_mapping; - ret = filemap_fdatawrite(mapping); + ret = filemap_write_and_wait(mapping); err = 0; if (file-f_op file-f_op-fsync) { err = file-f_op-FOP_FSYNC(file, 0); if (!ret) ret = err; } - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; return ret; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: rework accounting images_size
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 102acd98b5d6ddf9a310a0c0a54e21d6903d23ab Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:26:58 2015 +0400 ploop: rework accounting images_size The way how we kept ploop_io_images_size up-to-date (should be always equal to total number of bytes of all loaded image files) was very prone to errors: first delta loaded kept actual io-size, then backup delta was initialized with io-size=0; since then, if first delta was unloaded before the second, unloading the second delta led to subtracting io-size=0 from ploop_io_images_size. This is obviously incorrect. The patch makes the accounting much more straightforward: the size of image is actually the property of mapping, not a delta (because several deltas may point to the same mapping). So, let's keep actual 'size' in ploop_mapping structure and let every delta point to it. No extra locking is needed because the image is either opened by several device read-only, or one and only one device read-write. https://jira.sw.ru/browse/PSBM-20432 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/io_direct.c | 10 +- drivers/block/ploop/io_direct_map.c | 16 ++-- include/linux/ploop/ploop.h | 2 +- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 17dbf6c..ab74849 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -31,7 +31,7 @@ int max_extent_map_pages __read_mostly; int min_extent_map_entries __read_mostly; -/* total sum of io-size for all io structs */ +/* total sum of m-size for all ploop_mapping structs */ atomic_long_t ploop_io_images_size = ATOMIC_LONG_INIT(0); /* Direct IO from/to file. @@ -436,8 +436,8 @@ try_again: mutex_unlock(io-files.inode-i_mutex); new_size = i_size_read(io-files.inode); - atomic_long_add(new_size - io-size, ploop_io_images_size); - io-size = new_size; + atomic_long_add(new_size - *io-size_ptr, ploop_io_images_size); + *io-size_ptr = new_size; if (!err) err = filemap_fdatawrite(io-files.mapping); @@ -1684,8 +1684,8 @@ static int dio_truncate(struct ploop_io * io, struct file * file, mutex_unlock(io-files.inode-i_mutex); new_size = i_size_read(io-files.inode); - atomic_long_sub(io-size - new_size, ploop_io_images_size); - io-size = new_size; + atomic_long_sub(*io-size_ptr - new_size, ploop_io_images_size); + *io-size_ptr = new_size; if (!err) err = dio_fsync(file); diff --git a/drivers/block/ploop/io_direct_map.c b/drivers/block/ploop/io_direct_map.c index 6b0886c..b3cb04d 100644 --- a/drivers/block/ploop/io_direct_map.c +++ b/drivers/block/ploop/io_direct_map.c @@ -31,6 +31,7 @@ struct ploop_mapping struct address_space* mapping; int readers; unsigned long saved_gfp_mask; + loff_t size; struct extent_map_tree extent_root; }; @@ -81,6 +82,8 @@ out_unlock: spin_unlock(ploop_mappings_lock); if (pm) kfree(pm); + if (!err) + io-size_ptr = m-size; return err ? ERR_PTR(err) : m-extent_root; } } @@ -101,8 +104,9 @@ out_unlock: pm-readers = rdonly ? 1 : -1; list_add(pm-list, ploop_mappings); mapping-host-i_flags |= S_SWAPFILE; - io-size = i_size_read(mapping-host); - atomic_long_add(io-size, ploop_io_images_size); + io-size_ptr = pm-size; + *io-size_ptr = i_size_read(mapping-host); + atomic_long_add(*io-size_ptr, ploop_io_images_size); pm-saved_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, @@ -143,9 +147,9 @@ ploop_dio_close(struct ploop_io * io, int rdonly) } if (m-readers == 0) { - atomic_long_sub(io-size, + atomic_long_sub(*io-size_ptr, ploop_io_images_size); - io-size = 0; + *io-size_ptr = 0; mapping-host-i_flags = ~S_SWAPFILE; list_del(m-list); pm = m; @@ -191,9 +195,9 @@ int ploop_dio_upgrade(struct ploop_io * io) err = -EBUSY; if (m-readers == 1) { loff_t new_size = i_size_read(io-files.inode); -
[Devel] [PATCH RHEL7 COMMIT] ploop: fix race in ploop_tracker_init()
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit df915f10c4c348fb40ab7fded3ae860b715d7103 Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:03 2015 +0400 ploop: fix race in ploop_tracker_init() ploop_tracker_init() may acquire current alloc_head only after quiescing ploop. Otherwise a race is possible: 1) we acuire an alloc_head: e.end = (u64)ploop_top_delta(plo)-io.alloc_head (plo-cluster_log + 9); 2) then the alloc_head is advanced due to submit_alloc writes 3) we turn write tracker ON: set_bit(PLOOP_S_TRACK, plo-state). The result is disastrous: the 1st iteration of userspace vzmigrate won't copy blocks allocated on 2) because we reported old e.end; and then vzmigrate also won't copy the blocks because they were allocated when write tracker was off. https://jira.sw.ru/browse/PSBM-22993 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/tracker.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/block/ploop/tracker.c b/drivers/block/ploop/tracker.c index 5dbb7c9..3210006 100644 --- a/drivers/block/ploop/tracker.c +++ b/drivers/block/ploop/tracker.c @@ -101,12 +101,15 @@ int ploop_tracker_init(struct ploop_device * plo, unsigned long arg) if (list_empty(plo-map.delta_list)) return -ENOENT; + ploop_quiesce(plo); + e.start = 0; e.end = (u64)ploop_top_delta(plo)-io.alloc_head (plo-cluster_log + 9); - if (copy_to_user((void*)arg, e, sizeof(struct ploop_track_extent))) + if (copy_to_user((void*)arg, e, sizeof(struct ploop_track_extent))) { + ploop_relax(plo); return -EFAULT; + } - ploop_quiesce(plo); set_bit(PLOOP_S_TRACK, plo-state); plo-maintenance_type = PLOOP_MNTN_TRACK; plo-track_end = 0; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: add ioctl to limit size of top delta (v2)
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit fd12acccf76e1a56f073322414cd43cbffa598ba Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:04 2015 +0400 ploop: add ioctl to limit size of top delta (v2) customer created an online backup = backup.tib file. Most probably the .tib file is inconsistent and in order to correctly access data inside we need to replay the journal. = We have to provide to container a bundle of .tib file (read-only) and a tiny read-writeable ploop delta - journal replayed data will be stored there. A cunning customer can notice that the delta is writeble and fill it with his own data - unlimited. = we need an ability to limit the ploop delta max size. https://jira.sw.ru/browse/PSBM-22002 v2: move declaration of PLOOP_IOC_MAX_DELTA_SIZE Signed-off-by: Andrew Vagin ava...@openvz.org Acked-by: Maxim V. Patlasov mpatla...@parallels.com --- drivers/block/ploop/dev.c| 20 drivers/block/ploop/fmt_ploop1.c | 5 + include/linux/ploop/ploop.h | 2 ++ include/linux/ploop/ploop_if.h | 3 +++ 4 files changed, 30 insertions(+) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 0124349..33f8442 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2770,6 +2770,7 @@ init_delta(struct ploop_device * plo, struct ploop_ctl * ctl, int level) delta-plo = plo; delta-ops = ops; delta-flags = ctl-pctl_flags PLOOP_FMT_FLAGS; + delta-max_delta_size = ULLONG_MAX; KOBJECT_INIT(delta-kobj, ploop_delta_ktype); return delta; @@ -2780,6 +2781,22 @@ out_err: } +static int ploop_set_max_delta_size(struct ploop_device *plo, unsigned long arg) +{ + struct ploop_delta * top_delta = ploop_top_delta(plo); + u64 max_delta_size; + + if (copy_from_user(max_delta_size, (void*)arg, sizeof(u64))) + return -EFAULT; + + if (top_delta == NULL) + return -EINVAL; + + top_delta-max_delta_size = max_delta_size; + + return 0; +} + static int ploop_add_delta(struct ploop_device * plo, unsigned long arg) { int err; @@ -4419,6 +4436,9 @@ static int ploop_ioctl(struct block_device *bdev, fmode_t fmode, unsigned int cm case PLOOP_IOC_DISCARD_WAIT: err = ploop_discard_wait_ioc(plo); break; + case PLOOP_IOC_MAX_DELTA_SIZE: + err = ploop_set_max_delta_size(plo, arg); + break; default: err = -EINVAL; } diff --git a/drivers/block/ploop/fmt_ploop1.c b/drivers/block/ploop/fmt_ploop1.c index 5ce6915..585f6ce 100644 --- a/drivers/block/ploop/fmt_ploop1.c +++ b/drivers/block/ploop/fmt_ploop1.c @@ -222,6 +222,11 @@ static void ploop1_allocate(struct ploop_delta * delta, struct ploop_request * preq, struct bio_list * sbl, unsigned int size) { + if (delta-io.alloc_head = + (delta-max_delta_size delta-cluster_log)) { + ploop_fail_request(preq, -E2BIG); + return; + } delta-io.ops-submit_alloc(delta-io, preq, sbl, size); } diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 434789e..ae3dbfc 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -286,6 +286,8 @@ struct ploop_delta struct ploop_delta_ops *ops; struct kobject kobj; + + u64 max_delta_size; /* in sectors */ }; struct ploop_tunable diff --git a/include/linux/ploop/ploop_if.h b/include/linux/ploop/ploop_if.h index 45b74fc..aacddb3 100644 --- a/include/linux/ploop/ploop_if.h +++ b/include/linux/ploop/ploop_if.h @@ -299,6 +299,9 @@ struct ploop_track_extent /* Filter extents with sizes less than arg */ #define PLOOP_IOC_FBFILTER _IOR(PLOOPCTLTYPE, 27, unsigned long) +/* Set maximum size for the top delta . */ +#define PLOOP_IOC_MAX_DELTA_SIZE _IOW(PLOOPCTLTYPE, 28, __u64) + /* Events exposed via /sys/block/ploopN/pstate/event */ #define PLOOP_EVENT_ABORTED1 #define PLOOP_EVENT_STOPPED2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: expose open_count to sysfs
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 97ba7085978ee4245beaf43c5543cb230094189d Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:06 2015 +0400 ploop: expose open_count to sysfs The patch shows number of opened instances of ploop-device (plo-open_count) in /sys/block/ploopN/pstate/open_count. This will allow userspace to decide whether a ploop-device is used by someone (e.g. backup) by scanning the open_count for all ploop-devices. https://jira.sw.ru/browse/PSBM-24754 Signed-off-by: Maxim Patlasov mpatla...@parallels.com --- drivers/block/ploop/sysfs.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/ploop/sysfs.c b/drivers/block/ploop/sysfs.c index 5c31826..3ef53ac 100644 --- a/drivers/block/ploop/sysfs.c +++ b/drivers/block/ploop/sysfs.c @@ -383,6 +383,11 @@ static u32 show_event(struct ploop_device * plo) return ret; } +static u32 show_open_count(struct ploop_device * plo) +{ + return atomic_read(plo-open_count); +} + static ssize_t print_cookie(struct ploop_device * plo, char * page) { return sprintf(page, %s\n, plo-cookie); @@ -466,6 +471,7 @@ static struct attribute *state_attributes[] = { _A(top), _A(event), _A3(cookie), + _A(open_count), NULL }; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: ensure non-empty delta list on running ploop
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit ace664d9412efd2fa814851b0be55751dc9ed13a Author: Andrey Smetanin asmeta...@virtuozzo.com Date: Tue May 19 08:27:05 2015 +0400 ploop: ensure non-empty delta list on running ploop The only allowed use-case for delta removal on running ploop is the merge operation. But merge never removes base delta. Hence, we can prohibit base delta removal if ploop is in RUNNING state. This resolves the following issue: buggy userspace removes all deltas from a running ploop, but leaves ploop in RUNNING state making any further ploop-mount impossible. https://jira.sw.ru/browse/PSBM-25102 Signed-off-by: Maxim Patlasov mpatla...@parallels.com Acked-by: Pavel Emelyanov xe...@parallels.com --- drivers/block/ploop/dev.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 33f8442..d2a9eb4 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -3173,6 +3173,12 @@ static int ploop_del_delta(struct ploop_device * plo, unsigned long arg) if (plo-maintenance_type != PLOOP_MNTN_OFF) return -EBUSY; + if (level == 0 test_bit(PLOOP_S_RUNNING, plo-state)) { + printk(KERN_INFO Can't del base delta on running ploop%d\n, + plo-index); + return -EBUSY; + } + delta = find_delta(plo, level); if (delta == NULL) @@ -3197,6 +3203,8 @@ static int ploop_del_delta(struct ploop_device * plo, unsigned long arg) delta-ops-stop(delta); delta-ops-destroy(delta); kobject_put(delta-kobj); + BUG_ON(test_bit(PLOOP_S_RUNNING, plo-state) + list_empty(plo-map.delta_list)); return 0; } @@ -3573,6 +3581,7 @@ static int ploop_start(struct ploop_device * plo, struct block_device *bdev) wake_up_process(plo-thread); set_bit(PLOOP_S_RUNNING, plo-state); + BUG_ON(list_empty(plo-map.delta_list)); return 0; out_err: @@ -3605,8 +3614,11 @@ static int ploop_stop(struct ploop_device * plo, struct block_device *bdev) if (!test_bit(PLOOP_S_RUNNING, plo-state)) return -EINVAL; - if (list_empty(plo-map.delta_list)) + if (list_empty(plo-map.delta_list)) { + printk(KERN_INFO stop ploop%d failed (no deltas)\n, + plo-index); return -ENOENT; + } cnt = atomic_read(plo-open_count); if (cnt 1) { @@ -3755,6 +3767,7 @@ static int ploop_clear(struct ploop_device * plo, struct block_device * bdev) plo-maintenance_type = PLOOP_MNTN_OFF; plo-bd_size = 0; plo-state = (1 PLOOP_S_CHANGED); + BUG_ON(test_bit(PLOOP_S_RUNNING, plo-state)); return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve: device cgroup -- Implement devcgroup_seq_show_ve
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit bc411f061cc8878edb65db52b8e58ab2fa218186 Author: Cyrill Gorcunov gorcu...@odin.com Date: Tue May 19 00:43:32 2015 +0400 ve: device cgroup -- Implement devcgroup_seq_show_ve In PCS7 cgroups are configured from user space, so there is no longer connection from ve to device cgroup via css as it was in PCS6. Instead we should open device cgroup explicitly. https://jira.sw.ru/browse/PSBM-33555 v2 (by vdavydov@): - use ve::ve_name because we're switching to UUID based containers Signed-off-by: Cyrill Gorcunov gorcu...@odin.com Reviewed-by: Vladimir Davydov vdavy...@parallels.com CC: Konstantin Khorenko khore...@odin.com CC: Andrey Vagin ava...@odin.com --- include/linux/device_cgroup.h | 3 ++- kernel/ve/vecalls.c | 2 +- security/device_cgroup.c | 14 +++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index bc58c4c..32588bb 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -19,7 +19,8 @@ extern int devcgroup_device_visible(umode_t mode, int major, struct cgroup; int devcgroup_default_perms_ve(struct cgroup *cgroup); int devcgroup_set_perms_ve(struct cgroup *cgroup, unsigned, dev_t, unsigned); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m); +struct ve_struct; +int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, struct seq_file *m); #else static inline int devcgroup_inode_permission(struct inode *inode, int mask) diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c index 7c574b3..2613a1e 100644 --- a/kernel/ve/vecalls.c +++ b/kernel/ve/vecalls.c @@ -891,7 +891,7 @@ static int devperms_seq_show(struct seq_file *m, void *v) if (ve_is_super(ve)) seq_printf(m, %10u b 016 *:*\n%10u c 006 *:*\n, 0, 0); else - devcgroup_seq_show_ve(ve-css.cgroup, ve-veid, m); + devcgroup_seq_show_ve(devices_root, ve, m); return 0; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 31024f7..33a9883 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -17,6 +17,7 @@ #include linux/major.h #include linux/module.h #include linux/capability.h +#include linux/ve.h #define ACC_MKNOD 1 #define ACC_READ 2 @@ -1091,10 +1092,16 @@ int devcgroup_set_perms_ve(struct cgroup *cgroup, } EXPORT_SYMBOL(devcgroup_set_perms_ve); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m) +int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); struct dev_exception_item *wh; + struct dev_cgroup *devcgroup; + struct cgroup *cgroup; + + cgroup = cgroup_kernel_open(devices_root, 0, ve_name(ve)); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + devcgroup = cgroup_to_devcgroup(cgroup); rcu_read_lock(); list_for_each_entry_rcu(wh, devcgroup-exceptions, list) { @@ -1112,12 +1119,13 @@ int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file perm |= S_IXOTH; seq_printf(m, %10u %c %03o %s:%s\n, - veid, + ve-veid, type_to_char(wh-type), perm, maj, min); } rcu_read_unlock(); + cgroup_kernel_close(cgroup); return 0; } EXPORT_SYMBOL(devcgroup_seq_show_ve); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7] scripts: Delete generated binary files from kernel tree.
Signed-off-by: Kirill Tkhai ktk...@odin.com --- scripts/basic/fixdep | Bin scripts/kconfig/conf | Bin 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100755 scripts/basic/fixdep delete mode 100755 scripts/kconfig/conf diff --git a/scripts/basic/fixdep b/scripts/basic/fixdep deleted file mode 100755 index 2d8a408aef61f56512dc8ac61eb828492789022b.. GIT binary patch literal 0 HcmV?d1 literal 13875 zcmeHOdvH@%dOwm47-J;yaN-aGmBxY5`@hwHeezd8Lp^ICS;m$hEL#tc@*sq$|ML zOae7#Q5C0jH{G3ZI-6HX?CaWsNfO|e-U*kscw+1YN=vfEIW5DAmUn+I{S0rTv zdyaG^teO3z{g30({m%D0-|M{ZM`PI-x{mM!ep|tRgAcyDJ~8fS7h(ztiWp6WaeUX z*qv-5kO}y?cnMJC+Vm#EoqUED*!FTuf}QsHC7kTNIKhqnWQp8qU0}=Tq0}GtG5}f zqL#XAU%rJF^99D%{(LNQ4vSc6r)_~5lngLa##TNqSBMCaJ6+)s22l!fun*1(+Z z5$5673?*4g8{XGjDL=cEv!E`Xku;y)A3=}e{C$%{2yPVq^0mu|sGpQWeYok+m|g_ z*wMP6BNS9Ua-4n*@9(DoYAmz5wADdC)u^@H*-(29Tjj?znSa7@+cB{qDWPw_jG4 zy_l$}pZ?Nl1fp#_mezzCE4KhqKw61CG2+mD(`x3cJ0RrIXr-5rqyFxSasiYRD zgU`={_vXQ=-sAc0VwJ$gIDLl_vOL=3Aln^Zh8p7c=4sg9r2A+wVh=E3jCgVXdH zFP=yq{JA`MV;;N)_%wD0yPz~9$C#Xtk0VcbG6!cE$1OB$m$AU^pw2Y!!+BNZ(`5 zNF*F#$VXXcr@xCu^+-n`#5x0=EuF-*hGRPCy1H4XzoR4E!ZfW#@9qj{+cn}K0voI| z79w`Yugw{W@#g6$$DA*3uD23WWYCfm9q!I0jTR@cKF;a!0UH==12pa#u|u66o@ zkd6cb5Z%!k4hdLeYkl7O)!HKG;iThNjkg8xs+PMI?f-!EC$`vy%`DN@W-bz(*^ zIF)AZ8G*gXWzlJEMVsZADb1NOg_$or!+dZnNym(=_M6;*)Oz)i=~$$lZ6=C@2` zqQ52qpm=0bOt@HLc~ON4FOb5ZRVJL)ADL=QIQfzWx}ZrGN~qVv=ct372Dz${sP{ zXrOdzHsQr-l(BXbZf_O6JBDH-)+LtLFu%|gxk_6V+TyQ-Gm=B;j+)E_c*oW|BW_ z!p-yf6%#(gB!9$)7mf7aT6}KJYJ;sTrE-)6E@$jFtv9;pO72`to95Q5jCZ;rS%XQ zoA1HZKG%gD@$Dp$x-f`ew3qbW-OyzleAmGN}R1-$#4_@kcm+H}Nz?QinM|i+CCu zse_!KPCN~X)E?k6Hh}S)x~)$@iYWd7A-8Ht^K-sSTX}n0V^)l#BCMh^MYjRdN0z z@zljBh4cSOJaui#!TFyPPhFZ~oPU#edMr}4FL0hLOgY0H_Ca5Kmo~I-4}iKi}0 z4RHR)#8X$LjS}Z;;D;Lhrw^r0%_X2;APf|3=WOA6~!4-upTDp0PrE?_+3JHL;~g zjgP2_osRyGX1bUfFa6s=(WU#vg!dfzVszx;-!BBl6(;s)kNJlk*~g1J;_5fO`kx zgZ2ZXFsa5b#?SOqS0yL#0{ej@1SjJDz*2T-q=w6M)jCG{_bnkeJ8l!O0w89I;O@? zl+No_}S#W5D?5q$u}Spe=k{2l6!}!F4Y5%ZyQt-rDJy?v_##7QiYwofzT0=k!yJ z8GgI}`v8+aG9W8KCn5fi5Kol8gDySUHG?bNj-q~wDcOgdnwa`LL{7{DaFT-TAB9-* zk9kpD0zQ*l(h6G*g9rTH?8n46os5h8lqPzXI;W{jxAwOUk_RbCI*pWz`g8MEK z)C^fjcjQ0o*lR_-Nn?ir@A;fs}#firt4#4pSB%5{%@FYo7DwB1wpn)cE_mtlI{U z!#ba6pKj}?qOlA1xqApZPMnlKHCh09HU1Oz)Rl)+%Q^M!(HN9Zkf7T8bN$n8Dmsws zUljU#f$pM#$wYpLo@jdx^%`2|EEJ{hFs`k_!ng#)VPCA5{oBI{qWkY`K+_uy= zpn^K4q6T$0!O-hYW$OA^;c-?_P?Q1V`6-L*qcr@5@HMU$*NxKY{Kezz`ktQ8 zwz;acA!|q^Z2o)O7BxgKhS?9%R1e-s0ZuwDlqca!^dHjgSx_V-*Iv8s^=uNCF zS9`}|#lxjs9uC#S+~lAC4!(QO$40zo3(H}Td9gHk_kP7Hn_28)#rV+AD;D;^M$%G z{*fC0AA8Ro19ck=l1=7%L(k`}~D6}@ETR=sFtvAy?uG;n*)SbViNVwb$}Yb%%A z_eFsD;-9H~ua{qdC$;i3HGU;INDC=q=IpJX5Rx;%##0sQPDeK-**`mO-t0#NbVOU zZ$LTqmmzrl2^?o5h_WDl0zuzJ#XK)3(@pEjjF|#sfnehXDRUU0uQWz7vwR8Ui*^f zz|tS1s6TE%F9V|XfQ@uWd;zFKEwBK1%8Tp=mccTndXd`KRKCF;5WFbqvd-@ulJgL z-w-T#5)kK0foUCBLXxoKPsX}_I=-?dO*hCJb#~Uwfh2S5l?+d8)qf)uzAue^*D zs_gq;fpuT}GB$1TrYPGKTwB7glHl{DELvAgyH}vv4TPZjvwp2P8a%WJIc#; zKH*L@usT{5V{iE4!|IByyE^boS^OClmw07%s`w7gWdCo4qpO!CFAt6cH*hKy9 z{n%!N$yEEvH?I|{MX||yzSQi!Pg3wbr!B+3@FgfJGS{sLj~ffKA-#)Mv*N_K8cLC zZPnJf%VN7G09Qu66*dRQJW=}-g5`K;T-PHU4pOHpGEWCG?`vIg(6J4ECCam*I? zHun5ZIEpKGxZlY+yKMFKcwvbQ)D;(@o*WR5B6jA77XB-7{HE|#d?V^{;{twVriuD zYda9t$?Ea*qX7_LS5f^idHyYVi-FV`HgRnAuqB9;chMoZnj)OzGPxgMg%NLWHa z_8m0_+FuxfZy@czhh1pThW3ejq+)*s?9pr$Cx$A-$2+{qf^LHNSI!!zr@PmkE8g z^w?3@^~FC(J_w240k7S2RjvHB+H-xP{rSn520b56svdB^e)_Lw@2x$(v($dFsbCTf zH69NPw?WmEzC0ff!ls;$k1yM5^K6NRT~MIlB}EPZ#H*~vag}=nUua6orF?sl^ zx9=H8auWREE(ap^eY6a!eccXf$oLH!rfTw?#8{c(U4a9H=uHA+(Jx+C+;ta2QeM6 z=#_A9n1KM-c$SjGtKMmC-r+Kx%b@A^v2{g@{(ZniIqn%0)AstXwkAHXF+)Q?G?cy zd+*nCvtqk79I-tE0!28VHH5K2%;DqxZ=%JLhpqul68bu^-Ow;P?OEj7oc6i zAWSQsvEIhV+*{E0?#r#zq^}!ee(}YQLyUvEg-tdSNVSRdG*xw0=ffCe)bu)%!m3M zX~Y-5;)`GNz{c3jU#UHV7PWRbc7*XO=Gt$MflkwuAg$5)y+5!hZMb7~9jZVhxT zSGxRXh7l5d@GbtSi$qwj({Fj+QP9=D{g$G9G6hu`L+dp|pfUZNVK%Td*UrT;XOF zr1FYg!H(7zf236jhFUset$~GxAR0~Di3Xnvgxf0c+*hg0t5cRJ;fS;5{At{s5Rh zg#yvL(2SncEdNN%~Agj_%PI!m57;~u2NPSfg;n=*@{;fmMcr^phR0P(!TQ!$t2 z`Ug~_jD+D=R$xaU6o~lsKh$(Ob=6aobAjXX=z^=s%;Vx1iejGCfQg8co^!Y9+X zb7vsJqTyJiC7{iZD$DDX`O)HH2FFpsZ!KmDKm}BW$)CV!nZ+8+w8n=$n|%$kP|A) z+bfl=tayHOIXXaCtAybUv%Hw0k3n|c{mG3l|W~g-i_uXKU90modqjgptPF*;FGa2 zdKXcM0l5-u(PBTfx78A$}C}66jUXCM-uCC?5hn3OW-QCCgJAnPs=ba%ah;qJx%6 z4#HS=YgI3NI0Vr$-!43c*sj%HdLZH_gzvUT;yNdR(hi0v~Z*lqEWMT^m}cx z*j-0Jz0L8Q)om-=U*NGRJ%xc{TiF_$quy5Jn*!8rbF8(|!-@~Tr|}zso}Zp}p!cWN zdRy7^1@$(iuW*g6;*WHZB_dxdTcd4lh)b}-(4_bEwITZB=et1!N!#9T;}g!^{Kt zy@lTZ;{9ui7oPT096beVZAUEDw-Co_o5L-{dCXry;i5G0-%Qf2J+uP{u{`@O8!;y z-)M7njOUxTw+H@;upDL2bTWD?-D-hbEpV#^ZneOz7P!{Fw^5*LS?1(hig;SEW z#9-NA{Woech8sb%83$v|Dhzmuij~}F+VK1Yah;r_#L*CAt@(^x2KEFwsL+7m_2 z-w6gU-^mYH4IZ~`JbfT2-GfjnGYBgcRFVRjQJQ9-YdxO)bc)z?)@m;ZV+~OqKoGw z{)pgZ+^Ue1@A{GJU;UJlnHq}NO4}a=ZN5wqJF~6|2vW2v2z{$QuOO(L9Yoqak7hx z-X`e1f-Vzuji6fuZ4-2-p!)^=j-W3KdQ8wWg8ov_%Yt4LbYijaFDM1GcJ=DzN(H_o zS1L=KHO@uKqN?g8Rn?2Cl!}dkRzye2EL$rStVo6Xm_XX-wvur4B9Spcp3{tx|lN* z)ov=y?24`-yygip_p@f43BjSg00L+T)RKoYZ2?A=oph9x+PrfD;XdLs^W5vPhuA
[Devel] [PATCH RHEL7 COMMIT] ve/devmnt: Introduce ve::devmnt list
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 16a628a25414e4cca83d7aa8574fbe22f3e8f500 Author: Kirill Tkhai ktk...@odin.com Date: Mon May 18 12:52:09 2015 +0400 ve/devmnt: Introduce ve::devmnt list 1)Porting patch ve: mount option list by Maxim Patlasov: The patch adds new fields to ve_struct: devmnt_list and devmnt_mutex. devmnt_list is the head of list of ve_devmnt structs. Each host block device visible from CT can have no more than one struct ve_devmnt linked in ve-devmnt_list. If ve_devmnt is present, it can be found by 'dev' field. Each ve_devmnt struct may bear two strings: hidden and allowed options. hidden_options will be automatically added to CT-user-supplied mount options after checking allowed_options. Only options listed in allowed_options are allowed. devmnt_mutex is to protect operations on the list of ve_devmnt structs. 2)Porting patch vecalls: VE_CONFIGURE_MOUNT_OPTIONS by Maxim Patlasov. Reworking the interface using cgroups. Each CT now has a file: [ve_cgroup_mnt_pnt]/[CTID]/ve.mount_opts for configuring permittions for a block device. Below is permittions line example: 0 major:minor;1 balloon_ino=12,pfcache_csum,pfcache=/vz/pfcache;2 barrier=1 Here, major:minor is a device, '1' starts comma-separated list of hidden options, and '2' is allowed ones. https://jira.sw.ru/browse/PSBM-32273 Signed-off-by: Kirill Tkhai ktk...@odin.com Acked-by: Maxim Patlasov mpatla...@openvz.org --- include/linux/ve.h | 11 kernel/ve/ve.c | 151 + 2 files changed, 162 insertions(+) diff --git a/include/linux/ve.h b/include/linux/ve.h index 03e90e4..8b70dbe 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -123,12 +123,23 @@ struct ve_struct { struct net *ve_netns; struct mutexsync_mutex; + struct list_headdevmnt_list; + struct mutexdevmnt_mutex; + struct kmapset_key ve_sysfs_perms; #if IS_ENABLED(CONFIG_DEVTMPFS) struct path devtmpfs_root; #endif }; +struct ve_devmnt { + struct list_headlink; + + dev_t dev; + char*allowed_options; + char*hidden_options; /* balloon_ino, etc. */ +}; + #define VE_MEMINFO_DEFAULT 1 /* default behaviour */ #define VE_MEMINFO_SYSTEM 0 /* disable meminfo virtualization */ diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 609ea75..6ab409f 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -40,6 +40,7 @@ #include linux/task_work.h #include linux/tty.h #include linux/console.h +#include linux/ctype.h #include uapi/linux/vzcalluser.h #include linux/venet.h @@ -710,6 +711,8 @@ do_init: mutex_init(ve-sync_mutex); INIT_LIST_HEAD(ve-devices); INIT_LIST_HEAD(ve-ve_list); + INIT_LIST_HEAD(ve-devmnt_list); + mutex_init(ve-devmnt_mutex); kmapset_init_key(ve-ve_sysfs_perms); return ve-css; @@ -734,11 +737,33 @@ static void ve_offline(struct cgroup *cg) veid_free(ve-veid); } +static void ve_devmnt_free(struct ve_devmnt *devmnt) +{ + if (!devmnt) + return; + + kfree(devmnt-allowed_options); + kfree(devmnt-hidden_options); + kfree(devmnt); +} + +static void free_ve_devmnts(struct ve_struct *ve) +{ + while (!list_empty(ve-devmnt_list)) { + struct ve_devmnt *devmnt; + + devmnt = list_first_entry(ve-devmnt_list, struct ve_devmnt, link); + list_del(devmnt-link); + ve_devmnt_free(devmnt); + } +} + static void ve_destroy(struct cgroup *cg) { struct ve_struct *ve = cgroup_ve(cg); kmapset_unlink(ve-ve_sysfs_perms, ve_sysfs_perms); + free_ve_devmnts(ve); ve_log_destroy(ve); kfree(ve-binfmt_misc); @@ -886,6 +911,127 @@ static int ve_legacy_veid_read(struct cgroup *cg, struct cftype *cft, return seq_printf(m, %u\n, cgroup_ve(cg)-veid); } +/* + * 'data' for VE_CONFIGURE_MOUNT_OPTIONS is a zero-terminated string + * consisting of substrings separated by MNTOPT_DELIM. + */ +#define MNTOPT_DELIM ';' + +/* + * Each substring has the form of type comma-separated-list-of-options + * where types are: + */ +enum { + MNTOPT_DEVICE = 0, + MNTOPT_HIDDEN = 1, + MNTOPT_ALLOWED = 2, +}; + +/* + * 'ptr' points to the first character of buffer to parse + * 'endp' points to the last character of buffer to parse + */ +static int ve_parse_mount_options(const char *ptr, const char *endp, + struct ve_devmnt *devmnt) +{ + while (*ptr) { + const char *delim =
[Devel] [PATCH RHEL7 COMMIT] ve/uts_ns: Implement cgroup interface to configure ve's os_release
The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-123.1.2.vz7.5.1 -- commit 3c5b30b0520f4ca33608737b25aef6c45c05fb71 Author: Kirill Tkhai ktk...@odin.com Date: Mon May 18 13:52:48 2015 +0400 ve/uts_ns: Implement cgroup interface to configure ve's os_release It's the similar to VZCTL_VE_CONFIGURE ioctl in PCS6. Note: max_write_len is __NEW_UTS_LEN + 1, because I want to allow echo ... ve.os_release, which adds trailing '\n' to the string (see man echo for details). Extra symbol will be cut in ve_os_release_write(). https://jira.sw.ru/browse/PSBM-32273 Signed-off-by: Kirill Tkhai ktk...@odin.com Reviewed-by: Cyrill Gorcunov gorcu...@odin.com --- kernel/ve/ve.c | 55 +++ 1 file changed, 55 insertions(+) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 6ab409f..e598d15 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -1032,6 +1032,54 @@ static int ve_mount_opts_write(struct cgroup *cg, struct cftype *cft, return 0; } +static int ve_os_release_read(struct cgroup *cg, struct cftype *cft, + struct seq_file *m) +{ + struct ve_struct *ve = cgroup_ve(cg); + int ret = 0; + + down_read(ve-op_sem); + + if (!ve-ve_ns) { + ret = -ENOENT; + goto up_opsem; + } + + down_read(uts_sem); + seq_puts(m, ve-ve_ns-uts_ns-name.release); + seq_putc(m, '\n'); + up_read(uts_sem); +up_opsem: + up_read(ve-op_sem); + + return ret; +} + +static int ve_os_release_write(struct cgroup *cg, struct cftype *cft, + const char *buffer) +{ + struct ve_struct *ve = cgroup_ve(cg); + char *release; + int ret = 0; + + down_read(ve-op_sem); + + if (!ve-ve_ns) { + ret = -ENOENT; + goto up_opsem; + } + + down_write(uts_sem); + release = ve-ve_ns-uts_ns-name.release; + strncpy(release, buffer, __NEW_UTS_LEN); + release[__NEW_UTS_LEN] = '\0'; + up_write(uts_sem); +up_opsem: + up_read(ve-op_sem); + + return ret; +} + static struct cftype ve_cftypes[] = { { .name = state, @@ -1049,6 +1097,13 @@ static struct cftype ve_cftypes[] = { .flags = CFTYPE_NOT_ON_ROOT, .write_string = ve_mount_opts_write, }, + { + .name = os_release, + .max_write_len = __NEW_UTS_LEN + 1, + .flags = CFTYPE_NOT_ON_ROOT, + .read_seq_string = ve_os_release_read, + .write_string = ve_os_release_write, + }, { } }; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve
In PCS7 cgroups are configured from user space, so there is no longer connection from ve to device cgroup via css as it was in PCS6. Instead we should open device cgroup explicitly. https://jira.sw.ru/browse/PSBM-33555 Signed-off-by: Cyrill Gorcunov gorcu...@odin.com CC: Vladimir Davydov vdavy...@odin.com CC: Konstantin Khorenko khore...@odin.com CC: Andrey Vagin ava...@odin.com --- include/linux/device_cgroup.h |2 +- kernel/ve/vecalls.c |2 +- security/device_cgroup.c | 11 +-- 3 files changed, 11 insertions(+), 4 deletions(-) Index: linux-pcs7.git/include/linux/device_cgroup.h === --- linux-pcs7.git.orig/include/linux/device_cgroup.h +++ linux-pcs7.git/include/linux/device_cgroup.h @@ -19,7 +19,7 @@ extern int devcgroup_device_visible(umod struct cgroup; int devcgroup_default_perms_ve(struct cgroup *cgroup); int devcgroup_set_perms_ve(struct cgroup *cgroup, unsigned, dev_t, unsigned); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m); +int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, struct seq_file *m); #else static inline int devcgroup_inode_permission(struct inode *inode, int mask) Index: linux-pcs7.git/kernel/ve/vecalls.c === --- linux-pcs7.git.orig/kernel/ve/vecalls.c +++ linux-pcs7.git/kernel/ve/vecalls.c @@ -891,7 +891,7 @@ static int devperms_seq_show(struct seq_ if (ve_is_super(ve)) seq_printf(m, %10u b 016 *:*\n%10u c 006 *:*\n, 0, 0); else - devcgroup_seq_show_ve(ve-css.cgroup, ve-veid, m); + devcgroup_seq_show_ve(devices_root, ve-veid, m); return 0; } Index: linux-pcs7.git/security/device_cgroup.c === --- linux-pcs7.git.orig/security/device_cgroup.c +++ linux-pcs7.git/security/device_cgroup.c @@ -1091,10 +1091,16 @@ int devcgroup_set_perms_ve(struct cgroup } EXPORT_SYMBOL(devcgroup_set_perms_ve); -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct seq_file *m) +int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); struct dev_exception_item *wh; + struct dev_cgroup *devcgroup; + struct cgroup *cgroup; + + cgroup = ve_cgroup_open(devices_root, 0, veid); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + devcgroup = cgroup_to_devcgroup(cgroup); rcu_read_lock(); list_for_each_entry_rcu(wh, devcgroup-exceptions, list) { @@ -1118,6 +1124,7 @@ int devcgroup_seq_show_ve(struct cgroup } rcu_read_unlock(); + cgroup_kernel_close(cgroup); return 0; } EXPORT_SYMBOL(devcgroup_seq_show_ve); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 3/3] proc: add kpageidle file
Knowing the portion of memory that is not used by a certain application or memory cgroup (idle memory) can be useful for partitioning the system efficiently, e.g. by setting memory cgroup limits appropriately. Currently, the only means to estimate the amount of idle memory provided by the kernel is /proc/PID/{clear_refs,smaps}: the user can clear the access bit for all pages mapped to a particular process by writing 1 to clear_refs, wait for some time, and then count smaps:Referenced. However, this method has two serious shortcomings: - it does not count unmapped file pages - it affects the reclaimer logic To overcome these drawbacks, this patch introduces two new page flags, Idle and Young, and a new proc file, /proc/kpageidle. A page's Idle flag can only be set from userspace by setting bit in /proc/kpageidle at the offset corresponding to the page, and it is cleared whenever the page is accessed either through page tables (it is cleared in page_referenced() in this case) or using the read(2) system call (mark_page_accessed()). Thus by setting the Idle flag for pages of a particular workload, which can be found e.g. by reading /proc/PID/pagemap, waiting for some time to let the workload access its working set, and then reading the kpageidle file, one can estimate the amount of pages that are not used by the workload. The Young page flag is used to avoid interference with the memory reclaimer. A page's Young flag is set whenever the Access bit of a page table entry pointing to the page is cleared by writing to kpageidle. If page_referenced() is called on a Young page, it will add 1 to its return value, therefore concealing the fact that the Access bit was cleared. Note, since there is no room for extra page flags on 32 bit, this feature uses extended page flags when compiled on 32 bit. (on RH7 page ext is not available so make it depend on 64 bit) Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- Documentation/vm/pagemap.txt | 12 ++- fs/proc/page.c | 168 ++ fs/proc/task_mmu.c |3 +- include/linux/mm.h | 50 + include/linux/page-flags.h |9 +++ mm/Kconfig | 12 +++ mm/page_alloc.c |4 + mm/rmap.c|9 +++ mm/swap.c|2 + 9 files changed, 267 insertions(+), 2 deletions(-) diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index e37cff950ae8..a4fe9b25a6c9 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow userspace programs to examine the page tables and related information by reading files in /proc. -There are four components to pagemap: +There are five components to pagemap: * /proc/pid/pagemap. This file lets a userspace process find out which physical frame each virtual page is mapped to. It contains one 64-bit @@ -67,6 +67,16 @@ There are four components to pagemap: memory cgroup each page is charged to, indexed by PFN. Only available when CONFIG_MEMCG is set. + * /proc/kpageidle. This file implements a bitmap where each bit corresponds + to a page, indexed by PFN. When the bit is set, the corresponding page is + idle. A page is considered idle if it has not been accessed since it was + marked idle. To mark a page idle one should set the bit corresponding to the + page by writing to the file. A value written to the file is OR-ed with the + current bitmap value. Only user memory pages can be marked idle, for other + page types input is silently ignored. Writing to this file beyond max PFN + results in the ENXIO error. Only available when CONFIG_IDLE_PAGE_TRACKING is + set. + Short descriptions to the page flags: 0. LOCKED diff --git a/fs/proc/page.c b/fs/proc/page.c index c9cbed32be43..49aebd2e3596 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -15,6 +15,7 @@ #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) +#define KPMBITS (KPMSIZE * BITS_PER_BYTE) /* /proc/kpagecount - an array exposing page counts * @@ -263,6 +264,169 @@ static const struct file_operations proc_kpagecgroup_operations = { }; #endif /* CONFIG_MEMCG */ +#ifdef CONFIG_IDLE_PAGE_TRACKING +/* + * Idle page tracking only considers user memory pages, for other types of + * pages the idle flag is always unset and an attempt to set it is silently + * ignored. + * + * We treat a page as a user memory page if it is on an LRU list, because it is + * always safe to pass such a page to page_referenced(), which is essential for + * idle page tracking. With such an indicator of user pages we can skip + * isolated pages, but since there are not usually many of them, it will hardly + * affect the overall result. + * + * This function tries to get a user memory page by pfn as described above. + */ +static struct page
[Devel] [PATCH rh7 v2 0/3] idle memory tracking
This patch set backports https://lkml.org/lkml/2015/5/12/449 which is required by vcmmd. It is not yet clear if the original patch set will be accepted upstream as is, there still may be changes. However, I hope the user API will be preserved. If it is not, we will have to fix this in our kernel too. https://jira.sw.ru/browse/PSBM-32460 Vladimir Davydov (3): memcg: add page_cgroup_ino helper proc: add kpagecgroup file proc: add kpageidle file Documentation/vm/pagemap.txt | 16 ++- fs/proc/Kconfig |5 +- fs/proc/page.c | 221 ++ fs/proc/task_mmu.c |3 +- include/linux/memcontrol.h |3 + include/linux/mm.h | 50 ++ include/linux/page-flags.h |9 ++ mm/Kconfig | 12 +++ mm/hwpoison-inject.c |3 - mm/memcontrol.c | 22 + mm/memory-failure.c | 18 +--- mm/page_alloc.c |4 + mm/rmap.c|9 ++ mm/swap.c|2 + 14 files changed, 353 insertions(+), 24 deletions(-) -- 1.7.10.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 2/3] proc: add kpagecgroup file
/proc/kpagecgroup contains a 64-bit inode number of the memory cgroup each page is charged to, indexed by PFN. Having this information is useful for estimating a cgroup working set size. The file is present if CONFIG_PROC_PAGE_MONITOR CONFIG_MEMCG. Signed-off-by: Vladimir Davydov vdavy...@parallels.com --- Documentation/vm/pagemap.txt |6 - fs/proc/Kconfig |5 ++-- fs/proc/page.c | 53 ++ 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index fd7c3cfddd8e..e37cff950ae8 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt @@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow userspace programs to examine the page tables and related information by reading files in /proc. -There are three components to pagemap: +There are four components to pagemap: * /proc/pid/pagemap. This file lets a userspace process find out which physical frame each virtual page is mapped to. It contains one 64-bit @@ -63,6 +63,10 @@ There are three components to pagemap: 21. KSM 22. THP + * /proc/kpagecgroup. This file contains a 64-bit inode number of the + memory cgroup each page is charged to, indexed by PFN. Only available when + CONFIG_MEMCG is set. + Short descriptions to the page flags: 0. LOCKED diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 15af6222f8a4..e8ed22d2ba5b 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -65,5 +65,6 @@ config PROC_PAGE_MONITOR help Various /proc files exist to monitor process memory utilization: /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, - /proc/kpagecount, and /proc/kpageflags. Disabling these - interfaces will reduce the size of the kernel by approximately 4kb. + /proc/kpagecount, /proc/kpageflags, and /proc/kpagecgroup. + Disabling these interfaces will reduce the size of the kernel + by approximately 4kb. diff --git a/fs/proc/page.c b/fs/proc/page.c index cab84b6272ed..c9cbed32be43 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -8,6 +8,7 @@ #include linux/proc_fs.h #include linux/seq_file.h #include linux/hugetlb.h +#include linux/memcontrol.h #include linux/kernel-page-flags.h #include asm/uaccess.h #include internal.h @@ -213,10 +214,62 @@ static const struct file_operations proc_kpageflags_operations = { .read = kpageflags_read, }; +#ifdef CONFIG_MEMCG +static ssize_t kpagecgroup_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 ino; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src KPMMASK || count KPMMASK) + return -EINVAL; + + while (count 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (ppage) + ino = page_cgroup_ino(ppage); + else + ino = 0; + + if (put_user(ino, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecgroup_operations = { + .llseek = mem_lseek, + .read = kpagecgroup_read, +}; +#endif /* CONFIG_MEMCG */ + static int __init proc_page_init(void) { proc_create(kpagecount, S_IRUSR, NULL, proc_kpagecount_operations); proc_create(kpageflags, S_IRUSR, NULL, proc_kpageflags_operations); +#ifdef CONFIG_MEMCG + proc_create(kpagecgroup, S_IRUSR, NULL, proc_kpagecgroup_operations); +#endif return 0; } module_init(proc_page_init); -- 1.7.10.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel