date:20150518

[Devel] [PATCH rh7 2/5] ub: get rid of dcache accounting related stuff

2015-05-18 Thread Vladimir Davydov

dcache is now accounted as part of memcg:kmem, so remove the leftovers.

If we decide to account dcache separately, we will re-implement/port
what we really need.

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 fs/namei.c   |1 -
 include/bc/beancounter.h |6 --
 include/bc/dcache.h  |   18 
 kernel/bc/beancounter.c  |5 -
 kernel/bc/dcache.c   |  269 --
 kernel/bc/proc.c |3 -
 kernel/bc/vm_pages.c |5 +-
 kernel/ve/vecalls.c  |2 -
 8 files changed, 2 insertions(+), 307 deletions(-)
 delete mode 100644 include/bc/dcache.h
 delete mode 100644 kernel/bc/dcache.c

diff --git a/fs/namei.c b/fs/namei.c
index 5b0146255e94..b62c93df99d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -142,7 +142,6 @@ getname_flags(const char __user *filename, int flags, int 
*empty)
if (result)
return result;
 
-   /*ub_dentry_checkup();*/
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
diff --git a/include/bc/beancounter.h b/include/bc/beancounter.h
index 31671ff459da..4337e1363eeb 100644
--- a/include/bc/beancounter.h
+++ b/include/bc/beancounter.h
@@ -149,12 +149,6 @@ struct user_beancounter {
 
void*private_data2;
 
-   struct list_headub_dentry_lru;
-   struct list_headub_dentry_top;
-   int ub_dentry_unused;
-   int ub_dentry_batch;
-   unsigned long   ub_dentry_pruned;
-
/* resources statistic and settings */
struct ubparm   ub_parms[UB_RESOURCES];
/* resources statistic for last interval */
diff --git a/include/bc/dcache.h b/include/bc/dcache.h
deleted file mode 100644
index 186e0fc895d5..
--- a/include/bc/dcache.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef __UB_DCACHE_H__
-#define __UB_DCACHE_H__
-
-#include bc/decl.h
-
-extern unsigned int ub_dcache_threshold;
-
-UB_DECLARE_FUNC(int, ub_dcache_charge(struct user_beancounter *ub, int 
name_len))
-UB_DECLARE_VOID_FUNC(ub_dcache_uncharge(struct user_beancounter *ub, int 
name_len))
-UB_DECLARE_VOID_FUNC(ub_dcache_set_owner(struct dentry *d, struct 
user_beancounter *ub))
-UB_DECLARE_VOID_FUNC(ub_dcache_change_owner(struct dentry *dentry, struct 
user_beancounter *ub))
-UB_DECLARE_VOID_FUNC(ub_dcache_clear_owner(struct dentry *dentry))
-UB_DECLARE_VOID_FUNC(ub_dcache_unuse(struct user_beancounter *ub))
-UB_DECLARE_VOID_FUNC(ub_dcache_reclaim(struct user_beancounter *ub, unsigned 
long numerator, unsigned long denominator))
-UB_DECLARE_FUNC(int, ub_dcache_shrink(struct user_beancounter *ub, unsigned 
long size, gfp_t gfp_mask))
-UB_DECLARE_FUNC(unsigned long, ub_dcache_get_size(struct dentry *dentry))
-
-#endif
diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
index cdbe846bf839..5cc0688131ae 100644
--- a/kernel/bc/beancounter.c
+++ b/kernel/bc/beancounter.c
@@ -42,7 +42,6 @@
 #include bc/beancounter.h
 #include bc/io_acct.h
 #include bc/vmpages.h
-#include bc/dcache.h
 #include bc/proc.h
 
 static struct kmem_cache *ub_cachep;
@@ -465,8 +464,6 @@ static inline int bc_verify_held(struct user_beancounter 
*ub)
 
clean = verify_res(ub, pincount, __ub_percpu_sum(ub, pincount));
 
-   clean = verify_res(ub, dcache, !list_empty(ub-ub_dentry_lru));
-
ub_debug_trace(!clean, 5, 60*HZ);
 
return clean;
@@ -958,8 +955,6 @@ static void init_beancounter_struct(struct user_beancounter 
*ub)
spin_lock_init(ub-ub_lock);
INIT_LIST_HEAD(ub-ub_tcp_sk_list);
INIT_LIST_HEAD(ub-ub_other_sk_list);
-   INIT_LIST_HEAD(ub-ub_dentry_lru);
-   INIT_LIST_HEAD(ub-ub_dentry_top);
init_oom_control(ub-oom_ctrl);
spin_lock_init(ub-rl_lock);
ub-rl_wall.tv64 = LLONG_MIN;
diff --git a/kernel/bc/dcache.c b/kernel/bc/dcache.c
deleted file mode 100644
index 2727e690fbb4..
--- a/kernel/bc/dcache.c
+++ /dev/null
@@ -1,269 +0,0 @@
-#include linux/slab.h
-#include linux/dcache.h
-#include linux/fs.h
-#include linux/module.h
-#include linux/sched.h
-
-#include bc/beancounter.h
-#include bc/vmpages.h
-#include bc/dcache.h
-#include bc/kmem.h
-
-static unsigned int dcache_charge_size(int name_len)
-{
-   return dentry_cache-objuse + kmem_cache_objuse(inode_cachep) +
-   (name_len  DNAME_INLINE_LEN ? name_len : 0);
-}
-
-int ub_dcache_shrink(struct user_beancounter *ub,
-   unsigned long size, gfp_t gfp_mask)
-{
-   int count, pruned;
-
-   if (!(gfp_mask  __GFP_FS))
-   return -EBUSY;
-
-   count = DIV_ROUND_UP(size, dcache_charge_size(0));
-   spin_lock(dcache_lock);
-   pruned = __shrink_dcache_ub(ub, count);
-   spin_unlock(dcache_lock);
-   if (!pruned)
-   return -ENOMEM;
-
-   return 0;
-}
-
-static int __ub_dcache_charge(struct user_beancounter *ub,
-   unsigned long

[Devel] [PATCH rh7] net-namespace: Don't forget to put_ve on error path

2015-05-18 Thread Cyrill Gorcunov

If error happened during new net-namespace creation we might
end up having VE reference taken and never put back.

 | copy_net_ns
 |  setup_net
 |   ...
 |   net-owner_ve = get_ve(get_exec_env());
 |   ...
 |   error = ops_init(ops, net);
 |   if (error  0)
 |goto out_undo;
 |   ...
 |   return error;
 |  put_user_ns(user_ns);
 |  net_drop_ns(net);
 |   net_free(ns);
 |kfree(net-gen);
 |kmem_cache_free(net_cachep, net);

So lets call for put_ve to balance.

Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
CC: Vladimir Davydov vdavy...@odin.com
CC: Konstantin Khorenko khore...@odin.com
CC: Andrey Vagin ava...@odin.com
---
 net/core/net_namespace.c |3 +++
 1 file changed, 3 insertions(+)

Index: linux-pcs7.git/net/core/net_namespace.c
===
--- linux-pcs7.git.orig/net/core/net_namespace.c
+++ linux-pcs7.git/net/core/net_namespace.c
@@ -192,6 +192,9 @@ out_undo:
ops_free_list(ops, net_exit_list);
 
rcu_barrier();
+#ifdef CONFIG_VE
+   put_ve(net-owner_ve);
+#endif
goto out;
 }
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] mm/tswap/tcache: enable tcache and tswap by default

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit a1cd5a98145e5032cad97a0bf15e3e0904fad8d0
Author: Vladimir Davydov vdavy...@parallels.com
Date:   Mon May 18 17:00:04 2015 +0400

mm/tswap/tcache: enable tcache and tswap by default

We use both of them = enable tcache and tswap by default.

In order to disable them add appropriate kernel boot options:
tcache.enabled=0
tswap.enabled=0

https://jira.sw.ru/browse/PSBM-31757
https://jira.sw.ru/browse/PSBM-32063

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 mm/tcache.c | 2 +-
 mm/tswap.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/tcache.c b/mm/tcache.c
index bc740f0..e83ad05 100644
--- a/mm/tcache.c
+++ b/mm/tcache.c
@@ -125,7 +125,7 @@ static struct tcache_lru *tcache_lru_node;
  */
 
 /* Enable/disable tcache backend (set at boot time) */
-static bool tcache_enabled __read_mostly;
+static bool tcache_enabled __read_mostly = true;
 module_param_named(enabled, tcache_enabled, bool, 0444);
 
 /* Enable/disable populating the cache */
diff --git a/mm/tswap.c b/mm/tswap.c
index c4effa3..4b792cd 100644
--- a/mm/tswap.c
+++ b/mm/tswap.c
@@ -27,7 +27,7 @@ struct tswap_lru {
 static struct tswap_lru *tswap_lru_node;
 
 /* Enable/disable tswap backend (set at boot time) */
-static bool tswap_enabled __read_mostly;
+static bool tswap_enabled __read_mostly = true;
 module_param_named(enabled, tswap_enabled, bool, 0444);
 
 /* Enable/disable populating the cache */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2

2015-05-18 Thread Pavel Emelyanov

On 05/18/2015 03:52 PM, Cyrill Gorcunov wrote:
 On Mon, May 18, 2015 at 11:21:40AM +0300, Konstantin Khorenko wrote:

 Is this true that without these checks a single thread of a multithread 
 process can enter CT?
 If no - where is the check for this case?
 If yes - let's prohibit this.
 
 An update is attached: ether the task we're attaching should be 
 singlethreaded task,
 either all threads should be moved at once (which as far as I understand is 
 prepared
 by a caller code).
 

Looks OK

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7 3/5] bc: sysinfo: remove dead code

2015-05-18 Thread Vladimir Davydov

If meminfo_val != VE_MEMINFO_DEFAULT in bc_fill_sysinfo, it equals
VE_MEMINFO_SYSTEM, in which case we return from bc_fill_sysinfo
immediately.

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 kernel/bc/vm_pages.c |   22 --
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c
index 7f5eece57aa7..b3d0dd09f8cf 100644
--- a/kernel/bc/vm_pages.c
+++ b/kernel/bc/vm_pages.c
@@ -202,18 +202,8 @@ static int bc_fill_sysinfo(struct user_beancounter *ub,
total = physpages.limit;
used = physpages.held;
 
-   if (total == UB_MAXVALUE) {
-   if (meminfo_val == VE_MEMINFO_DEFAULT)
-   total = totalram;
-   else {
-   total = min(meminfo_val, totalram);
-   used = __get_beancounter_usage_percpu(ub, 
UB_PRIVVMPAGES);
-   if (glob_ve_meminfo) {
-   ub_update_resources(ub);
-   used = ub-ub_parms[UB_OOMGUARPAGES].held;
-   }
-   }
-   }
+   if (total == UB_MAXVALUE)
+   total = totalram;
 
si-totalram = total;
si-freeram = (total  used ? total - used : 0);
@@ -221,12 +211,8 @@ static int bc_fill_sysinfo(struct user_beancounter *ub,
total = swappages.limit;
used = swappages.held;
 
-   if (total == UB_MAXVALUE) {
-   if (meminfo_val == VE_MEMINFO_DEFAULT)
-   total = totalswap;
-   else
-   total = 0;
-   }
+   if (total == UB_MAXVALUE)
+   total = totalswap;
 
si-totalswap = total;
si-freeswap = (total  used ? total - used : 0);
-- 
1.7.10.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7 1/5] ub: remove CONFIG_BC_RSS_ACCOUNTING

2015-05-18 Thread Vladimir Davydov

There's no point in it, because w/o it beancounters are useless. Plus,
it isn't actually used throughout the code, because rss accounting is up
to memcg now. So, just make CONFIG_BEANCOUNTERS depend on memcg and
remove the option.

Also, remove dependency on CONFIG_CGROUP_HUGETLB, because we don't
actually require it.

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 config.OpenVZ |1 -
 kernel/bc/Kconfig |   19 ---
 kernel/bc/proc.c  |8 
 3 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/config.OpenVZ b/config.OpenVZ
index 73614cfd13e7..93f8d6ff4a22 100644
--- a/config.OpenVZ
+++ b/config.OpenVZ
@@ -5300,7 +5300,6 @@ CONFIG_VZ_IOLIMIT=m
 # User resources
 #
 CONFIG_BEANCOUNTERS=y
-CONFIG_BC_RSS_ACCOUNTING=y
 CONFIG_BC_IO_ACCOUNTING=y
 CONFIG_BC_IO_PRIORITY=y
 CONFIG_BC_PROC=y
diff --git a/kernel/bc/Kconfig b/kernel/bc/Kconfig
index 4b8156690c46..a3379f949d3c 100644
--- a/kernel/bc/Kconfig
+++ b/kernel/bc/Kconfig
@@ -12,6 +12,10 @@ config BEANCOUNTERS
bool Enable user resource accounting
default y
select CGROUPS
+   select MEMCG
+   select MEMCG_KMEM
+   select MEMCG_SWAP if SWAP
+   select MEMCG_SWAP_ENABLED if SWAP
help 
   This patch provides accounting and allows to configure
   limits for user's consumption of exhaustible system resources.
@@ -26,21 +30,6 @@ config BEANCOUNTERS
   per-process basis.  Per-process accounting doesn't prevent malicious
   users from spawning a lot of resource-consuming processes.
 
-config BC_RSS_ACCOUNTING
-   bool Account physical memory usage
-   default y
-   depends on BEANCOUNTERS
-   select RESOURCE_COUNTERS
-   select MEMCG
-   select MEMCG_KMEM
-   select MEMCG_SWAP if SWAP
-   select MEMCG_SWAP_ENABLED if SWAP
-   select CGROUP_HUGETLB if HUGETLBFS
-   help
-  This allows to estimate per beancounter physical memory usage.
-  Implemented alghorithm accounts shared pages of memory as well,
-  dividing them by number of beancounter which use the page.
-
 config BC_IO_ACCOUNTING
bool Account file I/O
default y
diff --git a/kernel/bc/proc.c b/kernel/bc/proc.c
index dd33d44a2cb0..af6a610a3e08 100644
--- a/kernel/bc/proc.c
+++ b/kernel/bc/proc.c
@@ -198,12 +198,7 @@ static struct bc_proc_entry bc_meminfo_entry = {
.u.show = bc_proc_meminfo_show,
 };
 
-#ifdef CONFIG_BC_RSS_ACCOUNTING
-
-#include linux/memcontrol.h
-
 #define K(x) ((x)  (PAGE_SHIFT - 10))
-
 static int bc_proc_nodeinfo_show(struct seq_file *f, void *v)
 {
int nid;
@@ -241,7 +236,6 @@ static struct bc_proc_entry bc_nodeinfo_entry = {
.name = nodeinfo,
.u.show = bc_proc_nodeinfo_show,
 };
-#endif
 
 #if 0
 
@@ -931,9 +925,7 @@ static int __init ub_init_proc(void)
 // bc_register_proc_entry(bc_dcacheinfo_entry);
bc_register_proc_root_entry(bc_all_resources_entry);
bc_register_proc_entry(bc_meminfo_entry);
-#ifdef CONFIG_BC_RSS_ACCOUNTING
bc_register_proc_entry(bc_nodeinfo_entry);
-#endif
 
entry = proc_create(user_beancounters,
S_IRUSR|S_ISVTX, NULL, ub_file_operations);
-- 
1.7.10.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7 4/5] proc: fix oom_score output

2015-05-18 Thread Vladimir Davydov

oom_badness now returns absolute badness, not per mille. So we have to
revert the chunk of PCS6 code that doesn't know that. Note, I use the
global totalpages rather than per ub as it used to be, because ub's oom
killer doesn't work anyway for now and will be reimplemented in the
scope of the memory cgroup. Then I'll change it to per-memcg value.

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 fs/proc/base.c |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 12d9ea1eca6d..79ee3c875e76 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -449,15 +449,15 @@ static const struct file_operations 
proc_cpuset_operations = {
 
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-   int points = 0;
+   unsigned long totalpages = totalram_pages + total_swap_pages;
+   unsigned long points = 0;
 
read_lock(tasklist_lock);
-   if (pid_alive(task)) {
-   points = oom_badness(task, NULL, NULL, 
ub_oom_total_pages(get_exec_ub()));
-   points = clamp(points, 0, 1000);
-   }
+   if (pid_alive(task))
+   points = oom_badness(task, NULL, NULL, totalpages) *
+   1000 / totalpages;
read_unlock(tasklist_lock);
-   return sprintf(buffer, %d\n, points);
+   return sprintf(buffer, %lu\n, points);
 }
 
 struct limit_names {
-- 
1.7.10.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7 0/5] ub: remove some dead code

2015-05-18 Thread Vladimir Davydov

Vladimir Davydov (5):
  ub: remove CONFIG_BC_RSS_ACCOUNTING
  ub: get rid of dcache accounting related stuff
  bc: sysinfo: remove dead code
  proc: fix oom_score output
  bc: zap oom kill related stuff

 arch/x86/mm/fault.c  |2 -
 config.OpenVZ|1 -
 drivers/tty/sysrq.c  |3 -
 fs/namei.c   |1 -
 fs/proc/base.c   |   14 +--
 include/bc/beancounter.h |   14 ---
 include/bc/dcache.h  |   18 ---
 include/bc/oom_kill.h|   19 ---
 include/bc/vmpages.h |2 -
 include/linux/mm_types.h |2 -
 include/linux/oom.h  |   13 ---
 kernel/bc/Kconfig|   19 +--
 kernel/bc/beancounter.c  |9 --
 kernel/bc/dcache.c   |  269 --
 kernel/bc/oom_kill.c |  289 --
 kernel/bc/proc.c |   13 ---
 kernel/bc/statd.c|3 -
 kernel/bc/vm_pages.c |   73 +---
 kernel/exit.c|1 -
 kernel/fork.c|5 -
 kernel/ve/vecalls.c  |2 -
 mm/page_alloc.c  |4 -
 22 files changed, 16 insertions(+), 760 deletions(-)
 delete mode 100644 include/bc/dcache.h
 delete mode 100644 include/bc/oom_kill.h
 delete mode 100644 kernel/bc/dcache.c
 delete mode 100644 kernel/bc/oom_kill.c

-- 
1.7.10.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ve/cgroups: fake num_cgroups in /proc/cgroups output

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 213b5800cbf1e1f36efaab61f2f49ea198bdb1e8
Author: Vasily Averin v...@odin.com
Date:   Mon May 18 16:32:55 2015 +0400

ve/cgroups: fake num_cgroups in /proc/cgroups output

Like in rh6-based kernels,
/proc/cgroups output inside container will show 1 in 'num_cgroups' column.

https://jira.sw.ru/browse/PSBM-33400

Signed-off-by: Vasily Averin v...@openvz.org

khorenko@:
This is done in order to prevent people to try guessing the
number of Containers running on a Hardware Node
because even if the guess is correct, it gives no useful info,
but people can easily come to wrong conclusions.
---
 kernel/cgroup.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f881f69..f897042 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4815,6 +4815,8 @@ out:
return retval;
 }
 
+#define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1)
+
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
@@ -4829,11 +4831,14 @@ static int proc_cgroupstats_show(struct seq_file *m, 
void *v)
mutex_lock(cgroup_mutex);
for (i = 0; i  CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+   int num;
+
if (ss == NULL)
continue;
+   num = _cg_virtualized(ss-root-number_of_cgroups);
seq_printf(m, %s\t%d\t%d\t%d\n,
   ss-name, ss-root-hierarchy_id,
-  ss-root-number_of_cgroups, !ss-disabled);
+  num, !ss-disabled);
}
mutex_unlock(cgroup_mutex);
return 0;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve

2015-05-18 Thread Vladimir Davydov

On Mon, May 18, 2015 at 01:22:22PM +0300, Cyrill Gorcunov wrote:
 In PCS7 cgroups are configured from user space, so there is
 no longer connection from ve to device cgroup via css as
 it was in PCS6. Instead we should open device cgroup explicitly.
 
 https://jira.sw.ru/browse/PSBM-33555
 
 Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
 CC: Vladimir Davydov vdavy...@odin.com
 CC: Konstantin Khorenko khore...@odin.com
 CC: Andrey Vagin ava...@odin.com

Reviewed-by: Vladimir Davydov vdavy...@parallels.com
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ve/net/printk: net_veboth_ratelimited introduced

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 16d8b1d984f26100bf006ed93fcd47642401dd26
Author: Vasily Averin v...@odin.com
Date:   Mon May 18 12:29:44 2015 +0400

ve/net/printk: net_veboth_ratelimited introduced

net_veboth_ratelimited is required to save net-ratelimited messages
both into host and into containers dmesg buffers

Signed-off-by:  Vasily Averin v...@openvz.org
Acked-by: Kirill Tkhai ktk...@odin.com
---
 include/linux/net.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/net.h b/include/linux/net.h
index d7b2205..7e59abe 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -249,6 +249,8 @@ do {
\
net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
 #define net_velog_ratelimited(fmt, ...)\
net_ratelimited_function(ve_printk, VE_LOG, fmt, ##__VA_ARGS__)
+#define net_veboth_ratelimited(fmt, ...)   \
+   net_ratelimited_function(ve_printk, VE_LOG_BOTH, fmt, ##__VA_ARGS__)
 
 
 #define net_random()   prandom_u32()
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [RFC rh7] ve: cgroups -- Allow to attach non-self into ve cgroups

2015-05-18 Thread Konstantin Khorenko

On 05/14/2015 07:52 PM, Cyrill Gorcunov wrote:
 In vzctl/libvzctl bundle we restore container like
 
  - create ve/$ctid cgroup
  - move self into this cgroup
  - run criu from inside
 
 So that kernel code passes ve_can_attach test. In turn for
 our P.Haul project (which is managing live migration) the
 situation is different -- it opens ve/$ctid but moves
 criu service pid instead (so that the service will
 start restore procedure). Which leads to situation
 where ve_can_attach fails with -EINVAL.
 
 Reported-by: Nikita Spiridonov nspirido...@odin.com
 Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
 CC: Vladimir Davydov vdavy...@odin.com
 CC: Konstantin Khorenko khore...@odin.com
 CC: Pavel Emelyanov xe...@odin.com
 CC: Andrey Vagin ava...@odin.com
 ---
 
 Guys, could you please take a look, especially from
 security POV, is it safe to remove all these checks?
 
  kernel/ve/ve.c |   31 +--
  1 file changed, 13 insertions(+), 18 deletions(-)
 
 Index: linux-pcs7.git/kernel/ve/ve.c
 ===
 --- linux-pcs7.git.orig/kernel/ve/ve.c
 +++ linux-pcs7.git/kernel/ve/ve.c
 @@ -750,13 +750,6 @@ static void ve_destroy(struct cgroup *cg
  static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
  {
   struct ve_struct *ve = cgroup_ve(cg);
 - struct task_struct *task = current;
 -
 - if (cgroup_taskset_size(tset) != 1 ||
 - cgroup_taskset_first(tset) != task ||
 - !thread_group_leader(task) ||
 - !thread_group_empty(task))
 - return -EINVAL;

Is this true that without these checks a single thread of a multithread process 
can enter CT?
If no - where is the check for this case?
If yes - let's prohibit this.

   if (ve-is_locked)
   return -EBUSY;
 @@ -775,20 +768,22 @@ static int ve_can_attach(struct cgroup *
  static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
  {
   struct ve_struct *ve = cgroup_ve(cg);
 - struct task_struct *tsk = current;
 -
 - /* this probihibts ptracing of task entered to VE from host system */
 - if (ve-is_running  tsk-mm)
 - tsk-mm-vps_dumpable = VD_VE_ENTER_TASK;
 + struct task_struct *tsk;
  
 - /* Drop OOM protection. */
 - tsk-signal-oom_score_adj = 0;
 - tsk-signal-oom_score_adj_min = 0;
 + cgroup_taskset_for_each(tsk, cg, tset) {
 + /* this probihibts ptracing of task entered to VE from host 
 system */
 + if (ve-is_running  tsk-mm)
 + tsk-mm-vps_dumpable = VD_VE_ENTER_TASK;
 +
 + /* Drop OOM protection. */
 + tsk-signal-oom_score_adj = 0;
 + tsk-signal-oom_score_adj_min = 0;
  
 - /* Leave parent exec domain */
 - tsk-parent_exec_id--;
 + /* Leave parent exec domain */
 + tsk-parent_exec_id--;
  
 - tsk-task_ve = ve;
 + tsk-task_ve = ve;
 + }
  }
  
  static int ve_state_read(struct cgroup *cg, struct cftype *cft,
 ___
 Devel mailing list
 Devel@openvz.org
 https://lists.openvz.org/mailman/listinfo/devel
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ve/netfilter: ve_printk for nf_conntrack: table full

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 8782918c418820d5127afa4a5db74c9b3eac3b82
Author: Vasily Averin v...@odin.com
Date:   Mon May 18 12:29:57 2015 +0400

ve/netfilter: ve_printk for nf_conntrack: table full

port of diff-ve-printk-conntrack-tables-full from rh6-based kernels

nf_conntrack: table full, dropping packet message
should be visible both in CT and on HN and
should contain CTID for reading simplicity.

https://bugzilla.openvz.org/show_bug.cgi?id=2940

Signed-off-by: Vasily Averin v...@openvz.org
Acked-by: Kirill Tkhai ktk...@odin.com
---
 net/netfilter/nf_conntrack_core.c   | 4 +++-
 net/netfilter/nf_conntrack_expect.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 495b859..017c755 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -696,7 +696,9 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
unlikely(atomic_read(net-ct.count)  ct_max)) {
if (!early_drop(net, hash_bucket(hash, net))) {
atomic_dec(net-ct.count);
-   net_warn_ratelimited(nf_conntrack: table full, 
dropping packet\n);
+   net_veboth_ratelimited(KERN_WARNING VE%u: 
+   nf_conntrack table full, 
dropping packet\n,
+   net-owner_ve-veid);
return ERR_PTR(-ENOMEM);
}
}
diff --git a/net/netfilter/nf_conntrack_expect.c 
b/net/netfilter/nf_conntrack_expect.c
index d80db92..bfa95fd 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -408,7 +408,9 @@ static inline int __nf_ct_expect_check(struct 
nf_conntrack_expect *expect)
}
 
if (net-ct.expect_count = init_net.ct.expect_max) {
-   net_warn_ratelimited(nf_conntrack: expectation table full\n);
+   net_veboth_ratelimited(KERN_WARNING VE%u 
+   nf_conntrack: expectation table 
full\n,
+   net-owner_ve-veid);
ret = -EMFILE;
}
 out:
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2

2015-05-18 Thread Vladimir Davydov

On Mon, May 18, 2015 at 07:42:50PM +0300, Cyrill Gorcunov wrote:
 On Mon, May 18, 2015 at 07:34:45PM +0300, Vladimir Davydov wrote:

 /*
   +  * We either moving the whole group of threads,
   +  * either a single thread process.
   +  */
   + if (cgroup_taskset_size(tset) == 1) {
  
  != ?
  
   + task = cgroup_taskset_first(tset);
   + if (!thread_group_leader(task)  !thread_group_empty(task))
   + return -EINVAL;
 
 No, ==. The thing is that the kernel carries about multithreaded
 tasks and groups all threads into the array. In turn, when task
 is attached via pid (ie ve/ctid/tasks). the kernel simply looks
 up for a task, put it into an array and pass to us. So it's our
 duty to check that the only one task has been passed and if so
 we need to check it's not a thread from some multithreaded
 application.
 

OK, I see, thanks.

But if we are attaching one thread which is thread_group_leader, we will
not fail even if the thread group is not empty and other threads are not
moved, will we?
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v3

2015-05-18 Thread Cyrill Gorcunov

On Mon, May 18, 2015 at 08:41:33PM +0300, Vladimir Davydov wrote:
 
 OK, I see, thanks.
 
 But if we are attaching one thread which is thread_group_leader, we will
 not fail even if the thread group is not empty and other threads are not
 moved, will we?

Yeah ;) It should be OR here. Thank you!
From: Cyrill Gorcunov gorcu...@odin.com
Subject: ve: cgroups -- Allow to attach non-self into ve cgroups

In vzctl/libvzctl bundle we restore container like

 - create ve/$ctid cgroup
 - move self into this cgroup
 - run criu from inside

So that kernel code passes ve_can_attach test. In turn for
our P.Haul project (which is managing live migration) the
situation is different -- it opens ve/$ctid but moves
criu service pid instead (so that the service will
start restore procedure). Which leads to situation
where ve_can_attach fails with -EINVAL.

Basically we need to

1) Check that in case if task is getting attached to
   VE cgroup it should be a single threaded task.

2) In case of multithread task all threads should be
   moved in one pass (this actually prepared by
   cgroup_attach_task caller).

3) In case if VE is stopping or starting only kernel
   threads can attach.

Reported-by: Nikita Spiridonov nspirido...@odin.com
Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
CC: Vladimir Davydov vdavy...@odin.com
CC: Konstantin Khorenko khore...@odin.com
CC: Pavel Emelyanov xe...@odin.com
CC: Andrey Vagin ava...@odin.com
---
 kernel/ve/ve.c |   53 +++--
 1 file changed, 31 insertions(+), 22 deletions(-)

Index: linux-pcs7.git/kernel/ve/ve.c
===
--- linux-pcs7.git.orig/kernel/ve/ve.c
+++ linux-pcs7.git/kernel/ve/ve.c
@@ -775,24 +775,31 @@ static void ve_destroy(struct cgroup *cg
 static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 {
struct ve_struct *ve = cgroup_ve(cg);
-   struct task_struct *task = current;
-
-   if (cgroup_taskset_size(tset) != 1 ||
-   cgroup_taskset_first(tset) != task ||
-   !thread_group_leader(task) ||
-   !thread_group_empty(task))
-   return -EINVAL;
+   struct task_struct *task;
 
if (ve-is_locked)
return -EBUSY;
 
/*
+* We either moving the whole group of threads,
+* either a single thread process.
+*/
+   if (cgroup_taskset_size(tset) == 1) {
+   task = cgroup_taskset_first(tset);
+   if (!thread_group_leader(task) || !thread_group_empty(task))
+   return -EINVAL;
+   }
+
+   /*
 * Forbid userspace tasks to enter during starting or stopping.
-* Permit attaching kernel threads and init task for this containers.
+* Permit attaching kernel threads for this containers.
 */
-   if (!ve-is_running  (ve-ve_ns || nr_threads_ve(ve)) 
-   !(task-flags  PF_KTHREAD))
-   return -EPIPE;
+   if (!ve-is_running  (ve-ve_ns || nr_threads_ve(ve))) {
+   cgroup_taskset_for_each(task, cg, tset) {
+   if (!(task-flags  PF_KTHREAD))
+   return -EPIPE;
+   }
+   }
 
return 0;
 }
@@ -800,20 +807,22 @@ static int ve_can_attach(struct cgroup *
 static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 {
struct ve_struct *ve = cgroup_ve(cg);
-   struct task_struct *tsk = current;
-
-   /* this probihibts ptracing of task entered to VE from host system */
-   if (ve-is_running  tsk-mm)
-   tsk-mm-vps_dumpable = VD_VE_ENTER_TASK;
+   struct task_struct *task;
 
-   /* Drop OOM protection. */
-   tsk-signal-oom_score_adj = 0;
-   tsk-signal-oom_score_adj_min = 0;
+   cgroup_taskset_for_each(task, cg, tset) {
+   /* this probihibts ptracing of task entered to VE from host 
system */
+   if (ve-is_running  task-mm)
+   task-mm-vps_dumpable = VD_VE_ENTER_TASK;
+
+   /* Drop OOM protection. */
+   task-signal-oom_score_adj = 0;
+   task-signal-oom_score_adj_min = 0;
 
-   /* Leave parent exec domain */
-   tsk-parent_exec_id--;
+   /* Leave parent exec domain */
+   task-parent_exec_id--;
 
-   tsk-task_ve = ve;
+   task-task_ve = ve;
+   }
 }
 
 static int ve_state_read(struct cgroup *cg, struct cftype *cft,
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2

2015-05-18 Thread Cyrill Gorcunov

On Mon, May 18, 2015 at 07:34:45PM +0300, Vladimir Davydov wrote:
   
  /*
  +* We either moving the whole group of threads,
  +* either a single thread process.
  +*/
  +   if (cgroup_taskset_size(tset) == 1) {
 
 != ?
 
  +   task = cgroup_taskset_first(tset);
  +   if (!thread_group_leader(task)  !thread_group_empty(task))
  +   return -EINVAL;

No, ==. The thing is that the kernel carries about multithreaded
tasks and groups all threads into the array. In turn, when task
is attached via pid (ie ve/ctid/tasks). the kernel simply looks
up for a task, put it into an array and pass to us. So it's our
duty to check that the only one task has been passed and if so
we need to check it's not a thread from some multithreaded
application.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve

2015-05-18 Thread Cyrill Gorcunov

On Mon, May 18, 2015 at 07:43:40PM +0300, Cyrill Gorcunov wrote:
  
  For uuid-named cgroups ve-veid != cgroup name. You should use ve-name
  instead. Please fix.
 
 Oh, i forgot about this new approach with uuid containers. Sure will do,
 thank you!

Attached.
From: Cyrill Gorcunov gorcu...@odin.com
Subject: ve: device cgroup -- Implement devcgroup_seq_show_ve

In PCS7 cgroups are configured from user space, so there is
no longer connection from ve to device cgroup via css as
it was in PCS6. Instead we should open device cgroup explicitly.

https://jira.sw.ru/browse/PSBM-33555

v2 (by vdavydov@):
 - use ve::ve_name because we're switching to UUID based containers

Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
CC: Vladimir Davydov vdavy...@odin.com
CC: Konstantin Khorenko khore...@odin.com
CC: Andrey Vagin ava...@odin.com
---
 include/linux/device_cgroup.h |3 ++-
 kernel/ve/vecalls.c   |2 +-
 security/device_cgroup.c  |   14 +++---
 3 files changed, 14 insertions(+), 5 deletions(-)

Index: linux-pcs7.git/include/linux/device_cgroup.h
===
--- linux-pcs7.git.orig/include/linux/device_cgroup.h
+++ linux-pcs7.git/include/linux/device_cgroup.h
@@ -19,7 +19,8 @@ extern int devcgroup_device_visible(umod
 struct cgroup;
 int devcgroup_default_perms_ve(struct cgroup *cgroup);
 int devcgroup_set_perms_ve(struct cgroup *cgroup, unsigned, dev_t, unsigned);
-int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
seq_file *m);
+struct ve_struct;
+int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, 
struct seq_file *m);
 
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
Index: linux-pcs7.git/kernel/ve/vecalls.c
===
--- linux-pcs7.git.orig/kernel/ve/vecalls.c
+++ linux-pcs7.git/kernel/ve/vecalls.c
@@ -891,7 +891,7 @@ static int devperms_seq_show(struct seq_
if (ve_is_super(ve))
seq_printf(m, %10u b 016 *:*\n%10u c 006 *:*\n, 0, 0);
else
-   devcgroup_seq_show_ve(ve-css.cgroup, ve-veid, m);
+   devcgroup_seq_show_ve(devices_root, ve, m);
 
return 0;
 }
Index: linux-pcs7.git/security/device_cgroup.c
===
--- linux-pcs7.git.orig/security/device_cgroup.c
+++ linux-pcs7.git/security/device_cgroup.c
@@ -17,6 +17,7 @@
 #include linux/major.h
 #include linux/module.h
 #include linux/capability.h
+#include linux/ve.h
 
 #define ACC_MKNOD 1
 #define ACC_READ  2
@@ -1091,10 +1092,16 @@ int devcgroup_set_perms_ve(struct cgroup
 }
 EXPORT_SYMBOL(devcgroup_set_perms_ve);
 
-int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
seq_file *m)
+int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, 
struct seq_file *m)
 {
-   struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
struct dev_exception_item *wh;
+   struct dev_cgroup *devcgroup;
+   struct cgroup *cgroup;
+
+   cgroup = cgroup_kernel_open(devices_root, 0, ve_name(ve));
+   if (IS_ERR(cgroup))
+   return PTR_ERR(cgroup);
+   devcgroup = cgroup_to_devcgroup(cgroup);
 
rcu_read_lock();
list_for_each_entry_rcu(wh, devcgroup-exceptions, list) {
@@ -1112,12 +1119,13 @@ int devcgroup_seq_show_ve(struct cgroup
perm |= S_IXOTH;
 
seq_printf(m, %10u %c %03o %s:%s\n,
-   veid,
+   ve-veid,
type_to_char(wh-type),
perm, maj, min);
}
rcu_read_unlock();
 
+   cgroup_kernel_close(cgroup);
return 0;
 }
 EXPORT_SYMBOL(devcgroup_seq_show_ve);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve

2015-05-18 Thread Cyrill Gorcunov

On Mon, May 18, 2015 at 07:33:41PM +0300, Vladimir Davydov wrote:
 On Mon, May 18, 2015 at 01:22:22PM +0300, Cyrill Gorcunov wrote:
  --- linux-pcs7.git.orig/security/device_cgroup.c
  +++ linux-pcs7.git/security/device_cgroup.c
  @@ -1091,10 +1091,16 @@ int devcgroup_set_perms_ve(struct cgroup
   }
   EXPORT_SYMBOL(devcgroup_set_perms_ve);
   
  -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
  seq_file *m)
  +int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, 
  struct seq_file *m)
   {
  -   struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
  struct dev_exception_item *wh;
  +   struct dev_cgroup *devcgroup;
  +   struct cgroup *cgroup;
  +
  +   cgroup = ve_cgroup_open(devices_root, 0, veid);
 
 For uuid-named cgroups ve-veid != cgroup name. You should use ve-name
 instead. Please fix.

Oh, i forgot about this new approach with uuid containers. Sure will do,
thank you!
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve

2015-05-18 Thread Vladimir Davydov

On Mon, May 18, 2015 at 01:22:22PM +0300, Cyrill Gorcunov wrote:
 --- linux-pcs7.git.orig/security/device_cgroup.c
 +++ linux-pcs7.git/security/device_cgroup.c
 @@ -1091,10 +1091,16 @@ int devcgroup_set_perms_ve(struct cgroup
  }
  EXPORT_SYMBOL(devcgroup_set_perms_ve);
  
 -int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
 seq_file *m)
 +int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, struct 
 seq_file *m)
  {
 - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
   struct dev_exception_item *wh;
 + struct dev_cgroup *devcgroup;
 + struct cgroup *cgroup;
 +
 + cgroup = ve_cgroup_open(devices_root, 0, veid);

For uuid-named cgroups ve-veid != cgroup name. You should use ve-name
instead. Please fix.

 + if (IS_ERR(cgroup))
 + return PTR_ERR(cgroup);
 + devcgroup = cgroup_to_devcgroup(cgroup);
  
   rcu_read_lock();
   list_for_each_entry_rcu(wh, devcgroup-exceptions, list) {
 @@ -1118,6 +1124,7 @@ int devcgroup_seq_show_ve(struct cgroup
   }
   rcu_read_unlock();
  
 + cgroup_kernel_close(cgroup);
   return 0;
  }
  EXPORT_SYMBOL(devcgroup_seq_show_ve);
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: cgroups -- Allow to attach non-self into ve cgroups, v2

2015-05-18 Thread Vladimir Davydov

On Mon, May 18, 2015 at 03:52:35PM +0300, Cyrill Gorcunov wrote:
 --- linux-pcs7.git.orig/kernel/ve/ve.c
 +++ linux-pcs7.git/kernel/ve/ve.c
 @@ -750,24 +750,31 @@ static void ve_destroy(struct cgroup *cg
  static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
  {
   struct ve_struct *ve = cgroup_ve(cg);
 - struct task_struct *task = current;
 -
 - if (cgroup_taskset_size(tset) != 1 ||
 - cgroup_taskset_first(tset) != task ||
 - !thread_group_leader(task) ||
 - !thread_group_empty(task))
 - return -EINVAL;
 + struct task_struct *task;
  
   if (ve-is_locked)
   return -EBUSY;
  
   /*
 +  * We either moving the whole group of threads,
 +  * either a single thread process.
 +  */
 + if (cgroup_taskset_size(tset) == 1) {

!= ?

 + task = cgroup_taskset_first(tset);
 + if (!thread_group_leader(task)  !thread_group_empty(task))
 + return -EINVAL;
 + }
 +
 + /*
* Forbid userspace tasks to enter during starting or stopping.
 -  * Permit attaching kernel threads and init task for this containers.
 +  * Permit attaching kernel threads for this containers.
*/
 - if (!ve-is_running  (ve-ve_ns || nr_threads_ve(ve)) 
 - !(task-flags  PF_KTHREAD))
 - return -EPIPE;
 + if (!ve-is_running  (ve-ve_ns || nr_threads_ve(ve))) {
 + cgroup_taskset_for_each(task, cg, tset) {
 + if (!(task-flags  PF_KTHREAD))
 + return -EPIPE;
 + }
 + }
  
   return 0;
  }
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve

2015-05-18 Thread Vladimir Davydov

On Mon, May 18, 2015 at 08:04:27PM +0300, Cyrill Gorcunov wrote:
 From: Cyrill Gorcunov gorcu...@odin.com
 Subject: ve: device cgroup -- Implement devcgroup_seq_show_ve
 
 In PCS7 cgroups are configured from user space, so there is
 no longer connection from ve to device cgroup via css as
 it was in PCS6. Instead we should open device cgroup explicitly.
 
 https://jira.sw.ru/browse/PSBM-33555
 
 v2 (by vdavydov@):
  - use ve::ve_name because we're switching to UUID based containers
 
 Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
 CC: Vladimir Davydov vdavy...@odin.com
 CC: Konstantin Khorenko khore...@odin.com
 CC: Andrey Vagin ava...@odin.com

Reviewed-by: Vladimir Davydov vdavy...@parallels.com
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH] properly charge and uncharge shmem

2015-05-18 Thread panda

From: Andrew Perepechko pa...@cloudlinux.com

Currently, shmem_lock immediately and
unconditionally uncharges what it has
just charged for a lock request.

This, indeed, causes a double uncharge
with something like the following:

  shmid = shmget(12345, 8192, IPC_CREAT | 0666);
  rc = shmctl(shmid, SHM_LOCK, NULL);
  shmctl(shmid, IPC_RMID, 0);

with the following in the kernel log:

[  455.815025] Uncharging too much 2 h 0, res lockedpages ub 0

Signed-off-by: Andrew Perepechko pa...@cloudlinux.com
---
 mm/shmem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index a6b3e30..d09a230 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1359,11 +1359,13 @@ int shmem_lock(struct file *file, int lock, struct 
user_struct *user)
mapping_set_unevictable(file-f_mapping);
}
if (!lock  (info-flags  VM_LOCKED)  user) {
+   ub_lockedshm_uncharge(info, inode-i_size);
user_shm_unlock(inode-i_size, user);
info-flags = ~VM_LOCKED;
mapping_clear_unevictable(file-f_mapping);
}
-   retval = 0;
+   spin_unlock(info-lock);
+   return 0;
 
 out_nomem:
ub_lockedshm_uncharge(info, inode-i_size);
-- 
1.9.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ve/cgroups: Allow to attach non-self into ve cgroups, v3

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 729323172bc760a2daf4d790a5bffc74ec10c04d
Author: Cyrill Gorcunov gorcu...@odin.com
Date:   Tue May 19 00:43:44 2015 +0400

ve/cgroups: Allow to attach non-self into ve cgroups, v3

In vzctl/libvzctl bundle we restore container like

 - create ve/$ctid cgroup
 - move self into this cgroup
 - run criu from inside

So that kernel code passes ve_can_attach test. In turn for
our P.Haul project (which is managing live migration) the
situation is different -- it opens ve/$ctid but moves
criu service pid instead (so that the service will
start restore procedure). Which leads to situation
where ve_can_attach fails with -EINVAL.

Basically we need to

1) Check that in case if task is getting attached to
   VE cgroup it should be a single threaded task.

2) In case of multithread task all threads should be
   moved in one pass (this actually prepared by
   cgroup_attach_task caller).

3) In case if VE is stopping or starting only kernel
   threads can attach.

khorenko@:
Check for thread_group_empty(task) is enough to be sure
the task is single-threaded.

https://jira.sw.ru/browse/PSBM-33561

Reported-by: Nikita Spiridonov nspirido...@odin.com
Signed-off-by: Cyrill Gorcunov gorcu...@odin.com

CC: Vladimir Davydov vdavy...@odin.com
CC: Konstantin Khorenko khore...@odin.com
CC: Pavel Emelyanov xe...@odin.com
CC: Andrey Vagin ava...@odin.com
---
 kernel/ve/ve.c | 51 ++-
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e598d15..cf7c848 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -775,24 +775,31 @@ static void ve_destroy(struct cgroup *cg)
 static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 {
struct ve_struct *ve = cgroup_ve(cg);
-   struct task_struct *task = current;
-
-   if (cgroup_taskset_size(tset) != 1 ||
-   cgroup_taskset_first(tset) != task ||
-   !thread_group_leader(task) ||
-   !thread_group_empty(task))
-   return -EINVAL;
+   struct task_struct *task;
 
if (ve-is_locked)
return -EBUSY;
 
/*
+* We either moving the whole group of threads,
+* either a single thread process.
+*/
+   if (cgroup_taskset_size(tset) == 1) {
+   task = cgroup_taskset_first(tset);
+   if (!thread_group_empty(task))
+   return -EINVAL;
+   }
+
+   /*
 * Forbid userspace tasks to enter during starting or stopping.
-* Permit attaching kernel threads and init task for this containers.
+* Permit attaching kernel threads for this containers.
 */
-   if (!ve-is_running  (ve-ve_ns || nr_threads_ve(ve)) 
-   !(task-flags  PF_KTHREAD))
-   return -EPIPE;
+   if (!ve-is_running  (ve-ve_ns || nr_threads_ve(ve))) {
+   cgroup_taskset_for_each(task, cg, tset) {
+   if (!(task-flags  PF_KTHREAD))
+   return -EPIPE;
+   }
+   }
 
return 0;
 }
@@ -800,20 +807,22 @@ static int ve_can_attach(struct cgroup *cg, struct 
cgroup_taskset *tset)
 static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 {
struct ve_struct *ve = cgroup_ve(cg);
-   struct task_struct *tsk = current;
+   struct task_struct *task;
 
-   /* this probihibts ptracing of task entered to VE from host system */
-   if (ve-is_running  tsk-mm)
-   tsk-mm-vps_dumpable = VD_VE_ENTER_TASK;
+   cgroup_taskset_for_each(task, cg, tset) {
+   /* this probihibts ptracing of task entered to VE from host 
system */
+   if (ve-is_running  task-mm)
+   task-mm-vps_dumpable = VD_VE_ENTER_TASK;
 
-   /* Drop OOM protection. */
-   tsk-signal-oom_score_adj = 0;
-   tsk-signal-oom_score_adj_min = 0;
+   /* Drop OOM protection. */
+   task-signal-oom_score_adj = 0;
+   task-signal-oom_score_adj_min = 0;
 
-   /* Leave parent exec domain */
-   tsk-parent_exec_id--;
+   /* Leave parent exec domain */
+   task-parent_exec_id--;
 
-   tsk-task_ve = ve;
+   task-task_ve = ve;
+   }
 }
 
 static int ve_state_read(struct cgroup *cg, struct cftype *cft,
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: check new size of block device on ioctl(GROW)

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 0385f754e9f680c7d5095ae981fe29c1b6e7323a
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:26:55 2015 +0400

ploop: check new size of block device on ioctl(GROW)

Return error if userspace attepmts to grow block device above limits
imposed by ploop1 formats.

https://jira.sw.ru/browse/PSBM-21027

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/fmt_ploop1.c   |  4 
 drivers/block/ploop/ploop1_image.h | 13 +
 2 files changed, 17 insertions(+)

diff --git a/drivers/block/ploop/fmt_ploop1.c b/drivers/block/ploop/fmt_ploop1.c
index 624bdc1..fb12c30 100644
--- a/drivers/block/ploop/fmt_ploop1.c
+++ b/drivers/block/ploop/fmt_ploop1.c
@@ -458,6 +458,10 @@ ploop1_prepare_grow(struct ploop_delta * delta, u64 
*new_size, int *reloc)
if (*new_size  ((1  delta-cluster_log) - 1))
return -EINVAL;
 
+   if (*new_size  ploop1_max_size(1  delta-plo-cluster_log,
+   delta-plo-fmt_version))
+   return -EFBIG;
+
vh = (struct ploop_pvd_header *)page_address(ph-dyn_page);
n_present  = le32_to_cpu(vh-m_FirstBlockOffset)  log;
BUG_ON (!n_present);
diff --git a/drivers/block/ploop/ploop1_image.h 
b/drivers/block/ploop/ploop1_image.h
index 337c05b..c4efe87 100644
--- a/drivers/block/ploop/ploop1_image.h
+++ b/drivers/block/ploop/ploop1_image.h
@@ -247,6 +247,19 @@ ploop1_version(struct ploop_pvd_header *vh)
return -1;
 }
 
+static inline __u64
+ploop1_max_size(__u32 blocksize, int version)
+{
+   switch (version) {
+   case PLOOP_FMT_V1:
+   return (__u32)-1;
+   case PLOOP_FMT_V2:
+   return 0xUL * blocksize;
+   }
+
+   return 0;
+}
+
 #ifdef __KERNEL__
 static inline u64
 get_SizeInSectors_from_le(struct ploop_pvd_header *vh, int version)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: fix a race condition on relocation of blocks

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit a762247cf8ff0b2ec0ba6e8a9742f7a5e38a8b15
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:02 2015 +0400

ploop: fix a race condition on relocation of blocks

map_release() are not atomic, because it calls atomic_read
and atomic_dec_and_test. Looks like it was designed to be
called under plo-lock.

https://jira.sw.ru/browse/PSBM-23905

Signed-off-by: Andrey Vagin ava...@openvz.org

Acked-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/dev.c | 6 ++
 drivers/block/ploop/map.c | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 353fb35..e3422d8 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1471,12 +1471,14 @@ static int prepare_merge_req(struct ploop_request * 
preq)
return res;
 
 drop_map:
+   spin_lock_irq(plo-lock);
map_release(preq-trans_map);
preq-trans_map = NULL;
if (preq-map) {
map_release(preq-map);
preq-map = NULL;
}
+   spin_unlock_irq(plo-lock);
return 1;
 }
 
@@ -1688,8 +1690,10 @@ ploop_entry_reloc_a_req(struct ploop_request *preq, 
iblock_t *iblk)
if (*clu = MAP_MAX_IND(preq))
break;
 
+   spin_lock_irq(plo-lock);
map_release(preq-map);
preq-map = NULL;
+   spin_unlock_irq(plo-lock);
}
 
if (*clu = plo-map.max_index) {
@@ -1814,8 +1818,10 @@ static int discard_get_index(struct ploop_request *preq)
preq-iblock = 0;
 
if (preq-map) {
+   spin_lock_irq(plo-lock);
map_release(preq-map);
preq-map = NULL;
+   spin_unlock_irq(plo-lock);
}
 
return 0;
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 5f50f81..2e971cd 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -145,6 +145,10 @@ static void flush_lru_buffer(struct ploop_map * map)
map-lru_buffer_ptr = 0;
 }
 
+/*
+ * map_release() must be called under plo-lock, because
+ * The pair atomic_read  atomic_dec_and_test is not atomic.
+ */
 void map_release(struct map_node * m)
 {
struct ploop_map * map = m-parent;
@@ -1026,9 +1030,11 @@ static void map_wb_complete_post_process(struct 
ploop_map *map,
}
 
if (test_bit(PLOOP_REQ_RELOC_S, preq-state)) {
+   spin_lock_irq(plo-lock);
del_lockout(preq);
map_release(preq-map);
preq-map = NULL;
+   spin_unlock_irq(plo-lock);
 
requeue_req(preq, PLOOP_E_RELOC_COMPLETE);
return;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: prioritize BAT operations

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit d742aa564de94c3816a9d3a7991adb00d23678d4
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:04 2015 +0400

ploop: prioritize BAT operations

Ploop uses -read_page and -write_page methods of pio_direct to read/write
index table. These operations are rare and usually someone is blocked on 
them.
Let's give them a priority by setting SYNCIO flag.

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/io_direct.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index c18d2f0..e5eb66a 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -1432,7 +1432,7 @@ static void
 dio_read_page(struct ploop_io * io, struct ploop_request * preq,
  struct page * page, sector_t sec)
 {
-   dio_io_page(io, READ, preq, page, sec);
+   dio_io_page(io, READ | REQ_SYNC, preq, page, sec);
 }
 
 static void
@@ -1444,7 +1444,8 @@ dio_write_page(struct ploop_io * io, struct ploop_request 
* preq,
return;
}
 
-   dio_io_page(io, WRITE | (fua ? REQ_FUA : 0), preq, page, sec);
+   dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
+   preq, page, sec);
 }
 
 static int
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: make manual abort transition verbose

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 9a5fe498a7a1d9c1ecf4001c0766f325f1139079
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:09 2015 +0400

ploop: make manual abort transition verbose

Signed-off-by: Dmitry Monakhov dmonak...@openvz.org
---
 drivers/block/ploop/sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/ploop/sysfs.c b/drivers/block/ploop/sysfs.c
index 3ef53ac..07a4829 100644
--- a/drivers/block/ploop/sysfs.c
+++ b/drivers/block/ploop/sysfs.c
@@ -326,6 +326,9 @@ static u32 show_aborted(struct ploop_device * plo)
 
 static int store_aborted(struct ploop_device * plo, u32 val)
 {
+   printk(KERN_INFO ploop: Force %s aborted state for ploop%d\n,
+  val ? set : clear, plo-index);
+
if (val)
set_bit(PLOOP_S_ABORT, plo-state);
else
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: warning on disk full condition

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit b6eb7575242d5e266d231ed53a4f7e03e47b2a68
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:10 2015 +0400

ploop: warning on disk full condition

People complain that it's not always obvious why an app in CT gets
-ENOSPC while there remains some space on host filesystem.

The patch adds time ratelimited printk about disk full condition.
Maximal rate is 1 per hour.

https://bugzilla.openvz.org/show_bug.cgi?id=3045

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/dev.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 9aaab4a..ab99724 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -3533,8 +3533,18 @@ static int ploop_bd_full(struct backing_dev_info *bdi, 
long long nr, int root)
 
current-journal_info = NULL;
ret = sb-s_op-statfs(F_DENTRY(file), buf);
-   if (ret || buf.f_bfree * buf.f_bsize  reserved + nr)
+   if (ret || buf.f_bfree * buf.f_bsize  reserved + nr) {
+   static unsigned long full_warn_time;
+
+   if (printk_timed_ratelimit(full_warn_time, 60*60*HZ))
+   printk(KERN_WARNING
+  ploop%d: host disk is almost full 
+  (%llu  %llu); CT sees -ENOSPC !\n,
+  plo-index, buf.f_bfree * buf.f_bsize,
+  reserved + nr);
+
rc = 1;
+   }
 
fput(file);
current-journal_info = jctx;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: fix busyloop on secondary discard bio

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit a5678140dd8f793272b5e562e81e27e2a249e4fd
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:11 2015 +0400

ploop: fix busyloop on secondary discard bio

After diff-ploop-add-a-separate-queue-for-discard-bio-s, ploop_thread()
skips processing previously queued discard bio-s if any discard bio is
already under processing (fbd-fbd_dbl is not empty).

ploop_wait() must take care about such a case, otherwise a busyloop may
happen: ploop_thread() believes that it has to go to sleep because all
incoming queues are empty excepting plo-bio_discard_list which cannot be
processed by now and calls ploop_wait(); the latter returns immediately
because plo-bio_discard_list is not empty and hence needs for processing.

The patch also fixes a trivial bug in discard bio accounting:

ploop_bio_queue() is called for all bio-s including discard bio-s and
it decrements bio_qlen unconditionally. This is incorrect: it has to
decrement either bio_qlen or discard_bio_qlen dependently on the type of 
bio.

https://jira.sw.ru/browse/PSBM-30451
https://bugzilla.openvz.org/show_bug.cgi?id=3124

Signed-off-by: Maxim Patlasov mpatla...@parallels.com

Acked-by: Andrew Vagin ava...@parallels.com
---
 drivers/block/ploop/dev.c  |  9 +++--
 drivers/block/ploop/freeblks.c | 12 
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index e2ff0aa..ac0f28f 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -551,7 +551,11 @@ ploop_bio_queue(struct ploop_device * plo, struct bio * 
bio,
 
__TRACE(A %p %u\n, preq, preq-req_cluster);
 
-   plo-bio_qlen--;
+   if (unlikely(bio-bi_rw  REQ_DISCARD))
+   plo-bio_discard_qlen--;
+   else
+   plo-bio_qlen--;
+
ploop_entry_add(plo, preq);
 
if (bio-bi_size  !(bio-bi_rw  REQ_DISCARD))
@@ -2563,7 +2567,8 @@ static void ploop_wait(struct ploop_device * plo, int 
once, struct blk_plug *plu
 !plo-active_reqs))
break;
} else if (plo-bio_head ||
-   !bio_list_empty(plo-bio_discard_list)) {
+   (!bio_list_empty(plo-bio_discard_list) 
+!ploop_discard_is_inprogress(plo-fbd))) {
/* ready_queue and entry_queue are empty, but
 * bio list not. Obviously, we'd like to process
 * bio_list instead of sleeping */
diff --git a/drivers/block/ploop/freeblks.c b/drivers/block/ploop/freeblks.c
index cf48d3a..89108c7 100644
--- a/drivers/block/ploop/freeblks.c
+++ b/drivers/block/ploop/freeblks.c
@@ -696,20 +696,24 @@ int ploop_fb_get_free_block(struct ploop_freeblks_desc 
*fbd,
 
 static void fbd_complete_bio(struct ploop_freeblks_desc *fbd, int err)
 {
+   struct ploop_device *plo = fbd-plo;
unsigned int nr_completed = 0;
 
while (fbd-fbd_dbl.head) {
struct bio * bio = fbd-fbd_dbl.head;
fbd-fbd_dbl.head = bio-bi_next;
bio-bi_next = NULL;
-   BIO_ENDIO(fbd-plo-queue, bio, err);
+   BIO_ENDIO(plo-queue, bio, err);
nr_completed++;
}
fbd-fbd_dbl.tail = NULL;
 
-   spin_lock_irq(fbd-plo-lock);
-   fbd-plo-bio_total -= nr_completed;
-   spin_unlock_irq(fbd-plo-lock);
+   spin_lock_irq(plo-lock);
+   plo-bio_total -= nr_completed;
+   if (!bio_list_empty(plo-bio_discard_list) 
+   waitqueue_active(plo-waitq))
+   wake_up_interruptible(plo-waitq);
+   spin_unlock_irq(plo-lock);
 }
 
 void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err)
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: mark reloc reqs to force FUA before write of relocated data

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 622b02378d190968a9ad04f5e8161a1574a1d2df
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:15 2015 +0400

ploop: mark reloc reqs to force FUA before write of relocated data

Series description:

During relocation of ploop clusters (resize/baloon) we need to FUA/fsync
image file after such operations:
 a) new data block wrote
 b) BAT update
 c) nullify old data block for BAT grow. We do this already nullify of old 
data
block at format module - complete_grow callback.

This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image
by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules
tuned by patch to force fsync/FUA if these flags are set. This code does
FUA/fsync only for a) and b) cases, while c) already implemented.

Also patch fixes inconsistent bio list FUA processing in direct module.
The problem is that for bunch of bios we only set FUA at last bio. Its 
possible
in case of power outage that last bio will be stored and previos are not
because they are stored only in cache at the time of power failure.
To solve problem this patch marking last bio as FLUSH|FUA if more than one 
bio
in list.

Moreover for KAIO if fsync possible at BAT update stage we do that like we
did in direct case instead of 2 fsync's. For direct case if we going to make
FUA at BAT update only(optimization trick that already exists) then we need
to mark req to FLUSH previously written(without FUA) data.

Performance:
Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% 
of
time.

https://jira.sw.ru/browse/PSBM-31222
https://jira.sw.ru/browse/PSBM-31225
https://jira.sw.ru/browse/PSBM-31321

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Andrey Smetanin (7):
  ploop: define struct ploop_request-state flags to force pre FLUSH
before write IO and FUA/fsync at I/O complete
  ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O
  ploop: mark reloc reqs to force FUA before write of relocated data
  ploop: direct: to support truly FLUSH/FUA of req we need mark first
bio FLUSH, write all bios and mark last bio as FLUSH/FUA
  ploop: added ploop_req_delay_fua_possible() func that detects possible
delaying of upcoming FUA to index update stage. This function will
be lately used in direct/kaio code to detect and delay FUA
  ploop: make image fsync at I/O complete if it's required by FUA/fsync
force flag or by req-req_rw
  ploop: do preflush or postfua according force FUA/flush flags, and
delay FUA if possible but add force FLUSH to req if so

This patch description:
Need to force FUA/fsync of relocated data write for consistent resize.

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Reviewed-by: Andrew Vagin ava...@parallels.com
---
 drivers/block/ploop/dev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index ac0f28f..bd5fe37 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2434,6 +2434,9 @@ restart:
top_delta = ploop_top_delta(plo);
sbl.head = sbl.tail = preq-aux_bio;
 
+   /* Relocated data write required sync before BAT updatee */
+   set_bit(PLOOP_REQ_FORCE_FUA, preq-state);
+
if (test_bit(PLOOP_REQ_RELOC_S, preq-state)) {
preq-eng_state = PLOOP_E_DATA_WBI;
plo-st.bio_out++;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ms/memcg/proc: add kpagecgroup file

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 3e48c113a59f801934292fc89e6915b1d8a341a7
Author: Vladimir Davydov vdavy...@parallels.com
Date:   Tue May 19 08:23:48 2015 +0400

ms/memcg/proc: add kpagecgroup file

Patchset description: idle memory tracking

This patch set backports

  https://lkml.org/lkml/2015/5/12/449

which is required by vcmmd.

It is not yet clear if the original patch set will be accepted upstream
as is, there still may be changes. However, I hope the user API will be
preserved. If it is not, we will have to fix this in our kernel too.

https://jira.sw.ru/browse/PSBM-32460

Vladimir Davydov (3):
  memcg: add page_cgroup_ino helper
  proc: add kpagecgroup file
  proc: add kpageidle file

===
This patch description:

/proc/kpagecgroup contains a 64-bit inode number of the memory cgroup
each page is charged to, indexed by PFN. Having this information is
useful for estimating a cgroup working set size.

The file is present if CONFIG_PROC_PAGE_MONITOR  CONFIG_MEMCG.

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 Documentation/vm/pagemap.txt |  6 -
 fs/proc/Kconfig  |  5 +++--
 fs/proc/page.c   | 53 
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index fd7c3cf..e37cff9 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel 
that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are three components to pagemap:
+There are four components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -63,6 +63,10 @@ There are three components to pagemap:
 21. KSM
 22. THP
 
+ * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
+
 Short descriptions to the page flags:
 
  0. LOCKED
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 15af622..e8ed22d 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -65,5 +65,6 @@ config PROC_PAGE_MONITOR
help
  Various /proc files exist to monitor process memory utilization:
  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
- /proc/kpagecount, and /proc/kpageflags. Disabling these
-  interfaces will reduce the size of the kernel by approximately 4kb.
+ /proc/kpagecount, /proc/kpageflags, and /proc/kpagecgroup.
+ Disabling these interfaces will reduce the size of the kernel
+ by approximately 4kb.
diff --git a/fs/proc/page.c b/fs/proc/page.c
index cab84b6..c9cbed3 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
 #include linux/proc_fs.h
 #include linux/seq_file.h
 #include linux/hugetlb.h
+#include linux/memcontrol.h
 #include linux/kernel-page-flags.h
 #include asm/uaccess.h
 #include internal.h
@@ -213,10 +214,62 @@ static const struct file_operations 
proc_kpageflags_operations = {
.read = kpageflags_read,
 };
 
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+   size_t count, loff_t *ppos)
+{
+   u64 __user *out = (u64 __user *)buf;
+   struct page *ppage;
+   unsigned long src = *ppos;
+   unsigned long pfn;
+   ssize_t ret = 0;
+   u64 ino;
+
+   pfn = src / KPMSIZE;
+   count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+   if (src  KPMMASK || count  KPMMASK)
+   return -EINVAL;
+
+   while (count  0) {
+   if (pfn_valid(pfn))
+   ppage = pfn_to_page(pfn);
+   else
+   ppage = NULL;
+
+   if (ppage)
+   ino = page_cgroup_ino(ppage);
+   else
+   ino = 0;
+
+   if (put_user(ino, out)) {
+   ret = -EFAULT;
+   break;
+   }
+
+   pfn++;
+   out++;
+   count -= KPMSIZE;
+   }
+
+   *ppos += (char __user *)out - buf;
+   if (!ret)
+   ret = (char __user *)out - buf;
+   return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+   .llseek = mem_lseek,
+   .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
 static int __init proc_page_init(void)
 {
proc_create(kpagecount, S_IRUSR,

[Devel] [PATCH RHEL7 COMMIT] ploop: prevent dangerous ploop-umount

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 8854414d2d97abd7ab86d4c9d1c74d9b2fc04c3c
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:26:56 2015 +0400

ploop: prevent dangerous ploop-umount

Umounting ploop device if inner fs is still mounted on it leads to
numerous complains in kernel logs like:

VFS: Busy inodes after unmount. sb = 880108987000, fs type = ext4, sb 
count = 2, sb-s_root = /

and is not what user expected. The patch adds some protection from dummy
userspace mistakes: do not allow to stop ploop device (this is the first 
step
of ploop-umount) if user uses /dev/ploopNp1 for ioctl, or if someone (inner 
fs)
is still using the device.

https://jira.sw.ru/browse/PSBM-21474

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/dev.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 5a3a5ec..2f4928d 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -3548,6 +3548,20 @@ static int ploop_stop(struct ploop_device * plo, struct 
block_device *bdev)
struct ploop_delta * delta;
int cnt;
 
+   if (bdev != bdev-bd_contains) {
+   if (printk_ratelimit())
+   printk(KERN_INFO stop ploop%d failed (wrong bdev)\n,
+  plo-index);
+   return -ENODEV;
+   }
+
+   if (bdev-bd_contains-bd_holders) {
+   if (printk_ratelimit())
+   printk(KERN_INFO stop ploop%d failed (holders=%d)\n,
+  plo-index, bdev-bd_contains-bd_holders);
+   return -EBUSY;
+   }
+
if (!test_bit(PLOOP_S_RUNNING, plo-state))
return -EINVAL;
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: fix iblk-to-sector calculations

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 96f009d1061c9e1ec9b6c7699eef565bcd44f26a
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:26:59 2015 +0400

ploop: fix iblk-to-sector calculations

iblk stands for image-file block number. Its size is the same as u32. The 
size
of 'sector' is the same as long. While converting the former to the latter
like this: sec = iblk  shift, we must always cast 'iblk' to long. And we
actually do in most cases. The patch fixes a place in io_direct module where
it was forgotten.

https://jira.sw.ru/browse/PSBM-22961

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/io_direct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index ab74849..56b9f37 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -119,8 +119,8 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
goto out_em_err;
 
if (write  em-block_start == BLOCK_UNINIT) {
-   sector_t end = (iblk + 1)  preq-plo-cluster_log;
-   sec = iblk  preq-plo-cluster_log;
+   sector_t end = (sector_t)(iblk + 1)  preq-plo-cluster_log;
+   sec = (sector_t)iblk  preq-plo-cluster_log;
 
if (em-start = sec)
sec = em-end;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: reverse order of fdatawait and fsync fop

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 0ac13b3ba07b42573f151c21d9727a2cbcd415d1
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:00 2015 +0400

ploop: reverse order of fdatawait and fsync fop

dio_fsync_thread must call filemap_fdatawrite() before file-f_op-fsync().
Otherwise:

  8,06   82 0.003095587 12328  D  WS 441706496 + 512 
[ploop19054]
  8,06   83 0.003103726 12328  D  WS 441707008 + 512 
[ploop19054]
  8,06   84 0.003108627 12328  D  WS 441707520 + 512 
[ploop19054]
  8,06   85 0.003113176 12328  D  WS 441708032 + 512 
[ploop19054]
  ...
  8,06  102 0.003149386  1299  D  WS 3950526248 + 24 
[jbd2/dm-1-8]
  ...
  8,06  103 0.003305550 0  C  WS 441706496 + 512 [0]
  8,06  104 0.003458057 0  C  WS 441707008 + 512 [0]
  8,06  105 0.003608325 0  C  WS 441707520 + 512 [0]
  8,06  106 0.003758297 0  C  WS 441708032 + 512 [0]
  8,06  107 0.003794543 0  C  WS 3950526248 + 24 [0]

And if the node crashes (or reboot happens) after last dispatch, journal 
data
may come to the disk while user bulk data -- not. The result would be ploop
image corruption.

The patch re-arranges the sequence of calls to make it safe and natural (the
same way as in vfs_fsync_range()).

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/io_direct.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index babc940..c18d2f0 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -735,14 +735,13 @@ static int dio_fsync_thread(void * data)
spin_unlock_irq(plo-lock);
 
/* filemap_fdatawrite() has been made already */
+   filemap_fdatawait(io-files.mapping);
 
err = 0;
if (io-files.file-f_op-fsync)
err = io-files.file-f_op-FOP_FSYNC(io-files.file,
  0);
 
-   filemap_fdatawait(io-files.mapping);
-
/* Do we need to invalidate page cache? Not really,
 * because we use it only to create full new pages,
 * which we overwrite completely. Probably, we should
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: support 4K block-size of host block-device

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 5e02bd5942dd5cfd66f5b4096e966ae9b134b5ea
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:01 2015 +0400

ploop: support 4K block-size of host block-device

Avoid 512-bytes reads/writes. They were used by 'expanded' format module
to get and save format header. Let's use 4K reads/writes instead.

Customer's problem:

 [root@pcstest10 ~]# ploop mount /vz3/test.hdd
 add delta dev=/dev/ploop19025 img=/vz3/test.hdd (rw)
 Can't add image /vz3/test.hdd: Input/output error
 [root@pcstest10 ~]#

 Right after trying to mount the image the kernel throws the following:

 [1564044.775584] sd 13:0:0:0: [sde] Bad block number requested

 The block size of this device is not 512 as for other direct attached
 disks. It is 4096 and the device is an iSCSI target.

https://jira.sw.ru/browse/PSBM-21989

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/fmt_ploop1.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/block/ploop/fmt_ploop1.c b/drivers/block/ploop/fmt_ploop1.c
index fb12c30..5ce6915 100644
--- a/drivers/block/ploop/fmt_ploop1.c
+++ b/drivers/block/ploop/fmt_ploop1.c
@@ -78,7 +78,7 @@ static int ploop1_stop(struct ploop_delta * delta)
 
vh = (struct ploop_pvd_header *)page_address(ph-dyn_page);
 
-   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
return err;
 
@@ -90,7 +90,7 @@ static int ploop1_stop(struct ploop_delta * delta)
 
vh-m_DiskInUse = 0;
 
-   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
return err;
 
@@ -128,7 +128,7 @@ ploop1_open(struct ploop_delta * delta)
goto out_err;
 
/* IO engine is ready. */
-   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
goto out_err;
 
@@ -168,7 +168,7 @@ ploop1_open(struct ploop_delta * delta)
 
if (!(delta-flags  PLOOP_FMT_RDONLY)) {
vh-m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_IN_USE);
-   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 
0, 0);
+   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 
0, 0);
if (err)
goto out_err;
}
@@ -198,7 +198,7 @@ ploop1_refresh(struct ploop_delta * delta)
 
vh = (struct ploop_pvd_header *)page_address(ph-dyn_page);
 
-   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
return err;
 
@@ -266,7 +266,7 @@ ploop1_sync(struct ploop_delta * delta)
if (err)
return err;
 
-   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
return err;
 
@@ -279,7 +279,7 @@ ploop1_sync(struct ploop_delta * delta)
vh-m_Flags = cpu_to_le32(vh-m_Flags);
}
 
-   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
return err;
 
@@ -312,7 +312,7 @@ ploop1_complete_snapshot(struct ploop_delta * delta, struct 
ploop_snapdata * sd)
if (err)
goto out;
 
-   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
goto out;
 
@@ -335,7 +335,7 @@ ploop1_complete_snapshot(struct ploop_delta * delta, struct 
ploop_snapdata * sd)
 * remain valid.
 */
 
-   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_write(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
goto out;
 
@@ -367,7 +367,7 @@ ploop1_prepare_merge(struct ploop_delta * delta, struct 
ploop_snapdata * sd)
 
vh = (struct ploop_pvd_header *)page_address(ph-dyn_page);
 
-   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0);
+   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 4096, 0, 0);
if (err)
return err;
 
@@ -403,7 +403,7 @@ ploop1_start_merge(struct ploop_delta * delta, struct 
ploop_snapdata * sd)
return -EIO;
}
 
-   err = delta-io.ops-sync_read(delta-io, ph-dyn_page, 512, 0, 0);
+   err

[Devel] [PATCH RHEL7 COMMIT] ploop: bug on bad fiemap (v2)

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit e3b634ed036e618d74643faaa478dc3951c2f781
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:05 2015 +0400

ploop: bug on bad fiemap (v2)

Based on crash analysis, one of extents from ploop em-tree is bad:

883fe6230ae0
  start = 19380224
  end = 19447808
  block_start = 0
  refs = {
counter = 1
  }

ploop never calculates em-block_start other than by direct assigning:

 em-block_start = fi_extent.fe_physical  9;

The patch attempts to catch erroneous (zero) output immediately after
fiemap call.

Changed in v2:
 - WARN_ON (instead of BUG_ON) for delalloc extents

https://jira.sw.ru/browse/PSBM-26762

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/io_direct_map.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ploop/io_direct_map.c 
b/drivers/block/ploop/io_direct_map.c
index b3cb04d..b9a0ce9 100644
--- a/drivers/block/ploop/io_direct_map.c
+++ b/drivers/block/ploop/io_direct_map.c
@@ -641,6 +641,7 @@ static struct extent_map *__map_extent_bmap(struct ploop_io 
*io,
 {
struct extent_map_tree *tree = io-files.em_tree;
struct inode *inode = mapping-host;
+   loff_t start_off = (loff_t)start  9;
struct extent_map *em;
struct fiemap_extent_info fieinfo;
struct fiemap_extent fi_extent;
@@ -681,6 +682,25 @@ again:
old_fs = get_fs();
set_fs(KERNEL_DS);
ret = inode-i_op-fiemap(inode, fieinfo, start  9, 1);
+
+   /* chase for PSBM-26762: em-block_start == 0 */
+   if (!ret  fieinfo.fi_extents_mapped == 1 
+   !(fi_extent.fe_flags  FIEMAP_EXTENT_UNWRITTEN) 
+   (fi_extent.fe_physical  9) == 0) {
+   /* see how ext4_fill_fiemap_extents() implemented */
+   if (!(fi_extent.fe_flags  FIEMAP_EXTENT_DELALLOC)) {
+   printk(bad fiemap(%ld,%ld) on inode=%p fieinfo=%p
+i_size=%lld\n, start, len, inode, fieinfo,
+   i_size_read(inode));
+   BUG();
+   }
+   /* complain about delalloc case -- ploop always fallocate
+   * before buffered write */
+   WARN(1, ploop%d: delalloc extent [%lld,%lld] for [%lld,%ld];
+i_size=%lld\n, io-plo-index, fi_extent.fe_logical,
+   fi_extent.fe_length, start_off, len  9, 
i_size_read(inode));
+   ret = -ENOENT;
+   }
set_fs(old_fs);
 
if (ret) {
@@ -808,9 +828,10 @@ void trim_extent_mappings(struct extent_map_tree *tree, 
sector_t start)
 
while ((em = lookup_extent_mapping(tree, start, ((sector_t)(-1ULL)) - 
start))) {
remove_extent_mapping(tree, em);
+   WARN_ON(atomic_read(em-refs) != 2);
/* once for us */
extent_put(em);
-   /* _XXX_ This cannot be correct in the case of concurrent 
lookups */
+   /* No concurrent lookups due to ploop_quiesce(). See WARN_ON 
above */
/* once for the tree */
extent_put(em);
}
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: put top-delta back if merge failed

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit ee2968cd8321728956effea19e98959befec32d0
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:07 2015 +0400

ploop: put top-delta back if merge failed

Before merge, we move top-delta to a temporary plo-trans_map list. Since
then, it's not present in the main plo-map list anymore. If merge failed,
we must put it back to plo-map. Otherwise the delta will be lost forever
(visible in /sys/block/ploop*/pdelta/*, but not accessible from ploop).

https://jira.sw.ru/browse/PSBM-25252

Signed-off-by: Maxim Patlasov mpatla...@parallels.com

Acked-by: Pavel Emelyanov xe...@parallels.com
---
 drivers/block/ploop/dev.c | 49 +++
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index d2a9eb4..2e6302f 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -3280,6 +3280,26 @@ static void ploop_update_fmt_version(struct ploop_device 
* plo)
}
 }
 
+static void ploop_merge_cleanup(struct ploop_device * plo,
+   struct ploop_map * map,
+   struct ploop_delta * delta, int err)
+{
+   ploop_quiesce(plo);
+   mutex_lock(plo-sysfs_mutex);
+   list_del(delta-list);
+
+   if (err)
+   list_add(delta-list, plo-map.delta_list);
+   else
+   ploop_update_fmt_version(plo);
+
+   plo-trans_map = NULL;
+   plo-maintenance_type = PLOOP_MNTN_OFF;
+   mutex_unlock(plo-sysfs_mutex);
+   ploop_map_destroy(map);
+   ploop_relax(plo);
+}
+
 static int ploop_merge(struct ploop_device * plo)
 {
int err;
@@ -3368,32 +3388,19 @@ already:
if (test_bit(PLOOP_S_ABORT, plo-state)) {
printk(KERN_WARNING merge for ploop%d failed (state ABORT)\n,
   plo-index);
-   plo-trans_map = NULL;
-   plo-maintenance_type = PLOOP_MNTN_OFF;
err = -EIO;
-   goto out;
}
 
-   ploop_quiesce(plo);
-   mutex_lock(plo-sysfs_mutex);
-   plo-trans_map = NULL;
-   plo-maintenance_type = PLOOP_MNTN_OFF;
-   list_del(delta-list);
-   ploop_update_fmt_version(plo);
-   mutex_unlock(plo-sysfs_mutex);
-   ploop_map_destroy(map);
-   ploop_relax(plo);
+   ploop_merge_cleanup(plo, map, delta, err);
 
-   kfree(map);
-
-   kobject_del(delta-kobj);
-   kobject_put(plo-kobj);
-
-   delta-ops-stop(delta);
-   delta-ops-destroy(delta);
-   kobject_put(delta-kobj);
-   return 0;
+   if (!err) {
+   kobject_del(delta-kobj);
+   kobject_put(plo-kobj);
 
+   delta-ops-stop(delta);
+   delta-ops-destroy(delta);
+   kobject_put(delta-kobj);
+   }
 out:
kfree(map);
return err;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: added ploop_req_delay_fua_possible() func that detects possible delaying of upcoming FUA to index update stage

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit abc95cfc45bd725c7aba2f7697e322413ae5725a
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:12 2015 +0400

ploop: added ploop_req_delay_fua_possible() func that detects possible 
delaying of upcoming FUA to index update stage

During relocation of ploop clusters (resize/baloon) we need to FUA/fsync
image file after such operations:
 a) new data block wrote
 b) BAT update
 c) nullify old data block for BAT grow. We do this already nullify of old 
data
block at format module - complete_grow callback.

This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image
by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules
tuned by patch to force fsync/FUA if these flags are set. This code does
FUA/fsync only for a) and b) cases, while c) already implemented.

Also patch fixes inconsistent bio list FUA processing in direct module.
The problem is that for bunch of bios we only set FUA at last bio. Its 
possible
in case of power outage that last bio will be stored and previos are not
because they are stored only in cache at the time of power failure.
To solve problem this patch marking last bio as FLUSH|FUA if more than one 
bio
in list.

Moreover for KAIO if fsync possible at BAT update stage we do that like we
did in direct case instead of 2 fsync's. For direct case if we going to make
FUA at BAT update only(optimization trick that already exists) then we need
to mark req to FLUSH previously written(without FUA) data.

Performance:
Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% 
of
time.

https://jira.sw.ru/browse/PSBM-31222
https://jira.sw.ru/browse/PSBM-31225
https://jira.sw.ru/browse/PSBM-31321

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Andrey Smetanin (7):
  ploop: define struct ploop_request-state flags to force pre FLUSH
before write IO and FUA/fsync at I/O complete
  ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O
  ploop: mark reloc reqs to force FUA before write of relocated data
  ploop: direct: to support truly FLUSH/FUA of req we need mark first
bio FLUSH, write all bios and mark last bio as FLUSH/FUA
  ploop: added ploop_req_delay_fua_possible() func that detects possible
delaying of upcoming FUA to index update stage. This function will
be lately used in direct/kaio code to detect and delay FUA
  ploop: make image fsync at I/O complete if it's required by FUA/fsync
force flag or by req-req_rw
  ploop: do preflush or postfua according force FUA/flush flags, and
delay FUA if possible but add force FLUSH to req if so

This patch description:

This function will be lately used in direct/kaio code to detect and delay 
FUA.

https://jira.sw.ru/browse/PSBM-31222
https://jira.sw.ru/browse/PSBM-31225
https://jira.sw.ru/browse/PSBM-31321

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Reviewed-by: Andrew Vagin ava...@parallels.com
---
 include/linux/ploop/ploop.h | 17 +
 1 file changed, 17 insertions(+)

diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index eacd36a..d8b83a6 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -577,6 +577,23 @@ void ploop_fail_request(struct ploop_request * preq, int 
err);
 void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list,
  int keep_locked);
 
+
+static inline int ploop_req_delay_fua_possible(unsigned long rw,
+   struct ploop_request *preq)
+{
+   int delay_fua = 0;
+
+   /* In case of eng_state != COMPLETE, we'll do FUA in
+* ploop_index_update(). Otherwise, we should post
+* fua.
+*/
+   if (rw  REQ_FUA) {
+   if (preq-eng_state != PLOOP_E_COMPLETE)
+   delay_fua = 1;
+   }
+   return delay_fua;
+}
+
 static inline void ploop_set_error(struct ploop_request * preq, int err)
 {
if (!preq-error) {
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: force FUA of nullified blocks for BAT grow

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 051a2a154c0e040d7d15ab9a3b56b77d9de021b3
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:16 2015 +0400

ploop: force FUA of nullified blocks for BAT grow

Lately we think we does sync of nullified blocks at format
driver by image fsync before header BAT size grow update.
But we write this data directly into underlying device
bypassing EXT4 by usage of extent map tree
(see dio_submit()). So fsync of EXT4 image doesnt help us.
We need to force sync of nullified blocks. This patch does
it by marking preq via PLOOP_REQ_FORCE_FUA flag.

https://jira.sw.ru/browse/PSBM-31969

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Acked-by: Andrew Vagin ava...@parallels.com
---
 drivers/block/ploop/map.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 67e2852..8ea67e9 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -1056,10 +1056,14 @@ static void map_wb_complete_post_process(struct 
ploop_map *map,
   0, PAGE_SIZE);
 
/*
-* FUA of this data occures at format driver -complete_grow() by
-* all image sync. After that header size increased to use this
-* cluster as BAT cluster.
+* Lately we think we does sync of nullified blocks at format
+* driver by image fsync before header update.
+* But we write this data directly into underlying device
+* bypassing EXT4 by usage of extent map tree
+* (see dio_submit()). So fsync of EXT4 image doesnt help us.
+* We need to force sync of nullified blocks.
 */
+   set_bit(PLOOP_REQ_FORCE_FUA, preq-state);
top_delta-io.ops-submit(top_delta-io, preq, preq-req_rw,
  sbl, preq-iblock, 1plo-cluster_log);
 }
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] vzctl PRE_CREATE hook

2015-05-18 Thread Kir Kolyshkin


(I previously replied to Nikolay only -- re-sending with devel@ included)

On 05/12/2015 02:36 AM, Nikolay Tenev wrote:

Hello devs,

In my project I wanted to make every OpenVZ container to use for a 
private directory (VE_PRIVATE) separated block device (HDD partition, 
lvm volume, NFS share, etc.). To use wrapper script over vzctl was one 
option, but PRE_CREATE hook, in which to create, mkfs and mount LVM 
volume would be even better.


So, I'm not a developer, but using code from POST_CREATE hook I was 
able to create the PRE_CREATE one, which can be used as the other 
hooks e.g.


add in /etc/vz/dists/ default
PRE_CREATE = precreate.sh

and during
vzctl --create ...

it will call /etc/vz/dists/scripts/precreate.sh with VEID as argument


Nope. These scripts are per-distribution scripts, i.e. they are targeted for
various distro-specific things, such as setting IP addresses etc.

What you need is a global script, not dependent on CT distro. I suggest
a precreate.sh script similar to prestart.sh one (for details, see commit
https://github.com/kolyshkin/vzctl/commit/0807ef4)



Currently I have a patch to vzctl master branch which implements this 
PRE_CREATE hook and I'm ready to share it.


So my questions are:
- Do you find this for interesting and/or useful?
- If 'yes', what is the right way to send this patch: here, by email; 
or to create pull request in git repo?


The best way would be to redo as advised above and send a patch to 
devel@ list.


Thanks,
  Kir.



Best regards!

Nikolay Tenev



___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ms/memcg: add page_cgroup_ino helper

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 77c59afe2b55a1dd631c3b8a6d3763eff8d09941
Author: Vladimir Davydov vdavy...@parallels.com
Date:   Tue May 19 08:23:31 2015 +0400

ms/memcg: add page_cgroup_ino helper

Patchset description: idle memory tracking

This patch set backports

  https://lkml.org/lkml/2015/5/12/449

which is required by vcmmd.

It is not yet clear if the original patch set will be accepted upstream
as is, there still may be changes. However, I hope the user API will be
preserved. If it is not, we will have to fix this in our kernel too.

https://jira.sw.ru/browse/PSBM-32460

Vladimir Davydov (3):
  memcg: add page_cgroup_ino helper
  proc: add kpagecgroup file
  proc: add kpageidle file

===
This patch description:

Hwpoison allows to filter pages by memory cgroup ino. To ahieve that, it
calls try_get_mem_cgroup_from_page(), then mem_cgroup_css(), and finally
extracts the inode number from the cgroup returned. This looks bulky.
Since in the next patch I need to get the ino of the memory cgroup a
page is charged to too, in this patch I introduce the page_cgroup_ino()
helper.

Note that page_cgroup_ino() only considers those pages that are charged
to mem_cgroup-res (i.e. page_cgroup-mem_cgroup != NULL), and for
others it returns 0, while try_get_mem_cgroup_page(), used by hwpoison
before, may extract the cgroup from a swapcache readahead page too.
Ignoring swapcache readahead pages allows to call page_cgroup_ino() on
unlocked pages, which is nice. Hwpoison users will hardly see any
difference.

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 include/linux/memcontrol.h |  3 +++
 mm/hwpoison-inject.c   |  3 ---
 mm/memcontrol.c| 22 ++
 mm/memory-failure.c| 18 +-
 4 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 675b4c5..5507be5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -200,6 +200,9 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 bool mem_cgroup_bad_page_check(struct page *page);
 void mem_cgroup_print_bad_page(struct page *page);
 #endif
+
+unsigned long page_cgroup_ino(struct page *page);
+
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 3a61efc..bd580f8 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -44,12 +44,9 @@ static int hwpoison_inject(void *data, u64 val)
/*
 * do a racy check with elevated page count, to make sure PG_hwpoison
 * will only be set for the targeted owner (or on a free page).
-* We temporarily take page lock for try_get_mem_cgroup_from_page().
 * memory_failure() will redo the check reliably inside page lock.
 */
-   lock_page(hpage);
err = hwpoison_filter(hpage);
-   unlock_page(hpage);
if (err)
return 0;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e772a06..9dda309 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2877,6 +2877,28 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct 
page *page)
return memcg;
 }
 
+/**
+ * page_cgroup_ino - return inode number of page's memcg
+ * @page: the page
+ *
+ * Look up the memory cgroup @page is charged to and return its inode number.
+ * It is safe to call this function without taking a reference to the page.
+ */
+unsigned long page_cgroup_ino(struct page *page)
+{
+   struct mem_cgroup *memcg;
+   struct page_cgroup *pc;
+   unsigned long ino = 0;
+
+   pc = lookup_page_cgroup(page);
+   lock_page_cgroup(pc);
+   memcg = pc-mem_cgroup;
+   if (PageCgroupUsed(pc)  memcg)
+   ino = memcg-css.cgroup-dentry-d_inode-i_ino;
+   unlock_page_cgroup(pc);
+   return ino;
+}
+
 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
   struct page *page,
   unsigned int nr_pages,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 06f8d308..b3b1a2d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -133,26 +133,10 @@ u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
 {
-   struct mem_cgroup *mem;
-   struct cgroup_subsys_state *css;
-   unsigned long ino;
-
if (!hwpoison_filter_memcg)
return 0;
 
-   mem = try_get_mem_cgroup_from_page(p);
-   if (!mem)
-   return -EINVAL;
-
-   css = mem_cgroup_css(mem);
-   /* root_mem_cgroup has NULL dentries */
-

[Devel] [PATCH RHEL7 COMMIT] ms/mm/proc: add kpageidle file

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 35dcabf891ce1931294c5bf3d98e1203ff656432
Author: Vladimir Davydov vdavy...@parallels.com
Date:   Tue May 19 08:23:57 2015 +0400

ms/mm/proc: add kpageidle file

Patchset description: idle memory tracking

This patch set backports

  https://lkml.org/lkml/2015/5/12/449

which is required by vcmmd.

It is not yet clear if the original patch set will be accepted upstream
as is, there still may be changes. However, I hope the user API will be
preserved. If it is not, we will have to fix this in our kernel too.

https://jira.sw.ru/browse/PSBM-32460

Vladimir Davydov (3):
  memcg: add page_cgroup_ino helper
  proc: add kpagecgroup file
  proc: add kpageidle file

===
This patch description:

Knowing the portion of memory that is not used by a certain application
or memory cgroup (idle memory) can be useful for partitioning the system
efficiently, e.g. by setting memory cgroup limits appropriately.
Currently, the only means to estimate the amount of idle memory provided
by the kernel is /proc/PID/{clear_refs,smaps}: the user can clear the
access bit for all pages mapped to a particular process by writing 1 to
clear_refs, wait for some time, and then count smaps:Referenced.
However, this method has two serious shortcomings:

 - it does not count unmapped file pages
 - it affects the reclaimer logic

To overcome these drawbacks, this patch introduces two new page flags,
Idle and Young, and a new proc file, /proc/kpageidle. A page's Idle flag
can only be set from userspace by setting bit in /proc/kpageidle at the
offset corresponding to the page, and it is cleared whenever the page is
accessed either through page tables (it is cleared in page_referenced()
in this case) or using the read(2) system call (mark_page_accessed()).
Thus by setting the Idle flag for pages of a particular workload, which
can be found e.g. by reading /proc/PID/pagemap, waiting for some time to
let the workload access its working set, and then reading the kpageidle
file, one can estimate the amount of pages that are not used by the
workload.

The Young page flag is used to avoid interference with the memory
reclaimer. A page's Young flag is set whenever the Access bit of a page
table entry pointing to the page is cleared by writing to kpageidle. If
page_referenced() is called on a Young page, it will add 1 to its return
value, therefore concealing the fact that the Access bit was cleared.

Note, since there is no room for extra page flags on 32 bit, this
feature uses extended page flags when compiled on 32 bit.

(on RH7 page ext is not available so make it depend on 64 bit)
Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 Documentation/vm/pagemap.txt |  12 +++-
 fs/proc/page.c   | 168 +++
 fs/proc/task_mmu.c   |   3 +-
 include/linux/mm.h   |  50 +
 include/linux/page-flags.h   |   9 +++
 mm/Kconfig   |  12 
 mm/page_alloc.c  |   4 ++
 mm/rmap.c|   9 +++
 mm/swap.c|   2 +
 9 files changed, 267 insertions(+), 2 deletions(-)

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index e37cff9..a4fe9b2 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel 
that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are four components to pagemap:
+There are five components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -67,6 +67,16 @@ There are four components to pagemap:
memory cgroup each page is charged to, indexed by PFN. Only available when
CONFIG_MEMCG is set.
 
+ * /proc/kpageidle.  This file implements a bitmap where each bit corresponds
+   to a page, indexed by PFN. When the bit is set, the corresponding page is
+   idle. A page is considered idle if it has not been accessed since it was
+   marked idle. To mark a page idle one should set the bit corresponding to the
+   page by writing to the file. A value written to the file is OR-ed with the
+   current bitmap value. Only user memory pages can be marked idle, for other
+   page types input is silently ignored. Writing to this file beyond max PFN
+   results in the ENXIO error. Only available when CONFIG_IDLE_PAGE_TRACKING is
+   set.
+
 Short descriptions to the page flags:
 
  0.

[Devel] [PATCH RHEL7 COMMIT] ploop: prevent disclosure 4 bytes of the stack kernel

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit c25ed54c1a19bc8c11fcc472c3e4869c210eca97
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:26:57 2015 +0400

ploop: prevent disclosure 4 bytes of the stack kernel

 Memory leak (4 bytes) in the ploop_getdevice_ioc function.

  217401 +static int ploop_getdevice_ioc(unsigned long arg)
  217402 +{
  217403 +   int err;
  217404 +   int index = 0;
  217405 +   struct rb_node *n;
  217406 +   struct ploop_getdevice_ctl ctl;
  217407 +
  217408 +   mutex_lock(ploop_devices_mutex);
  217409 +   for (n = rb_first(ploop_devices_tree); n; n = 
rb_next(n), index++) {
  217410 +   struct ploop_device *plo;
  217411 +   plo = rb_entry(n, struct ploop_device, 
link);
  217412 +   if (plo-index != index || 
list_empty(plo-map.delta_list))
  217413 +   break;
  217414 +   }
  217415 +   mutex_unlock(ploop_devices_mutex);
  217416 +
  217417 +   ctl.minor = index  PLOOP_PART_SHIFT;
  217418 +   if (ctl.minor  ~MINORMASK)
  217419 +   return -ERANGE;
  217420 +   err = copy_to_user((void*)arg, ctl, sizeof(ctl));
  217421 +   return err;
  217422 +}

 The ploop_getdevice_ioc() function copy to user the
ploop_getdevice_ctl structure but it initialize juste the 'minor'
attribute. It's possible to disclosure 4 bytes of the stack kernel via
the '__mbz1' attribute.

 Below the 'ploop_getdevice_ctl' structure :

 3772915 +struct ploop_getdevice_ctl
 3772916 +{
 3772917 +   __u32   minor;
 3772918 +   __u32   __mbz1;
 3772919 +} __attribute__ ((aligned (8)));

Signed-off-by: Andrey Vagin ava...@openvz.org

Reported-by: Jonathan Salwan (Sysdream Security Laboratory) 
jonathan.sal...@gmail.com
---
 drivers/block/ploop/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 2f4928d..8556af2 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -4277,7 +4277,7 @@ static int ploop_getdevice_ioc(unsigned long arg)
int err;
int index = 0;
struct rb_node *n;
-   struct ploop_getdevice_ctl ctl;
+   struct ploop_getdevice_ctl ctl = {};
 
mutex_lock(ploop_devices_mutex);
for (n = rb_first(ploop_devices_tree); n; n = rb_next(n), index++) {
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: skip writes of zeroes to unallocated blocks by default

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 6051dc5f6e200cef2011e2174d1c3b76280fe75f
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:01 2015 +0400

ploop: skip writes of zeroes to unallocated blocks by default

Reading from unallocated blocks returns zeroes =
we can safely skip writes of zeroes to unallocated blocks.

As a lot of tests do dd if=/dev/zero ..., this optimization is valuable.

Feature enabled, test results:
  [root@p2 ~]# echo 1 /sys/block/ploop37803/ptune/check_zeros
  [root@p2 ~]# dd if=/dev/zero of=/mnt/sb-io-test bs=1M count=1k oflag=dsync
  1024+0 records in
  1024+0 records out
  1073741824 bytes (1.1 GB) copied, 1.58975 s, 675 MB/s

The impact on CPU utilization is negligible.

https://jira.sw.ru/browse/PSBM-22506
https://jira.sw.ru/browse/PSBM-22381

Signed-off-by: Konstantin Khorenko khore...@parallels.com

Acked-by: Maxim V. Patlasov mpatla...@parallels.com
---
 include/linux/ploop/ploop.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index d295cba..434789e 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -323,6 +323,7 @@ struct ploop_tunable
 .congestion_low_watermark = DEFAULT_PLOOP_MAXRQ/2, \
 .pass_flushes = 1, \
 .pass_fuas = 1, \
+.check_zeros = 1, \
 .max_active_requests = DEFAULT_PLOOP_BATCH_ENTRY_QLEN / 2, }
 
 struct ploop_stats
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: fix spurious hole complains

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit e4c1ce43241df81fad73953200d887c6a402d82f
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:07 2015 +0400

ploop: fix spurious hole complains

Spurious complains were triggered by fiemap-ahead logic of pio_direct 
module.
Fix it by suppressing complains if fiemap behind EOF failed. Also print
more details about a hole.

Signed-off-by: Maxim Patlasov mpatla...@parallels.com

Acked-by: Andrew Vagin ava...@parallels.com
---
 drivers/block/ploop/io_direct_map.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/block/ploop/io_direct_map.c 
b/drivers/block/ploop/io_direct_map.c
index b9a0ce9..c1d889b 100644
--- a/drivers/block/ploop/io_direct_map.c
+++ b/drivers/block/ploop/io_direct_map.c
@@ -681,7 +681,7 @@ again:
 
old_fs = get_fs();
set_fs(KERNEL_DS);
-   ret = inode-i_op-fiemap(inode, fieinfo, start  9, 1);
+   ret = inode-i_op-fiemap(inode, fieinfo, start_off, 1);
 
/* chase for PSBM-26762: em-block_start == 0 */
if (!ret  fieinfo.fi_extents_mapped == 1 
@@ -709,8 +709,11 @@ again:
}
 
if (fieinfo.fi_extents_mapped != 1) {
-   ploop_msg_once(io-plo, a hole in image file detected (%d),
-  fieinfo.fi_extents_mapped);
+   if (start_off  i_size_read(inode))
+   ploop_msg_once(io-plo, a hole in image file detected
+   (mapped=%d i_size=%llu off=%llu),
+  fieinfo.fi_extents_mapped,
+  i_size_read(inode), start_off);
extent_put(em);
return ERR_PTR(-EINVAL);
}
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: notify blktrace about bio completions

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 3ebe6f8f4178ebf89b3aff5b064657e1e9615dce
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:11 2015 +0400

ploop: notify blktrace about bio completions

Signed-off-by: Andrey Smetanin asmeta...@virtuozzo.com
---
 drivers/block/ploop/dev.c  | 14 --
 drivers/block/ploop/freeblks.c |  4 +++-
 include/linux/ploop/compat.h   |  6 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 225c2ab..e2ff0aa 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -13,6 +13,8 @@
 #include linux/ve.h
 #include asm/uaccess.h
 
+#include trace/events/block.h
+
 #include linux/ploop/ploop.h
 #include ploop_events.h
 #include freeblks.h
@@ -518,7 +520,7 @@ ploop_bio_queue(struct ploop_device * plo, struct bio * bio,
bio-bi_bdev = plo-bdev;
clear_bit(BIO_BDEV_REUSED, bio-bi_flags);
}
-   BIO_ENDIO(bio, err);
+   BIO_ENDIO(plo-queue, bio, err);
list_add(preq-list, plo-free_list);
plo-bio_qlen--;
plo-bio_discard_qlen--;
@@ -591,7 +593,7 @@ DEFINE_BIO_CB(ploop_fast_end_io)
 
plo = orig-bi_bdev-bd_disk-private_data;
 
-   BIO_ENDIO(orig, err);
+   BIO_ENDIO(plo-queue, orig, err);
 
/* End of fast bio wakes up main process only when this could
 * mean exit from ATTENTION state.
@@ -800,13 +802,13 @@ static void ploop_make_request(struct request_queue *q, 
struct bio *bio)
 * marked as FLUSH, otherwise just warn and complete. */
if (!(bio-bi_rw  REQ_FLUSH)) {
WARN_ON(1);
-   BIO_ENDIO(bio, 0);
+   BIO_ENDIO(q, bio, 0);
return;
}
/* useless to pass this bio further */
if (!plo-tune.pass_flushes) {
ploop_acc_ff_in(plo, bio-bi_rw);
-   BIO_ENDIO(bio, 0);
+   BIO_ENDIO(q, bio, 0);
return;
}
}
@@ -862,7 +864,7 @@ static void ploop_make_request(struct request_queue *q, 
struct bio *bio)
plo-bio_total--;
spin_unlock_irq(plo-lock);
 
-   BIO_ENDIO(bio, -EIO);
+   BIO_ENDIO(q, bio, -EIO);
if (nbio)
bio_put(nbio);
return;
@@ -1208,7 +1210,7 @@ static void ploop_complete_request(struct ploop_request * 
preq)
struct bio * bio = preq-bl.head;
preq-bl.head = bio-bi_next;
bio-bi_next = NULL;
-   BIO_ENDIO(bio, preq-error);
+   BIO_ENDIO(plo-queue, bio, preq-error);
nr_completed++;
}
preq-bl.tail = NULL;
diff --git a/drivers/block/ploop/freeblks.c b/drivers/block/ploop/freeblks.c
index 569cb94..cf48d3a 100644
--- a/drivers/block/ploop/freeblks.c
+++ b/drivers/block/ploop/freeblks.c
@@ -8,6 +8,8 @@
 #include linux/buffer_head.h
 #include linux/kthread.h
 
+#include trace/events/block.h
+
 #include linux/ploop/ploop.h
 #include freeblks.h
 
@@ -700,7 +702,7 @@ static void fbd_complete_bio(struct ploop_freeblks_desc 
*fbd, int err)
struct bio * bio = fbd-fbd_dbl.head;
fbd-fbd_dbl.head = bio-bi_next;
bio-bi_next = NULL;
-   BIO_ENDIO(bio, err);
+   BIO_ENDIO(fbd-plo-queue, bio, err);
nr_completed++;
}
fbd-fbd_dbl.tail = NULL;
diff --git a/include/linux/ploop/compat.h b/include/linux/ploop/compat.h
index ace8ec1..03c3ae3 100644
--- a/include/linux/ploop/compat.h
+++ b/include/linux/ploop/compat.h
@@ -44,7 +44,11 @@ static void func(struct bio *bio, int err) {
 
 #define END_BIO_CB(func)  }
 
-#define BIO_ENDIO(_bio, _err)  bio_endio(_bio, _err)
+#define BIO_ENDIO(_queue, _bio, _err)  \
+   do {\
+   trace_block_bio_complete((_queue), (_bio), (_err)); \
+   bio_endio((_bio), (_err));  \
+   } while (0);
 
 #define F_DENTRY(file) (file)-f_path.dentry
 #define F_MNT(file)(file)-f_path.mnt
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: add a separate queue for discard bio-s (v2)

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit fa6f3b8595f13c13eebd452bc0947754ac249c2c
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:10 2015 +0400

ploop: add a separate queue for discard bio-s (v2)

When I created support of discard requests, process_bio_queue is
called from ploop_thread. So I use ploop_quiesceploop_relax for
synchronization. Now it is called from ploop_make_request too,
so my synchronization doesn't work any more.

The race was added by
diff-ploop-converting-bio-into-ploop-request-in-function-ploop_make_request.

This patch adds a separate queue for discard requests, which is handled
only from ploop_thread(). In addition we get ability to postpone discard 
bio-s,
while we are handling others. So we will not fail, if a bio is received 
while
another one is processed. In a future this will allow us to handle more than
one bio concurrently.

v2: fix comments from Maxim
 Also, ploop_preq_drop() and ploop_complete_request() must wake up 
ploop-thread
 if !bio_list_empty(plo-bio_discard_list) as well.

https://jira.sw.ru/browse/PSBM-27676

Note, that this is a plain(no logic changes) port for RHEL7 of Andrew Vagin
original patch (RHEL6).

Signed-off-by: Andrew Vagin ava...@openvz.org
---
 drivers/block/ploop/dev.c  | 54 ++
 drivers/block/ploop/freeblks.c |  5 
 drivers/block/ploop/freeblks.h |  1 +
 drivers/block/ploop/sysfs.c|  6 +
 include/linux/ploop/ploop.h|  2 ++
 5 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index ab99724..225c2ab 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -117,8 +117,9 @@ static void mitigation_timeout(unsigned long data)
spin_lock_irq(plo-lock);
if (test_bit(PLOOP_S_WAIT_PROCESS, plo-state) 
(!list_empty(plo-entry_queue) ||
-(plo-bio_head  !list_empty(plo-free_list))) 
-   waitqueue_active(plo-waitq))
+((plo-bio_head  !bio_list_empty(plo-bio_discard_list)) 
+   !list_empty(plo-free_list))) 
+   waitqueue_active(plo-waitq))
wake_up_interruptible(plo-waitq);
spin_unlock_irq(plo-lock);
 }
@@ -237,7 +238,8 @@ void ploop_preq_drop(struct ploop_device * plo, struct 
list_head *drop_list,
if (waitqueue_active(plo-req_waitq))
wake_up(plo-req_waitq);
else if (test_bit(PLOOP_S_WAIT_PROCESS, plo-state) 
-   waitqueue_active(plo-waitq)  plo-bio_head)
+   waitqueue_active(plo-waitq) 
+   (plo-bio_head || !bio_list_empty(plo-bio_discard_list)))
wake_up_interruptible(plo-waitq);
 
ploop_uncongest(plo);
@@ -519,6 +521,7 @@ ploop_bio_queue(struct ploop_device * plo, struct bio * bio,
BIO_ENDIO(bio, err);
list_add(preq-list, plo-free_list);
plo-bio_qlen--;
+   plo-bio_discard_qlen--;
plo-bio_total--;
return;
}
@@ -756,6 +759,28 @@ static void ploop_unplug(struct blk_plug_cb *cb, bool 
from_schedule)
kfree(cb);
 }
 
+static void
+process_discard_bio_queue(struct ploop_device *plo, struct list_head 
*drop_list)
+{
+   bool discard = test_bit(PLOOP_S_DISCARD, plo-state);
+   while (!list_empty(plo-free_list)) {
+   struct bio *tmp;
+
+   /* Only one discard bio can be handled concurrently */
+   if (discard  ploop_discard_is_inprogress(plo-fbd))
+   return;
+
+   tmp = bio_list_pop(plo-bio_discard_list);
+   if (tmp == NULL)
+   break;
+
+   /* If PLOOP_S_DISCARD isn't set, ploop_bio_queue
+* will complete it with a proper error.
+*/
+   ploop_bio_queue(plo, tmp, drop_list);
+   }
+}
+
 static void ploop_make_request(struct request_queue *q, struct bio *bio)
 {
struct bio * nbio;
@@ -843,6 +868,12 @@ static void ploop_make_request(struct request_queue *q, 
struct bio *bio)
return;
}
 
+   if (bio-bi_rw  REQ_DISCARD) {
+   bio_list_add(plo-bio_discard_list, bio);
+   plo-bio_discard_qlen++;
+   goto queued;
+   }
+
/* Write tracking in fast path does not work at the moment. */
if (unlikely(test_bit(PLOOP_S_TRACK, plo-state) 
 (bio-bi_rw  WRITE)))
@@ -864,9 +895,6 @@ static void ploop_make_request(struct request_queue *q, 
struct bio *bio)
if (unlikely(nbio == NULL))
goto queue;
 
-   if (bio-bi_rw  REQ_DISCARD)
-

[Devel] [PATCH RHEL7 COMMIT] ploop: define struct ploop_request-state flags to force pre FLUSH before write IO and FUA/fsync at I/O complete

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit ebf1008ff2e19354244317140b41ae3c2854f74b
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:13 2015 +0400

ploop: define struct ploop_request-state flags to force pre FLUSH before 
write IO and FUA/fsync at I/O complete

Series description:

During relocation of ploop clusters (resize/baloon) we need to FUA/fsync
image file after such operations:
 a) new data block wrote
 b) BAT update
 c) nullify old data block for BAT grow. We do this already nullify of old 
data
block at format module - complete_grow callback.

This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image
by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules
tuned by patch to force fsync/FUA if these flags are set. This code does
FUA/fsync only for a) and b) cases, while c) already implemented.

Also patch fixes inconsistent bio list FUA processing in direct module.
The problem is that for bunch of bios we only set FUA at last bio. Its 
possible
in case of power outage that last bio will be stored and previos are not
because they are stored only in cache at the time of power failure.
To solve problem this patch marking last bio as FLUSH|FUA if more than one 
bio
in list.

Moreover for KAIO if fsync possible at BAT update stage we do that like we
did in direct case instead of 2 fsync's. For direct case if we going to make
FUA at BAT update only(optimization trick that already exists) then we need
to mark req to FLUSH previously written(without FUA) data.

Performance:
Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% 
of
time.

https://jira.sw.ru/browse/PSBM-31222
https://jira.sw.ru/browse/PSBM-31225
https://jira.sw.ru/browse/PSBM-31321

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Andrey Smetanin (7):
  ploop: define struct ploop_request-state flags to force pre FLUSH
before write IO and FUA/fsync at I/O complete
  ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O
  ploop: mark reloc reqs to force FUA before write of relocated data
  ploop: direct: to support truly FLUSH/FUA of req we need mark first
bio FLUSH, write all bios and mark last bio as FLUSH/FUA
  ploop: added ploop_req_delay_fua_possible() func that detects possible
delaying of upcoming FUA to index update stage. This function will
be lately used in direct/kaio code to detect and delay FUA
  ploop: make image fsync at I/O complete if it's required by FUA/fsync
force flag or by req-req_rw
  ploop: do preflush or postfua according force FUA/flush flags, and
delay FUA if possible but add force FLUSH to req if so

This patch description:
Need such defines to force FUA/FLUSH/fsync in direct/kaio modules.

https://jira.sw.ru/browse/PSBM-31222
https://jira.sw.ru/browse/PSBM-31225
https://jira.sw.ru/browse/PSBM-31321

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Reviewed-by: Andrew Vagin ava...@parallels.com
---
 include/linux/ploop/ploop.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index d8b83a6..73280e0 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -456,6 +456,9 @@ enum
PLOOP_REQ_ZERO,
PLOOP_REQ_DISCARD,
PLOOP_REQ_RSYNC,
+   PLOOP_REQ_FORCE_FUA,/*force fua of req write I/O by engine */
+   PLOOP_REQ_FORCE_FLUSH,  /*force flush by engine */
+   PLOOP_REQ_KAIO_FSYNC,   /*force image fsync by KAIO module */
 };
 
 enum
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: direct: to support truly FLUSH/FUA of req we need mark first bio FLUSH, write all bios and mark last bio as FLUSH/FUA

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 822c64967450485c2a26b9cfbf388d85ad022781
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:13 2015 +0400

ploop: direct: to support truly FLUSH/FUA of req we need mark first bio 
FLUSH, write all bios and mark last bio as FLUSH/FUA

Series description:

During relocation of ploop clusters (resize/baloon) we need to FUA/fsync
image file after such operations:
 a) new data block wrote
 b) BAT update
 c) nullify old data block for BAT grow. We do this already nullify of old 
data
block at format module - complete_grow callback.

This patch forses fsync(kaio), FUA(direct) of reloc write I/O to image
by marking such reloc reqs(A|S) with appropriate flags. Kaio/direct modules
tuned by patch to force fsync/FUA if these flags are set. This code does
FUA/fsync only for a) and b) cases, while c) already implemented.

Also patch fixes inconsistent bio list FUA processing in direct module.
The problem is that for bunch of bios we only set FUA at last bio. Its 
possible
in case of power outage that last bio will be stored and previos are not
because they are stored only in cache at the time of power failure.
To solve problem this patch marking last bio as FLUSH|FUA if more than one 
bio
in list.

Moreover for KAIO if fsync possible at BAT update stage we do that like we
did in direct case instead of 2 fsync's. For direct case if we going to make
FUA at BAT update only(optimization trick that already exists) then we need
to mark req to FLUSH previously written(without FUA) data.

Performance:
Overall(includes EXT4 resize upto 16T) resize performance degradated by -5% 
of
time.

https://jira.sw.ru/browse/PSBM-31222
https://jira.sw.ru/browse/PSBM-31225
https://jira.sw.ru/browse/PSBM-31321

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Andrey Smetanin (7):
  ploop: define struct ploop_request-state flags to force pre FLUSH
before write IO and FUA/fsync at I/O complete
  ploop: mark reloc reqs to force FUA/fsync(kaio) for index update I/O
  ploop: mark reloc reqs to force FUA before write of relocated data
  ploop: direct: to support truly FLUSH/FUA of req we need mark first
bio FLUSH, write all bios and mark last bio as FLUSH/FUA
  ploop: added ploop_req_delay_fua_possible() func that detects possible
delaying of upcoming FUA to index update stage. This function will
be lately used in direct/kaio code to detect and delay FUA
  ploop: make image fsync at I/O complete if it's required by FUA/fsync
force flag or by req-req_rw
  ploop: do preflush or postfua according force FUA/flush flags, and
delay FUA if possible but add force FLUSH to req if so

This patch description:
Patch fixes inconsistent bio list FUA processing in direct module.
The problem is that for bunch of bios we only set FUA at last bio. Its 
possible
in case of power outage that last bio will be stored and previos are not
because they are stored only in cache at the time of power failure.
To solve problem this patch marking last bio as FLUSH|FUA if more than one 
bio
in list.

https://jira.sw.ru/browse/PSBM-31222
https://jira.sw.ru/browse/PSBM-31225
https://jira.sw.ru/browse/PSBM-31321

Signed-off-by: Andrey Smetanin asmeta...@parallels.com

Reviewed-by: Andrew Vagin ava...@parallels.com
---
 drivers/block/ploop/io_direct.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 5e2e078..2e81d81 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -85,6 +85,7 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
int preflush;
int postfua = 0;
int write = !!(rw  REQ_WRITE);
+   int bio_num;
 
trace_submit(preq);
 
@@ -215,6 +216,7 @@ flush_bio:
}
extent_put(em);
 
+   bio_num = 0;
while (bl.head) {
struct bio * b = bl.head;
unsigned long rw2 = rw;
@@ -230,10 +232,11 @@ flush_bio:
preflush = 0;
}
if (unlikely(postfua  !bl.head))
-   rw2 |= REQ_FUA;
+   rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0));
 
ploop_acc_ff_out(preq-plo, rw2 | b-bi_rw);
submit_bio(rw2  ~(bl.head ? REQ_SYNC : 0), b);
+   bio_num++;
}
 
ploop_complete_io_request(preq);
@@ -1341,9 +1344,12 @@ dio_io_page(struct ploop_io * io, unsigned long rw,
int err;
int off;
int postfua;
+   int

[Devel] [PATCH RHEL7 COMMIT] ploop: added printk of function, line, backtrace before ploop_set_error

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit a24f999555ce0caf879864e9b623908830e54232
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:17 2015 +0400

ploop: added printk of function, line, backtrace before ploop_set_error

There are several bugs where we only see messages like:
ploop_set_error=-7 on ploop28808. Sometimes such infomation
insufficient to find fastly why and where error in ploop happens.
This patch added extra printk of function, line and stack backtrace
in case of ploop request failure.

Signed-off-by: Andrey Smetanin asmeta...@virtuozzo.com
---
 drivers/block/ploop/dev.c| 32 +++-
 drivers/block/ploop/fmt_ploop1.c |  2 +-
 drivers/block/ploop/io_direct.c  | 20 ++--
 drivers/block/ploop/io_kaio.c| 16 
 drivers/block/ploop/map.c| 12 ++--
 include/linux/ploop/ploop.h  | 36 +++-
 6 files changed, 79 insertions(+), 39 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index bd5fe37..f780618 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1331,7 +1331,7 @@ void ploop_fail_request(struct ploop_request * preq, int 
err)
 {
struct ploop_device * plo = preq-plo;
 
-   ploop_set_error(preq, err);
+   ploop_req_set_error(preq, err);
 
spin_lock_irq(plo-lock);
if (err == -ENOSPC) {
@@ -1351,13 +1351,19 @@ void ploop_fail_immediate(struct ploop_request * preq, 
int err)
 {
struct ploop_device * plo = preq-plo;
 
-   ploop_set_error(preq, err);
+   ploop_req_set_error(preq, err);
 
set_bit(PLOOP_S_ABORT, plo-state);
preq-eng_state = PLOOP_E_COMPLETE;
ploop_complete_request(preq);
 }
 
+#define PLOOP_REQ_FAIL_IMMEDIATE(preq, err)\
+   do {\
+   PLOOP_REQ_TRACE_ERROR(preq, err);   \
+   ploop_fail_immediate(preq, err);\
+   } while (0);
+
 void ploop_complete_io_state(struct ploop_request * preq)
 {
struct ploop_device * plo = preq-plo;
@@ -1577,7 +1583,7 @@ ploop_reloc_sched_read(struct ploop_request *preq, 
iblock_t iblk)
 
if (!preq-aux_bio ||
fill_bio(plo, preq-aux_bio, preq-req_cluster)) {
-   ploop_fail_immediate(preq, -ENOMEM);
+   PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
return;
}
}
@@ -2064,7 +2070,7 @@ restart:
 
if (!preq-aux_bio ||
fill_bio(plo, preq-aux_bio, preq-req_cluster)) {
-   ploop_fail_immediate(preq, -ENOMEM);
+   PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
return;
}
 
@@ -2170,7 +2176,7 @@ delta_io:
 
if (!preq-aux_bio ||
fill_bio(plo, preq-aux_bio, 
preq-req_cluster)) {
-   ploop_fail_immediate(preq, -ENOMEM);
+   PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
return;
}
spin_lock_irq(plo-lock);
@@ -2225,7 +2231,7 @@ delta_io:
return;
 
 error:
-   ploop_fail_immediate(preq, err);
+   PLOOP_REQ_FAIL_IMMEDIATE(preq, err);
 }
 
 static void ploop_req_state_process(struct ploop_request * preq)
@@ -2271,7 +2277,7 @@ restart:
if (preq-error ||
((preq-req_rw  REQ_WRITE) 
 test_bit(PLOOP_S_ABORT, plo-state))) {
-   ploop_fail_immediate(preq, preq-error ? : -EIO);
+   PLOOP_REQ_FAIL_IMMEDIATE(preq, preq-error ? : -EIO);
break;
}
 
@@ -2346,7 +2352,7 @@ restart:
 */
if (preq-error ||
test_bit(PLOOP_S_ABORT, plo-state)) {
-   ploop_fail_immediate(preq, preq-error ? : -EIO);
+   PLOOP_REQ_FAIL_IMMEDIATE(preq, preq-error ? : -EIO);
break;
}
 
@@ -2386,7 +2392,7 @@ restart:
 
if (!preq-aux_bio ||
fill_bio(plo, preq-aux_bio, preq-req_cluster)) {
-   ploop_fail_immediate(preq, -ENOMEM);
+   PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
break;
}
 
@@ -2425,7 +2431,7 @@ restart:
 
if (preq-error ||
test_bit(PLOOP_S_ABORT, plo-state)) {
-   ploop_fail_immediate(preq, preq-error ? : -EIO);
+

[Devel] [PATCH RHEL7 COMMIT] ploop: fix accounting ploop_io_images_size

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 09fe781813a8c4e74ad6f9621cf26acb705424f4
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:26:54 2015 +0400

ploop: fix accounting ploop_io_images_size

ploop_io_images_size is a global counter supposed to be total sum of 
io-size
for all io structs. However, actual size of image can be used by userspace.
E.g. when userspace grows lower delta for merge operartion. This means that
by the time ploop_dio_close() is called, actual size may differ slightly 
from
the size we initially accounted.

The patch fixes the problem by accurate accounting of images sizes: we can
subtruct from ploop_io_images_size only so many bytes as we actually added
earlier.

Another fix is for growing lower delta in userspace: we try to catch up
changes made by userspace when kernel merge starts and ploop_dio_upgrade
is being called.

https://jira.sw.ru/browse/PSBM-19906

Signed-off-by: Maxim V. Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/io_direct.c | 12 +---
 drivers/block/ploop/io_direct_map.c | 20 +++-
 drivers/block/ploop/io_direct_map.h |  6 +++---
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 8d716ca..cbb7edc 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -852,7 +852,7 @@ static void dio_destroy(struct ploop_io * io)
if (io-files.em_tree) {
io-files.em_tree = NULL;
mutex_lock(io-files.inode-i_mutex);
-   ploop_dio_close(io-files.mapping, delta-flags  
PLOOP_FMT_RDONLY);
+   ploop_dio_close(io, delta-flags  PLOOP_FMT_RDONLY);
(void)dio_invalidate_cache(io-files.mapping, 
io-files.bdev);
mutex_unlock(io-files.inode-i_mutex);
}
@@ -910,7 +910,7 @@ static int dio_open(struct ploop_io * io)
dio_fsync(file);
 
mutex_lock(io-files.inode-i_mutex);
-   em_tree = ploop_dio_open(io-files.file, (delta-flags  
PLOOP_FMT_RDONLY));
+   em_tree = ploop_dio_open(io, (delta-flags  PLOOP_FMT_RDONLY));
err = PTR_ERR(em_tree);
if (IS_ERR(em_tree))
goto out;
@@ -920,7 +920,7 @@ static int dio_open(struct ploop_io * io)
err = dio_invalidate_cache(io-files.mapping, io-files.bdev);
if (err) {
io-files.em_tree = NULL;
-   ploop_dio_close(io-files.mapping, 0);
+   ploop_dio_close(io, 0);
goto out;
}
 
@@ -930,7 +930,7 @@ static int dio_open(struct ploop_io * io)
  delta-plo-index);
if (io-fsync_thread == NULL) {
io-files.em_tree = NULL;
-   ploop_dio_close(io-files.mapping, 0);
+   ploop_dio_close(io, 0);
goto out;
}
wake_up_process(io-fsync_thread);
@@ -938,8 +938,6 @@ static int dio_open(struct ploop_io * io)
 
 out:
mutex_unlock(io-files.inode-i_mutex);
-   if (!err)
-   io-size = i_size_read(io-files.inode);
return err;
 }
 
@@ -1644,7 +1642,7 @@ static int dio_prepare_merge(struct ploop_io * io, struct 
ploop_snapdata *sd)
return err;
}
 
-   err = ploop_dio_upgrade(io-files.mapping);
+   err = ploop_dio_upgrade(io);
if (err) {
mutex_unlock(io-files.inode-i_mutex);
fput(file);
diff --git a/drivers/block/ploop/io_direct_map.c 
b/drivers/block/ploop/io_direct_map.c
index 62984bf..2ddf93a 100644
--- a/drivers/block/ploop/io_direct_map.c
+++ b/drivers/block/ploop/io_direct_map.c
@@ -52,10 +52,11 @@ extern atomic_long_t ploop_io_images_size;
  */
 
 struct extent_map_tree *
-ploop_dio_open(struct file * file, int rdonly)
+ploop_dio_open(struct ploop_io * io, int rdonly)
 {
int err;
struct ploop_mapping *m, *pm;
+   struct file * file = io-files.file;
struct address_space * mapping = file-f_mapping;
 
pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL);
@@ -100,7 +101,8 @@ out_unlock:
pm-readers = rdonly ? 1 : -1;
list_add(pm-list, ploop_mappings);
mapping-host-i_flags |= S_SWAPFILE;
-   atomic_long_add(i_size_read(mapping-host), ploop_io_images_size);
+   io-size = i_size_read(mapping-host);
+   atomic_long_add(io-size, ploop_io_images_size);
 
pm-saved_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping,
@@ -125,8 +127,9 @@ out_unlock:
 }
 
 int
-ploop_dio_close(struct address_space * mapping, int rdonly)
+ploop_dio_close(struct ploop_io *

[Devel] [PATCH RHEL7 COMMIT] ploop: fix dio_fsync wait

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 732676a0eafac636b18f060f06b79312e0ad829d
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:26:55 2015 +0400

ploop: fix dio_fsync wait

we MUST wait for writeback to finish before file-f_op-fsync.

https://jira.sw.ru/browse/PSBM-18049

Signed-off-by: Dmitry Monakhov dmonak...@openvz.org

Acked-by: Maxim V. Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/io_direct.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index cbb7edc..a9910ba 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -774,16 +774,13 @@ static int dio_fsync(struct file * file)
int err, ret;
struct address_space *mapping = file-f_mapping;
 
-   ret = filemap_fdatawrite(mapping);
+   ret = filemap_write_and_wait(mapping);
err = 0;
if (file-f_op  file-f_op-fsync) {
err = file-f_op-FOP_FSYNC(file, 0);
if (!ret)
ret = err;
}
-   err = filemap_fdatawait(mapping);
-   if (!ret)
-   ret = err;
return ret;
 }
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: rework accounting images_size

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 102acd98b5d6ddf9a310a0c0a54e21d6903d23ab
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:26:58 2015 +0400

ploop: rework accounting images_size

The way how we kept ploop_io_images_size up-to-date (should be always
equal to total number of bytes of all loaded image files) was very prone
to errors: first delta loaded kept actual io-size, then backup delta was
initialized with io-size=0; since then, if first delta was unloaded before
the second, unloading the second delta led to subtracting io-size=0 from
ploop_io_images_size. This is obviously incorrect.

The patch makes the accounting much more straightforward: the size of image
is actually the property of mapping, not a delta (because several deltas
may point to the same mapping). So, let's keep actual 'size' in 
ploop_mapping
structure and let every delta point to it.

No extra locking is needed because the image is either opened by several
device read-only, or one and only one device read-write.

https://jira.sw.ru/browse/PSBM-20432

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/io_direct.c | 10 +-
 drivers/block/ploop/io_direct_map.c | 16 ++--
 include/linux/ploop/ploop.h |  2 +-
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 17dbf6c..ab74849 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -31,7 +31,7 @@
 int max_extent_map_pages __read_mostly;
 int min_extent_map_entries __read_mostly;
 
-/* total sum of io-size for all io structs */
+/* total sum of m-size for all ploop_mapping structs */
 atomic_long_t ploop_io_images_size = ATOMIC_LONG_INIT(0);
 
 /* Direct IO from/to file.
@@ -436,8 +436,8 @@ try_again:
mutex_unlock(io-files.inode-i_mutex);
 
new_size = i_size_read(io-files.inode);
-   atomic_long_add(new_size - io-size, ploop_io_images_size);
-   io-size = new_size;
+   atomic_long_add(new_size - *io-size_ptr, ploop_io_images_size);
+   *io-size_ptr = new_size;
 
if (!err)
err = filemap_fdatawrite(io-files.mapping);
@@ -1684,8 +1684,8 @@ static int dio_truncate(struct ploop_io * io, struct file 
* file,
mutex_unlock(io-files.inode-i_mutex);
 
new_size = i_size_read(io-files.inode);
-   atomic_long_sub(io-size - new_size, ploop_io_images_size);
-   io-size = new_size;
+   atomic_long_sub(*io-size_ptr - new_size, ploop_io_images_size);
+   *io-size_ptr = new_size;
 
if (!err)
err = dio_fsync(file);
diff --git a/drivers/block/ploop/io_direct_map.c 
b/drivers/block/ploop/io_direct_map.c
index 6b0886c..b3cb04d 100644
--- a/drivers/block/ploop/io_direct_map.c
+++ b/drivers/block/ploop/io_direct_map.c
@@ -31,6 +31,7 @@ struct ploop_mapping
struct address_space* mapping;
int readers;
unsigned long   saved_gfp_mask;
+   loff_t  size;
 
struct extent_map_tree  extent_root;
 };
@@ -81,6 +82,8 @@ out_unlock:
spin_unlock(ploop_mappings_lock);
if (pm)
kfree(pm);
+   if (!err)
+   io-size_ptr = m-size;
return err ? ERR_PTR(err) : m-extent_root;
}
}
@@ -101,8 +104,9 @@ out_unlock:
pm-readers = rdonly ? 1 : -1;
list_add(pm-list, ploop_mappings);
mapping-host-i_flags |= S_SWAPFILE;
-   io-size = i_size_read(mapping-host);
-   atomic_long_add(io-size, ploop_io_images_size);
+   io-size_ptr = pm-size;
+   *io-size_ptr = i_size_read(mapping-host);
+   atomic_long_add(*io-size_ptr, ploop_io_images_size);
 
pm-saved_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping,
@@ -143,9 +147,9 @@ ploop_dio_close(struct ploop_io * io, int rdonly)
}
 
if (m-readers == 0) {
-   atomic_long_sub(io-size,
+   atomic_long_sub(*io-size_ptr,
ploop_io_images_size);
-   io-size = 0;
+   *io-size_ptr = 0;
mapping-host-i_flags = ~S_SWAPFILE;
list_del(m-list);
pm = m;
@@ -191,9 +195,9 @@ int ploop_dio_upgrade(struct ploop_io * io)
err = -EBUSY;
if (m-readers == 1) {
loff_t new_size = i_size_read(io-files.inode);
-

[Devel] [PATCH RHEL7 COMMIT] ploop: fix race in ploop_tracker_init()

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit df915f10c4c348fb40ab7fded3ae860b715d7103
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:03 2015 +0400

ploop: fix race in ploop_tracker_init()

ploop_tracker_init() may acquire current alloc_head only after quiescing
ploop. Otherwise a race is possible:

1) we acuire an alloc_head:
e.end = (u64)ploop_top_delta(plo)-io.alloc_head  (plo-cluster_log + 9);

2) then the alloc_head is advanced due to submit_alloc writes

3) we turn write tracker ON: set_bit(PLOOP_S_TRACK, plo-state).

The result is disastrous: the 1st iteration of userspace vzmigrate won't 
copy
blocks allocated on 2) because we reported old e.end; and then vzmigrate
also won't copy the blocks because they were allocated when write tracker
was off.

https://jira.sw.ru/browse/PSBM-22993

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/tracker.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ploop/tracker.c b/drivers/block/ploop/tracker.c
index 5dbb7c9..3210006 100644
--- a/drivers/block/ploop/tracker.c
+++ b/drivers/block/ploop/tracker.c
@@ -101,12 +101,15 @@ int ploop_tracker_init(struct ploop_device * plo, 
unsigned long arg)
if (list_empty(plo-map.delta_list))
return -ENOENT;
 
+   ploop_quiesce(plo);
+
e.start = 0;
e.end = (u64)ploop_top_delta(plo)-io.alloc_head  (plo-cluster_log + 
9);
-   if (copy_to_user((void*)arg, e, sizeof(struct ploop_track_extent)))
+   if (copy_to_user((void*)arg, e, sizeof(struct ploop_track_extent))) {
+   ploop_relax(plo);
return -EFAULT;
+   }
 
-   ploop_quiesce(plo);
set_bit(PLOOP_S_TRACK, plo-state);
plo-maintenance_type = PLOOP_MNTN_TRACK;
plo-track_end = 0;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: add ioctl to limit size of top delta (v2)

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit fd12acccf76e1a56f073322414cd43cbffa598ba
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:04 2015 +0400

ploop: add ioctl to limit size of top delta (v2)

customer created an online backup = backup.tib file.  Most probably the
.tib file is inconsistent and in order to correctly access data inside
we need to replay the journal.

= We have to provide to container a bundle of .tib file (read-only) and
a tiny read-writeable ploop delta - journal replayed data will be stored
there.

A cunning customer can notice that the delta is writeble and fill it
with his own data - unlimited.

= we need an ability to limit the ploop delta max size.

https://jira.sw.ru/browse/PSBM-22002

v2: move declaration of PLOOP_IOC_MAX_DELTA_SIZE

Signed-off-by: Andrew Vagin ava...@openvz.org

Acked-by: Maxim V. Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/dev.c| 20 
 drivers/block/ploop/fmt_ploop1.c |  5 +
 include/linux/ploop/ploop.h  |  2 ++
 include/linux/ploop/ploop_if.h   |  3 +++
 4 files changed, 30 insertions(+)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 0124349..33f8442 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2770,6 +2770,7 @@ init_delta(struct ploop_device * plo, struct ploop_ctl * 
ctl, int level)
delta-plo = plo;
delta-ops = ops;
delta-flags = ctl-pctl_flags  PLOOP_FMT_FLAGS;
+   delta-max_delta_size = ULLONG_MAX;
 
KOBJECT_INIT(delta-kobj, ploop_delta_ktype);
return delta;
@@ -2780,6 +2781,22 @@ out_err:
 }
 
 
+static int ploop_set_max_delta_size(struct ploop_device *plo, unsigned long 
arg)
+{
+   struct ploop_delta * top_delta = ploop_top_delta(plo);
+   u64 max_delta_size;
+
+   if (copy_from_user(max_delta_size, (void*)arg, sizeof(u64)))
+   return -EFAULT;
+
+   if (top_delta == NULL)
+   return -EINVAL;
+
+   top_delta-max_delta_size = max_delta_size;
+
+   return 0;
+}
+
 static int ploop_add_delta(struct ploop_device * plo, unsigned long arg)
 {
int err;
@@ -4419,6 +4436,9 @@ static int ploop_ioctl(struct block_device *bdev, fmode_t 
fmode, unsigned int cm
case PLOOP_IOC_DISCARD_WAIT:
err = ploop_discard_wait_ioc(plo);
break;
+   case PLOOP_IOC_MAX_DELTA_SIZE:
+   err = ploop_set_max_delta_size(plo, arg);
+   break;
default:
err = -EINVAL;
}
diff --git a/drivers/block/ploop/fmt_ploop1.c b/drivers/block/ploop/fmt_ploop1.c
index 5ce6915..585f6ce 100644
--- a/drivers/block/ploop/fmt_ploop1.c
+++ b/drivers/block/ploop/fmt_ploop1.c
@@ -222,6 +222,11 @@ static void
 ploop1_allocate(struct ploop_delta * delta, struct ploop_request * preq,
struct bio_list * sbl, unsigned int size)
 {
+   if (delta-io.alloc_head =
+   (delta-max_delta_size  delta-cluster_log)) {
+   ploop_fail_request(preq, -E2BIG);
+   return;
+   }
delta-io.ops-submit_alloc(delta-io, preq, sbl, size);
 }
 
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 434789e..ae3dbfc 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -286,6 +286,8 @@ struct ploop_delta
struct ploop_delta_ops  *ops;
 
struct kobject  kobj;
+
+   u64 max_delta_size; /* in sectors */
 };
 
 struct ploop_tunable
diff --git a/include/linux/ploop/ploop_if.h b/include/linux/ploop/ploop_if.h
index 45b74fc..aacddb3 100644
--- a/include/linux/ploop/ploop_if.h
+++ b/include/linux/ploop/ploop_if.h
@@ -299,6 +299,9 @@ struct ploop_track_extent
 /* Filter extents with sizes less than arg */
 #define PLOOP_IOC_FBFILTER _IOR(PLOOPCTLTYPE, 27, unsigned long)
 
+/* Set maximum size for the top delta . */
+#define PLOOP_IOC_MAX_DELTA_SIZE _IOW(PLOOPCTLTYPE, 28, __u64)
+
 /* Events exposed via /sys/block/ploopN/pstate/event */
 #define PLOOP_EVENT_ABORTED1
 #define PLOOP_EVENT_STOPPED2
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: expose open_count to sysfs

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 97ba7085978ee4245beaf43c5543cb230094189d
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:06 2015 +0400

ploop: expose open_count to sysfs

The patch shows number of opened instances of ploop-device (plo-open_count)
in /sys/block/ploopN/pstate/open_count. This will allow userspace to decide
whether a ploop-device is used by someone (e.g. backup) by scanning the
open_count for all ploop-devices.

https://jira.sw.ru/browse/PSBM-24754

Signed-off-by: Maxim Patlasov mpatla...@parallels.com
---
 drivers/block/ploop/sysfs.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/ploop/sysfs.c b/drivers/block/ploop/sysfs.c
index 5c31826..3ef53ac 100644
--- a/drivers/block/ploop/sysfs.c
+++ b/drivers/block/ploop/sysfs.c
@@ -383,6 +383,11 @@ static u32 show_event(struct ploop_device * plo)
return ret;
 }
 
+static u32 show_open_count(struct ploop_device * plo)
+{
+   return atomic_read(plo-open_count);
+}
+
 static ssize_t print_cookie(struct ploop_device * plo, char * page)
 {
return sprintf(page, %s\n, plo-cookie);
@@ -466,6 +471,7 @@ static struct attribute *state_attributes[] = {
_A(top),
_A(event),
_A3(cookie),
+   _A(open_count),
NULL
 };
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ploop: ensure non-empty delta list on running ploop

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit ace664d9412efd2fa814851b0be55751dc9ed13a
Author: Andrey Smetanin asmeta...@virtuozzo.com
Date:   Tue May 19 08:27:05 2015 +0400

ploop: ensure non-empty delta list on running ploop

The only allowed use-case for delta removal on running ploop is the merge
operation. But merge never removes base delta. Hence, we can prohibit
base delta removal if ploop is in RUNNING state. This resolves the following
issue: buggy userspace removes all deltas from a running ploop, but leaves
ploop in RUNNING state making any further ploop-mount impossible.

https://jira.sw.ru/browse/PSBM-25102

Signed-off-by: Maxim Patlasov mpatla...@parallels.com

Acked-by: Pavel Emelyanov xe...@parallels.com
---
 drivers/block/ploop/dev.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 33f8442..d2a9eb4 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -3173,6 +3173,12 @@ static int ploop_del_delta(struct ploop_device * plo, 
unsigned long arg)
if (plo-maintenance_type != PLOOP_MNTN_OFF)
return -EBUSY;
 
+   if (level == 0  test_bit(PLOOP_S_RUNNING, plo-state)) {
+   printk(KERN_INFO Can't del base delta on running ploop%d\n,
+  plo-index);
+   return -EBUSY;
+   }
+
delta = find_delta(plo, level);
 
if (delta == NULL)
@@ -3197,6 +3203,8 @@ static int ploop_del_delta(struct ploop_device * plo, 
unsigned long arg)
delta-ops-stop(delta);
delta-ops-destroy(delta);
kobject_put(delta-kobj);
+   BUG_ON(test_bit(PLOOP_S_RUNNING, plo-state) 
+  list_empty(plo-map.delta_list));
return 0;
 }
 
@@ -3573,6 +3581,7 @@ static int ploop_start(struct ploop_device * plo, struct 
block_device *bdev)
 
wake_up_process(plo-thread);
set_bit(PLOOP_S_RUNNING, plo-state);
+   BUG_ON(list_empty(plo-map.delta_list));
return 0;
 
 out_err:
@@ -3605,8 +3614,11 @@ static int ploop_stop(struct ploop_device * plo, struct 
block_device *bdev)
if (!test_bit(PLOOP_S_RUNNING, plo-state))
return -EINVAL;
 
-   if (list_empty(plo-map.delta_list))
+   if (list_empty(plo-map.delta_list)) {
+   printk(KERN_INFO stop ploop%d failed (no deltas)\n,
+  plo-index);
return -ENOENT;
+   }
 
cnt = atomic_read(plo-open_count);
if (cnt  1) {
@@ -3755,6 +3767,7 @@ static int ploop_clear(struct ploop_device * plo, struct 
block_device * bdev)
plo-maintenance_type = PLOOP_MNTN_OFF;
plo-bd_size = 0;
plo-state = (1  PLOOP_S_CHANGED);
+   BUG_ON(test_bit(PLOOP_S_RUNNING, plo-state));
return 0;
 }
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ve: device cgroup -- Implement devcgroup_seq_show_ve

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit bc411f061cc8878edb65db52b8e58ab2fa218186
Author: Cyrill Gorcunov gorcu...@odin.com
Date:   Tue May 19 00:43:32 2015 +0400

ve: device cgroup -- Implement devcgroup_seq_show_ve

In PCS7 cgroups are configured from user space, so there is
no longer connection from ve to device cgroup via css as
it was in PCS6. Instead we should open device cgroup explicitly.

https://jira.sw.ru/browse/PSBM-33555

v2 (by vdavydov@):
 - use ve::ve_name because we're switching to UUID based containers

Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
Reviewed-by: Vladimir Davydov vdavy...@parallels.com

CC: Konstantin Khorenko khore...@odin.com
CC: Andrey Vagin ava...@odin.com
---
 include/linux/device_cgroup.h |  3 ++-
 kernel/ve/vecalls.c   |  2 +-
 security/device_cgroup.c  | 14 +++---
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index bc58c4c..32588bb 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -19,7 +19,8 @@ extern int devcgroup_device_visible(umode_t mode, int major,
 struct cgroup;
 int devcgroup_default_perms_ve(struct cgroup *cgroup);
 int devcgroup_set_perms_ve(struct cgroup *cgroup, unsigned, dev_t, unsigned);
-int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
seq_file *m);
+struct ve_struct;
+int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, 
struct seq_file *m);
 
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
index 7c574b3..2613a1e 100644
--- a/kernel/ve/vecalls.c
+++ b/kernel/ve/vecalls.c
@@ -891,7 +891,7 @@ static int devperms_seq_show(struct seq_file *m, void *v)
if (ve_is_super(ve))
seq_printf(m, %10u b 016 *:*\n%10u c 006 *:*\n, 0, 0);
else
-   devcgroup_seq_show_ve(ve-css.cgroup, ve-veid, m);
+   devcgroup_seq_show_ve(devices_root, ve, m);
 
return 0;
 }
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 31024f7..33a9883 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -17,6 +17,7 @@
 #include linux/major.h
 #include linux/module.h
 #include linux/capability.h
+#include linux/ve.h
 
 #define ACC_MKNOD 1
 #define ACC_READ  2
@@ -1091,10 +1092,16 @@ int devcgroup_set_perms_ve(struct cgroup *cgroup,
 }
 EXPORT_SYMBOL(devcgroup_set_perms_ve);
 
-int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
seq_file *m)
+int devcgroup_seq_show_ve(struct cgroup *devices_root, struct ve_struct *ve, 
struct seq_file *m)
 {
-   struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
struct dev_exception_item *wh;
+   struct dev_cgroup *devcgroup;
+   struct cgroup *cgroup;
+
+   cgroup = cgroup_kernel_open(devices_root, 0, ve_name(ve));
+   if (IS_ERR(cgroup))
+   return PTR_ERR(cgroup);
+   devcgroup = cgroup_to_devcgroup(cgroup);
 
rcu_read_lock();
list_for_each_entry_rcu(wh, devcgroup-exceptions, list) {
@@ -1112,12 +1119,13 @@ int devcgroup_seq_show_ve(struct cgroup *cgroup, 
unsigned veid, struct seq_file
perm |= S_IXOTH;
 
seq_printf(m, %10u %c %03o %s:%s\n,
-   veid,
+   ve-veid,
type_to_char(wh-type),
perm, maj, min);
}
rcu_read_unlock();
 
+   cgroup_kernel_close(cgroup);
return 0;
 }
 EXPORT_SYMBOL(devcgroup_seq_show_ve);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7] scripts: Delete generated binary files from kernel tree.

2015-05-18 Thread Kirill Tkhai

Signed-off-by: Kirill Tkhai ktk...@odin.com
---
 scripts/basic/fixdep |  Bin
 scripts/kconfig/conf |  Bin
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100755 scripts/basic/fixdep
 delete mode 100755 scripts/kconfig/conf

diff --git a/scripts/basic/fixdep b/scripts/basic/fixdep
deleted file mode 100755
index 
2d8a408aef61f56512dc8ac61eb828492789022b..
GIT binary patch
literal 0
HcmV?d1

literal 13875
zcmeHOdvH@%dOwm47-J;yaN-aGmBxY5`@hwHeezd8Lp^ICS;m$hEL#tc@*sq$|ML
zOae7#Q5C0jH{G3ZI-6HX?CaWsNfO|e-U*kscw+1YN=vfEIW5DAmUn+I{S0rTv
zdyaG^teO3z{g30({m%D0-|M{ZM`PI-x{mM!ep|tRgAcyDJ~8fS7h(ztiWp6WaeUX
z*qv-5kO}y?cnMJC+Vm#EoqUED*!FTuf}QsHC7kTNIKhqnWQp8qU0}=Tq0}GtG5}f
zqL#XAU%rJF^99D%{(LNQ4vSc6r)_~5lngLa##TNqSBMCaJ6+)s22l!fun*1(+Z
z5$5673?*4g8{XGjDL=cEv!E`Xku;y)A3=}e{C$%{2yPVq^0mu|sGpQWeYok+m|g_
z*wMP6BNS9Ua-4n*@9(DoYAmz5wADdC)u^@H*-(29Tjj?znSa7@+cB{qDWPw_jG4
zy_l$}pZ?Nl1fp#_mezzCE4KhqKw61CG2+mD(`x3cJ0RrIXr-5rqyFxSasiYRD
zgU`={_vXQ=-sAc0VwJ$gIDLl_vOL=3Aln^Zh8p7c=4sg9r2A+wVh=E3jCgVXdH
zFP=yq{JA`MV;;N)_%wD0yPz~9$C#Xtk0VcbG6!cE$1OB$m$AU^pw2Y!!+BNZ(`5
zNF*F#$VXXcr@xCu^+-n`#5x0=EuF-*hGRPCy1H4XzoR4E!ZfW#@9qj{+cn}K0voI|
z79w`Yugw{W@#g6$$DA*3uD23WWYCfm9q!I0jTR@cKF;a!0UH==12pa#u|u66o@
zkd6cb5Z%!k4hdLeYkl7O)!HKG;iThNjkg8xs+PMI?f-!EC$`vy%`DN@W-bz(*^
zIF)AZ8G*gXWzlJEMVsZADb1NOg_$or!+dZnNym(=_M6;*)Oz)i=~$$lZ6=C@2`
zqQ52qpm=0bOt@HLc~ON4FOb5ZRVJL)ADL=QIQfzWx}ZrGN~qVv=ct372Dz${sP{
zXrOdzHsQr-l(BXbZf_O6JBDH-)+LtLFu%|gxk_6V+TyQ-Gm=B;j+)E_c*oW|BW_
z!p-yf6%#(gB!9$)7mf7aT6}KJYJ;sTrE-)6E@$jFtv9;pO72`to95Q5jCZ;rS%XQ
zoA1HZKG%gD@$Dp$x-f`ew3qbW-OyzleAmGN}R1-$#4_@kcm+H}Nz?QinM|i+CCu
zse_!KPCN~X)E?k6Hh}S)x~)$@iYWd7A-8Ht^K-sSTX}n0V^)l#BCMh^MYjRdN0z
z@zljBh4cSOJaui#!TFyPPhFZ~oPU#edMr}4FL0hLOgY0H_Ca5Kmo~I-4}iKi}0
z4RHR)#8X$LjS}Z;;D;Lhrw^r0%_X2;APf|3=WOA6~!4-upTDp0PrE?_+3JHL;~g
zjgP2_osRyGX1bUfFa6s=(WU#vg!dfzVszx;-!BBl6(;s)kNJlk*~g1J;_5fO`kx
zgZ2ZXFsa5b#?SOqS0yL#0{ej@1SjJDz*2T-q=w6M)jCG{_bnkeJ8l!O0w89I;O@?
zl+No_}S#W5D?5q$u}Spe=k{2l6!}!F4Y5%ZyQt-rDJy?v_##7QiYwofzT0=k!yJ
z8GgI}`v8+aG9W8KCn5fi5Kol8gDySUHG?bNj-q~wDcOgdnwa`LL{7{DaFT-TAB9-*
zk9kpD0zQ*l(h6G*g9rTH?8n46os5h8lqPzXI;W{jxAwOUk_RbCI*pWz`g8MEK
z)C^fjcjQ0o*lR_-Nn?ir@A;fs}#firt4#4pSB%5{%@FYo7DwB1wpn)cE_mtlI{U
z!#ba6pKj}?qOlA1xqApZPMnlKHCh09HU1Oz)Rl)+%Q^M!(HN9Zkf7T8bN$n8Dmsws
zUljU#f$pM#$wYpLo@jdx^%`2|EEJ{hFs`k_!ng#)VPCA5{oBI{qWkY`K+_uy=
zpn^K4q6T$0!O-hYW$OA^;c-?_P?Q1V`6-L*qcr@5@HMU$*NxKY{Kezz`ktQ8
zwz;acA!|q^Z2o)O7BxgKhS?9%R1e-s0ZuwDlqca!^dHjgSx_V-*Iv8s^=uNCF
zS9`}|#lxjs9uC#S+~lAC4!(QO$40zo3(H}Td9gHk_kP7Hn_28)#rV+AD;D;^M$%G
z{*fC0AA8Ro19ck=l1=7%L(k`}~D6}@ETR=sFtvAy?uG;n*)SbViNVwb$}Yb%%A
z_eFsD;-9H~ua{qdC$;i3HGU;INDC=q=IpJX5Rx;%##0sQPDeK-**`mO-t0#NbVOU
zZ$LTqmmzrl2^?o5h_WDl0zuzJ#XK)3(@pEjjF|#sfnehXDRUU0uQWz7vwR8Ui*^f
zz|tS1s6TE%F9V|XfQ@uWd;zFKEwBK1%8Tp=mccTndXd`KRKCF;5WFbqvd-@ulJgL
z-w-T#5)kK0foUCBLXxoKPsX}_I=-?dO*hCJb#~Uwfh2S5l?+d8)qf)uzAue^*D
zs_gq;fpuT}GB$1TrYPGKTwB7glHl{DELvAgyH}vv4TPZjvwp2P8a%WJIc#;
zKH*L@usT{5V{iE4!|IByyE^boS^OClmw07%s`w7gWdCo4qpO!CFAt6cH*hKy9
z{n%!N$yEEvH?I|{MX||yzSQi!Pg3wbr!B+3@FgfJGS{sLj~ffKA-#)Mv*N_K8cLC
zZPnJf%VN7G09Qu66*dRQJW=}-g5`K;T-PHU4pOHpGEWCG?`vIg(6J4ECCam*I?
zHun5ZIEpKGxZlY+yKMFKcwvbQ)D;(@o*WR5B6jA77XB-7{HE|#d?V^{;{twVriuD
zYda9t$?Ea*qX7_LS5f^idHyYVi-FV`HgRnAuqB9;chMoZnj)OzGPxgMg%NLWHa
z_8m0_+FuxfZy@czhh1pThW3ejq+)*s?9pr$Cx$A-$2+{qf^LHNSI!!zr@PmkE8g
z^w?3@^~FC(J_w240k7S2RjvHB+H-xP{rSn520b56svdB^e)_Lw@2x$(v($dFsbCTf
zH69NPw?WmEzC0ff!ls;$k1yM5^K6NRT~MIlB}EPZ#H*~vag}=nUua6orF?sl^
zx9=H8auWREE(ap^eY6a!eccXf$oLH!rfTw?#8{c(U4a9H=uHA+(Jx+C+;ta2QeM6
z=#_A9n1KM-c$SjGtKMmC-r+Kx%b@A^v2{g@{(ZniIqn%0)AstXwkAHXF+)Q?G?cy
zd+*nCvtqk79I-tE0!28VHH5K2%;DqxZ=%JLhpqul68bu^-Ow;P?OEj7oc6i
zAWSQsvEIhV+*{E0?#r#zq^}!ee(}YQLyUvEg-tdSNVSRdG*xw0=ffCe)bu)%!m3M
zX~Y-5;)`GNz{c3jU#UHV7PWRbc7*XO=Gt$MflkwuAg$5)y+5!hZMb7~9jZVhxT
zSGxRXh7l5d@GbtSi$qwj({Fj+QP9=D{g$G9G6hu`L+dp|pfUZNVK%Td*UrT;XOF
zr1FYg!H(7zf236jhFUset$~GxAR0~Di3Xnvgxf0c+*hg0t5cRJ;fS;5{At{s5Rh
zg#yvL(2SncEdNN%~Agj_%PI!m57;~u2NPSfg;n=*@{;fmMcr^phR0P(!TQ!$t2
z`Ug~_jD+D=R$xaU6o~lsKh$(Ob=6aobAjXX=z^=s%;Vx1iejGCfQg8co^!Y9+X
zb7vsJqTyJiC7{iZD$DDX`O)HH2FFpsZ!KmDKm}BW$)CV!nZ+8+w8n=$n|%$kP|A)
z+bfl=tayHOIXXaCtAybUv%Hw0k3n|c{mG3l|W~g-i_uXKU90modqjgptPF*;FGa2
zdKXcM0l5-u(PBTfx78A$}C}66jUXCM-uCC?5hn3OW-QCCgJAnPs=ba%ah;qJx%6
z4#HS=YgI3NI0Vr$-!43c*sj%HdLZH_gzvUT;yNdR(hi0v~Z*lqEWMT^m}cx
z*j-0Jz0L8Q)om-=U*NGRJ%xc{TiF_$quy5Jn*!8rbF8(|!-@~Tr|}zso}Zp}p!cWN
zdRy7^1@$(iuW*g6;*WHZB_dxdTcd4lh)b}-(4_bEwITZB=et1!N!#9T;}g!^{Kt
zy@lTZ;{9ui7oPT096beVZAUEDw-Co_o5L-{dCXry;i5G0-%Qf2J+uP{u{`@O8!;y
z-)M7njOUxTw+H@;upDL2bTWD?-D-hbEpV#^ZneOz7P!{Fw^5*LS?1(hig;SEW
z#9-NA{Woech8sb%83$v|Dhzmuij~}F+VK1Yah;r_#L*CAt@(^x2KEFwsL+7m_2
z-w6gU-^mYH4IZ~`JbfT2-GfjnGYBgcRFVRjQJQ9-YdxO)bc)z?)@m;ZV+~OqKoGw
z{)pgZ+^Ue1@A{GJU;UJlnHq}NO4}a=ZN5wqJF~6|2vW2v2z{$QuOO(L9Yoqak7hx
z-X`e1f-Vzuji6fuZ4-2-p!)^=j-W3KdQ8wWg8ov_%Yt4LbYijaFDM1GcJ=DzN(H_o
zS1L=KHO@uKqN?g8Rn?2Cl!}dkRzye2EL$rStVo6Xm_XX-wvur4B9Spcp3{tx|lN*
z)ov=y?24`-yygip_p@f43BjSg00L+T)RKoYZ2?A=oph9x+PrfD;XdLs^W5vPhuA

[Devel] [PATCH RHEL7 COMMIT] ve/devmnt: Introduce ve::devmnt list

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 16a628a25414e4cca83d7aa8574fbe22f3e8f500
Author: Kirill Tkhai ktk...@odin.com
Date:   Mon May 18 12:52:09 2015 +0400

ve/devmnt: Introduce ve::devmnt list

1)Porting patch ve: mount option list by Maxim Patlasov:

The patch adds new fields to ve_struct: devmnt_list and devmnt_mutex.
devmnt_list is the head of list of ve_devmnt structs. Each host block device
visible from CT can have no more than one struct ve_devmnt linked in
ve-devmnt_list. If ve_devmnt is present, it can be found by 'dev' field.

Each ve_devmnt struct may bear two strings: hidden and allowed options.
hidden_options will be automatically added to CT-user-supplied mount options
after checking allowed_options. Only options listed in allowed_options are
allowed.

devmnt_mutex is to protect operations on the list of ve_devmnt structs.

2)Porting patch vecalls: VE_CONFIGURE_MOUNT_OPTIONS by Maxim Patlasov.

Reworking the interface using cgroups. Each CT now has a file:

[ve_cgroup_mnt_pnt]/[CTID]/ve.mount_opts

for configuring permittions for a block device. Below is permittions line
example:

0 major:minor;1 balloon_ino=12,pfcache_csum,pfcache=/vz/pfcache;2 
barrier=1

Here, major:minor is a device, '1' starts comma-separated list of
hidden options, and '2' is allowed ones.

https://jira.sw.ru/browse/PSBM-32273

Signed-off-by: Kirill Tkhai ktk...@odin.com
Acked-by: Maxim Patlasov mpatla...@openvz.org
---
 include/linux/ve.h |  11 
 kernel/ve/ve.c | 151 +
 2 files changed, 162 insertions(+)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index 03e90e4..8b70dbe 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -123,12 +123,23 @@ struct ve_struct {
struct net  *ve_netns;
struct mutexsync_mutex;
 
+   struct list_headdevmnt_list;
+   struct mutexdevmnt_mutex;
+
struct kmapset_key  ve_sysfs_perms;
 #if IS_ENABLED(CONFIG_DEVTMPFS)
struct path devtmpfs_root;
 #endif
 };
 
+struct ve_devmnt {
+   struct list_headlink;
+
+   dev_t   dev;
+   char*allowed_options;
+   char*hidden_options; /* balloon_ino, etc. */
+};
+
 #define VE_MEMINFO_DEFAULT  1   /* default behaviour */
 #define VE_MEMINFO_SYSTEM   0   /* disable meminfo virtualization */
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 609ea75..6ab409f 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -40,6 +40,7 @@
 #include linux/task_work.h
 #include linux/tty.h
 #include linux/console.h
+#include linux/ctype.h
 
 #include uapi/linux/vzcalluser.h
 #include linux/venet.h
@@ -710,6 +711,8 @@ do_init:
mutex_init(ve-sync_mutex);
INIT_LIST_HEAD(ve-devices);
INIT_LIST_HEAD(ve-ve_list);
+   INIT_LIST_HEAD(ve-devmnt_list);
+   mutex_init(ve-devmnt_mutex);
kmapset_init_key(ve-ve_sysfs_perms);
 
return ve-css;
@@ -734,11 +737,33 @@ static void ve_offline(struct cgroup *cg)
veid_free(ve-veid);
 }
 
+static void ve_devmnt_free(struct ve_devmnt *devmnt)
+{
+   if (!devmnt)
+   return;
+
+   kfree(devmnt-allowed_options);
+   kfree(devmnt-hidden_options);
+   kfree(devmnt);
+}
+
+static void free_ve_devmnts(struct ve_struct *ve)
+{
+   while (!list_empty(ve-devmnt_list)) {
+   struct ve_devmnt *devmnt;
+
+   devmnt = list_first_entry(ve-devmnt_list, struct ve_devmnt, 
link);
+   list_del(devmnt-link);
+   ve_devmnt_free(devmnt);
+   }
+}
+
 static void ve_destroy(struct cgroup *cg)
 {
struct ve_struct *ve = cgroup_ve(cg);
 
kmapset_unlink(ve-ve_sysfs_perms, ve_sysfs_perms);
+   free_ve_devmnts(ve);
 
ve_log_destroy(ve);
kfree(ve-binfmt_misc);
@@ -886,6 +911,127 @@ static int ve_legacy_veid_read(struct cgroup *cg, struct 
cftype *cft,
return seq_printf(m, %u\n, cgroup_ve(cg)-veid);
 }
 
+/*
+ * 'data' for VE_CONFIGURE_MOUNT_OPTIONS is a zero-terminated string
+ * consisting of substrings separated by MNTOPT_DELIM.
+ */
+#define MNTOPT_DELIM ';'
+
+/*
+ * Each substring has the form of type comma-separated-list-of-options
+ * where types are:
+ */
+enum {
+   MNTOPT_DEVICE = 0,
+   MNTOPT_HIDDEN = 1,
+   MNTOPT_ALLOWED = 2,
+};
+
+/*
+ * 'ptr' points to the first character of buffer to parse
+ * 'endp' points to the last character of buffer to parse
+ */
+static int ve_parse_mount_options(const char *ptr, const char *endp,
+ struct ve_devmnt *devmnt)
+{
+   while (*ptr) {
+   const char *delim =

[Devel] [PATCH RHEL7 COMMIT] ve/uts_ns: Implement cgroup interface to configure ve's os_release

2015-05-18 Thread Konstantin Khorenko

The commit is pushed to branch-rh7-3.10.0-123.1.2-ovz and will appear at 
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.1
--
commit 3c5b30b0520f4ca33608737b25aef6c45c05fb71
Author: Kirill Tkhai ktk...@odin.com
Date:   Mon May 18 13:52:48 2015 +0400

ve/uts_ns: Implement cgroup interface to configure ve's os_release

It's the similar to VZCTL_VE_CONFIGURE ioctl in PCS6.

Note: max_write_len is __NEW_UTS_LEN + 1, because I want to allow

echo ...  ve.os_release, which adds trailing '\n' to the string
(see man echo for details).

Extra symbol will be cut in ve_os_release_write().

https://jira.sw.ru/browse/PSBM-32273

Signed-off-by: Kirill Tkhai ktk...@odin.com
Reviewed-by: Cyrill Gorcunov gorcu...@odin.com
---
 kernel/ve/ve.c | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 6ab409f..e598d15 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -1032,6 +1032,54 @@ static int ve_mount_opts_write(struct cgroup *cg, struct 
cftype *cft,
return 0;
 }
 
+static int ve_os_release_read(struct cgroup *cg, struct cftype *cft,
+ struct seq_file *m)
+{
+   struct ve_struct *ve = cgroup_ve(cg);
+   int ret = 0;
+
+   down_read(ve-op_sem);
+
+   if (!ve-ve_ns) {
+   ret = -ENOENT;
+   goto up_opsem;
+   }
+
+   down_read(uts_sem);
+   seq_puts(m, ve-ve_ns-uts_ns-name.release);
+   seq_putc(m, '\n');
+   up_read(uts_sem);
+up_opsem:
+   up_read(ve-op_sem);
+
+   return ret;
+}
+
+static int ve_os_release_write(struct cgroup *cg, struct cftype *cft,
+  const char *buffer)
+{
+   struct ve_struct *ve = cgroup_ve(cg);
+   char *release;
+   int ret = 0;
+
+   down_read(ve-op_sem);
+
+   if (!ve-ve_ns) {
+   ret = -ENOENT;
+   goto up_opsem;
+   }
+
+   down_write(uts_sem);
+   release = ve-ve_ns-uts_ns-name.release;
+   strncpy(release, buffer, __NEW_UTS_LEN);
+   release[__NEW_UTS_LEN] = '\0';
+   up_write(uts_sem);
+up_opsem:
+   up_read(ve-op_sem);
+
+   return ret;
+}
+
 static struct cftype ve_cftypes[] = {
{
.name = state,
@@ -1049,6 +1097,13 @@ static struct cftype ve_cftypes[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.write_string = ve_mount_opts_write,
},
+   {
+   .name = os_release,
+   .max_write_len = __NEW_UTS_LEN + 1,
+   .flags = CFTYPE_NOT_ON_ROOT,
+   .read_seq_string = ve_os_release_read,
+   .write_string = ve_os_release_write,
+   },
{ }
 };
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7] ve: device cgroup -- Implement devcgroup_seq_show_ve

2015-05-18 Thread Cyrill Gorcunov

In PCS7 cgroups are configured from user space, so there is
no longer connection from ve to device cgroup via css as
it was in PCS6. Instead we should open device cgroup explicitly.

https://jira.sw.ru/browse/PSBM-33555

Signed-off-by: Cyrill Gorcunov gorcu...@odin.com
CC: Vladimir Davydov vdavy...@odin.com
CC: Konstantin Khorenko khore...@odin.com
CC: Andrey Vagin ava...@odin.com
---
 include/linux/device_cgroup.h |2 +-
 kernel/ve/vecalls.c   |2 +-
 security/device_cgroup.c  |   11 +--
 3 files changed, 11 insertions(+), 4 deletions(-)

Index: linux-pcs7.git/include/linux/device_cgroup.h
===
--- linux-pcs7.git.orig/include/linux/device_cgroup.h
+++ linux-pcs7.git/include/linux/device_cgroup.h
@@ -19,7 +19,7 @@ extern int devcgroup_device_visible(umod
 struct cgroup;
 int devcgroup_default_perms_ve(struct cgroup *cgroup);
 int devcgroup_set_perms_ve(struct cgroup *cgroup, unsigned, dev_t, unsigned);
-int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
seq_file *m);
+int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, struct 
seq_file *m);
 
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
Index: linux-pcs7.git/kernel/ve/vecalls.c
===
--- linux-pcs7.git.orig/kernel/ve/vecalls.c
+++ linux-pcs7.git/kernel/ve/vecalls.c
@@ -891,7 +891,7 @@ static int devperms_seq_show(struct seq_
if (ve_is_super(ve))
seq_printf(m, %10u b 016 *:*\n%10u c 006 *:*\n, 0, 0);
else
-   devcgroup_seq_show_ve(ve-css.cgroup, ve-veid, m);
+   devcgroup_seq_show_ve(devices_root, ve-veid, m);
 
return 0;
 }
Index: linux-pcs7.git/security/device_cgroup.c
===
--- linux-pcs7.git.orig/security/device_cgroup.c
+++ linux-pcs7.git/security/device_cgroup.c
@@ -1091,10 +1091,16 @@ int devcgroup_set_perms_ve(struct cgroup
 }
 EXPORT_SYMBOL(devcgroup_set_perms_ve);
 
-int devcgroup_seq_show_ve(struct cgroup *cgroup, unsigned veid, struct 
seq_file *m)
+int devcgroup_seq_show_ve(struct cgroup *devices_root, envid_t veid, struct 
seq_file *m)
 {
-   struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
struct dev_exception_item *wh;
+   struct dev_cgroup *devcgroup;
+   struct cgroup *cgroup;
+
+   cgroup = ve_cgroup_open(devices_root, 0, veid);
+   if (IS_ERR(cgroup))
+   return PTR_ERR(cgroup);
+   devcgroup = cgroup_to_devcgroup(cgroup);
 
rcu_read_lock();
list_for_each_entry_rcu(wh, devcgroup-exceptions, list) {
@@ -1118,6 +1124,7 @@ int devcgroup_seq_show_ve(struct cgroup
}
rcu_read_unlock();
 
+   cgroup_kernel_close(cgroup);
return 0;
 }
 EXPORT_SYMBOL(devcgroup_seq_show_ve);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7 v2 3/3] proc: add kpageidle file

2015-05-18 Thread Vladimir Davydov

Knowing the portion of memory that is not used by a certain application
or memory cgroup (idle memory) can be useful for partitioning the system
efficiently, e.g. by setting memory cgroup limits appropriately.
Currently, the only means to estimate the amount of idle memory provided
by the kernel is /proc/PID/{clear_refs,smaps}: the user can clear the
access bit for all pages mapped to a particular process by writing 1 to
clear_refs, wait for some time, and then count smaps:Referenced.
However, this method has two serious shortcomings:

 - it does not count unmapped file pages
 - it affects the reclaimer logic

To overcome these drawbacks, this patch introduces two new page flags,
Idle and Young, and a new proc file, /proc/kpageidle. A page's Idle flag
can only be set from userspace by setting bit in /proc/kpageidle at the
offset corresponding to the page, and it is cleared whenever the page is
accessed either through page tables (it is cleared in page_referenced()
in this case) or using the read(2) system call (mark_page_accessed()).
Thus by setting the Idle flag for pages of a particular workload, which
can be found e.g. by reading /proc/PID/pagemap, waiting for some time to
let the workload access its working set, and then reading the kpageidle
file, one can estimate the amount of pages that are not used by the
workload.

The Young page flag is used to avoid interference with the memory
reclaimer. A page's Young flag is set whenever the Access bit of a page
table entry pointing to the page is cleared by writing to kpageidle. If
page_referenced() is called on a Young page, it will add 1 to its return
value, therefore concealing the fact that the Access bit was cleared.

Note, since there is no room for extra page flags on 32 bit, this
feature uses extended page flags when compiled on 32 bit.

(on RH7 page ext is not available so make it depend on 64 bit)
Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 Documentation/vm/pagemap.txt |   12 ++-
 fs/proc/page.c   |  168 ++
 fs/proc/task_mmu.c   |3 +-
 include/linux/mm.h   |   50 +
 include/linux/page-flags.h   |9 +++
 mm/Kconfig   |   12 +++
 mm/page_alloc.c  |4 +
 mm/rmap.c|9 +++
 mm/swap.c|2 +
 9 files changed, 267 insertions(+), 2 deletions(-)

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index e37cff950ae8..a4fe9b25a6c9 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel 
that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are four components to pagemap:
+There are five components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -67,6 +67,16 @@ There are four components to pagemap:
memory cgroup each page is charged to, indexed by PFN. Only available when
CONFIG_MEMCG is set.
 
+ * /proc/kpageidle.  This file implements a bitmap where each bit corresponds
+   to a page, indexed by PFN. When the bit is set, the corresponding page is
+   idle. A page is considered idle if it has not been accessed since it was
+   marked idle. To mark a page idle one should set the bit corresponding to the
+   page by writing to the file. A value written to the file is OR-ed with the
+   current bitmap value. Only user memory pages can be marked idle, for other
+   page types input is silently ignored. Writing to this file beyond max PFN
+   results in the ENXIO error. Only available when CONFIG_IDLE_PAGE_TRACKING is
+   set.
+
 Short descriptions to the page flags:
 
  0. LOCKED
diff --git a/fs/proc/page.c b/fs/proc/page.c
index c9cbed32be43..49aebd2e3596 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -15,6 +15,7 @@
 
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
 
 /* /proc/kpagecount - an array exposing page counts
  *
@@ -263,6 +264,169 @@ static const struct file_operations 
proc_kpagecgroup_operations = {
 };
 #endif /* CONFIG_MEMCG */
 
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to page_referenced(), which is essential for
+ * idle page tracking. With such an indicator of user pages we can skip
+ * isolated pages, but since there are not usually many of them, it will hardly
+ * affect the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page

[Devel] [PATCH rh7 v2 0/3] idle memory tracking

2015-05-18 Thread Vladimir Davydov

This patch set backports

  https://lkml.org/lkml/2015/5/12/449

which is required by vcmmd.

It is not yet clear if the original patch set will be accepted upstream
as is, there still may be changes. However, I hope the user API will be
preserved. If it is not, we will have to fix this in our kernel too.

https://jira.sw.ru/browse/PSBM-32460

Vladimir Davydov (3):
  memcg: add page_cgroup_ino helper
  proc: add kpagecgroup file
  proc: add kpageidle file

 Documentation/vm/pagemap.txt |   16 ++-
 fs/proc/Kconfig  |5 +-
 fs/proc/page.c   |  221 ++
 fs/proc/task_mmu.c   |3 +-
 include/linux/memcontrol.h   |3 +
 include/linux/mm.h   |   50 ++
 include/linux/page-flags.h   |9 ++
 mm/Kconfig   |   12 +++
 mm/hwpoison-inject.c |3 -
 mm/memcontrol.c  |   22 +
 mm/memory-failure.c  |   18 +---
 mm/page_alloc.c  |4 +
 mm/rmap.c|9 ++
 mm/swap.c|2 +
 14 files changed, 353 insertions(+), 24 deletions(-)

-- 
1.7.10.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7 v2 2/3] proc: add kpagecgroup file

2015-05-18 Thread Vladimir Davydov

/proc/kpagecgroup contains a 64-bit inode number of the memory cgroup
each page is charged to, indexed by PFN. Having this information is
useful for estimating a cgroup working set size.

The file is present if CONFIG_PROC_PAGE_MONITOR  CONFIG_MEMCG.

Signed-off-by: Vladimir Davydov vdavy...@parallels.com
---
 Documentation/vm/pagemap.txt |6 -
 fs/proc/Kconfig  |5 ++--
 fs/proc/page.c   |   53 ++
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index fd7c3cfddd8e..e37cff950ae8 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel 
that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are three components to pagemap:
+There are four components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -63,6 +63,10 @@ There are three components to pagemap:
 21. KSM
 22. THP
 
+ * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
+
 Short descriptions to the page flags:
 
  0. LOCKED
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 15af6222f8a4..e8ed22d2ba5b 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -65,5 +65,6 @@ config PROC_PAGE_MONITOR
help
  Various /proc files exist to monitor process memory utilization:
  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
- /proc/kpagecount, and /proc/kpageflags. Disabling these
-  interfaces will reduce the size of the kernel by approximately 4kb.
+ /proc/kpagecount, /proc/kpageflags, and /proc/kpagecgroup.
+ Disabling these interfaces will reduce the size of the kernel
+ by approximately 4kb.
diff --git a/fs/proc/page.c b/fs/proc/page.c
index cab84b6272ed..c9cbed32be43 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
 #include linux/proc_fs.h
 #include linux/seq_file.h
 #include linux/hugetlb.h
+#include linux/memcontrol.h
 #include linux/kernel-page-flags.h
 #include asm/uaccess.h
 #include internal.h
@@ -213,10 +214,62 @@ static const struct file_operations 
proc_kpageflags_operations = {
.read = kpageflags_read,
 };
 
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+   size_t count, loff_t *ppos)
+{
+   u64 __user *out = (u64 __user *)buf;
+   struct page *ppage;
+   unsigned long src = *ppos;
+   unsigned long pfn;
+   ssize_t ret = 0;
+   u64 ino;
+
+   pfn = src / KPMSIZE;
+   count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+   if (src  KPMMASK || count  KPMMASK)
+   return -EINVAL;
+
+   while (count  0) {
+   if (pfn_valid(pfn))
+   ppage = pfn_to_page(pfn);
+   else
+   ppage = NULL;
+
+   if (ppage)
+   ino = page_cgroup_ino(ppage);
+   else
+   ino = 0;
+
+   if (put_user(ino, out)) {
+   ret = -EFAULT;
+   break;
+   }
+
+   pfn++;
+   out++;
+   count -= KPMSIZE;
+   }
+
+   *ppos += (char __user *)out - buf;
+   if (!ret)
+   ret = (char __user *)out - buf;
+   return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+   .llseek = mem_lseek,
+   .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
 static int __init proc_page_init(void)
 {
proc_create(kpagecount, S_IRUSR, NULL, proc_kpagecount_operations);
proc_create(kpageflags, S_IRUSR, NULL, proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+   proc_create(kpagecgroup, S_IRUSR, NULL, proc_kpagecgroup_operations);
+#endif
return 0;
 }
 module_init(proc_page_init);
-- 
1.7.10.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

65 matches

Mail list logo