Re: [Devel] [PATCH rh7 9/9] Charge kmem allocations accounted to UBC in PCS6 to memcg

2016-04-27 Thread Vladimir Davydov
Please ignore this last patch. I'll resend it.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 8½/9] vmalloc: add v[mz]alloc_account helpers

2016-04-27 Thread Vladimir Davydov
Same as v[mz]alloc, but accounted to kmemcg. Will be used later.

Signed-off-by: Vladimir Davydov 
---
 include/linux/vmalloc.h |  2 ++
 mm/vmalloc.c| 14 ++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 97b987fedec0..6ea82cf30dc1 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -68,6 +68,8 @@ static inline void vmalloc_init(void)
 
 extern void *vmalloc(unsigned long size);
 extern void *vzalloc(unsigned long size);
+extern void *vmalloc_account(unsigned long size);
+extern void *vzalloc_account(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 32d3744a9c46..a8c2b283699a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1782,6 +1782,20 @@ void *vzalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vzalloc);
 
+void *vmalloc_account(unsigned long size)
+{
+   return __vmalloc_node_flags(size, NUMA_NO_NODE,
+   GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(vmalloc_account);
+
+void *vzalloc_account(unsigned long size)
+{
+   return __vmalloc_node_flags(size, NUMA_NO_NODE,
+   GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_account);
+
 /**
  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
  * @size: allocation size
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 9/9] Charge kmem allocations accounted to UBC in PCS6 to memcg

2016-04-27 Thread Vladimir Davydov
Signed-off-by: Vladimir Davydov 
---
Changes in v2:
 - use v[mz]alloc_account instead of __vmalloc + GFP_KERNEL_ACCOUNT
 - use __get_free_kmem_pages instead of alloc_kmem_pages + page_address
 - free pages allocated with __get_free_kmem_pages with free_kmem_pages
 - export __get_free_kmem_pages and free_kmem_pages functions

 arch/x86/kernel/ldt.c |  8 
 drivers/tty/tty_io.c  |  4 ++--
 fs/fcntl.c|  2 +-
 fs/locks.c|  2 +-
 fs/namespace.c|  4 ++--
 fs/nfsd/vfs.c |  2 +-
 fs/pipe.c |  2 +-
 fs/select.c   |  4 ++--
 fs/seq_file.c | 10 +-
 ipc/msgutil.c |  4 ++--
 ipc/sem.c |  6 +++---
 ipc/util.c|  4 ++--
 kernel/fairsched.c|  2 +-
 kernel/posix-timers.c |  2 +-
 mm/page_alloc.c   |  3 +++
 net/8021q/vlan.c  |  2 +-
 net/core/dev.c| 10 +-
 net/core/fib_rules.c  |  4 ++--
 net/core/filter.c |  2 +-
 net/core/scm.c|  4 ++--
 net/core/sock.c   |  6 +++---
 net/ipv4/devinet.c|  2 +-
 net/ipv4/fib_trie.c   |  4 ++--
 net/ipv4/netfilter/ip_tables.c|  6 +++---
 net/ipv4/tcp.c|  2 +-
 net/ipv6/addrconf.c   |  2 +-
 net/ipv6/ip6_fib.c|  2 +-
 net/ipv6/route.c  |  2 +-
 net/netfilter/ipvs/ip_vs_conn.c   |  2 +-
 net/netfilter/nf_conntrack_core.c |  8 
 net/netfilter/x_tables.c  |  4 ++--
 net/packet/af_packet.c| 14 +++---
 32 files changed, 69 insertions(+), 66 deletions(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 79aa97d8fe4c..4a6c8fee47f2 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -42,9 +42,9 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int 
reload)
mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-   newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+   newldt = vmalloc_account(mincount * LDT_ENTRY_SIZE);
else
-   newldt = (void *)__get_free_page(GFP_KERNEL);
+   newldt = (void *)__get_free_kmem_pages(GFP_KERNEL_ACCOUNT, 0);
 
if (!newldt)
return -ENOMEM;
@@ -83,7 +83,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int 
reload)
if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(oldldt);
else
-   put_page(virt_to_page(oldldt));
+   __free_kmem_pages(virt_to_page(oldldt), 0);
}
return 0;
 }
@@ -138,7 +138,7 @@ void destroy_context(struct mm_struct *mm)
if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(mm->context.ldt);
else
-   put_page(virt_to_page(mm->context.ldt));
+   __free_kmem_pages(virt_to_page(mm->context.ldt), 0);
mm->context.size = 0;
}
 }
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index e4b03bb2a0a1..91ffc65a1ec4 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -167,7 +167,7 @@ static void proc_set_tty(struct task_struct *tsk, struct 
tty_struct *tty);
 
 struct tty_struct *alloc_tty_struct(void)
 {
-   return kzalloc(sizeof(struct tty_struct), GFP_KERNEL);
+   return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_ACCOUNT);
 }
 
 /**
@@ -1512,7 +1512,7 @@ void tty_free_termios(struct tty_struct *tty)
/* Stash the termios data */
tp = tty->driver->termios[idx];
if (tp == NULL) {
-   tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL);
+   tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_ACCOUNT);
if (tp == NULL) {
pr_warn("tty: no memory to save termios state.\n");
return;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cfa349cccda2..8e8e40e8be0b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -783,7 +783,7 @@ static int __init fcntl_init(void)
));
 
fasync_cache = kmem_cache_create("fasync_cache",
-   sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+   sizeof(struct fasync_struct), 0, SLAB_PANIC | SLAB_ACCOUNT, 
NULL);
return 0;
 }
 
diff --git a/fs/locks.c b/fs/locks.c
index 93c097bd7af4..ad89993d9ecb 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2650,7 +2650,7 @@ static int __init filelock_init(void)
int i;
 
filelock_cache = kmem_cache_create("file_lock_cache",
-   sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+   sizeo

[Devel] [PATCH rh7] meminfo: show size of tcache and tswap

2016-04-27 Thread Vladimir Davydov
Although these counters are already available via
/sys/module/{tcache,tswap}/parameters/nr_pages, having them in
/proc/meminfo will come handy, e.g. because meminfo is already
gathered by perf tests, while those counters are not.

Signed-off-by: Vladimir Davydov 
---
 fs/proc/meminfo.c | 15 +++
 mm/tswap.c|  5 +
 2 files changed, 20 insertions(+)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8e9077fea885..40db369116b4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -78,6 +78,9 @@ static int meminfo_proc_show_mi(struct seq_file *m, struct 
meminfo *mi)
 #ifdef CONFIG_TCACHE
 extern unsigned long get_nr_tcache_pages(void);
 #endif
+#ifdef CONFIG_TSWAP
+extern unsigned long get_nr_tswap_pages(void);
+#endif
 
 int meminfo_proc_show_ub(struct seq_file *m, void *v,
struct user_beancounter *ub, unsigned long meminfo_val)
@@ -215,6 +218,12 @@ int meminfo_proc_show_ub(struct seq_file *m, void *v,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
"AnonHugePages:  %8lu kB\n"
 #endif
+#ifdef CONFIG_TCACHE
+   "Tcache: %8lu kB\n"
+#endif
+#ifdef CONFIG_TSWAP
+   "Tswap:  %8lu kB\n"
+#endif
,
K(i.totalram),
K(i.freeram),
@@ -276,6 +285,12 @@ int meminfo_proc_show_ub(struct seq_file *m, void *v,
,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
   HPAGE_PMD_NR)
 #endif
+#ifdef CONFIG_TCACHE
+   ,K(get_nr_tcache_pages())
+#endif
+#ifdef CONFIG_TSWAP
+   ,K(get_nr_tswap_pages())
+#endif
);
 
hugetlb_report_meminfo(m);
diff --git a/mm/tswap.c b/mm/tswap.c
index 20c952b4716c..258728519364 100644
--- a/mm/tswap.c
+++ b/mm/tswap.c
@@ -38,6 +38,11 @@ module_param_named(active, tswap_active, bool, 0644);
 static unsigned long tswap_nr_pages;
 module_param_named(nr_pages, tswap_nr_pages, ulong, 0444);
 
+unsigned long get_nr_tswap_pages(void)
+{
+   return tswap_nr_pages;
+}
+
 static void tswap_lru_add(struct page *page)
 {
struct tswap_lru *lru = &tswap_lru_node[page_to_nid(page)];
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 1/2] mm: memcontrol: add swapin/out stats

2016-04-27 Thread Vladimir Davydov
This patch adds pswpin and pswpout fields to memory.stat file. These
fields show the number of pages swapped in and out respectively for the
given memory cgroup. We assume a page was swapped in/out even if there
were no actual io (frontswap).

https://jira.sw.ru/browse/PSBM-46104

Signed-off-by: Vladimir Davydov 
---
 mm/memcontrol.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f617423a3ade..29716f465ffb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -112,6 +112,8 @@ static const char * const mem_cgroup_stat_names[] = {
 enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN,   /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT,  /* # of pages paged out */
+   MEM_CGROUP_EVENTS_PSWPIN,   /* # of pages swapped in */
+   MEM_CGROUP_EVENTS_PSWPOUT,  /* # of pages swapped out */
MEM_CGROUP_EVENTS_PGFAULT,  /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
MEM_CGROUP_EVENTS_NSTATS,
@@ -120,6 +122,8 @@ enum mem_cgroup_events_index {
 static const char * const mem_cgroup_events_names[] = {
"pgpgin",
"pgpgout",
+   "pswpin",
+   "pswpout",
"pgfault",
"pgmajfault",
 };
@@ -3945,6 +3949,8 @@ __mem_cgroup_commit_charge_swapin(struct page *page, 
struct mem_cgroup *memcg,
swp_entry_t ent = {.val = page_private(page)};
mem_cgroup_uncharge_swap(ent);
}
+
+   this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PSWPIN]);
 }
 
 void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -4234,6 +4240,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, 
swp_entry_t ent, bool swapout)
 */
if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
+
+   if (swapout && memcg)
+   this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PSWPOUT]);
 }
 #endif
 
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/2] ub: show swapin/out in /proc/bc//vmaux

2016-04-27 Thread Vladimir Davydov
Required for backward compatibility. The values are received from the
associated memory cgroup.

https://jira.sw.ru/browse/PSBM-46104

Signed-off-by: Vladimir Davydov 
---
 include/bc/beancounter.h | 3 +++
 kernel/bc/vm_pages.c | 3 +++
 mm/memcontrol.c  | 8 
 3 files changed, 14 insertions(+)

diff --git a/include/bc/beancounter.h b/include/bc/beancounter.h
index d004afa5be3c..8316cc4861a3 100644
--- a/include/bc/beancounter.h
+++ b/include/bc/beancounter.h
@@ -105,6 +105,9 @@ struct user_beancounter {
atomic_long_t   wb_requests;
atomic_long_t   wb_sectors;
 
+   unsigned long   swapin;
+   unsigned long   swapout;
+
void*iolimit;
 
/* resources statistic and settings */
diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c
index 7529899538a9..b46da98fece2 100644
--- a/kernel/bc/vm_pages.c
+++ b/kernel/bc/vm_pages.c
@@ -253,6 +253,9 @@ static int bc_vmaux_show(struct seq_file *f, void *v)
 
ub_sync_memcg(ub);
 
+   seq_printf(f, bc_proc_lu_fmt, "swapin", ub->swapin);
+   seq_printf(f, bc_proc_lu_fmt, "swapout", ub->swapout);
+
seq_printf(f, bc_proc_lu_fmt, "ram", ub->ub_parms[UB_PHYSPAGES].held);
 
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29716f465ffb..61c395b7c4ed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5440,6 +5440,7 @@ static int mem_cgroup_move_charge_write(struct cgroup 
*cgrp,
 void mem_cgroup_sync_beancounter(struct mem_cgroup *memcg,
 struct user_beancounter *ub)
 {
+   struct mem_cgroup *mi;
unsigned long long lim, held, maxheld;
volatile struct ubparm *k, *d, *p, *s, *o;
 
@@ -5493,6 +5494,13 @@ void mem_cgroup_sync_beancounter(struct mem_cgroup 
*memcg,
lim = lim >= RESOURCE_MAX ? UB_MAXVALUE :
min_t(unsigned long long, lim >> PAGE_SHIFT, UB_MAXVALUE);
o->barrier = o->limit = lim;
+
+   ub->swapin = 0;
+   ub->swapout = 0;
+   for_each_mem_cgroup_tree(mi, memcg) {
+   ub->swapin += mem_cgroup_read_events(mi, 
MEM_CGROUP_EVENTS_PSWPIN);
+   ub->swapout += mem_cgroup_read_events(mi, 
MEM_CGROUP_EVENTS_PSWPOUT);
+   }
 }
 
 int mem_cgroup_apply_beancounter(struct mem_cgroup *memcg,
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7] oom: fix NULL ptr deref on oom if memory cgroup is disabled

2016-04-27 Thread Vladimir Davydov
mem_cgroup_iter and try_get_mem_cgroup_from_mm return NULL in this case,
handle this properly.

https://jira.sw.ru/browse/PSBM-43328

Signed-off-by: Vladimir Davydov 
---
 include/linux/memcontrol.h |  5 +++--
 mm/memcontrol.c|  4 +++-
 mm/oom_kill.c  | 20 +++-
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d90f6c77dc69..743fb0b6f621 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -31,6 +31,8 @@ struct mm_struct;
 struct kmem_cache;
 struct oom_context;
 
+extern struct oom_context global_oom_ctx;
+
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
@@ -392,8 +394,7 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum 
lru_list lru,
 static inline struct oom_context *
 mem_cgroup_oom_context(struct mem_cgroup *memcg)
 {
-   extern struct oom_context oom_ctx;
-   return &oom_ctx;
+   return &global_oom_ctx;
 }
 
 static inline unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61c395b7c4ed..fa66d1128cfb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1699,6 +1699,8 @@ void mem_cgroup_note_oom_kill(struct mem_cgroup 
*root_memcg,
 
 struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
 {
+   if (mem_cgroup_disabled())
+   return &global_oom_ctx;
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->oom_ctx;
@@ -1708,7 +1710,7 @@ unsigned long mem_cgroup_overdraft(struct mem_cgroup 
*memcg)
 {
unsigned long long guarantee, usage;
 
-   if (mem_cgroup_is_root(memcg))
+   if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
return 0;
 
guarantee = ACCESS_ONCE(memcg->oom_guarantee);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2402fcceda6e..7a328e8c3204 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -51,12 +51,10 @@ static DEFINE_SPINLOCK(oom_context_lock);
 #define OOM_BASE_RAGE  -10
 #define OOM_MAX_RAGE   20
 
-#ifndef CONFIG_MEMCG
-struct oom_context oom_ctx = {
+struct oom_context global_oom_ctx = {
.rage   = OOM_BASE_RAGE,
-   .waitq  = __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq),
+   .waitq  = __WAIT_QUEUE_HEAD_INITIALIZER(global_oom_ctx.waitq),
 };
-#endif
 
 void init_oom_context(struct oom_context *ctx)
 {
@@ -187,7 +185,8 @@ static unsigned long mm_overdraft(struct mm_struct *mm)
memcg = try_get_mem_cgroup_from_mm(mm);
ctx = mem_cgroup_oom_context(memcg);
overdraft = ctx->overdraft;
-   mem_cgroup_put(memcg);
+   if (memcg)
+   mem_cgroup_put(memcg);
 
return overdraft;
 }
@@ -497,7 +496,8 @@ void mark_oom_victim(struct task_struct *tsk)
ctx->marked = true;
}
spin_unlock(&oom_context_lock);
-   mem_cgroup_put(memcg);
+   if (memcg)
+   mem_cgroup_put(memcg);
 }
 
 /**
@@ -608,7 +608,7 @@ bool oom_trylock(struct mem_cgroup *memcg)
 * information will be used in oom_badness.
 */
ctx->overdraft = mem_cgroup_overdraft(iter);
-   parent = parent_mem_cgroup(iter);
+   parent = iter ? parent_mem_cgroup(iter) : NULL;
if (parent && iter != memcg)
ctx->overdraft = max(ctx->overdraft,
mem_cgroup_oom_context(parent)->overdraft);
@@ -645,7 +645,8 @@ void oom_unlock(struct mem_cgroup *memcg)
 * on it for the victim to exit below.
 */
victim_memcg = iter;
-   mem_cgroup_get(iter);
+   if (iter)
+   mem_cgroup_get(iter);
 
mem_cgroup_iter_break(memcg, iter);
break;
@@ -695,7 +696,8 @@ void oom_unlock(struct mem_cgroup *memcg)
 */
ctx = mem_cgroup_oom_context(victim_memcg);
__wait_oom_context(ctx);
-   mem_cgroup_put(victim_memcg);
+   if (victim_memcg)
+   mem_cgroup_put(victim_memcg);
 }
 
 /*
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] ploop: force journal commit after dio_post_submit

2016-04-27 Thread Dmitry Monakhov
Once we converted extent to initialized it can be part of uncompleted
journal transaction, so we have to force transaction commit at some point.
The easiest way to do it is to perform unconditional fsync.
https://jira.sw.ru/browse/PSBM-45326

TODO: This case and others can be optimized by deferring fsync.But this is
  subject of another patch.

Signed-off-by: Dmitry Monakhov 
---
 drivers/block/ploop/io_direct.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 8032999..5a2e12a 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -523,6 +523,8 @@ dio_post_submit(struct ploop_io *io, struct ploop_request * 
preq)
err = io->files.file->f_op->fallocate(io->files.file,
  FALLOC_FL_CONVERT_UNWRITTEN,
  (loff_t)sec << 9, clu_siz);
+   if (!err)
+   err = io->files.file->f_op->FOP_FSYNC(io->files.file, 0);
file_end_write(io->files.file);
if (err) {
PLOOP_REQ_SET_ERROR(preq, err);
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel