[Devel] [PATCH] fs-writeback: add endless writeback debug

2017-08-25 Thread Dmitry Monakhov
https://jira.sw.ru/browse/PSBM-69587
Signed-off-by: Dmitry Monakhov 
---
 fs/fs-writeback.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f34ae6c..9df1573 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -787,11 +787,15 @@ static long __writeback_inodes_wb(struct bdi_writeback 
*wb,
 {
unsigned long start_time = jiffies;
long wrote = 0;
-
+   int trace = 0;
+   
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
 
+   if (time_is_before_jiffies(start_time + 15* HZ))
+   trace = 1;
+
if (!grab_super_passive(sb)) {
/*
 * grab_super_passive() may fail consistently due to
@@ -799,6 +803,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 * requeue_io() to avoid busy retrying the inode/sb.
 */
redirty_tail(inode, wb);
+   if (trace)
+   printk("%s:%d writeback is taking too long 
ino:%ld sb(%p):%s\n",
+  __FUNCTION__, __LINE__, inode->i_ino, 
sb, sb->s_id);
continue;
}
wrote += writeback_sb_inodes(sb, wb, work);
@@ -890,6 +897,7 @@ static long wb_writeback(struct bdi_writeback *wb,
unsigned long oldest_jif;
struct inode *inode;
long progress;
+   int trace = 0;
 
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
@@ -902,6 +910,9 @@ static long wb_writeback(struct bdi_writeback *wb,
if (work->nr_pages <= 0)
break;
 
+   if (time_is_before_jiffies(wb_start + 15* HZ))
+   trace = 1;
+
/*
 * Background writeout and kupdate-style writeback may
 * run forever. Stop them if there is other work to do
@@ -973,6 +984,10 @@ static long wb_writeback(struct bdi_writeback *wb,
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
+   if (trace)
+   printk("%s:%d writeback is taking too long 
ino:%ld st:%ld sb(%p):%s\n",
+  __FUNCTION__, __LINE__, inode->i_ino,
+  inode->i_state, inode->i_sb, 
inode->i_sb->s_id);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/2] mm/memcg: reclaim only kmem if kmem limit reached.

2017-08-25 Thread Andrey Ryabinin
If kmem limit on memcg reached, we go into memory reclaim,
and reclaim everything we can, including page cache and anon.
Reclaiming page cache or anon won't help since we need to lower
only kmem usage. This patch fixes the problem by avoiding
non-kmem reclaim on hitting the kmem limit.

https://jira.sw.ru/browse/PSBM-69226
Signed-off-by: Andrey Ryabinin 
---
 include/linux/memcontrol.h | 10 ++
 include/linux/swap.h   |  2 +-
 mm/memcontrol.c| 30 --
 mm/vmscan.c| 31 ---
 4 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1a52e58ab7de..1d6bc80c4c90 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -45,6 +45,16 @@ struct mem_cgroup_reclaim_cookie {
unsigned int generation;
 };
 
+/*
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT  0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP  (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
+#define MEM_CGROUP_RECLAIM_SHRINK  (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+#define MEM_CGROUP_RECLAIM_KMEM_BIT0x2
+#define MEM_CGROUP_RECLAIM_KMEM(1 << 
MEM_CGROUP_RECLAIM_KMEM_BIT)
+
 #ifdef CONFIG_MEMCG
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
  gfp_t gfp_mask, struct mem_cgroup **memcgp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bd162f9bef0d..bd47451ec95a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -324,7 +324,7 @@ extern unsigned long try_to_free_pages(struct zonelist 
*zonelist, int order,
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
  unsigned long nr_pages,
- gfp_t gfp_mask, bool noswap);
+ gfp_t gfp_mask, int flags);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 97824e281d7a..f9a5f3819a31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -511,16 +511,6 @@ enum res_type {
 #define OOM_CONTROL(0)
 
 /*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT  0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP  (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
-#define MEM_CGROUP_RECLAIM_SHRINK  (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-#define MEM_CGROUP_RECLAIM_KMEM_BIT0x2
-#define MEM_CGROUP_RECLAIM_KMEM(1 << 
MEM_CGROUP_RECLAIM_KMEM_BIT)
-
-/*
  * The memcg_create_mutex will be held whenever a new cgroup is created.
  * As a consequence, any change that needs to protect against new child cgroups
  * appearing has to hold it as well.
@@ -2137,7 +2127,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup 
*memcg,
if (loop)
drain_all_stock_async(memcg);
total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
- gfp_mask, noswap);
+ gfp_mask, flags);
if (test_thread_flag(TIF_MEMDIE) ||
fatal_signal_pending(current))
return 1;
@@ -2150,6 +2140,16 @@ static unsigned long mem_cgroup_reclaim(struct 
mem_cgroup *memcg,
break;
if (mem_cgroup_margin(memcg, flags & MEM_CGROUP_RECLAIM_KMEM))
break;
+
+   /*
+* Try harder to reclaim dcache. dcache reclaim may
+* temporarly fail due to dcache->dlock being held
+* by someone else. We must try harder to avoid premature
+* slab allocation failures.
+*/
+   if (flags & MEM_CGROUP_RECLAIM_KMEM &&
+   page_counter_read(&memcg->dcache))
+   continue;
/*
 * If nothing was reclaimed after two attempts, there
 * may be no reclaimable pages in this hierarchy.
@@ -2778,11 +2778,13 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t 
gfp_mask, bool kmem_charge
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
unsigned long nr_reclaimed;
-   unsigned long flags = 0;
+   unsigned long flags;
 
if (mem_cgroup_is_root(memcg))
goto done;
 retry:
+   flags = 0;
+
if (consume_stock(memcg, nr_pages)) {
if (!kmem_charge)

[Devel] [PATCH rh7 1/2] mm: use sc->priority for slab shrink targets

2017-08-25 Thread Andrey Ryabinin
From: Josef Bacik 

Previously we were using the ratio of the number of lru pages scanned to
the number of eligible lru pages to determine the number of slab objects
to scan.  The problem with this is that these two things have nothing to
do with each other, so in slab heavy work loads where there is little to
no page cache we can end up with the pages scanned being a very low
number.  This means that we reclaim next to no slab pages and waste a
lot of time reclaiming small amounts of space.

Instead use sc->priority in the same way we use it to determine scan
amounts for the lru's.  This generally equates to pages.  Consider the
following

slab_pages = (nr_objects * object_size) / PAGE_SIZE

What we would like to do is

scan = slab_pages >> sc->priority

but we don't know the number of slab pages each shrinker controls, only
the objects.  However say that theoretically we knew how many pages a
shrinker controlled, we'd still have to convert this to objects, which
would look like the following

scan = shrinker_pages >> sc->priority
scan_objects = (PAGE_SIZE / object_size) * scan

or written another way

scan_objects = (shrinker_pages >> sc->priority) *
(PAGE_SIZE / object_size)

which can thus be written

scan_objects = ((shrinker_pages * PAGE_SIZE) / object_size) >>
sc->priority

which is just

scan_objects = nr_objects >> sc->priority

We don't need to know exactly how many pages each shrinker represents,
it's objects are all the information we need.  Making this change allows
us to place an appropriate amount of pressure on the shrinker pools for
their relative size.

Signed-off-by: Josef Bacik 

https://jira.sw.ru/browse/PSBM-69226
Signed-off-by: Andrey Ryabinin 
---
 include/trace/events/vmscan.h | 23 ++
 mm/vmscan.c   | 44 ---
 2 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 132a985aba8b..d98fb0ab1831 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -181,23 +181,22 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, 
mm_vmscan_memcg_softlimit_re
 
 TRACE_EVENT(mm_shrink_slab_start,
TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
-   long nr_objects_to_shrink, unsigned long pgs_scanned,
-   unsigned long lru_pgs, unsigned long cache_items,
-   unsigned long long delta, unsigned long total_scan),
+   long nr_objects_to_shrink, unsigned long cache_items,
+   unsigned long long delta, unsigned long total_scan,
+   int priority),
 
-   TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs,
-   cache_items, delta, total_scan),
+   TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan,
+   priority),
 
TP_STRUCT__entry(
__field(struct shrinker *, shr)
__field(void *, shrink)
__field(long, nr_objects_to_shrink)
__field(gfp_t, gfp_flags)
-   __field(unsigned long, pgs_scanned)
-   __field(unsigned long, lru_pgs)
__field(unsigned long, cache_items)
__field(unsigned long long, delta)
__field(unsigned long, total_scan)
+   __field(int, priority)
),
 
TP_fast_assign(
@@ -205,23 +204,21 @@ TRACE_EVENT(mm_shrink_slab_start,
__entry->shrink = shr->scan_objects;
__entry->nr_objects_to_shrink = nr_objects_to_shrink;
__entry->gfp_flags = sc->gfp_mask;
-   __entry->pgs_scanned = pgs_scanned;
-   __entry->lru_pgs = lru_pgs;
__entry->cache_items = cache_items;
__entry->delta = delta;
__entry->total_scan = total_scan;
+   __entry->priority = priority;
),
 
-   TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld 
lru_pgs %ld cache items %ld delta %lld total_scan %ld",
+   TP_printk("%pF %p: objects to shrink %ld gfp_flags %s cache items %ld 
delta %lld total_scan %ld priority %d",
__entry->shrink,
__entry->shr,
__entry->nr_objects_to_shrink,
show_gfp_flags(__entry->gfp_flags),
-   __entry->pgs_scanned,
-   __entry->lru_pgs,
__entry->cache_items,
__entry->delta,
-   __entry->total_scan)
+   __entry->total_scan,
+   __entry->priority)
 );
 
 TRACE_EVENT(mm_shrink_slab_end,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b9e77c303fcd..277bd37bd430 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -223,9 +223,7 @@ EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
 
 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
-   st