[Devel] [PATCH rh7 1/3] ms/ext4: move handling of list of shrinkable inodes into extent status code

2018-04-13 Thread Konstantin Khorenko
From: Jan Kara 

Currently callers adding extents to extent status tree were responsible
for adding the inode to the list of inodes with freeable extents. This
is error prone and puts list handling in unnecessarily many places.

Just add inode to the list automatically when the first non-delay extent
is added to the tree and remove inode from the list when the last
non-delay extent is removed.

Signed-off-by: Jan Kara 
Signed-off-by: Theodore Ts'o 
(cherry picked from commit b0dea4c1651f3cdb6d17604fa473e72cb74cdc6b)

https://jira.sw.ru/browse/PSBM-83335

We do face a situation when all (32) cpus on a node content on sbi->s_es_lock
shrinking extents on a single superblock and
shrinking extents goes very slow (180 sec in average!).

crash> struct ext4_sb_info 0x882fcb7ca800 -p

  s_es_nr_inode = 3173832,
  s_es_stats = {
es_stats_shrunk = 70,
es_stats_cache_hits = 35182748,
es_stats_cache_misses = 2622931,
es_stats_scan_time = 182642303461,
es_stats_max_scan_time = 276290979674,

This patch should help a bit because it decreases sbi->s_es_nr_inode right
in __es_shrink() as a side effect, thus cpus which comes later to __es_shrink()
will loop less cycles.

Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/extents.c|  2 --
 fs/ext4/extents_status.c | 10 ++
 fs/ext4/extents_status.h |  2 --
 fs/ext4/inode.c  |  2 --
 fs/ext4/ioctl.c  |  2 --
 fs/ext4/super.c  |  1 -
 6 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a8675aea44ad..ccbb952482e8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4658,7 +4658,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode 
*inode,
 
trace_ext4_ext_map_blocks_exit(inode, flags, map,
   err ? err : allocated);
-   ext4_es_list_add(inode);
return err ? err : allocated;
 }
 
@@ -5248,7 +5247,6 @@ int ext4_fiemap(struct inode *inode, struct 
fiemap_extent_info *fieinfo,
error = ext4_fill_fiemap_extents(inode, start_blk,
 len_blks, fieinfo);
}
-   ext4_es_list_add(inode);
return error;
 }
 
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d632a3e43994..77f44d382aa5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -297,7 +297,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
-void ext4_es_list_add(struct inode *inode)
+static void ext4_es_list_add(struct inode *inode)
 {
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -313,7 +313,7 @@ void ext4_es_list_add(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
 }
 
-void ext4_es_list_del(struct inode *inode)
+static void ext4_es_list_del(struct inode *inode)
 {
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -343,7 +343,8 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, 
ext4_lblk_t len,
 * We don't count delayed extent because we never try to reclaim them
 */
if (!ext4_es_is_delayed(es)) {
-   EXT4_I(inode)->i_es_shk_nr++;
+   if (!EXT4_I(inode)->i_es_shk_nr++)
+   ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_shk_cnt);
}
@@ -362,7 +363,8 @@ static void ext4_es_free_extent(struct inode *inode, struct 
extent_status *es)
/* Decrease the shrink counter when this es is not delayed */
if (!ext4_es_is_delayed(es)) {
BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
-   EXT4_I(inode)->i_es_shk_nr--;
+   if (!--EXT4_I(inode)->i_es_shk_nr)
+   ext4_es_list_del(inode);
percpu_counter_dec(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_shk_cnt);
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 0e6a33e81e5f..b0b78b95f481 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -150,7 +150,5 @@ static inline void ext4_es_store_pblock_status(struct 
extent_status *es,
 
 extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
-extern void ext4_es_list_add(struct inode *inode);
-extern void ext4_es_list_del(struct inode *inode);
 
 #endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1853dccc88c7..e633c707b119 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -500,7 +500,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
-

[Devel] [PATCH rh7 0/3] ext4: speedup shrinking non-delay extents

2018-04-13 Thread Konstantin Khorenko
We faced a situation when all (32) cpus on a node content on sbi->s_es_lock
shrinking extents on a single superblock and
shrinking extents goes very slow (180 sec in average!).

crash> struct ext4_sb_info 0x882fcb7ca800 -p

  s_es_nr_inode = 3173832,
  s_es_stats = {
es_stats_shrunk = 70,
es_stats_cache_hits = 35182748,
es_stats_cache_misses = 2622931,
es_stats_scan_time = 182642303461,
es_stats_max_scan_time = 276290979674,

This patchset speeds up parallel shrink a bit.
If we findout this is not enough, next step is to limit the number of shrinkers
working on a single superslock in parallel.

https://jira.sw.ru/browse/PSBM-83335

Jan Kara (1):
  ms/ext4: move handling of list of shrinkable inodes into extent status
code

Konstantin Khorenko (1):
  ext4: don't iterate over sbi->s_es_list more than the number of
elements

Waiman Long (1):
  ext4: Make cache hits/misses per-cpu counts

 fs/ext4/extents.c|  2 --
 fs/ext4/extents_status.c | 56 +---
 fs/ext4/extents_status.h |  6 ++
 fs/ext4/inode.c  |  2 --
 fs/ext4/ioctl.c  |  2 --
 fs/ext4/super.c  |  1 -
 6 files changed, 45 insertions(+), 24 deletions(-)

-- 
2.15.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/3] ext4: Make cache hits/misses per-cpu counts

2018-04-13 Thread Konstantin Khorenko
From: Waiman Long 

This patch changes the es_stats_cache_hits and es_stats_cache_misses
statistics counts to percpu counters to reduce cacheline contention
issues whem multiple threads are trying to update those counts
simultaneously.

With a 38-threads fio I/O test with 2 shared files (on DAX-mount
NVDIMM) running on a 4-socket Haswell-EX server with 4.6-rc1 kernel,
the aggregated bandwidths before and after the patch were:

  Test  W/O patch   With patch  % change
    -   --  
  Read-only 16499MB/s   17215MB/s+4.3%
  Read-write 4361MB/s4794MB/s+9.9%

Signed-off-by: Waiman Long 

The patch is not in mainstream yet, but was ack-ed already:
https://lkml.org/lkml/2016/4/29/584

This patch does not improve the performance of __es_shrink() itself,
but generally improves the fastpath.

Was found while digging:
https://jira.sw.ru/browse/PSBM-83335

Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/extents_status.c | 38 +-
 fs/ext4/extents_status.h |  4 ++--
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 77f44d382aa5..a3b9c480ec20 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -769,6 +769,15 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t 
lblk,
write_unlock(&EXT4_I(inode)->i_es_lock);
 }
 
+/*
+ * For pure statistics count, use a large batch size to make sure that
+ * it does percpu update as much as possible.
+ */
+static inline void ext4_es_stats_inc(struct percpu_counter *fbc)
+{
+   __percpu_counter_add(fbc, 1, (1 << 30));
+}
+
 /*
  * ext4_es_lookup_extent() looks up an extent in extent status tree.
  *
@@ -823,9 +832,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t 
lblk,
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
-   stats->es_stats_cache_hits++;
+   ext4_es_stats_inc(&stats->es_stats_cache_hits);
} else {
-   stats->es_stats_cache_misses++;
+   ext4_es_stats_inc(&stats->es_stats_cache_misses);
}
 
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -1122,9 +1131,9 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file 
*seq, void *v)
seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
-   seq_printf(seq, "  %lu/%lu cache hits/misses\n",
-  es_stats->es_stats_cache_hits,
-  es_stats->es_stats_cache_misses);
+   seq_printf(seq, "  %lld/%lld cache hits/misses\n",
+  percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
+  
percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
if (inode_cnt)
seq_printf(seq, "  %d inodes on list\n", inode_cnt);
 
@@ -1188,8 +1197,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_nr_inode = 0;
spin_lock_init(&sbi->s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
-   sbi->s_es_stats.es_stats_cache_hits = 0;
-   sbi->s_es_stats.es_stats_cache_misses = 0;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, 
GFP_KERNEL);
@@ -1199,19 +1206,30 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err1;
 
+   err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, 
GFP_KERNEL);
+   if (err)
+   goto err2;
+
+   err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, 
GFP_KERNEL);
+   if (err)
+   goto err3;
+
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
err = register_shrinker(&sbi->s_es_shrinker);
if (err)
-   goto err2;
+   goto err4;
 
if (sbi->s_proc)
proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
 &ext4_es_seq_shrinker_info_fops, sbi);
 
return 0;
-
+err4:
+   percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
+err3:
+   percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
 err2:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
 err1:
@@ -1225,6 +1243,8 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
remove_proc_entry("es_shrinker_info", sbi->s_proc);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
+

[Devel] [PATCH rh7 3/3] ext4: don't iterate over sbi->s_es_list more than the number of elements

2018-04-13 Thread Konstantin Khorenko
If there are several shrinkers working on a single sbi there can be easily a
situation when a neighbor shrinkers reclaimed a bunch of extents and thus a
bunch inoes from the s_es_list but we don't honor this and iterate over
sbi->s_es_list the number of times equal to the initial number of inodes in
s_es_list.

Before each iteration, check if we are going to iterate more than the number of
inodes in the list and adjust nr_to_walk accordingly.

https://jira.sw.ru/browse/PSBM-83335

Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/extents_status.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index a3b9c480ec20..ed1f63eef74c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -998,6 +998,14 @@ static int __es_shrink(struct ext4_sb_info *sbi, int 
nr_to_scan,
spin_unlock(&sbi->s_es_lock);
goto out;
}
+   /*
+* Another shrinker can remove a bunch of extents in parallel,
+* we don't have to iterate more than the current number of
+* inodes in the list.
+*/
+   if (nr_to_walk > sbi->s_es_nr_inode)
+   nr_to_walk = sbi->s_es_nr_inode;
+
ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
  i_es_list);
/* Move the inode to the tail */
-- 
2.15.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 0/3] ext4: speedup shrinking non-delay extents

2018-04-13 Thread Dmitry Monakhov
Konstantin Khorenko  writes:

> We faced a situation when all (32) cpus on a node content on sbi->s_es_lock
> shrinking extents on a single superblock and
> shrinking extents goes very slow (180 sec in average!).
>
> crash> struct ext4_sb_info 0x882fcb7ca800 -p
>
>   s_es_nr_inode = 3173832,
>   s_es_stats = {
> es_stats_shrunk = 70,
> es_stats_cache_hits = 35182748,
> es_stats_cache_misses = 2622931,
> es_stats_scan_time = 182642303461,
> es_stats_max_scan_time = 276290979674,
>
> This patchset speeds up parallel shrink a bit.
> If we findout this is not enough, next step is to limit the number of 
> shrinkers
> working on a single superslock in parallel.
>
> https://jira.sw.ru/browse/PSBM-83335
>
> Jan Kara (1):
>   ms/ext4: move handling of list of shrinkable inodes into extent status
> code
>
> Konstantin Khorenko (1):
>   ext4: don't iterate over sbi->s_es_list more than the number of
> elements
>
> Waiman Long (1):
>   ext4: Make cache hits/misses per-cpu counts
ACK.
>
>  fs/ext4/extents.c|  2 --
>  fs/ext4/extents_status.c | 56 
> +---
>  fs/ext4/extents_status.h |  6 ++
>  fs/ext4/inode.c  |  2 --
>  fs/ext4/ioctl.c  |  2 --
>  fs/ext4/super.c  |  1 -
>  6 files changed, 45 insertions(+), 24 deletions(-)
>
> -- 
> 2.15.1


signature.asc
Description: PGP signature
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] net: Change number of netlink repair

2018-04-13 Thread Kirill Tkhai
Mainstream has NETLINK_EXT_ACK 11, which is used by fresh
iproute utils. We don't want these utils switch the socket
in repair mode.

https://jira.sw.ru/browse/PSBM-83415

Signed-off-by: Kirill Tkhai 
---
 include/uapi/linux/netlink.h |3 ++-
 net/netlink/af_netlink.c |2 +-
 net/netlink/af_netlink.h |3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 56ddadf14e0e..5d202cc19705 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -111,7 +111,8 @@ struct nlmsgerr {
 #define NETLINK_LISTEN_ALL_NSID8
 #define NETLINK_LIST_MEMBERSHIPS   9
 #define NETLINK_CAP_ACK10
-#define NETLINK_REPAIR 11
+
+#define NETLINK_REPAIR2127
 
 struct nl_pktinfo {
__u32   group;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 513597d267eb..0d02a287ea79 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2192,7 +2192,7 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
return -EFAULT;
 
switch (optname) {
-   case NETLINK_REPAIR:
+   case NETLINK_REPAIR2:
if (val)
nlk->flags |= NETLINK_F_REPAIR;
else
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 041b5da8bd5d..07bac2ff1054 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -11,7 +11,8 @@
 #define NETLINK_F_RECV_NO_ENOBUFS  0x8
 #define NETLINK_F_LISTEN_ALL_NSID  0x10
 #define NETLINK_F_CAP_ACK  0x20
-#define NETLINK_F_REPAIR   0x40
+
+#define NETLINK_F_REPAIR   0x8000
 
 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)  (NLGRPSZ(x)/sizeof(unsigned long))

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] net: Change number of netlink repair

2018-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.21.1.vz7.46.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.21.1.vz7.46.5
-->
commit 9cdd731a629a0c83ae8864759944aa62d9c8bd5b
Author: Kirill Tkhai 
Date:   Fri Apr 13 19:18:44 2018 +0300

net: Change number of netlink repair

Mainstream has NETLINK_EXT_ACK 11, which is used by fresh
iproute utils. We don't want these utils switch the socket
in repair mode.

https://jira.sw.ru/browse/PSBM-83415

Signed-off-by: Kirill Tkhai 
Acked-by: Konstantin Khorenko 
---
 include/uapi/linux/netlink.h | 3 ++-
 net/netlink/af_netlink.c | 2 +-
 net/netlink/af_netlink.h | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 56ddadf14e0e..5d202cc19705 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -111,7 +111,8 @@ struct nlmsgerr {
 #define NETLINK_LISTEN_ALL_NSID8
 #define NETLINK_LIST_MEMBERSHIPS   9
 #define NETLINK_CAP_ACK10
-#define NETLINK_REPAIR 11
+
+#define NETLINK_REPAIR2127
 
 struct nl_pktinfo {
__u32   group;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 513597d267eb..0d02a287ea79 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2192,7 +2192,7 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
return -EFAULT;
 
switch (optname) {
-   case NETLINK_REPAIR:
+   case NETLINK_REPAIR2:
if (val)
nlk->flags |= NETLINK_F_REPAIR;
else
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 041b5da8bd5d..07bac2ff1054 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -11,7 +11,8 @@
 #define NETLINK_F_RECV_NO_ENOBUFS  0x8
 #define NETLINK_F_LISTEN_ALL_NSID  0x10
 #define NETLINK_F_CAP_ACK  0x20
-#define NETLINK_F_REPAIR   0x40
+
+#define NETLINK_F_REPAIR   0x8000
 
 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)  (NLGRPSZ(x)/sizeof(unsigned long))
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ext4: Make cache hits/misses per-cpu counts

2018-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.21.1.vz7.46.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.21.1.vz7.46.5
-->
commit e92c4dde4105d1f583f5d83f5c29e08f21f00203
Author: Waiman Long 
Date:   Fri Apr 13 19:22:22 2018 +0300

ext4: Make cache hits/misses per-cpu counts

This patch changes the es_stats_cache_hits and es_stats_cache_misses
statistics counts to percpu counters to reduce cacheline contention
issues whem multiple threads are trying to update those counts
simultaneously.

With a 38-threads fio I/O test with 2 shared files (on DAX-mount
NVDIMM) running on a 4-socket Haswell-EX server with 4.6-rc1 kernel,
the aggregated bandwidths before and after the patch were:

  Test  W/O patch   With patch  % change
    -   --  
  Read-only 16499MB/s   17215MB/s+4.3%
  Read-write 4361MB/s4794MB/s+9.9%

Signed-off-by: Waiman Long 

The patch is not in mainstream yet, but was ack-ed already:
https://lkml.org/lkml/2016/4/29/584

This patch does not improve the performance of __es_shrink() itself,
but generally improves the fastpath.

Was found while digging:
https://jira.sw.ru/browse/PSBM-83335

Signed-off-by: Konstantin Khorenko 
Acked-by: Dmitry Monakhov 
---
 fs/ext4/extents_status.c | 38 +-
 fs/ext4/extents_status.h |  4 ++--
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 77f44d382aa5..a3b9c480ec20 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -769,6 +769,15 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t 
lblk,
write_unlock(&EXT4_I(inode)->i_es_lock);
 }
 
+/*
+ * For pure statistics count, use a large batch size to make sure that
+ * it does percpu update as much as possible.
+ */
+static inline void ext4_es_stats_inc(struct percpu_counter *fbc)
+{
+   __percpu_counter_add(fbc, 1, (1 << 30));
+}
+
 /*
  * ext4_es_lookup_extent() looks up an extent in extent status tree.
  *
@@ -823,9 +832,9 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t 
lblk,
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
-   stats->es_stats_cache_hits++;
+   ext4_es_stats_inc(&stats->es_stats_cache_hits);
} else {
-   stats->es_stats_cache_misses++;
+   ext4_es_stats_inc(&stats->es_stats_cache_misses);
}
 
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -1122,9 +1131,9 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file 
*seq, void *v)
seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
-   seq_printf(seq, "  %lu/%lu cache hits/misses\n",
-  es_stats->es_stats_cache_hits,
-  es_stats->es_stats_cache_misses);
+   seq_printf(seq, "  %lld/%lld cache hits/misses\n",
+  percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
+  
percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
if (inode_cnt)
seq_printf(seq, "  %d inodes on list\n", inode_cnt);
 
@@ -1188,8 +1197,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_nr_inode = 0;
spin_lock_init(&sbi->s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
-   sbi->s_es_stats.es_stats_cache_hits = 0;
-   sbi->s_es_stats.es_stats_cache_misses = 0;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, 
GFP_KERNEL);
@@ -1199,19 +1206,30 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err1;
 
+   err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, 
GFP_KERNEL);
+   if (err)
+   goto err2;
+
+   err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, 
GFP_KERNEL);
+   if (err)
+   goto err3;
+
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
err = register_shrinker(&sbi->s_es_shrinker);
if (err)
-   goto err2;
+   goto err4;
 
if (sbi->s_proc)
proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
 &ext4_es_seq_shrinker_info_fops, sbi);
 
return 0;
-
+err4:
+   percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_m

[Devel] [PATCH RHEL7 COMMIT] ext4: don't iterate over sbi->s_es_list more than the number of elements

2018-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.21.1.vz7.46.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.21.1.vz7.46.5
-->
commit 98ec64d9082be9d0b5594a0ecd98021fa995ba82
Author: Konstantin Khorenko 
Date:   Fri Apr 13 19:22:22 2018 +0300

ext4: don't iterate over sbi->s_es_list more than the number of elements

If there are several shrinkers working on a single sbi there can be easily a
situation when a neighbor shrinkers reclaimed a bunch of extents and thus a
bunch inoes from the s_es_list but we don't honor this and iterate over
sbi->s_es_list the number of times equal to the initial number of inodes in
s_es_list.

Before each iteration, check if we are going to iterate more than the 
number of
inodes in the list and adjust nr_to_walk accordingly.

https://jira.sw.ru/browse/PSBM-83335

Signed-off-by: Konstantin Khorenko 
Acked-by: Dmitry Monakhov 
---
 fs/ext4/extents_status.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index a3b9c480ec20..ed1f63eef74c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -998,6 +998,14 @@ static int __es_shrink(struct ext4_sb_info *sbi, int 
nr_to_scan,
spin_unlock(&sbi->s_es_lock);
goto out;
}
+   /*
+* Another shrinker can remove a bunch of extents in parallel,
+* we don't have to iterate more than the current number of
+* inodes in the list.
+*/
+   if (nr_to_walk > sbi->s_es_nr_inode)
+   nr_to_walk = sbi->s_es_nr_inode;
+
ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
  i_es_list);
/* Move the inode to the tail */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/ext4: move handling of list of shrinkable inodes into extent status code

2018-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.21.1.vz7.46.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.21.1.vz7.46.5
-->
commit 4a45bc07cbf78806045c5b19bf6e2652ce000d23
Author: Jan Kara 
Date:   Fri Apr 13 19:22:21 2018 +0300

ms/ext4: move handling of list of shrinkable inodes into extent status code

Currently callers adding extents to extent status tree were responsible
for adding the inode to the list of inodes with freeable extents. This
is error prone and puts list handling in unnecessarily many places.

Just add inode to the list automatically when the first non-delay extent
is added to the tree and remove inode from the list when the last
non-delay extent is removed.

Signed-off-by: Jan Kara 
Signed-off-by: Theodore Ts'o 
(cherry picked from commit b0dea4c1651f3cdb6d17604fa473e72cb74cdc6b)

https://jira.sw.ru/browse/PSBM-83335

We do face a situation when all (32) cpus on a node content on 
sbi->s_es_lock
shrinking extents on a single superblock and
shrinking extents goes very slow (180 sec in average!).

crash> struct ext4_sb_info 0x882fcb7ca800 -p

  s_es_nr_inode = 3173832,
  s_es_stats = {
es_stats_shrunk = 70,
es_stats_cache_hits = 35182748,
es_stats_cache_misses = 2622931,
es_stats_scan_time = 182642303461,
es_stats_max_scan_time = 276290979674,

This patch should help a bit because it decreases sbi->s_es_nr_inode right
in __es_shrink() as a side effect, thus cpus which comes later to 
__es_shrink()
will loop less cycles.

Signed-off-by: Konstantin Khorenko 
Acked-by: Dmitry Monakhov 
---
 fs/ext4/extents.c|  2 --
 fs/ext4/extents_status.c | 10 ++
 fs/ext4/extents_status.h |  2 --
 fs/ext4/inode.c  |  2 --
 fs/ext4/ioctl.c  |  2 --
 fs/ext4/super.c  |  1 -
 6 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a8675aea44ad..ccbb952482e8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4658,7 +4658,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode 
*inode,
 
trace_ext4_ext_map_blocks_exit(inode, flags, map,
   err ? err : allocated);
-   ext4_es_list_add(inode);
return err ? err : allocated;
 }
 
@@ -5248,7 +5247,6 @@ int ext4_fiemap(struct inode *inode, struct 
fiemap_extent_info *fieinfo,
error = ext4_fill_fiemap_extents(inode, start_blk,
 len_blks, fieinfo);
}
-   ext4_es_list_add(inode);
return error;
 }
 
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d632a3e43994..77f44d382aa5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -297,7 +297,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
-void ext4_es_list_add(struct inode *inode)
+static void ext4_es_list_add(struct inode *inode)
 {
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -313,7 +313,7 @@ void ext4_es_list_add(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
 }
 
-void ext4_es_list_del(struct inode *inode)
+static void ext4_es_list_del(struct inode *inode)
 {
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -343,7 +343,8 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, 
ext4_lblk_t len,
 * We don't count delayed extent because we never try to reclaim them
 */
if (!ext4_es_is_delayed(es)) {
-   EXT4_I(inode)->i_es_shk_nr++;
+   if (!EXT4_I(inode)->i_es_shk_nr++)
+   ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_shk_cnt);
}
@@ -362,7 +363,8 @@ static void ext4_es_free_extent(struct inode *inode, struct 
extent_status *es)
/* Decrease the shrink counter when this es is not delayed */
if (!ext4_es_is_delayed(es)) {
BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
-   EXT4_I(inode)->i_es_shk_nr--;
+   if (!--EXT4_I(inode)->i_es_shk_nr)
+   ext4_es_list_del(inode);
percpu_counter_dec(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_shk_cnt);
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 0e6a33e81e5f..b0b78b95f481 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -150,7 +150,5 @@ static inline void ext4_es_store_pblock_status(struct 
extent_status *es,
 
 extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_sh