[Devel] [PATCH RHEL v2] mm: Reduce access frequency to shrinker_rwsem during shrink_slab

2020-08-20 Thread Valeriy Vdovin
Bug https://jira.sw.ru/browse/PSBM-99181 has introduced a problem: when
the kernel has opened NFS delegations and NFS server is not accessible
at the time when NFS shrinker is called, the whole shrinker list
execution gets stuck until NFS server is back. Being a problem in itself
it also introduces bigger problem - during that hang, the shrinker_rwsem
also gets locked, consequently no new mounts can be done at that time
because new superblock tries to register it's own shrinker and also gets
stuck at aquiring shrinker_rwsem.

Commit 9e9e35d050955648449498827deb2d43be0564e1 is a workaround for that
problem. It is known that during signle shrinker execution we do not
actually need to hold shrinker_rwsem so we release and reacqiure the
rwsem for each shrinker in the list.

Because of this workaround shrink_slab function now experiences a major
slowdown, because shrinker_rwsem gets accessed for each shrinker in the
list twice. On an idle fresh-booted system shrinker_list could be
iterated up to 1600 times a second, although originally the problem was
local to only one NFS shrinker.

This patch fixes commit 9e9e35d050955648449498827deb2d43be0564e1 in a
way that before calling for up_read for shrinker_rwsem, we check that
this is really an NFS shrinker by checking NFS magic in superblock, if
it is accessible from shrinker.

https://jira.sw.ru/browse/PSBM-99181

Co-authored-by: Andrey Ryabinin 
Signed-off-by: Valeriy Vdovin 

Changes:
  v2: Added missing 'rwsem_is_contented' check
---
 fs/super.c  |  2 +-
 mm/vmscan.c | 65 ++---
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index f131d14..1cf377a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(dcache_is_low);
  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
-static unsigned long super_cache_scan(struct shrinker *shrink,
+unsigned long super_cache_scan(struct shrinker *shrink,
  struct shrink_control *sc)
 {
struct super_block *sb;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d7082d2..4fa86e7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -453,6 +453,20 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
return freed;
 }
 
+unsigned long super_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+static inline bool is_nfs_shrinker(struct shrinker *shrinker)
+{
+   struct super_block *sb = container_of(shrinker,
+   struct super_block, s_shrink);
+
+   if (shrinker->scan_objects == _cache_scan)
+   return sb->s_magic == NFS_SUPER_MAGIC;
+
+   return false;
+}
+
 struct shrinker *get_shrinker(struct shrinker *shrinker)
 {
/*
@@ -511,6 +525,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int 
nid,
.memcg = memcg,
};
struct shrinker *shrinker;
+   bool is_nfs;
 
shrinker = idr_find(_idr, i);
if (unlikely(!shrinker)) {
@@ -518,6 +533,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int 
nid,
continue;
}
 
+   is_nfs = is_nfs_shrinker(shrinker);
+
/*
 * Take a refcnt on a shrinker so that it can't be freed or
 * removed from shrinker_idr (and shrinker_list). These way we
@@ -527,10 +544,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
int nid,
 * take too much time to finish (e.g. on nfs). And holding
 * global shrinker_rwsem can block registring and unregistring
 * of shrinkers.
+*
+* The up_read logic should only be executed for nfs shrinker
+* path, because it has proven to hang. For others it should be
+* skipped to reduce performance penalties.
 */
-   if(!get_shrinker(shrinker))
-   continue;
-   up_read(_rwsem);
+   if (is_nfs) {
+   if (!get_shrinker(shrinker))
+   continue;
+   up_read(_rwsem);
+   }
 
ret = do_shrink_slab(, shrinker, priority);
if (ret == SHRINK_EMPTY) {
@@ -565,14 +588,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
int nid,
 * memcg_expand_one_shrinker_map if new shrinkers
 * were registred in the meanwhile.
 */
-   if (!down_read_trylock(_rwsem)) {
-   freed = freed ? : 1;
+   if (is_nfs) {
+   if (!down_read_trylock(_rwsem)) {
+   freed = freed ? : 1;
+   

[Devel] [PATCH RH7 2/2] ploop: Cleanup in ploop_make_request()

2020-08-20 Thread Kirill Tkhai
BUG_ON(bio->bi_idx) is already checked in start of this function,
so there is no a reason to check that twise.

Signed-off-by: Kirill Tkhai 
---
 drivers/block/ploop/dev.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 25b516cad9cc..36c063b80df3 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -990,7 +990,7 @@ static void ploop_make_request(struct request_queue *q, 
struct bio *bio)
plo->st.bio_splits++;
 
if (!(bio->bi_rw & REQ_DISCARD))
-   BUG_ON(bio->bi_vcnt != 1 || bio->bi_idx != 0);
+   BUG_ON(bio->bi_vcnt != 1)
 
bp = bio_split(bio, first_sectors);
ploop_make_request(q, >bio1);


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH7 1/2] ploop: io_kaio: Introduce 4K discard_granuality with 1M clearing indexes when possible

2020-08-20 Thread Kirill Tkhai
1)In case of discard size is less then 1 cluster,
  a small hole is punched.
2)In case of discard request covers the whole cluster,
  index in BAT is also cleared.

Since small 4K holes require 4K discard_granuality,
this makes impossible to use block level to make
discard requests 1 cluster aligned. This requires
us to split the requests manually, so force_split_discard_reqs
parameter was introduced.

See comments in kaio_queue_settings() for details.

https://jira.sw.ru/browse/PSBM-105347

Signed-off-by: Kirill Tkhai 
---
 drivers/block/ploop/dev.c |   12 
 drivers/block/ploop/io_kaio.c |   36 +++-
 include/linux/ploop/ploop.h   |1 +
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 5f1a0c289a26..25b516cad9cc 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -979,7 +979,8 @@ static void ploop_make_request(struct request_queue *q, 
struct bio *bio)
 * bio layer assumes that it can prepare single-page bio
 * not depending on any alignment constraints. So be it.
 */
-   if (!(bio->bi_rw & REQ_DISCARD) && bio->bi_size &&
+   if ((!(bio->bi_rw & REQ_DISCARD) || plo->force_split_discard_reqs) &&
+   bio->bi_size &&
(bio->bi_sector >> cluster_log) !=
((bio->bi_sector + (bio->bi_size >> 9) - 1) >> cluster_log)) {
struct bio_pair *bp;
@@ -988,7 +989,8 @@ static void ploop_make_request(struct request_queue *q, 
struct bio *bio)
 
plo->st.bio_splits++;
 
-   BUG_ON(bio->bi_vcnt != 1 || bio->bi_idx != 0);
+   if (!(bio->bi_rw & REQ_DISCARD))
+   BUG_ON(bio->bi_vcnt != 1 || bio->bi_idx != 0);
 
bp = bio_split(bio, first_sectors);
ploop_make_request(q, >bio1);
@@ -2298,7 +2300,7 @@ static bool ploop_can_issue_discard(struct ploop_device 
*plo,
if (!list_is_singular(>map.delta_list))
return false;
 
-   return whole_block(plo, preq);
+   return whole_block(plo, preq) || plo->force_split_discard_reqs;
 }
 
 static void
@@ -2568,7 +2570,8 @@ ploop_entry_request(struct ploop_request * preq)
}
preq->iblock = iblk;
if (!(preq->req_rw & REQ_DISCARD) ||
-   (delta->ops->capability & 
PLOOP_FMT_CAP_IDENTICAL))
+   (delta->ops->capability & 
PLOOP_FMT_CAP_IDENTICAL) ||
+   !whole_block(plo, preq))
preq->eng_state = PLOOP_E_COMPLETE;
else
preq->eng_state = PLOOP_E_DATA_WBI;
@@ -4205,6 +4208,7 @@ static int ploop_start(struct ploop_device * plo, struct 
block_device *bdev)
blk_queue_merge_bvec(q, ploop_merge_bvec);
blk_queue_flush(q, REQ_FLUSH);
 
+   plo->force_split_discard_reqs = false;
top_delta->io.ops->queue_settings(_delta->io, q);
/* REQ_WRITE_SAME is not supported */
blk_queue_max_write_same_sectors(q, 0);
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index 2052dac43c1a..365e2e3850d4 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -1140,7 +1140,41 @@ static void kaio_queue_settings(struct ploop_io * io, 
struct request_queue * q)
if (inode->i_sb->s_magic == EXT4_SUPER_MAGIC) {
WARN_ON(!kaio_backed_ext4);
blk_queue_stack_limits(q, bdev_get_queue(io->files.bdev));
-   ploop_set_discard_limits(io->plo);
+   /*
+* There is no a way to force block engine to split a request
+* to fit a single cluster, when discard granuality is 4K
+* (inherited from fs block size in blk_queue_stack_limits()).
+* So, ploop_make_request() splits them.
+*/
+   io->plo->force_split_discard_reqs = true;
+   /*
+* Why not (1 << io->plo->cluster_log)?
+* Someone may want to clear indexes in case of a request
+* is big enough to fit the whole cluster.
+* In case of max_discard_sectors is 1 cluster, a request
+* for [cluster_start - 4K, cluster_start + cluster_size)
+* at block level will be splitted in two requests:
+*
+* [cluster_start - 4K, cluster_start + cluster_size - 4K)
+* [cluster_start + cluster_size - 4K, cluster_start + 
cluster_size)
+*
+* Then, ploop_make_request() splits the first of them in two
+* to fit a single cluster, so all three requests will be 
smaller
+* then 1 cluster, and no index will be cleared.
+   

Re: [Devel] [PATCH RHEL7] mm: Reduce access frequency to shrinker_rwsem during shrink_slab

2020-08-20 Thread Andrey Ryabinin



On 8/20/20 11:32 AM, Valeriy Vdovin wrote:

> @@ -565,14 +588,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
> int nid,
>* memcg_expand_one_shrinker_map if new shrinkers
>* were registred in the meanwhile.
>*/
> - if (!down_read_trylock(_rwsem)) {
> - freed = freed ? : 1;
> + if (is_nfs) {
> + if (!down_read_trylock(_rwsem)) {
> + freed = freed ? : 1;
> + put_shrinker(shrinker);
> + return freed;
> + }
>   put_shrinker(shrinker);
> - return freed;
> + map = memcg_nid_shrinker_map(memcg, nid);
> + nr_max = min(shrinker_nr_max, map->nr_max);
>   }

Need to add rwsem_is_contended() check back. It was here before commit 9e9e35d05

else if (rwsem_is_contended(_rwsem)) {
freed = freed ? : 1;
break;
}



> - put_shrinker(shrinker);
> - map = memcg_nid_shrinker_map(memcg, nid);
> - nr_max = min(shrinker_nr_max, map->nr_max);
>   }
>  unlock:
>   up_read(_rwsem);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7] mm: Reduce access frequency to shrinker_rwsem during shrink_slab

2020-08-20 Thread Valeriy Vdovin
Bug https://jira.sw.ru/browse/PSBM-99181 has introduced a problem: when
the kernel has opened NFS delegations and NFS server is not accessible
at the time when NFS shrinker is called, the whole shrinker list
execution gets stuck until NFS server is back. Being a problem in itself
it also introduces bigger problem - during that hang, the shrinker_rwsem
also gets locked, consequently no new mounts can be done at that time
because new superblock tries to register it's own shrinker and also gets
stuck at aquiring shrinker_rwsem.

Commit 9e9e35d050955648449498827deb2d43be0564e1 is a workaround for that
problem. It is known that during signle shrinker execution we do not
actually need to hold shrinker_rwsem so we release and reacqiure the
rwsem for each shrinker in the list.

Because of this workaround shrink_slab function now experiences a major
slowdown, because shrinker_rwsem gets accessed for each shrinker in the
list twice. On an idle fresh-booted system shrinker_list could be
iterated up to 1600 times a second, although originally the problem was
local to only one NFS shrinker.

This patch fixes commit 9e9e35d050955648449498827deb2d43be0564e1 in a
way that before calling for up_read for shrinker_rwsem, we check that
this is really an NFS shrinker by checking NFS magic in superblock, if
it is accessible from shrinker.

https://jira.sw.ru/browse/PSBM-99181

Co-authored-by: Andrey Ryabinin 
Signed-off-by: Valeriy Vdovin 
---
 fs/super.c  |  2 +-
 mm/vmscan.c | 63 +
 2 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index f131d14..1cf377a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(dcache_is_low);
  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
-static unsigned long super_cache_scan(struct shrinker *shrink,
+unsigned long super_cache_scan(struct shrinker *shrink,
  struct shrink_control *sc)
 {
struct super_block *sb;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d7082d2..4fb5d78 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -453,6 +453,20 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
return freed;
 }
 
+unsigned long super_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc);
+
+static inline bool is_nfs_shrinker(struct shrinker *shrinker)
+{
+   struct super_block *sb = container_of(shrinker,
+   struct super_block, s_shrink);
+
+   if (shrinker->scan_objects == _cache_scan)
+   return sb->s_magic == NFS_SUPER_MAGIC;
+
+   return false;
+}
+
 struct shrinker *get_shrinker(struct shrinker *shrinker)
 {
/*
@@ -511,6 +525,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int 
nid,
.memcg = memcg,
};
struct shrinker *shrinker;
+   bool is_nfs;
 
shrinker = idr_find(_idr, i);
if (unlikely(!shrinker)) {
@@ -518,6 +533,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int 
nid,
continue;
}
 
+   is_nfs = is_nfs_shrinker(shrinker);
+
/*
 * Take a refcnt on a shrinker so that it can't be freed or
 * removed from shrinker_idr (and shrinker_list). These way we
@@ -527,10 +544,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
int nid,
 * take too much time to finish (e.g. on nfs). And holding
 * global shrinker_rwsem can block registring and unregistring
 * of shrinkers.
+*
+* The up_read logic should only be executed for nfs shrinker
+* path, because it has proven to hang. For others it should be
+* skipped to reduce performance penalties.
 */
-   if(!get_shrinker(shrinker))
-   continue;
-   up_read(_rwsem);
+   if (is_nfs) {
+   if (!get_shrinker(shrinker))
+   continue;
+   up_read(_rwsem);
+   }
 
ret = do_shrink_slab(, shrinker, priority);
if (ret == SHRINK_EMPTY) {
@@ -565,14 +588,16 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, 
int nid,
 * memcg_expand_one_shrinker_map if new shrinkers
 * were registred in the meanwhile.
 */
-   if (!down_read_trylock(_rwsem)) {
-   freed = freed ? : 1;
+   if (is_nfs) {
+   if (!down_read_trylock(_rwsem)) {
+   freed = freed ? : 1;
+   put_shrinker(shrinker);
+