On Mon 2017-02-06 17:49:06, Kent Overstreet wrote:
> On Mon, Feb 06, 2017 at 04:47:24PM -0900, Kent Overstreet wrote:
> > On Mon, Feb 06, 2017 at 01:53:09PM +0100, Pavel Machek wrote:
> > > Still there on v4.9, 36 threads on nokia n900 cellphone.
> > > 
> > > So.. what needs to be done there?
> 
> > But, I just got an idea for how to handle this that might be halfway sane, 
> > maybe
> > I'll try and come up with a patch...
> 
> Ok, here's such a patch, only lightly tested:

I guess it would be nice for me to test it... but what it is against?
I tried after v4.10-rc5 and linux-next, but got rejects in both cases.

Thanks,
                                                                Pavel

> -- >8 --
> Subject: [PATCH] block: Make rescuer threads per request_queue, not per bioset
> 
> Note: this patch is very lightly tested.
> 
> Also, trigger rescuing whenever with bios on current->bio_list, instead
> of only when we block in bio_alloc_bioset(). This is more correct, and
> should result in fewer rescuer threads.
> 
> XXX: The current->bio_list plugging needs to be unified with the
> blk_plug mechanism.
> 
> TODO: If we change normal request_queue drivers to handle arbitrary size
> bios by processing requests incrementally, instead of splitting bios,
> then we can get rid of rescuer threads from those devices.
> ---
>  block/bio.c            | 107 
> ++++---------------------------------------------
>  block/blk-core.c       |  58 ++++++++++++++++++++++++---
>  block/blk-sysfs.c      |   2 +
>  include/linux/bio.h    |  16 ++++----
>  include/linux/blkdev.h |  10 +++++
>  include/linux/sched.h  |   2 +-
>  kernel/sched/core.c    |   4 ++
>  7 files changed, 83 insertions(+), 116 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index f3b5786202..9ad54a9b12 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -336,54 +336,6 @@ void bio_chain(struct bio *bio, struct bio *parent)
>  }
>  EXPORT_SYMBOL(bio_chain);
>  
> -static void bio_alloc_rescue(struct work_struct *work)
> -{
> -     struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
> -     struct bio *bio;
> -
> -     while (1) {
> -             spin_lock(&bs->rescue_lock);
> -             bio = bio_list_pop(&bs->rescue_list);
> -             spin_unlock(&bs->rescue_lock);
> -
> -             if (!bio)
> -                     break;
> -
> -             generic_make_request(bio);
> -     }
> -}
> -
> -static void punt_bios_to_rescuer(struct bio_set *bs)
> -{
> -     struct bio_list punt, nopunt;
> -     struct bio *bio;
> -
> -     /*
> -      * In order to guarantee forward progress we must punt only bios that
> -      * were allocated from this bio_set; otherwise, if there was a bio on
> -      * there for a stacking driver higher up in the stack, processing it
> -      * could require allocating bios from this bio_set, and doing that from
> -      * our own rescuer would be bad.
> -      *
> -      * Since bio lists are singly linked, pop them all instead of trying to
> -      * remove from the middle of the list:
> -      */
> -
> -     bio_list_init(&punt);
> -     bio_list_init(&nopunt);
> -
> -     while ((bio = bio_list_pop(current->bio_list)))
> -             bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
> -
> -     *current->bio_list = nopunt;
> -
> -     spin_lock(&bs->rescue_lock);
> -     bio_list_merge(&bs->rescue_list, &punt);
> -     spin_unlock(&bs->rescue_lock);
> -
> -     queue_work(bs->rescue_workqueue, &bs->rescue_work);
> -}
> -
>  /**
>   * bio_alloc_bioset - allocate a bio for I/O
>   * @gfp_mask:   the GFP_ mask given to the slab allocator
> @@ -421,54 +373,27 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
>   */
>  struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set 
> *bs)
>  {
> -     gfp_t saved_gfp = gfp_mask;
>       unsigned front_pad;
>       unsigned inline_vecs;
>       struct bio_vec *bvl = NULL;
>       struct bio *bio;
>       void *p;
>  
> -     if (!bs) {
> -             if (nr_iovecs > UIO_MAXIOV)
> -                     return NULL;
> +     WARN(current->bio_list &&
> +          !current->bio_list->q->rescue_workqueue,
> +          "allocating bio beneath generic_make_request() without rescuer");
>  
> +     if (nr_iovecs > UIO_MAXIOV)
> +             return NULL;
> +
> +     if (!bs) {
>               p = kmalloc(sizeof(struct bio) +
>                           nr_iovecs * sizeof(struct bio_vec),
>                           gfp_mask);
>               front_pad = 0;
>               inline_vecs = nr_iovecs;
>       } else {
> -             /*
> -              * generic_make_request() converts recursion to iteration; this
> -              * means if we're running beneath it, any bios we allocate and
> -              * submit will not be submitted (and thus freed) until after we
> -              * return.
> -              *
> -              * This exposes us to a potential deadlock if we allocate
> -              * multiple bios from the same bio_set() while running
> -              * underneath generic_make_request(). If we were to allocate
> -              * multiple bios (say a stacking block driver that was splitting
> -              * bios), we would deadlock if we exhausted the mempool's
> -              * reserve.
> -              *
> -              * We solve this, and guarantee forward progress, with a rescuer
> -              * workqueue per bio_set. If we go to allocate and there are
> -              * bios on current->bio_list, we first try the allocation
> -              * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
> -              * bios we would be blocking to the rescuer workqueue before
> -              * we retry with the original gfp_flags.
> -              */
> -
> -             if (current->bio_list && !bio_list_empty(current->bio_list))
> -                     gfp_mask &= ~__GFP_DIRECT_RECLAIM;
> -
>               p = mempool_alloc(&bs->bio_pool, gfp_mask);
> -             if (!p && gfp_mask != saved_gfp) {
> -                     punt_bios_to_rescuer(bs);
> -                     gfp_mask = saved_gfp;
> -                     p = mempool_alloc(&bs->bio_pool, gfp_mask);
> -             }
> -
>               front_pad = bs->front_pad;
>               inline_vecs = BIO_INLINE_VECS;
>       }
> @@ -483,12 +408,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int 
> nr_iovecs, struct bio_set *bs)
>               unsigned long idx = 0;
>  
>               bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
> -             if (!bvl && gfp_mask != saved_gfp) {
> -                     punt_bios_to_rescuer(bs);
> -                     gfp_mask = saved_gfp;
> -                     bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, 
> &bs->bvec_pool);
> -             }
> -
>               if (unlikely(!bvl))
>                       goto err_free;
>  
> @@ -1938,10 +1857,6 @@ int biovec_init_pool(mempool_t *pool, int pool_entries)
>  
>  void bioset_exit(struct bio_set *bs)
>  {
> -     if (bs->rescue_workqueue)
> -             destroy_workqueue(bs->rescue_workqueue);
> -     bs->rescue_workqueue = NULL;
> -
>       mempool_exit(&bs->bio_pool);
>       mempool_exit(&bs->bvec_pool);
>  
> @@ -1968,10 +1883,6 @@ static int __bioset_init(struct bio_set *bs,
>  
>       bs->front_pad = front_pad;
>  
> -     spin_lock_init(&bs->rescue_lock);
> -     bio_list_init(&bs->rescue_list);
> -     INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
> -
>       bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
>       if (!bs->bio_slab)
>               return -ENOMEM;
> @@ -1983,10 +1894,6 @@ static int __bioset_init(struct bio_set *bs,
>           biovec_init_pool(&bs->bvec_pool, pool_size))
>               goto bad;
>  
> -     bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
> -     if (!bs->rescue_workqueue)
> -             goto bad;
> -
>       return 0;
>  bad:
>       bioset_exit(bs);
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 7e3cfa9c88..f716164cb3 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -48,6 +48,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
>  
>  DEFINE_IDA(blk_queue_ida);
>  
> +static void bio_rescue_work(struct work_struct *);
> +
>  /*
>   * For the allocated request tables
>   */
> @@ -759,11 +761,21 @@ struct request_queue *blk_alloc_queue_node(gfp_t 
> gfp_mask, int node_id)
>                               PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
>               goto fail_bdi;
>  
> -     if (blkcg_init_queue(q))
> +     spin_lock_init(&q->rescue_lock);
> +     bio_list_init(&q->rescue_list);
> +     INIT_WORK(&q->rescue_work, bio_rescue_work);
> +
> +     q->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
> +     if (!q->rescue_workqueue)
>               goto fail_ref;
>  
> +     if (blkcg_init_queue(q))
> +             goto fail_rescue;
> +
>       return q;
>  
> +fail_rescue:
> +     destroy_workqueue(q->rescue_workqueue);
>  fail_ref:
>       percpu_ref_exit(&q->q_usage_counter);
>  fail_bdi:
> @@ -1994,7 +2006,7 @@ generic_make_request_checks(struct bio *bio)
>   */
>  blk_qc_t generic_make_request(struct bio *bio)
>  {
> -     struct bio_list bio_list_on_stack;
> +     struct bio_plug_list bio_list_on_stack;
>       blk_qc_t ret = BLK_QC_T_NONE;
>  
>       if (!generic_make_request_checks(bio))
> @@ -2011,7 +2023,9 @@ blk_qc_t generic_make_request(struct bio *bio)
>        * should be added at the tail
>        */
>       if (current->bio_list) {
> -             bio_list_add(current->bio_list, bio);
> +             WARN(!current->bio_list->q->rescue_workqueue,
> +                  "submitting bio beneath generic_make_request() without 
> rescuer");
> +             bio_list_add(&current->bio_list->bios, bio);
>               goto out;
>       }
>  
> @@ -2030,19 +2044,23 @@ blk_qc_t generic_make_request(struct bio *bio)
>        * bio_list, and call into ->make_request() again.
>        */
>       BUG_ON(bio->bi_next);
> -     bio_list_init(&bio_list_on_stack);
> +     bio_list_init(&bio_list_on_stack.bios);
>       current->bio_list = &bio_list_on_stack;
> +
>       do {
>               struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>  
> +             current->bio_list->q = q;
> +
>               if (likely(blk_queue_enter(q, false) == 0)) {
>                       ret = q->make_request_fn(q, bio);
>  
>                       blk_queue_exit(q);
>  
> -                     bio = bio_list_pop(current->bio_list);
> +                     bio = bio_list_pop(&current->bio_list->bios);
>               } else {
> -                     struct bio *bio_next = bio_list_pop(current->bio_list);
> +                     struct bio *bio_next =
> +                             bio_list_pop(&current->bio_list->bios);
>  
>                       bio_io_error(bio);
>                       bio = bio_next;
> @@ -2055,6 +2073,34 @@ blk_qc_t generic_make_request(struct bio *bio)
>  }
>  EXPORT_SYMBOL(generic_make_request);
>  
> +static void bio_rescue_work(struct work_struct *work)
> +{
> +     struct request_queue *q =
> +             container_of(work, struct request_queue, rescue_work);
> +     struct bio *bio;
> +
> +     while (1) {
> +             spin_lock(&q->rescue_lock);
> +             bio = bio_list_pop(&q->rescue_list);
> +             spin_unlock(&q->rescue_lock);
> +
> +             if (!bio)
> +                     break;
> +
> +             generic_make_request(bio);
> +     }
> +}
> +
> +void blk_punt_blocked_bios(struct bio_plug_list *list)
> +{
> +     spin_lock(&list->q->rescue_lock);
> +     bio_list_merge(&list->q->rescue_list, &list->bios);
> +     bio_list_init(&list->bios);
> +     spin_unlock(&list->q->rescue_lock);
> +
> +     queue_work(list->q->rescue_workqueue, &list->q->rescue_work);
> +}
> +
>  /**
>   * submit_bio - submit a bio to the block device layer for I/O
>   * @bio: The &struct bio which describes the I/O
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 7f27a18cc4..77529238d1 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -660,6 +660,8 @@ static void blk_release_queue(struct kobject *kobj)
>  
>       blk_trace_shutdown(q);
>  
> +     if (q->rescue_workqueue)
> +             destroy_workqueue(q->rescue_workqueue);
>       if (q->bio_split)
>               bioset_free(q->bio_split);
>  
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1ffe8e37ae..87eeec7eda 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -658,6 +658,13 @@ static inline struct bio *bio_list_get(struct bio_list 
> *bl)
>       return bio;
>  }
>  
> +struct bio_plug_list {
> +     struct bio_list         bios;
> +     struct request_queue    *q;
> +};
> +
> +void blk_punt_blocked_bios(struct bio_plug_list *);
> +
>  /*
>   * Increment chain count for the bio. Make sure the CHAIN flag update
>   * is visible before the raised count.
> @@ -687,15 +694,6 @@ struct bio_set {
>       mempool_t bio_integrity_pool;
>       mempool_t bvec_integrity_pool;
>  #endif
> -
> -     /*
> -      * Deadlock avoidance for stacking block drivers: see comments in
> -      * bio_alloc_bioset() for details
> -      */
> -     spinlock_t              rescue_lock;
> -     struct bio_list         rescue_list;
> -     struct work_struct      rescue_work;
> -     struct workqueue_struct *rescue_workqueue;
>  };
>  
>  struct biovec_slab {
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c47c358ba0..f64b886c65 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -476,6 +476,16 @@ struct request_queue {
>       struct bio_set          *bio_split;
>  
>       bool                    mq_sysfs_init_done;
> +
> +     /*
> +      * Deadlock avoidance, to deal with the plugging in
> +      * generic_make_request() that converts recursion to iteration to avoid
> +      * stack overflow:
> +      */
> +     spinlock_t              rescue_lock;
> +     struct bio_list         rescue_list;
> +     struct work_struct      rescue_work;
> +     struct workqueue_struct *rescue_workqueue;
>  };
>  
>  #define QUEUE_FLAG_QUEUED    1       /* uses generic tag queueing */
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2865d10a28..59df7a1030 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1762,7 +1762,7 @@ struct task_struct {
>       void *journal_info;
>  
>  /* stacked block device info */
> -     struct bio_list *bio_list;
> +     struct bio_plug_list *bio_list;
>  
>  #ifdef CONFIG_BLOCK
>  /* stack plugging */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bd39d698cb..23b6290ba1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3439,6 +3439,10 @@ static inline void sched_submit_work(struct 
> task_struct *tsk)
>  {
>       if (!tsk->state || tsk_is_pi_blocked(tsk))
>               return;
> +
> +     if (tsk->bio_list && !bio_list_empty(&tsk->bio_list->bios))
> +             blk_punt_blocked_bios(tsk->bio_list);
> +
>       /*
>        * If we are going to sleep and we have plugged IO queued,
>        * make sure to submit it to avoid deadlocks.

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

Attachment: signature.asc
Description: Digital signature

Reply via email to