Chris-

The patch cannot be applied on the latest drm-intel-nightly directly. 
I modified it a little bit to make it applied.
The patch can help much in HSW, but a little bit in BDW.
The test is to transcode 26 streams, which creates 244 threads.

CPU util      | w/o patch  |   w/ patch
----------------------------------------------------------
HSW async 1   |   102%     |     61%
HSW async 5   |   114%     |     46%
BDW async 1   |   116%     |     116%
BDW async 5   |   111%     |     107%

-Zhipeng
> -----Original Message-----
> From: Chris Wilson [mailto:ch...@chris-wilson.co.uk]
> Sent: Saturday, October 31, 2015 6:35 PM
> To: intel-gfx@lists.freedesktop.org
> Cc: Chris Wilson; Rogozhkin, Dmitry V; Gong, Zhipeng
> Subject: [PATCH] RFC drm/i915: Slaughter the thundering i915_wait_request
> herd
> 
> One particularly stressful scenario consists of many independent tasks all
> competing for GPU time and waiting upon the results (e.g. realtime
> transcoding of many, many streams). One bottleneck in particular is that each
> client waits on its own results, but every client is woken up after every
> batchbuffer - hence the thunder of hooves as then every client must do its
> heavyweight dance to read a coherent seqno to see if it is the lucky one.
> Alternatively, we can have one worker responsible for wakeing after an
> interrupt, checking the seqno and only wakeing up the clients who are
> complete. The disadvantage is that in the uncontended scenario (i.e. only one
> waiter) we incur an extra context switch in the wakeup path - though that
> should be mitigated somewhat by the busywait we do first before sleeping.
> 
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozh...@intel.com>
> Cc: "Gong, Zhipeng" <zhipeng.g...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |   2 +-
>  drivers/gpu/drm/i915/i915_gem.c         |  92 ++++--------------
>  drivers/gpu/drm/i915/i915_gem_request.h |   6 ++
>  drivers/gpu/drm/i915/intel_lrc.c        |   3 +
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 159
> +++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |   9 ++
>  6 files changed, 196 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3d4c422b3587..fe0d5ddad49d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1442,7 +1442,7 @@ struct i915_gpu_error {
>  #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
> 
>       /* For missed irq/seqno simulation. */
> -     unsigned int test_irq_rings;
> +     unsigned long test_irq_rings;
>  };
> 
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c index 29bd5238b824..1a89e7cc76d1
> 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1144,17 +1144,6 @@ i915_gem_check_wedge(unsigned reset_counter,
>       return 0;
>  }
> 
> -static void fake_irq(unsigned long data) -{
> -     wake_up_process((struct task_struct *)data);
> -}
> -
> -static bool missed_irq(struct drm_i915_private *dev_priv,
> -                    struct intel_engine_cs *ring)
> -{
> -     return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> -}
> -
>  static int __i915_spin_request(struct drm_i915_gem_request *req)  {
>       unsigned long timeout;
> @@ -1199,27 +1188,17 @@ int __i915_wait_request(struct
> drm_i915_gem_request *req,
>                       s64 *timeout,
>                       struct intel_rps_client *rps)
>  {
> -     struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
> -     struct drm_i915_private *dev_priv = req->i915;
> -     const bool irq_test_in_progress =
> -             ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) &
> intel_ring_flag(ring);
>       DEFINE_WAIT(wait);
> -     unsigned long timeout_expire;
> +     unsigned long timeout_remain;
>       s64 before, now;
>       int ret;
> 
> -     WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
> -
> -     if (list_empty(&req->list))
> -             return 0;
> -
>       if (i915_gem_request_completed(req, true))
>               return 0;
> 
> -     timeout_expire = timeout ?
> -             jiffies + nsecs_to_jiffies_timeout((u64)*timeout) : 0;
> +     timeout_remain = timeout ? nsecs_to_jiffies_timeout((u64)*timeout) :
> +0;
> 
> -     intel_rps_boost(dev_priv, rps, req->emitted_jiffies);
> +     intel_rps_boost(req->i915, rps, req->emitted_jiffies);
> 
>       /* Record current time in case interrupted by signal, or wedged */
>       trace_i915_gem_request_wait_begin(req);
> @@ -1230,67 +1209,34 @@ int __i915_wait_request(struct
> drm_i915_gem_request *req,
>       if (ret == 0)
>               goto out;
> 
> -     if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> -             ret = -ENODEV;
> -             goto out;
> -     }
> -
> +     intel_engine_add_wakeup(req);
>       for (;;) {
> -             struct timer_list timer;
> -
> -             prepare_to_wait(&ring->irq_queue, &wait,
> -                             interruptible ? TASK_INTERRUPTIBLE :
> TASK_UNINTERRUPTIBLE);
> +             int state = interruptible ? TASK_INTERRUPTIBLE :
> +TASK_UNINTERRUPTIBLE;
> 
> -             /* We need to check whether any gpu reset happened in between
> -              * the caller grabbing the seqno and now ... */
> -             if (req->reset_counter != 
> i915_reset_counter(&dev_priv->gpu_error))
> {
> -                     /* As we do not requeue the request over a GPU reset,
> -                      * if one does occur we know that the request is
> -                      * effectively complete.
> -                      */
> -                     ret = 0;
> -                     break;
> -             }
> +             prepare_to_wait(&req->wait, &wait, state);
> 
> -             if (i915_gem_request_completed(req, false)) {
> +             if (i915_gem_request_completed(req, true) ||
> +                 req->reset_counter !=
> i915_reset_counter(&req->i915->gpu_error))
> +{
>                       ret = 0;
>                       break;
>               }
> 
> -             if (interruptible && signal_pending(current)) {
> +             if (signal_pending_state(state, current)) {
>                       ret = -ERESTARTSYS;
>                       break;
>               }
> 
> -             if (timeout && time_after_eq(jiffies, timeout_expire)) {
> -                     ret = -ETIME;
> -                     break;
> -             }
> -
> -             i915_queue_hangcheck(dev_priv);
> -
> -             trace_i915_gem_request_wait_sleep(req);
> -
> -             timer.function = NULL;
> -             if (timeout || missed_irq(dev_priv, ring)) {
> -                     unsigned long expire;
> -
> -                     setup_timer_on_stack(&timer, fake_irq, (unsigned
> long)current);
> -                     expire = missed_irq(dev_priv, ring) ? jiffies + 1 : 
> timeout_expire;
> -                     mod_timer(&timer, expire);
> -             }
> -
> -             io_schedule();
> -
> -             if (timer.function) {
> -                     del_singleshot_timer_sync(&timer);
> -                     destroy_timer_on_stack(&timer);
> -             }
> +             if (timeout) {
> +                     timeout_remain = io_schedule_timeout(timeout_remain);
> +                     if (timeout_remain == 0) {
> +                             ret = -ETIME;
> +                             break;
> +                     }
> +             } else
> +                     io_schedule();
>       }
> -     if (!irq_test_in_progress)
> -             ring->irq_put(ring);
> -
> -     finish_wait(&ring->irq_queue, &wait);
> +     finish_wait(&req->wait, &wait);
> +     intel_engine_remove_wakeup(req);
> 
>  out:
>       now = ktime_get_raw_ns();
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h
> b/drivers/gpu/drm/i915/i915_gem_request.h
> index a5e27b7de93a..6fc295d5ba0f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -27,6 +27,7 @@
> 
>  #include <linux/list.h>
>  #include <linux/kref.h>
> +#include <linux/rbtree.h>
> 
>  struct drm_i915_file_private;
>  struct drm_i915_gem_object;
> @@ -60,6 +61,11 @@ struct drm_i915_gem_request {
>       /** GEM sequence number associated with this request. */
>       uint32_t seqno;
> 
> +     /** List of clients waiting for completion of this request */
> +     wait_queue_head_t wait;
> +     struct rb_node irq_node;
> +     unsigned irq_count;
> +
>       /** Position in the ringbuffer of the request */
>       u32 head, tail, wa_tail;
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index 70ca20ecbff4..4436616c00b8 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2024,6 +2024,7 @@ static int logical_ring_init(struct drm_device *dev,
> struct intel_engine_cs *rin
>       ring->buffer = NULL;
> 
>       ring->dev = dev;
> +     ring->i915 = to_i915(dev);
>       INIT_LIST_HEAD(&ring->request_list);
>       i915_gem_batch_pool_init(ring, &ring->batch_pool);
>       init_waitqueue_head(&ring->irq_queue);
> @@ -2032,6 +2033,8 @@ static int logical_ring_init(struct drm_device *dev,
> struct intel_engine_cs *rin
>       INIT_LIST_HEAD(&ring->execlist_completed);
>       spin_lock_init(&ring->execlist_lock);
> 
> +     intel_engine_init_wakeup(ring);
> +
>       ret = i915_cmd_parser_init_ring(ring);
>       if (ret)
>               goto error;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index f3fea688d2e5..6cb9a0aee833 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -33,6 +33,162 @@
>  #include "i915_trace.h"
>  #include "intel_drv.h"
> 
> +static bool missed_irq(struct intel_engine_cs *engine) {
> +     return test_bit(engine->id,
> +&engine->i915->gpu_error.missed_irq_rings);
> +}
> +
> +static bool __irq_enable(struct intel_engine_cs *engine) {
> +     if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
> +             return false;
> +
> +     if (!intel_irqs_enabled(engine->i915))
> +             return false;
> +
> +     return engine->irq_get(engine);
> +}
> +
> +static struct drm_i915_gem_request *irq_first(struct intel_engine_cs
> +*engine) {
> +     if (engine->irq_first == NULL) {
> +             struct rb_node *rb;
> +
> +             if (RB_EMPTY_ROOT(&engine->irq_requests))
> +                     return NULL;
> +
> +             rb = rb_first(&engine->irq_requests);
> +             engine->irq_first = rb_entry(rb, struct drm_i915_gem_request,
> irq_node);
> +     }
> +
> +     return engine->irq_first;
> +}
> +
> +static void intel_engine_irq_wakeup(struct work_struct *work) {
> +     struct intel_engine_cs *engine =
> +             container_of(work, struct intel_engine_cs, irq_work);
> +     const bool fake_irq = !__irq_enable(engine);
> +     DEFINE_WAIT(wait);
> +
> +     for (;;) {
> +             struct timer_list timer;
> +             struct drm_i915_gem_request *request;
> +
> +             prepare_to_wait(&engine->irq_queue, &wait,
> TASK_INTERRUPTIBLE);
> +
> +             spin_lock(&engine->irq_lock);
> +             request = irq_first(engine);
> +             while (request) {
> +                     struct rb_node *rb;
> +
> +                     if (request->reset_counter ==
> i915_reset_counter(&engine->i915->gpu_error) &&
> +                         !i915_gem_request_completed(request, false))
> +                             break;
> +
> +                     rb = rb_next(&request->irq_node);
> +                     rb_erase(&request->irq_node, &engine->irq_requests);
> +                     RB_CLEAR_NODE(&request->irq_node);
> +
> +                     wake_up_all(&request->wait);
> +
> +                     request =
> +                             rb ?
> +                             rb_entry(rb, typeof(*request), irq_node) :
> +                             NULL;
> +             }
> +             engine->irq_first = request;
> +             spin_unlock(&engine->irq_lock);
> +             if (request == NULL)
> +                     break;
> +
> +             i915_queue_hangcheck(engine->i915);
> +
> +             timer.function = NULL;
> +             if (fake_irq || missed_irq(engine)) {
> +                     setup_timer_on_stack(&timer,
> +                                          (void (*)(unsigned long))fake_irq,
> +                                          (unsigned long)current);
> +                     mod_timer(&timer, jiffies + 1);
> +             }
> +
> +             /* Unlike the individual clients, we do not want this
> +              * background thread to contribute to the system load,
> +              * i.e. we do not want to use io_schedule() here.
> +              */
> +             schedule();
> +
> +             if (timer.function) {
> +                     del_singleshot_timer_sync(&timer);
> +                     destroy_timer_on_stack(&timer);
> +             }
> +     }
> +     finish_wait(&engine->irq_queue, &wait);
> +     if (!fake_irq)
> +             engine->irq_put(engine);
> +}
> +
> +void intel_engine_init_wakeup(struct intel_engine_cs *engine) {
> +     init_waitqueue_head(&engine->irq_queue);
> +     spin_lock_init(&engine->irq_lock);
> +     INIT_WORK(&engine->irq_work, intel_engine_irq_wakeup); }
> +
> +void intel_engine_add_wakeup(struct drm_i915_gem_request *request) {
> +     struct intel_engine_cs *engine = i915_gem_request_get_ring(request);
> +
> +     spin_lock(&engine->irq_lock);
> +     if (request->irq_count++ == 0) {
> +             struct rb_node **p, *parent;
> +             bool first;
> +
> +             if (RB_EMPTY_ROOT(&engine->irq_requests))
> +                     schedule_work(&engine->irq_work);
> +
> +             init_waitqueue_head(&request->wait);
> +
> +             first = true;
> +             parent = NULL;
> +             p = &engine->irq_requests.rb_node;
> +             while (*p) {
> +                     struct drm_i915_gem_request *__req;
> +
> +                     parent = *p;
> +                     __req = rb_entry(parent, typeof(*__req), irq_node);
> +
> +                     if (i915_seqno_passed(request->seqno, __req->seqno)) {
> +                             p = &parent->rb_right;
> +                             first = false;
> +                     } else
> +                             p = &parent->rb_left;
> +             }
> +             if (first)
> +                     engine->irq_first = request;
> +
> +             rb_link_node(&request->irq_node, parent, p);
> +             rb_insert_color(&request->irq_node, &engine->irq_requests);
> +     }
> +     spin_unlock(&engine->irq_lock);
> +}
> +
> +void intel_engine_remove_wakeup(struct drm_i915_gem_request *request) {
> +     struct intel_engine_cs *engine = i915_gem_request_get_ring(request);
> +
> +     if (RB_EMPTY_NODE(&request->irq_node))
> +             return;
> +
> +     spin_lock(&engine->irq_lock);
> +     if (--request->irq_count == 0 && !RB_EMPTY_NODE(&request->irq_node))
> {
> +             if (engine->irq_first == request)
> +                     engine->irq_first = NULL;
> +             rb_erase(&request->irq_node, &engine->irq_requests);
> +     }
> +     spin_unlock(&engine->irq_lock);
> +}
> +
>  int __intel_ring_space(int head, int tail, int size)  {
>       int space = head - tail;
> @@ -2087,6 +2243,7 @@ static int intel_init_ring_buffer(struct drm_device
> *dev,
>       ring->buffer = ringbuf;
> 
>       ring->dev = dev;
> +     ring->i915 = to_i915(dev);
>       INIT_LIST_HEAD(&ring->request_list);
>       INIT_LIST_HEAD(&ring->execlist_queue);
>       i915_gem_batch_pool_init(ring, &ring->batch_pool); @@ -2095,7
> +2252,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>       ringbuf->ring = ring;
>       memset(ring->semaphore.sync_seqno, 0,
> sizeof(ring->semaphore.sync_seqno));
> 
> -     init_waitqueue_head(&ring->irq_queue);
> +     intel_engine_init_wakeup(ring);
> 
>       if (I915_NEED_GFX_HWS(dev)) {
>               ret = init_status_page(ring);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 66b7f32fd293..9a98268a55f5 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -160,6 +160,7 @@ struct  intel_engine_cs {  #define LAST_USER_RING
> (VECS + 1)
>       u32             mmio_base;
>       struct          drm_device *dev;
> +     struct drm_i915_private *i915;
>       struct intel_ringbuffer *buffer;
> 
>       /*
> @@ -295,7 +296,11 @@ struct  intel_engine_cs {
> 
>       bool gpu_caches_dirty;
> 
> +     spinlock_t irq_lock;
> +     struct rb_root irq_requests;
> +     struct drm_i915_gem_request *irq_first;
>       wait_queue_head_t irq_queue;
> +     struct work_struct irq_work;
> 
>       struct intel_context *default_context;
>       struct intel_context *last_context;
> @@ -499,4 +504,8 @@ void intel_ring_reserved_space_end(struct
> intel_ringbuffer *ringbuf);
>  /* Legacy ringbuffer specific portion of reservation code: */  int
> intel_ring_reserve_space(struct drm_i915_gem_request *request);
> 
> +void intel_engine_init_wakeup(struct intel_engine_cs *engine); void
> +intel_engine_add_wakeup(struct drm_i915_gem_request *request); void
> +intel_engine_remove_wakeup(struct drm_i915_gem_request *request);
> +
>  #endif /* _INTEL_RINGBUFFER_H_ */
> --
> 2.6.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to