We are using a timeline-attached struct dma_fence_work to coalesce dma-fences on eviction. In this mode we will not have a work callback attached. Similar to how the dma-fence-chain and dma-fence-array containers do this, use irq work to signal to reduce latency.
Signed-off-by: Thomas Hellström <thomas.hellst...@linux.intel.com> --- drivers/gpu/drm/i915/i915_sw_fence_work.c | 36 ++++++++++++++++++----- drivers/gpu/drm/i915/i915_sw_fence_work.h | 2 ++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.c b/drivers/gpu/drm/i915/i915_sw_fence_work.c index 87cdb3158042..4573f537ada4 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence_work.c +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.c @@ -32,16 +32,17 @@ void dma_fence_work_timeline_attach(struct dma_fence_work_timeline *tl, { struct dma_fence *await; + might_sleep(); if (tl->ops->get) tl->ops->get(tl); - spin_lock(&tl->lock); + spin_lock_irq(&tl->lock); await = tl->last_fence; tl->last_fence = dma_fence_get(&f->dma); f->dma.seqno = tl->seqno++; f->dma.context = tl->context; f->tl = tl; - spin_unlock(&tl->lock); + spin_unlock_irq(&tl->lock); if (await) { __i915_sw_fence_await_dma_fence(&f->chain, await, tl_cb); @@ -53,13 +54,14 @@ static void dma_fence_work_timeline_detach(struct dma_fence_work *f) { struct dma_fence_work_timeline *tl = f->tl; bool put = false; + unsigned long irq_flags; - spin_lock(&tl->lock); + spin_lock_irqsave(&tl->lock, irq_flags); if (tl->last_fence == &f->dma) { put = true; tl->last_fence = NULL; } - spin_unlock(&tl->lock); + spin_unlock_irqrestore(&tl->lock, irq_flags); if (tl->ops->put) tl->ops->put(tl); if (put) @@ -68,8 +70,6 @@ static void dma_fence_work_timeline_detach(struct dma_fence_work *f) static void dma_fence_work_complete(struct dma_fence_work *f) { - dma_fence_signal(&f->dma); - if (f->ops->release) f->ops->release(f); @@ -79,13 +79,32 @@ static void dma_fence_work_complete(struct dma_fence_work *f) dma_fence_put(&f->dma); } +static void dma_fence_work_irq_work(struct irq_work *irq_work) +{ + struct dma_fence_work *f = container_of(irq_work, typeof(*f), irq_work); + + dma_fence_signal(&f->dma); + if (f->ops->release) + /* Note we take the signaled path in dma_fence_work_work() */ + queue_work(system_unbound_wq, &f->work); + else + dma_fence_work_complete(f); +} + static void dma_fence_work_work(struct work_struct *work) { struct dma_fence_work *f = container_of(work, typeof(*f), work); + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &f->dma.flags)) { + dma_fence_work_complete(f); + return; + } + if (f->ops->work) f->ops->work(f); + dma_fence_signal(&f->dma); + dma_fence_work_complete(f); } @@ -102,8 +121,10 @@ fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) dma_fence_get(&f->dma); if (test_bit(DMA_FENCE_WORK_IMM, &f->dma.flags)) dma_fence_work_work(&f->work); - else + else if (f->ops->work) queue_work(system_unbound_wq, &f->work); + else + irq_work_queue(&f->irq_work); break; case FENCE_FREE: @@ -155,6 +176,7 @@ void dma_fence_work_init(struct dma_fence_work *f, dma_fence_init(&f->dma, &fence_ops, &f->lock, 0, 0); i915_sw_fence_init(&f->chain, fence_notify); INIT_WORK(&f->work, dma_fence_work_work); + init_irq_work(&f->irq_work, dma_fence_work_irq_work); } int dma_fence_work_chain(struct dma_fence_work *f, struct dma_fence *signal) diff --git a/drivers/gpu/drm/i915/i915_sw_fence_work.h b/drivers/gpu/drm/i915/i915_sw_fence_work.h index 6f41ee360133..c412bb4cb288 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence_work.h +++ b/drivers/gpu/drm/i915/i915_sw_fence_work.h @@ -8,6 +8,7 @@ #define I915_SW_FENCE_WORK_H #include <linux/dma-fence.h> +#include <linux/irq_work.h> #include <linux/spinlock.h> #include <linux/workqueue.h> @@ -77,6 +78,7 @@ struct dma_fence_work { struct i915_sw_dma_fence_cb cb; struct work_struct work; + struct irq_work irq_work; struct dma_fence_work_timeline *tl; -- 2.31.1