Re: [Intel-gfx] [PATCH v7 5/9] drm/i915: vgpu context submission pv optimization

2019-07-22 Thread Zhang, Xiaolin
On 07/08/2019 06:41 PM, Chris Wilson wrote:
> Quoting Xiaolin Zhang (2019-07-08 02:35:18)
>> +static void pv_submit(struct intel_engine_cs *engine,
>> +  struct i915_request **out,
>> +  struct i915_request **end)
>> +{
>> +   struct intel_engine_execlists * const execlists = &engine->execlists;
>> +   struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
>> +   struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
>> +   struct i915_request *rq;
>> +
>> +   u64 descs[2];
>> +   int n, err;
>> +
>> +   memset(descs, 0, sizeof(descs));
>> +   n = 0;
>> +   do {
>> +   rq = *out++;
>> +   descs[n++] = execlists_update_context(rq);
>> +   } while (out != end);
>> +
>> +   for (n = 0; n < execlists_num_ports(execlists); n++)
>> +   pv_elsp->descs[n] = descs[n];
> You can polish this a bit, minor nit.
Sure.
>
>> +   writel(PV_ACTION_ELSP_SUBMISSION, execlists->submit_reg);
>> +
>> +#define done (READ_ONCE(pv_elsp->submitted) == true)
>> +   err = wait_for_us(done, 1);
>> +   if (err)
>> +   err = wait_for(done, 1);
> Strictly, you need to use wait_for_atomic_us() [under a spinlock here],
> and there's no need for a second pass since you are not allowed to sleep
> anyway. So just set the timeout to 1000us.
Sure.
>> +#undef done
>> +
>> +   if (unlikely(err))
>> +   DRM_ERROR("PV (%s) workload submission failed\n", 
>> engine->name);
>> +
>> +   pv_elsp->submitted = false;
> However, that looks solid wrt to serialisation of this engine with its
> pv host, without cross-interference (at least in the comms channel).
>
> If you want to get fancy, you should be able to simply not dequeue until
> !pv_elsp->submitted so the wait-for-ack occurs naturally. So long as the
> pv host plays nicely, we should always see submitted acked before the
> request is signaled. Give or take problems with preemption and the pv
> host being a black box that may allow requests to complete and so our
> submission be a no-op (and so not generate an interrupt to allow further
> submission). Indeed, I would strongly recommend you use the delayed ack
> plus jiffie timer to avoid the no-op submission problem.
I will implement this suggestion. thanks your feedback, Chris.
-BRs, Xiaolin
> If you want to prove this in a bunch of mocked up selftests that provide
> the pv channel ontop of the local driver
> -Chris
> ___
> intel-gvt-dev mailing list
> intel-gvt-...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH v7 5/9] drm/i915: vgpu context submission pv optimization

2019-07-08 Thread Chris Wilson
Quoting Xiaolin Zhang (2019-07-08 02:35:18)
> +static void pv_submit(struct intel_engine_cs *engine,
> +  struct i915_request **out,
> +  struct i915_request **end)
> +{
> +   struct intel_engine_execlists * const execlists = &engine->execlists;
> +   struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
> +   struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
> +   struct i915_request *rq;
> +
> +   u64 descs[2];
> +   int n, err;
> +
> +   memset(descs, 0, sizeof(descs));
> +   n = 0;
> +   do {
> +   rq = *out++;
> +   descs[n++] = execlists_update_context(rq);
> +   } while (out != end);
> +
> +   for (n = 0; n < execlists_num_ports(execlists); n++)
> +   pv_elsp->descs[n] = descs[n];

You can polish this a bit, minor nit.

> +   writel(PV_ACTION_ELSP_SUBMISSION, execlists->submit_reg);
> +
> +#define done (READ_ONCE(pv_elsp->submitted) == true)
> +   err = wait_for_us(done, 1);
> +   if (err)
> +   err = wait_for(done, 1);

Strictly, you need to use wait_for_atomic_us() [under a spinlock here],
and there's no need for a second pass since you are not allowed to sleep
anyway. So just set the timeout to 1000us.

> +#undef done
> +
> +   if (unlikely(err))
> +   DRM_ERROR("PV (%s) workload submission failed\n", 
> engine->name);
> +
> +   pv_elsp->submitted = false;

However, that looks solid wrt to serialisation of this engine with its
pv host, without cross-interference (at least in the comms channel).

If you want to get fancy, you should be able to simply not dequeue until
!pv_elsp->submitted so the wait-for-ack occurs naturally. So long as the
pv host plays nicely, we should always see submitted acked before the
request is signaled. Give or take problems with preemption and the pv
host being a black box that may allow requests to complete and so our
submission be a no-op (and so not generate an interrupt to allow further
submission). Indeed, I would strongly recommend you use the delayed ack
plus jiffie timer to avoid the no-op submission problem.

If you want to prove this in a bunch of mocked up selftests that provide
the pv channel ontop of the local driver
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH v7 5/9] drm/i915: vgpu context submission pv optimization

2019-07-07 Thread Xiaolin Zhang
It is performance optimization to override the actual submisison backend
in order to eliminate execlists csb process and reduce mmio trap numbers
for workload submission without context switch interrupt by talking with
GVT via PV submisison notification mechanism between guest and GVT.

Use PV_SUBMISSION to control this level of pv optimization.

v0: RFC.
v1: rebase.
v2: added pv ops for pv context submission. to maximize code resuse,
introduced 2 more ops (submit_ports & preempt_context) instead of 1 op
(set_default_submission) in engine structure. pv version of
submit_ports and preempt_context implemented.
v3:
1. to reduce more code duplication, code refactor and replaced 2 ops
"submit_ports & preempt_contex" from v2 by 1 ops "write_desc"
in engine structure. pv version of write_des implemented.
2. added VGT_G2V_ELSP_SUBMIT for g2v pv notification.
v4: implemented pv elsp submission tasklet as the backend workload
submisison by talking to GVT with PV notificaiton mechanism and renamed
VGT_G2V_ELSP_SUBMIT to VGT_G2V_PV_SUBMISIION.
v5: addressed v4 comments from Chris, intel_pv_submission.c added.
v6: addressed v5 comments from Chris, replaced engine id by hw_id.
v7: rebase.

Signed-off-by: Xiaolin Zhang 
---
 drivers/gpu/drm/i915/Makefile  |   2 +-
 drivers/gpu/drm/i915/gt/intel_lrc.c|   8 +-
 drivers/gpu/drm/i915/i915_vgpu.c   |  15 ++-
 drivers/gpu/drm/i915/i915_vgpu.h   |  15 +++
 drivers/gpu/drm/i915/intel_pv_submission.c | 189 +
 5 files changed, 225 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_pv_submission.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 5266dbe..6e13f7c 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -244,7 +244,7 @@ i915-$(CONFIG_DRM_I915_SELFTEST) += \
selftests/igt_spinner.o
 
 # virtual gpu code
-i915-y += i915_vgpu.o
+i915-y += i915_vgpu.o intel_pv_submission.o
 
 ifeq ($(CONFIG_DRM_I915_GVT),y)
 i915-y += intel_gvt.o
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c 
b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e1ae139..48a9b28 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2702,10 +2702,14 @@ void intel_execlists_set_default_submission(struct 
intel_engine_cs *engine)
engine->unpark = NULL;
 
engine->flags |= I915_ENGINE_SUPPORTS_STATS;
-   if (!intel_vgpu_active(engine->i915))
-   engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
+   engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
engine->flags |= I915_ENGINE_HAS_PREEMPTION;
+
+   if (intel_vgpu_active(engine->i915)) {
+   engine->flags &= ~I915_ENGINE_HAS_SEMAPHORES;
+   intel_vgpu_config_pv_caps(engine->i915, PV_SUBMISSION, engine);
+   }
 }
 
 static void execlists_destroy(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index 2aad0b8..c628be8 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -97,7 +97,7 @@ void i915_detect_vgpu(struct drm_i915_private *dev_priv)
dev_priv->vgpu.active = true;
 
/* guest driver PV capability */
-   dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE;
+   dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE | PV_SUBMISSION;
 
if (!intel_vgpu_check_pv_caps(dev_priv, shared_area)) {
DRM_INFO("Virtual GPU for Intel GVT-g detected.\n");
@@ -380,6 +380,7 @@ void intel_vgpu_config_pv_caps(struct drm_i915_private 
*dev_priv,
enum pv_caps cap, void *data)
 {
struct i915_ppgtt *ppgtt;
+   struct intel_engine_cs *engine;
 
if (!intel_vgpu_enabled_pv_caps(dev_priv, cap))
return;
@@ -390,6 +391,11 @@ void intel_vgpu_config_pv_caps(struct drm_i915_private 
*dev_priv,
ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl_pv;
ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl_pv;
}
+
+   if (cap == PV_SUBMISSION) {
+   engine = (struct intel_engine_cs *)data;
+   vgpu_set_pv_submission(engine);
+   }
 }
 
 /**
@@ -584,6 +590,8 @@ static int intel_vgpu_setup_shared_page(struct 
drm_i915_private *dev_priv,
struct gvt_shared_page *base;
u64 gpa;
u16 ver_maj, ver_min;
+   int i;
+   u32 size;
 
/* We allocate 1 page shared between guest and GVT for data exchange.
 *   ___.
@@ -657,6 +665,11 @@ static int intel_vgpu_setup_shared_page(struct 
drm_i915_private *dev_priv,
pv->notify = intel_vgpu_pv_notify_mmio;
mutex_init(&pv->send_mutex);
 
+   /* setup PV per engine data exchange ptr */
+   size = sizeof(struct pv_submission);
+   for (i = 0; i < PV_MAX_ENGINES_NUM; i++)
+   pv->pv_elsp[i] = (void *)base + PV_E