RE: [RFC PATCH 3/3] drm/i915: Enabling WD Transcoder

2022-04-27 Thread Kandpal, Suraj
++Laurent ,Dmitry, Abhinav and Rob
> Adding support for writeback transcoder to start capturing frames using
> interrupt mechanism
> 
> Signed-off-by: Suraj Kandpal 
> ---
>  drivers/gpu/drm/i915/Makefile |   1 +
>  drivers/gpu/drm/i915/display/intel_acpi.c |   1 +
>  drivers/gpu/drm/i915/display/intel_display.c  |  89 +-
>  drivers/gpu/drm/i915/display/intel_display.h  |   9 +
>  .../drm/i915/display/intel_display_types.h|  13 +
>  drivers/gpu/drm/i915/display/intel_dpll.c |   3 +
>  drivers/gpu/drm/i915/display/intel_opregion.c |   3 +
>  drivers/gpu/drm/i915/display/intel_wd.c   | 978 ++
>  drivers/gpu/drm/i915/display/intel_wd.h   |  82 ++
>  drivers/gpu/drm/i915/i915_drv.h   |   2 +
>  drivers/gpu/drm/i915/i915_irq.c   |   8 +-
>  drivers/gpu/drm/i915/i915_pci.c   |   7 +-
>  drivers/gpu/drm/i915/i915_reg.h   | 137 +++
>  13 files changed, 1330 insertions(+), 3 deletions(-)  create mode 100644
> drivers/gpu/drm/i915/display/intel_wd.c
>  create mode 100644 drivers/gpu/drm/i915/display/intel_wd.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 087bd9d1b397..5ee32513a945 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -287,6 +287,7 @@ i915-y += \
>   display/intel_vdsc.o \
>   display/intel_vrr.o \
>   display/intel_wb_connector.o\
> + display/intel_wd.o\
>   display/vlv_dsi.o \
>   display/vlv_dsi_pll.o
> 
> diff --git a/drivers/gpu/drm/i915/display/intel_acpi.c
> b/drivers/gpu/drm/i915/display/intel_acpi.c
> index e78430001f07..ae08db164f73 100644
> --- a/drivers/gpu/drm/i915/display/intel_acpi.c
> +++ b/drivers/gpu/drm/i915/display/intel_acpi.c
> @@ -247,6 +247,7 @@ static u32 acpi_display_type(struct intel_connector
> *connector)
>   case DRM_MODE_CONNECTOR_LVDS:
>   case DRM_MODE_CONNECTOR_eDP:
>   case DRM_MODE_CONNECTOR_DSI:
> + case DRM_MODE_CONNECTOR_WRITEBACK:
>   display_type = ACPI_DISPLAY_TYPE_INTERNAL_DIGITAL;
>   break;
>   case DRM_MODE_CONNECTOR_Unknown:
> diff --git a/drivers/gpu/drm/i915/display/intel_display.c
> b/drivers/gpu/drm/i915/display/intel_display.c
> index eb49973621f0..6dedc7921f54 100644
> --- a/drivers/gpu/drm/i915/display/intel_display.c
> +++ b/drivers/gpu/drm/i915/display/intel_display.c
> @@ -111,6 +111,7 @@
>  #include "intel_sprite.h"
>  #include "intel_tc.h"
>  #include "intel_vga.h"
> +#include "intel_wd.h"
>  #include "i9xx_plane.h"
>  #include "skl_scaler.h"
>  #include "skl_universal_plane.h"
> @@ -1544,6 +1545,72 @@ static void
> intel_encoders_update_complete(struct intel_atomic_state *state)
>   }
>  }
> 
> +static void intel_queue_writeback_job(struct intel_atomic_state *state,
> + struct intel_crtc *intel_crtc, struct intel_crtc_state
> *crtc_state) {
> + struct drm_connector_state *new_conn_state;
> + struct drm_connector *connector;
> + struct drm_i915_private *i915 = to_i915(intel_crtc->base.dev);
> + struct intel_wd *intel_wd;
> + struct intel_connector *intel_connector;
> + struct intel_digital_connector_state *intel_conn_state;
> + struct intel_encoder *encoder;
> + int i;
> +
> + for_each_intel_encoder_with_wd(>drm, encoder) {
> + intel_wd = enc_to_intel_wd(encoder);
> +
> + if (intel_wd->wd_crtc != intel_crtc)
> + return;
> +
> + }
> +
> + for_each_new_connector_in_state(>base, connector,
> new_conn_state,
> + i) {
> + intel_conn_state =
> to_intel_digital_connector_state(new_conn_state);
> + if (!intel_conn_state->job)
> + continue;
> + intel_connector = to_intel_connector(connector);
> + intel_writeback_queue_job(_connector->wb_conn,
> new_conn_state);
> + drm_dbg_kms(>drm, "queueing writeback job\n");
> + }
> +}
> +
> +static void intel_find_writeback_connector(struct intel_atomic_state
> *state,
> + struct intel_crtc *intel_crtc, struct intel_crtc_state
> *crtc_state) {
> + struct drm_connector_state *new_conn_state;
> + struct drm_connector *connector;
> + struct drm_i915_private *i915 = to_i915(intel_crtc->base.dev);
> + struct intel_wd *intel_wd;
> + struct intel_encoder *encoder;
> + int i;
> +
> + for_each_intel_encoder_with_wd(>drm, encoder) {
> + intel_wd = enc_to_intel_wd(encoder);
> +
> + if (intel_wd->wd_crtc != intel_crtc)
> + return;
> +
> + }
> +
> + for_each_new_connector_in_state(>base, connector,
> new_conn_state,
> + i) {
> + struct intel_connector *intel_connector;
> +
> + intel_connector = to_intel_connector(connector);
> + drm_dbg_kms(>drm, "[CONNECTOR:%d:%s]: status:
> %s\n",
> +

RE: [RFC PATCH 2/3] drm/i915: Define WD trancoder for i915

2022-04-27 Thread Kandpal, Suraj
++Laurent ,Dmitry, Abhinav and Rob

> -Original Message-
> From: Kandpal, Suraj 
> Sent: Thursday, April 21, 2022 10:38 AM
> To: intel-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Cc: Nikula, Jani ; ville.syrj...@linux.intel.com;
> Murthy, Arun R ; Kandpal, Suraj
> 
> Subject: [RFC PATCH 2/3] drm/i915: Define WD trancoder for i915
> 
> Adding WD Types, WD transcoder to enum list and WD Transcoder offsets
> 
> Signed-off-by: Suraj Kandpal 
> ---
>  drivers/gpu/drm/i915/display/intel_display.h   | 6 ++
>  drivers/gpu/drm/i915/display/intel_display_types.h | 1 +
>  drivers/gpu/drm/i915/i915_reg.h| 2 ++
>  3 files changed, 9 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/display/intel_display.h
> b/drivers/gpu/drm/i915/display/intel_display.h
> index 8513703086b7..8c93a5de8e07 100644
> --- a/drivers/gpu/drm/i915/display/intel_display.h
> +++ b/drivers/gpu/drm/i915/display/intel_display.h
> @@ -119,6 +119,8 @@ enum transcoder {
>   TRANSCODER_DSI_1,
>   TRANSCODER_DSI_A = TRANSCODER_DSI_0,/* legacy DSI */
>   TRANSCODER_DSI_C = TRANSCODER_DSI_1,/* legacy DSI */
> + TRANSCODER_WD_0,
> + TRANSCODER_WD_1,
> 
>   I915_MAX_TRANSCODERS
>  };
> @@ -140,6 +142,10 @@ static inline const char *transcoder_name(enum
> transcoder transcoder)
>   return "DSI A";
>   case TRANSCODER_DSI_C:
>   return "DSI C";
> + case TRANSCODER_WD_0:
> + return "WD 0";
> + case TRANSCODER_WD_1:
> + return "WD 1";
>   default:
>   return "";
>   }
> diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h
> b/drivers/gpu/drm/i915/display/intel_display_types.h
> index 7a96ecba73c0..dcb4ad43cf88 100644
> --- a/drivers/gpu/drm/i915/display/intel_display_types.h
> +++ b/drivers/gpu/drm/i915/display/intel_display_types.h
> @@ -79,6 +79,7 @@ enum intel_output_type {
>   INTEL_OUTPUT_DSI = 9,
>   INTEL_OUTPUT_DDI = 10,
>   INTEL_OUTPUT_DP_MST = 11,
> + INTEL_OUTPUT_WD = 12,
>  };
> 
>  enum hdmi_force_audio {
> diff --git a/drivers/gpu/drm/i915/i915_reg.h
> b/drivers/gpu/drm/i915/i915_reg.h index ddbc7a685a50..6396afd77209
> 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2023,6 +2023,8 @@
>  #define TRANSCODER_EDP_OFFSET 0x6f000
>  #define TRANSCODER_DSI0_OFFSET   0x6b000
>  #define TRANSCODER_DSI1_OFFSET   0x6b800
> +#define TRANSCODER_WD0_OFFSET0x6e000
> +#define TRANSCODER_WD1_OFFSET0x6e800
> 
>  #define HTOTAL(trans)_MMIO_TRANS2(trans, _HTOTAL_A)
>  #define HBLANK(trans)_MMIO_TRANS2(trans, _HBLANK_A)
> --
> 2.35.1



Re: [Intel-gfx] [PATCH 0/2] Initial GuC firmware release for DG2

2022-04-27 Thread Lucas De Marchi

On Wed, Apr 27, 2022 at 03:14:16PM -0700, John Harrison wrote:

On 4/27/2022 11:24, Timo Aaltonen wrote:

john.c.harri...@intel.com kirjoitti 27.4.2022 klo 19.55:

From: John Harrison 

Add GuC firmware for DG2.

Note that an older version of this patch exists in the CI topic
branch. Hence this set includes a revert of that patch before applying
the new version. When merging, the revert would simply be dropped and
the corresponding patch in the topic branch would also be dropped.

Signed-off-by: John Harrison 


John Harrison (2):
   Revert "drm/i915/dg2: Define GuC firmware version for DG2"
   drm/i915/dg2: Define GuC firmware version for DG2

  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)



The firmware is not public yet, though? Shouldn't it have been sent 
upstream already? Same complaint applies to DMC.



Not sure about the DMC team, but for i915 we upload the firmware to an 
FDO repo so that the CI system can find it and run the pre-merge 
testing with it. However, we don't send the final pull request for the 
real linux firmware repo until we have merged the i915 patch to 
drm-gt-intel-next and it is definitely going upstream. Otherwise, we 
might end up pushing firmwares to the linux repo that never get used.


we don't want to risk sending a pull request to drm if the firmware is
not in linux-firmware repo yet though, so we need to be careful with
this workflow.

We still have some weeks, which should be sufficient time if it's sent
to linux-firmware asap.

2nd patch pushed to drm-intel-gt-next.
1st patch I removed from topic/core-for-CI.

Thanks
Lucas De Marchi



John.



RE: [RFC PATCH 0/3] i915 writeback private framework

2022-04-27 Thread Kandpal, Suraj
++Laurent ,Dmitry, and Abhinav

Hi,
Can you have a look at the private implementation i915 is currently going with 
till
we can figure out how  to work with drm core .

Regards,
Suraj Kandpal
> A patch series was floated in the drm mailing list which aimed to change the
> drm_connector and drm_encoder fields to pointer in the
> drm_connector_writeback structure, this received a huge pushback from the
> community but since i915 expects each connector present in the drm_device
> list to be a intel_connector but drm_writeback framework.
> [1] https://patchwork.kernel.org/project/dri-
> devel/patch/20220202081702.22119-1-suraj.kand...@intel.com/
> [2] https://patchwork.kernel.org/project/dri-
> devel/patch/20220202085429.22261-6-suraj.kand...@intel.com/
> This forces us to use a drm_connector which is not embedded in
> intel_connector the current drm_writeback framework becomes very
> unfeasible to us as it would mean a lot of checks at a lot of places to take 
> into
> account the above issue.Since no one had an issue with encoder field being
> changed into a pointer it was decided to break the connector and encoder
> pointer changes into two different series.The encoder field changes is
> currently being worked upon by Abhinav Kumar
> [3]https://patchwork.kernel.org/project/dri-devel/list/?series=633565
> In the meantime for i915 to start using the writeback functionality we came
> up with a interim solution to own writeback pipeline bypassing one provided
> by drm which is what these patches do.
> Note: these are temp patches till we figure out how we can either change
> drm core writeback to work with our intel_connector structure or find a
> different solution which allows us to work with the current drm_writeback
> framework
> 
> Suraj Kandpal (3):
>   drm/i915: Creating writeback pipeline to bypass drm_writeback
> framework
>   drm/i915: Define WD trancoder for i915
>   drm/i915: Enabling WD Transcoder
> 
>  drivers/gpu/drm/i915/Makefile |   2 +
>  drivers/gpu/drm/i915/display/intel_acpi.c |   1 +
>  drivers/gpu/drm/i915/display/intel_display.c  |  89 +-
> drivers/gpu/drm/i915/display/intel_display.h  |  15 +
>  .../drm/i915/display/intel_display_types.h|  18 +
>  drivers/gpu/drm/i915/display/intel_dpll.c |   3 +
>  drivers/gpu/drm/i915/display/intel_opregion.c |   3 +
>  .../gpu/drm/i915/display/intel_wb_connector.c | 296 ++
> .../gpu/drm/i915/display/intel_wb_connector.h |  99 ++
>  drivers/gpu/drm/i915/display/intel_wd.c   | 978 ++
>  drivers/gpu/drm/i915/display/intel_wd.h   |  82 ++
>  drivers/gpu/drm/i915/i915_drv.h   |   5 +
>  drivers/gpu/drm/i915/i915_irq.c   |   8 +-
>  drivers/gpu/drm/i915/i915_pci.c   |   7 +-
>  drivers/gpu/drm/i915/i915_reg.h   | 139 +++
>  15 files changed, 1742 insertions(+), 3 deletions(-)  create mode 100644
> drivers/gpu/drm/i915/display/intel_wb_connector.c
>  create mode 100644 drivers/gpu/drm/i915/display/intel_wb_connector.h
>  create mode 100644 drivers/gpu/drm/i915/display/intel_wd.c
>  create mode 100644 drivers/gpu/drm/i915/display/intel_wd.h
> 
> --
> 2.35.1



[PATCH v2 3/4] drm/i915/xehp: Add compute engine ABI

2022-04-27 Thread Matt Roper
We're now ready to start exposing compute engines to userspace.

v2:
 - Move kerneldoc for other engine classes to a separate patch.  (Andi)

Cc: Daniele Ceraolo Spurio 
Cc: Tvrtko Ursulin 
Cc: Vinay Belgaumkar 
Cc: Jordan Justen 
Cc: Szymon Morek 
UMD (mesa): https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14395
Signed-off-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/intel_engine_user.c | 2 +-
 drivers/gpu/drm/i915/i915_drm_client.c  | 1 +
 drivers/gpu/drm/i915/i915_drm_client.h  | 2 +-
 include/uapi/drm/i915_drm.h | 9 +
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c 
b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index 0f6cd96b459f..46a174f8aa00 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -47,7 +47,7 @@ static const u8 uabi_classes[] = {
[COPY_ENGINE_CLASS] = I915_ENGINE_CLASS_COPY,
[VIDEO_DECODE_CLASS] = I915_ENGINE_CLASS_VIDEO,
[VIDEO_ENHANCEMENT_CLASS] = I915_ENGINE_CLASS_VIDEO_ENHANCE,
-   /* TODO: Add COMPUTE_CLASS mapping once ABI is available */
+   [COMPUTE_CLASS] = I915_ENGINE_CLASS_COMPUTE,
 };
 
 static int engine_cmp(void *priv, const struct list_head *A,
diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
b/drivers/gpu/drm/i915/i915_drm_client.c
index 475a6f824cad..18d38cb59923 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.c
+++ b/drivers/gpu/drm/i915/i915_drm_client.c
@@ -81,6 +81,7 @@ static const char * const uabi_class_names[] = {
[I915_ENGINE_CLASS_COPY] = "copy",
[I915_ENGINE_CLASS_VIDEO] = "video",
[I915_ENGINE_CLASS_VIDEO_ENHANCE] = "video-enhance",
+   [I915_ENGINE_CLASS_COMPUTE] = "compute",
 };
 
 static u64 busy_add(struct i915_gem_context *ctx, unsigned int class)
diff --git a/drivers/gpu/drm/i915/i915_drm_client.h 
b/drivers/gpu/drm/i915/i915_drm_client.h
index 5f5b02b01ba0..f796c5e8e060 100644
--- a/drivers/gpu/drm/i915/i915_drm_client.h
+++ b/drivers/gpu/drm/i915/i915_drm_client.h
@@ -13,7 +13,7 @@
 
 #include "gt/intel_engine_types.h"
 
-#define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_VIDEO_ENHANCE
+#define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE
 
 struct drm_i915_private;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index ec000fc6c879..a2def7b27009 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -203,6 +203,15 @@ enum drm_i915_gem_engine_class {
 */
I915_ENGINE_CLASS_VIDEO_ENHANCE = 3,
 
+   /**
+* @I915_ENGINE_CLASS_COMPUTE:
+*
+* Compute engines support a subset of the instructions available
+* on render engines:  compute engines support Compute (GPGPU) and
+* programmable media workloads, but do not support the 3D pipeline.
+*/
+   I915_ENGINE_CLASS_COMPUTE   = 4,
+
/* Values in this enum should be kept compact. */
 
/**
-- 
2.35.1



[PATCH v2 0/4] i915: Turn on compute engine support

2022-04-27 Thread Matt Roper
Now that the necessary GuC-based hardware workarounds have landed, we're
finally ready to actually enable compute engines for use by userspace.
All of the "under-the-hood" heavy lifting already landed a while back in
other series so all that remains now is to add I915_ENGINE_CLASS_COMPUTE
to the uapi enum and add the CCS engines to the engine lists for the
Xe_HP SDV and DG2.

Userspace (Mesa) is linked in the ABI patch.  Existing IGT tests (e.g.,
i915_hangman) provide test coverage for general engine behavior since compute
engines should follow the same general rules as other engines.  We've also
recently added some additional subtests like
igt@gem_reset_stats@shared-reset-domain to cover the user-visible impacts of
the compute engines sharing the same hardware reset domain as the render
engine.

v2:
 - Update TLB invalidation register for compute engines and move it to a
   separate patch since it isn't related to the new uapi.  (Tvrtko,
   Prathap)
 - Move new kerneldoc for pre-existing engine classes to a separate
   patch.  (Andi)
 - Drop the compute UMD merge request link for now because it also
   included some additional multi-tile uapi that we're not ready to
   upstream just yet.  Even if they don't have a disentangled MR ready
   for reference, we still have the Mesa MR as a key userspace consumer.
   (Tvrtko)

Cc: Lucas De Marchi 
Cc: Tvrtko Ursulin 

Daniele Ceraolo Spurio (1):
  drm/i915: Xe_HP SDV and DG2 have up to 4 CCS engines

Matt Roper (3):
  drm/i915/uapi: Add kerneldoc for engine class enum
  drm/i915/xehp: Add register for compute engine's MMIO-based TLB
invalidation
  drm/i915/xehp: Add compute engine ABI

 drivers/gpu/drm/i915/gt/intel_engine_user.c |  2 +-
 drivers/gpu/drm/i915/gt/intel_gt.c  |  1 +
 drivers/gpu/drm/i915/gt/intel_gt_regs.h |  1 +
 drivers/gpu/drm/i915/i915_drm_client.c  |  1 +
 drivers/gpu/drm/i915/i915_drm_client.h  |  2 +-
 drivers/gpu/drm/i915/i915_pci.c |  6 +-
 include/uapi/drm/i915_drm.h | 62 +++--
 7 files changed, 65 insertions(+), 10 deletions(-)

-- 
2.35.1



[PATCH v2 4/4] drm/i915: Xe_HP SDV and DG2 have up to 4 CCS engines

2022-04-27 Thread Matt Roper
From: Daniele Ceraolo Spurio 

Cc: Vinay Belgaumkar 
Signed-off-by: Daniele Ceraolo Spurio 
Signed-off-by: Matt Roper 
Reviewed-by: Matt Roper 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/i915_pci.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index b60492826478..7739d6c33481 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -1037,7 +1037,8 @@ static const struct intel_device_info xehpsdv_info = {
BIT(RCS0) | BIT(BCS0) |
BIT(VECS0) | BIT(VECS1) | BIT(VECS2) | BIT(VECS3) |
BIT(VCS0) | BIT(VCS1) | BIT(VCS2) | BIT(VCS3) |
-   BIT(VCS4) | BIT(VCS5) | BIT(VCS6) | BIT(VCS7),
+   BIT(VCS4) | BIT(VCS5) | BIT(VCS6) | BIT(VCS7) |
+   BIT(CCS0) | BIT(CCS1) | BIT(CCS2) | BIT(CCS3),
.require_force_probe = 1,
 };
 
@@ -1056,7 +1057,8 @@ static const struct intel_device_info xehpsdv_info = {
.platform_engine_mask = \
BIT(RCS0) | BIT(BCS0) | \
BIT(VECS0) | BIT(VECS1) | \
-   BIT(VCS0) | BIT(VCS2)
+   BIT(VCS0) | BIT(VCS2) | \
+   BIT(CCS0) | BIT(CCS1) | BIT(CCS2) | BIT(CCS3)
 
 static const struct intel_device_info dg2_info = {
DG2_FEATURES,
-- 
2.35.1



[PATCH v2 2/4] drm/i915/xehp: Add register for compute engine's MMIO-based TLB invalidation

2022-04-27 Thread Matt Roper
Compute engines have a separate register that the driver should use to
perform MMIO-based TLB invalidation.

Note that the term "context" in this register's bspec description is
used to refer to the engine instance (in the same way "context" is used
on bspec 46167).

Bspec: 43930
Cc: Prathap Kumar Valsan 
Cc: Tvrtko Ursulin 
Signed-off-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/intel_gt.c  | 1 +
 drivers/gpu/drm/i915/gt/intel_gt_regs.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index 92394f13b42f..53307ca0eed0 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -1175,6 +1175,7 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
[VIDEO_DECODE_CLASS]= GEN12_VD_TLB_INV_CR,
[VIDEO_ENHANCEMENT_CLASS]   = GEN12_VE_TLB_INV_CR,
[COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR,
+   [COMPUTE_CLASS] = GEN12_COMPCTX_TLB_INV_CR,
};
struct drm_i915_private *i915 = gt->i915;
struct intel_uncore *uncore = gt->uncore;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h 
b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index a39718a40cc3..a0a49c16babd 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1007,6 +1007,7 @@
 #define GEN12_VD_TLB_INV_CR_MMIO(0xcedc)
 #define GEN12_VE_TLB_INV_CR_MMIO(0xcee0)
 #define GEN12_BLT_TLB_INV_CR   _MMIO(0xcee4)
+#define GEN12_COMPCTX_TLB_INV_CR   _MMIO(0xcf04)
 
 #define GEN12_MERT_MOD_CTRL_MMIO(0xcf28)
 #define RENDER_MOD_CTRL_MMIO(0xcf2c)
-- 
2.35.1



[PATCH v2 1/4] drm/i915/uapi: Add kerneldoc for engine class enum

2022-04-27 Thread Matt Roper
We'll be adding a new type of engine soon.  Let's document the existing
engine classes first to help make it clear what each type of engine is
used for.

Cc: Andi Shyti 
Signed-off-by: Matt Roper 
---
 include/uapi/drm/i915_drm.h | 53 -
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 35ca528803fd..ec000fc6c879 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -154,21 +154,62 @@ enum i915_mocs_table_index {
I915_MOCS_CACHED,
 };
 
-/*
+/**
+ * enum drm_i915_gem_engine_class - uapi engine type enumeration
+ *
  * Different engines serve different roles, and there may be more than one
- * engine serving each role. enum drm_i915_gem_engine_class provides a
- * classification of the role of the engine, which may be used when requesting
- * operations to be performed on a certain subset of engines, or for providing
- * information about that group.
+ * engine serving each role.  This enum provides a classification of the role
+ * of the engine, which may be used when requesting operations to be performed
+ * on a certain subset of engines, or for providing information about that
+ * group.
  */
 enum drm_i915_gem_engine_class {
+   /**
+* @I915_ENGINE_CLASS_RENDER:
+*
+* Render engines support instructions used for 3D, Compute (GPGPU),
+* and programmable media workloads.  These instructions fetch data and
+* dispatch individual work items to threads that operate in parallel.
+* The threads run small programs (called "kernels" or "shaders") on
+* the GPU's execution units (EUs).
+*/
I915_ENGINE_CLASS_RENDER= 0,
+
+   /**
+* @I915_ENGINE_CLASS_COPY:
+*
+* Copy engines (also referred to as "blitters") support instructions
+* that move blocks of data from one location in memory to another,
+* or that fill a specified location of memory with fixed data.
+* Copy engines can perform pre-defined logical or bitwise operations
+* on the source, destination, or pattern data.
+*/
I915_ENGINE_CLASS_COPY  = 1,
+
+   /**
+* @I915_ENGINE_CLASS_VIDEO:
+*
+* Video engines (also referred to as "bit stream decode" (BSD) or
+* "vdbox") support instructions that perform fixed-function media
+* decode and encode.
+*/
I915_ENGINE_CLASS_VIDEO = 2,
+
+   /**
+* @I915_ENGINE_CLASS_VIDEO_ENHANCE:
+*
+* Video enhancement engines (also referred to as "vebox") support
+* instructions related to image enhancement.
+*/
I915_ENGINE_CLASS_VIDEO_ENHANCE = 3,
 
-   /* should be kept compact */
+   /* Values in this enum should be kept compact. */
 
+   /**
+* @I915_ENGINE_CLASS_INVALID:
+*
+* Placeholder value to represent an invalid engine class assignment.
+*/
I915_ENGINE_CLASS_INVALID   = -1
 };
 
-- 
2.35.1



Re: [PATCH 1/2] drm/i915/xehp: Add compute engine ABI

2022-04-27 Thread Matt Roper
On Mon, Apr 25, 2022 at 11:41:36AM +0100, Tvrtko Ursulin wrote:
> 
> On 22/04/2022 20:50, Matt Roper wrote:
> > We're now ready to start exposing compute engines to userspace.
> > 
> > While we're at it, let's extend the kerneldoc description for the other
> > engine types as well.
> > 
> > Cc: Daniele Ceraolo Spurio 
> > Cc: Tvrtko Ursulin 
> > Cc: Vinay Belgaumkar 
> > Cc: Jordan Justen 
> > Cc: Szymon Morek 
> > UMD (mesa): https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14395
> > UMD (compute): https://github.com/intel/compute-runtime/pull/451
> 
> The compute one points to a commit named "Add compute engine class for xehp"
> but content of which seems more about engine query, including the yet
> non-existent distance query (and more)?! I certainly does not appear to be
> adding a definition of I915_ENGINE_CLASS_COMPUTE. This needs clarifying.
> 

Hi Syzmon, any updates on the compute UMD merge request here?  Is there
a different merge request we should reference for now that just uses the
I915_ENGINE_CLASS_COMPUTE without also relying on the
DRM_I915_QUERY_DISTANCE_INFO that we aren't upstreaming just yet?

I believe distance info is only useful for multi-tile platforms and
isn't necessary for general use of compute engines on a single tile
platform.

Thanks.


Matt

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation
(916) 356-2795


[pull] amdgpu, amdkfd drm-fixes-5.18

2022-04-27 Thread Alex Deucher
Hi Dave, Daniel,

Fixes for 5.18.

The following changes since commit b2d229d4ddb17db541098b83524d901257e93845:

  Linux 5.18-rc3 (2022-04-17 13:57:31 -0700)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-fixes-5.18-2022-04-27

for you to fetch changes up to fb8cc3318e47e1a0ced4025ef614317b541147e7:

  drm/amdgpu: keep mmhub clock gating being enabled during s2idle suspend 
(2022-04-27 17:38:02 -0400)


amd-drm-fixes-5.18-2022-04-27:

amdgpu:
- Runtime pm fix
- DCN memory leak fix in error path
- SI DPM deadlock fix
- S0ix fix

amdkfd:
- GWS fix
- GWS support for CRIU


Alex Deucher (1):
  drm/amdgpu: don't runtime suspend if there are displays attached (v3)

David Yat Sin (2):
  drm/amdkfd: Fix GWS queue count
  drm/amdkfd: CRIU add support for GWS queues

Evan Quan (1):
  drm/amd/pm: fix the deadlock issue observed on SI

Miaoqian Lin (1):
  drm/amd/display: Fix memory leak in dcn21_clock_source_create

Prike Liang (1):
  drm/amdgpu: keep mmhub clock gating being enabled during s2idle suspend

 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 105 ++---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  10 ++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  83 
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |   2 +-
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  10 +-
 .../gpu/drm/amd/display/dc/dcn21/dcn21_resource.c  |   1 +
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c|  39 
 drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c |  10 --
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c |  35 ---
 drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c   |  10 --
 10 files changed, 165 insertions(+), 140 deletions(-)


RE: [PATCH v2 0/4] Add Toshiba Visconti AFFINE image processing accelerator driver

2022-04-27 Thread yuji2.ishikawa
Hi Laurent,

Thank you for your comment.

We had never imagined that affine driver can be a V4L2 driver.
Affine is one of the accelerators in Visconti, and some accelerators 
receive/yield non-picture data.
Also, as the original accelerator drivers were implemented for kernel 4.19.x, 
we were not aware of the latest V4L2 architecture.
Currently, we assume accelerator drivers are kicked individually, not in an 
image processing pipeline, therefore simple misc driver is enough solution.

Is there any memory-to-memory driver sample/skelton to bring me better 
understanding?

Best regards,
Yuji

> -Original Message-
> From: Laurent Pinchart 
> Sent: Thursday, April 28, 2022 7:04 AM
> To: ishikawa yuji(石川 悠司 ○RDC□AITC○EA開)
> 
> Cc: Rob Herring ; iwamatsu nobuhiro(岩松 信洋 □SW
> C◯ACT) ; Sumit Semwal
> ; Christian König ;
> linux-arm-ker...@lists.infradead.org; linux-ker...@vger.kernel.org;
> linux-me...@vger.kernel.org; dri-devel@lists.freedesktop.org;
> linaro-mm-...@lists.linaro.org
> Subject: Re: [PATCH v2 0/4] Add Toshiba Visconti AFFINE image processing
> accelerator driver
> 
> Hi Yuji,
> 
> Thank you for the patch. It's nice to see contributions from Toshiba in the 
> image
> acceleration domain :-)
> 
> I'll start with a high-level question before diving into detailed review. Why 
> is
> this implemented in drivers/soc/ with a custom userspace API, and not as a
> V4L2 memory-to-memory driver ?
> 
> On Wed, Apr 27, 2022 at 10:23:41PM +0900, Yuji Ishikawa wrote:
> > This series is the AFFINE image processing accelerator driver for Toshiba's
> ARM SoC, Visconti[0].
> > This provides DT binding documentation, device driver, MAINTAINER files.
> >
> > The second patch "soc: visconti: Add Toshiba Visconti image processing
> accelerator common source"
> > is commonly used among acclerator drivers (affine, dnn, dspif, pyramid).
> >
> > Best regards,
> > Yuji
> >
> > [0]:
> >
> https://toshiba.semicon-storage.com/ap-en/semiconductor/product/image-
> > recognition-processors-visconti.html
> >
> >   dt-bindings: soc: visconti: Add Toshiba Visconti AFFINE image
> > v1 -> v2:
> >   - No update
> >
> >   soc: visconti: Add Toshiba Visconti image processing accelerator common
> source
> > v1 -> v2:
> >   - apply checkpatch.pl --strict
> >
> >   soc: visconti: Add Toshiba Visconti AFFINE image processing accelerator
> > v1 -> v2:
> >   - apply checkpatch.pl --strict
> >   - rename hwd_AFFINE_ to hwd_affine_
> >
> >   MAINTAINERS: Add entries for Toshiba Visconti AFFINE image processing
> accelerator
> > v1 -> v2:
> >   - No update
> >
> > Change in V2:
> >   - apply checkpatch.pl --strict
> >   - rename hwd_AFFINE_ to hwd_affine_
> >
> > Yuji Ishikawa (4):
> >   dt-bindings: soc: visconti: Add Toshiba Visconti AFFINE image
> > processing accelerator bindings
> >   soc: visconti: Add Toshiba Visconti image processing accelerator
> > common source
> >   soc: visconti: Add Toshiba Visconti AFFINE image processing
> > accelerator
> >   MAINTAINERS: Add entries for Toshiba Visconti AFFINE image processing
> > accelerator
> >
> >  .../soc/visconti/toshiba,visconti-affine.yaml |  53 ++
> >  MAINTAINERS   |   2 +
> >  drivers/soc/Kconfig   |   1 +
> >  drivers/soc/Makefile  |   1 +
> >  drivers/soc/visconti/Kconfig  |   7 +
> >  drivers/soc/visconti/Makefile |   8 +
> >  drivers/soc/visconti/affine/Makefile  |   6 +
> >  drivers/soc/visconti/affine/affine.c  | 451
> ++
> >  drivers/soc/visconti/affine/hwd_affine.c  | 206 
> >  drivers/soc/visconti/affine/hwd_affine.h  |  83 
> >  drivers/soc/visconti/affine/hwd_affine_reg.h  |  45 ++
> >  drivers/soc/visconti/ipa_common.c |  55 +++
> >  drivers/soc/visconti/ipa_common.h |  18 +
> >  drivers/soc/visconti/uapi/affine.h|  87 
> >  drivers/soc/visconti/uapi/ipa.h   |  88 
> >  15 files changed,  insertions(+)
> >  create mode 100644
> > Documentation/devicetree/bindings/soc/visconti/toshiba,visconti-affine
> > .yaml  create mode 100644 drivers/soc/visconti/Kconfig  create mode
> > 100644 drivers/soc/visconti/Makefile  create mode 100644
> > drivers/soc/visconti/affine/Makefile
> >  create mode 100644 drivers/soc/visconti/affine/affine.c
> >  create mode 100644 drivers/soc/visconti/affine/hwd_affine.c
> >  create mode 100644 drivers/soc/visconti/affine/hwd_affine.h
> >  create mode 100644 drivers/soc/visconti/affine/hwd_affine_reg.h
> >  create mode 100644 drivers/soc/visconti/ipa_common.c  create mode
> > 100644 drivers/soc/visconti/ipa_common.h  create mode 100644
> > drivers/soc/visconti/uapi/affine.h
> >  create mode 100644 drivers/soc/visconti/uapi/ipa.h
> 
> --
> Regards,
> 
> Laurent Pinchart


Re: [PATCH v5 6/9] drm: vkms: Refactor the plane composer to accept new formats

2022-04-27 Thread Igor Torrente




On 4/27/22 04:43, Pekka Paalanen wrote:

On Tue, 26 Apr 2022 22:22:22 -0300
Igor Torrente  wrote:


On April 26, 2022 10:03:09 PM GMT-03:00, Igor Torrente 
 wrote:



On 4/25/22 22:54, Igor Torrente wrote:

Hi Pekka,

On 4/25/22 05:10, Pekka Paalanen wrote:

On Sat, 23 Apr 2022 15:53:20 -0300
Igor Torrente  wrote:
   


...


+static void argb_u16_to_XRGB(struct vkms_frame_info *frame_info,
+const struct line_buffer *src_buffer, int y)
+{
+   int x, x_dst = frame_info->dst.x1;
+   u8 *dst_pixels = packed_pixels_addr(frame_info, x_dst, y);
+   struct pixel_argb_u16 *in_pixels = src_buffer->pixels;
+   int x_limit = min_t(size_t, drm_rect_width(_info->dst),
+   src_buffer->n_pixels);
+
+   for (x = 0; x < x_limit; x++, dst_pixels += 4) {
+   dst_pixels[3] = (u8)0xff;


When writing to XRGB, it's not necessary to ensure the X channel has
any sensible value. Anyone reading from XRGB must ignore that value
anyway. So why not write something wacky here, like 0xa1, that is far
enough from both 0x00 or 0xff to not be confused with them even
visually? Also not 0x7f or 0x80 which are close to half of 0xff.

Or, you could save a whole function and just use argb_u16_to_ARGB()
instead, even for XRGB destination.



Right. Maybe I could just leave the channel untouched.


Untouched may not be a good idea. Leaving anything untouched always has
the risk of leaking information through uninitialized memory. Maybe not
in this case because the destination is allocated by userspace already,
but nothing beats being obviously correct.


Makes sense.
   


Whatever you decide here, be prepared for it becoming de-facto kernel
UABI, because it is easy for userspace to (accidentally) rely on the
value, no matter what you pick.


I hope to make the right decision then.


The de-facto UABI seems to be already in place for {A, X}RGB.


"Only XRGB_


If that's only IGT, then you should raise an issue with IGT about this,
to figure out if IGT is wrong by accident or if it is deliberate, and
are we stuck with it.

This is why I would want to fill X with garbage, to make the
expectations clear before the "obvious and logical constant value for X"
makes a mess by making XRGB indistinguishable from ARGB. Then the next
question is, do we need a special function to write out XRGB values, or
can we simply re-use the ARGB function.

Do the tests expect X channel to be filled with 0xff or with the actual
A values? This question will matter when all planes have ARGB
framebuffers and no background color. Then even more questions will
arise about what should actually happen with A values (blending
equation).


I dig into the igt code a little bit and found that it's waiting for the 
channel to not be changed.
It fills all the pixels in the line with a value and calculates the CRC 
of the entire buffer, including the alpha.


I will crate an issue asking if this is intended.







I changed from 0xff to 0xbe and the `writeback-check-output` started to fail.



Thanks,
pq


Re: [Intel-gfx] [PATCH 1/2] drm/i915/xehp: Add compute engine ABI

2022-04-27 Thread Kumar Valsan, Prathap
On Mon, Apr 25, 2022 at 11:41:36AM +0100, Tvrtko Ursulin wrote:
> 
> On 22/04/2022 20:50, Matt Roper wrote:
> > We're now ready to start exposing compute engines to userspace.
> > 
> > While we're at it, let's extend the kerneldoc description for the other
> > engine types as well.
> > 
> > Cc: Daniele Ceraolo Spurio 
> > Cc: Tvrtko Ursulin 
> > Cc: Vinay Belgaumkar 
> > Cc: Jordan Justen 
> > Cc: Szymon Morek 
> > UMD (mesa): https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14395
> > UMD (compute): https://github.com/intel/compute-runtime/pull/451
> 
> The compute one points to a commit named "Add compute engine class for xehp"
> but content of which seems more about engine query, including the yet
> non-existent distance query (and more)?! I certainly does not appear to be
> adding a definition of I915_ENGINE_CLASS_COMPUTE. This needs clarifying.
> 
> > Signed-off-by: Matt Roper 
> > ---
> >   drivers/gpu/drm/i915/gt/intel_engine_user.c |  2 +-
> >   drivers/gpu/drm/i915/gt/intel_gt.c  |  1 +
> >   drivers/gpu/drm/i915/i915_drm_client.c  |  1 +
> >   drivers/gpu/drm/i915/i915_drm_client.h  |  2 +-
> >   include/uapi/drm/i915_drm.h | 62 +++--
> >   5 files changed, 60 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c 
> > b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > index 0f6cd96b459f..46a174f8aa00 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
> > @@ -47,7 +47,7 @@ static const u8 uabi_classes[] = {
> > [COPY_ENGINE_CLASS] = I915_ENGINE_CLASS_COPY,
> > [VIDEO_DECODE_CLASS] = I915_ENGINE_CLASS_VIDEO,
> > [VIDEO_ENHANCEMENT_CLASS] = I915_ENGINE_CLASS_VIDEO_ENHANCE,
> > -   /* TODO: Add COMPUTE_CLASS mapping once ABI is available */
> > +   [COMPUTE_CLASS] = I915_ENGINE_CLASS_COMPUTE,
> >   };
> >   static int engine_cmp(void *priv, const struct list_head *A,
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
> > b/drivers/gpu/drm/i915/gt/intel_gt.c
> > index 92394f13b42f..c96e123496a5 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> > @@ -1175,6 +1175,7 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
> > [VIDEO_DECODE_CLASS]= GEN12_VD_TLB_INV_CR,
> > [VIDEO_ENHANCEMENT_CLASS]   = GEN12_VE_TLB_INV_CR,
> > [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR,
> > +   [COMPUTE_CLASS] = GEN12_GFX_TLB_INV_CR,
> 
> Do you know what 0xcf04 is?
The mmio 0xcf04 is the one we should use for compute class. 
And the context bit in 0xcf04 represents engine instance.

GEN12_GFX_TLB_INV_CR is for render class.

Thanks,
Prathap
> 
> Or if GEN12_GFX_TLB_INV_CR is correct then I think get_reg_and_bit() might
> need adjusting to always select bit 0 for any compute engine instance. Not
> sure how hardware would behave if value other than '1' would be written into
> 0xced8.
> 
> Regards,
> 
> Tvrtko
> 
> > };
> > struct drm_i915_private *i915 = gt->i915;
> > struct intel_uncore *uncore = gt->uncore;
> > diff --git a/drivers/gpu/drm/i915/i915_drm_client.c 
> > b/drivers/gpu/drm/i915/i915_drm_client.c
> > index 475a6f824cad..18d38cb59923 100644
> > --- a/drivers/gpu/drm/i915/i915_drm_client.c
> > +++ b/drivers/gpu/drm/i915/i915_drm_client.c
> > @@ -81,6 +81,7 @@ static const char * const uabi_class_names[] = {
> > [I915_ENGINE_CLASS_COPY] = "copy",
> > [I915_ENGINE_CLASS_VIDEO] = "video",
> > [I915_ENGINE_CLASS_VIDEO_ENHANCE] = "video-enhance",
> > +   [I915_ENGINE_CLASS_COMPUTE] = "compute",
> >   };
> >   static u64 busy_add(struct i915_gem_context *ctx, unsigned int class)
> > diff --git a/drivers/gpu/drm/i915/i915_drm_client.h 
> > b/drivers/gpu/drm/i915/i915_drm_client.h
> > index 5f5b02b01ba0..f796c5e8e060 100644
> > --- a/drivers/gpu/drm/i915/i915_drm_client.h
> > +++ b/drivers/gpu/drm/i915/i915_drm_client.h
> > @@ -13,7 +13,7 @@
> >   #include "gt/intel_engine_types.h"
> > -#define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_VIDEO_ENHANCE
> > +#define I915_LAST_UABI_ENGINE_CLASS I915_ENGINE_CLASS_COMPUTE
> >   struct drm_i915_private;
> > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> > index 35ca528803fd..a2def7b27009 100644
> > --- a/include/uapi/drm/i915_drm.h
> > +++ b/include/uapi/drm/i915_drm.h
> > @@ -154,21 +154,71 @@ enum i915_mocs_table_index {
> > I915_MOCS_CACHED,
> >   };
> > -/*
> > +/**
> > + * enum drm_i915_gem_engine_class - uapi engine type enumeration
> > + *
> >* Different engines serve different roles, and there may be more than one
> > - * engine serving each role. enum drm_i915_gem_engine_class provides a
> > - * classification of the role of the engine, which may be used when 
> > requesting
> > - * operations to be performed on a certain subset of engines, or for 
> > providing
> > - * information about that group.
> > 

Re: [PATCH 2/2] drm/i915/dg2: Define GuC firmware version for DG2

2022-04-27 Thread Ceraolo Spurio, Daniele




On 4/27/2022 9:55 AM, john.c.harri...@intel.com wrote:

From: John Harrison 

First release of GuC for DG2.


Reviewed-by: Daniele Ceraolo Spurio 

Daniele



Signed-off-by: John Harrison 
CC: Tomasz Mistat 
CC: Ramalingam C 
CC: Daniele Ceraolo Spurio 
---
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
index a876d39e6bcf..d078f884b5e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -53,6 +53,7 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw,
   * firmware as TGL.
   */
  #define INTEL_GUC_FIRMWARE_DEFS(fw_def, guc_def) \
+   fw_def(DG2,  0, guc_def(dg2,  70, 1, 2)) \
fw_def(ALDERLAKE_P,  0, guc_def(adlp, 70, 1, 1)) \
fw_def(ALDERLAKE_S,  0, guc_def(tgl,  70, 1, 1)) \
fw_def(DG1,  0, guc_def(dg1,  70, 1, 1)) \




[PATCH 3/5] drm/i915/xehp: Use separate sseu init function

2022-04-27 Thread Matt Roper
Xe_HP has enough fundamental differences from previous platforms that it
makes sense to use a separate SSEU init function to keep things
straightforward and easy to understand.

Signed-off-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/intel_sseu.c | 85 
 1 file changed, 48 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c 
b/drivers/gpu/drm/i915/gt/intel_sseu.c
index 13387b4024ea..ef66c2b8861a 100644
--- a/drivers/gpu/drm/i915/gt/intel_sseu.c
+++ b/drivers/gpu/drm/i915/gt/intel_sseu.c
@@ -203,13 +203,42 @@ static void gen11_compute_sseu_info(struct sseu_dev_info 
*sseu, u8 s_en,
sseu->eu_total = compute_eu_total(sseu);
 }
 
-static void gen12_sseu_info_init(struct intel_gt *gt)
+static void xehp_sseu_info_init(struct intel_gt *gt)
 {
struct sseu_dev_info *sseu = >info.sseu;
struct intel_uncore *uncore = gt->uncore;
u32 g_dss_en, c_dss_en = 0;
u16 eu_en = 0;
u8 eu_en_fuse;
+   int eu;
+
+   /*
+* The concept of slice has been removed in Xe_HP.  To be compatible
+* with prior generations, assume a single slice across the entire
+* device. Then calculate out the DSS for each workload type within
+* that software slice.
+*/
+   intel_sseu_set_info(sseu, 1, 32, 16);
+
+   g_dss_en = intel_uncore_read(uncore, GEN12_GT_GEOMETRY_DSS_ENABLE);
+   c_dss_en = intel_uncore_read(uncore, GEN12_GT_COMPUTE_DSS_ENABLE);
+
+   eu_en_fuse = intel_uncore_read(uncore, XEHP_EU_ENABLE) & 
XEHP_EU_ENA_MASK;
+
+   for (eu = 0; eu < sseu->max_eus_per_subslice / 2; eu++)
+   if (eu_en_fuse & BIT(eu))
+   eu_en |= BIT(eu * 2) | BIT(eu * 2 + 1);
+
+   gen11_compute_sseu_info(sseu, 0x1, g_dss_en, c_dss_en, eu_en);
+}
+
+static void gen12_sseu_info_init(struct intel_gt *gt)
+{
+   struct sseu_dev_info *sseu = >info.sseu;
+   struct intel_uncore *uncore = gt->uncore;
+   u32 g_dss_en;
+   u16 eu_en = 0;
+   u8 eu_en_fuse;
u8 s_en;
int eu;
 
@@ -217,43 +246,23 @@ static void gen12_sseu_info_init(struct intel_gt *gt)
 * Gen12 has Dual-Subslices, which behave similarly to 2 gen11 SS.
 * Instead of splitting these, provide userspace with an array
 * of DSS to more closely represent the hardware resource.
-*
-* In addition, the concept of slice has been removed in Xe_HP.
-* To be compatible with prior generations, assume a single slice
-* across the entire device. Then calculate out the DSS for each
-* workload type within that software slice.
 */
-   if (IS_DG2(gt->i915) || IS_XEHPSDV(gt->i915))
-   intel_sseu_set_info(sseu, 1, 32, 16);
-   else
-   intel_sseu_set_info(sseu, 1, 6, 16);
+   intel_sseu_set_info(sseu, 1, 6, 16);
 
-   /*
-* As mentioned above, Xe_HP does not have the concept of a slice.
-* Enable one for software backwards compatibility.
-*/
-   if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
-   s_en = 0x1;
-   else
-   s_en = intel_uncore_read(uncore, GEN11_GT_SLICE_ENABLE) &
-  GEN11_GT_S_ENA_MASK;
+   s_en = intel_uncore_read(uncore, GEN11_GT_SLICE_ENABLE) &
+   GEN11_GT_S_ENA_MASK;
 
g_dss_en = intel_uncore_read(uncore, GEN12_GT_GEOMETRY_DSS_ENABLE);
-   if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
-   c_dss_en = intel_uncore_read(uncore, 
GEN12_GT_COMPUTE_DSS_ENABLE);
 
/* one bit per pair of EUs */
-   if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 50))
-   eu_en_fuse = intel_uncore_read(uncore, XEHP_EU_ENABLE) & 
XEHP_EU_ENA_MASK;
-   else
-   eu_en_fuse = ~(intel_uncore_read(uncore, GEN11_EU_DISABLE) &
-  GEN11_EU_DIS_MASK);
+   eu_en_fuse = ~(intel_uncore_read(uncore, GEN11_EU_DISABLE) &
+  GEN11_EU_DIS_MASK);
 
for (eu = 0; eu < sseu->max_eus_per_subslice / 2; eu++)
if (eu_en_fuse & BIT(eu))
eu_en |= BIT(eu * 2) | BIT(eu * 2 + 1);
 
-   gen11_compute_sseu_info(sseu, s_en, g_dss_en, c_dss_en, eu_en);
+   gen11_compute_sseu_info(sseu, s_en, g_dss_en, 0, eu_en);
 
/* TGL only supports slice-level power gating */
sseu->has_slice_pg = 1;
@@ -608,18 +617,20 @@ void intel_sseu_info_init(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
 
-   if (IS_HASWELL(i915))
-   hsw_sseu_info_init(gt);
-   else if (IS_CHERRYVIEW(i915))
-   cherryview_sseu_info_init(gt);
-   else if (IS_BROADWELL(i915))
-   bdw_sseu_info_init(gt);
-   else if (GRAPHICS_VER(i915) == 9)
-   gen9_sseu_info_init(gt);
-   else if (GRAPHICS_VER(i915) == 11)
-   gen11_sseu_info_init(gt);
+   if (GRAPHICS_VER_FULL(i915) 

[PATCH 5/5] drm/i915/sseu: Disassociate internal subslice mask representation from uapi

2022-04-27 Thread Matt Roper
Rather than storing subslice masks internally as u8[] (inside the sseu
structure) and u32 (everywhere else), let's move over to using an
intel_sseu_ss_mask_t typedef compatible with the operations in
linux/bitmap.h.  We're soon going to start adding code for a new
platform where subslice masks are spread across two 32-bit registers
(requiring 64 bits to represent), and we expect future platforms will
likely take this even farther, requiring bitmask storage larger than a
simple u64 can hold.

Signed-off-by: Matt Roper 
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c  |   4 +-
 drivers/gpu/drm/i915/gt/intel_engine_cs.c|   2 +-
 drivers/gpu/drm/i915/gt/intel_gt.c   |  14 +-
 drivers/gpu/drm/i915/gt/intel_sseu.c | 197 +++
 drivers/gpu/drm/i915/gt/intel_sseu.h |  48 ++---
 drivers/gpu/drm/i915/gt/intel_sseu_debugfs.c |  28 +--
 drivers/gpu/drm/i915/gt/intel_workarounds.c  |  28 ++-
 drivers/gpu/drm/i915/i915_getparam.c |   2 +-
 drivers/gpu/drm/i915/i915_query.c|   8 +-
 9 files changed, 183 insertions(+), 148 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index ab4c5ab28e4d..ea012ee3a8de 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1901,7 +1901,7 @@ i915_gem_user_to_context_sseu(struct intel_gt *gt,
if (user->slice_mask & ~device->slice_mask)
return -EINVAL;
 
-   if (user->subslice_mask & ~device->subslice_mask[0])
+   if (user->subslice_mask & ~device->subslice_mask.b[0])
return -EINVAL;
 
if (user->max_eus_per_subslice > device->max_eus_per_subslice)
@@ -1915,7 +1915,7 @@ i915_gem_user_to_context_sseu(struct intel_gt *gt,
/* Part specific restrictions. */
if (GRAPHICS_VER(i915) == 11) {
unsigned int hw_s = hweight8(device->slice_mask);
-   unsigned int hw_ss_per_s = hweight8(device->subslice_mask[0]);
+   unsigned int hw_ss_per_s = hweight8(device->subslice_mask.b[0]);
unsigned int req_s = hweight8(context->slice_mask);
unsigned int req_ss = hweight8(context->subslice_mask);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 14c6ddbbfde8..39c09963b3c7 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -610,7 +610,7 @@ static void engine_mask_apply_compute_fuses(struct intel_gt 
*gt)
if (GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
return;
 
-   ccs_mask = 
intel_slicemask_from_dssmask(intel_sseu_get_compute_subslices(>sseu),
+   ccs_mask = 
intel_slicemask_from_dssmask(info->sseu.compute_subslice_mask,
ss_per_ccs);
/*
 * If all DSS in a quadrant are fused off, the corresponding CCS
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c 
b/drivers/gpu/drm/i915/gt/intel_gt.c
index 92394f13b42f..cc03512d59ba 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -133,13 +133,6 @@ static const struct intel_mmio_range 
dg2_lncf_steering_table[] = {
{},
 };
 
-static u16 slicemask(struct intel_gt *gt, int count)
-{
-   u64 dss_mask = intel_sseu_get_subslices(>info.sseu, 0);
-
-   return intel_slicemask_from_dssmask(dss_mask, count);
-}
-
 int intel_gt_init_mmio(struct intel_gt *gt)
 {
struct drm_i915_private *i915 = gt->i915;
@@ -153,11 +146,14 @@ int intel_gt_init_mmio(struct intel_gt *gt)
 * An mslice is unavailable only if both the meml3 for the slice is
 * disabled *and* all of the DSS in the slice (quadrant) are disabled.
 */
-   if (HAS_MSLICES(i915))
+   if (HAS_MSLICES(i915)) {
gt->info.mslice_mask =
-   slicemask(gt, GEN_DSS_PER_MSLICE) |
+   
intel_slicemask_from_dssmask(gt->info.sseu.subslice_mask,
+GEN_DSS_PER_MSLICE);
+   gt->info.mslice_mask |=
(intel_uncore_read(gt->uncore, GEN10_MIRROR_FUSE3) &
 GEN12_MEML3_EN_MASK);
+   }
 
if (IS_DG2(i915)) {
gt->steering_table[MSLICE] = xehpsdv_mslice_steering_table;
diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c 
b/drivers/gpu/drm/i915/gt/intel_sseu.c
index f7ff6a9f67b0..466505d6bd18 100644
--- a/drivers/gpu/drm/i915/gt/intel_sseu.c
+++ b/drivers/gpu/drm/i915/gt/intel_sseu.c
@@ -28,56 +28,49 @@ void intel_sseu_set_info(struct sseu_dev_info *sseu, u8 
max_slices,
 unsigned int
 intel_sseu_subslice_total(const struct sseu_dev_info *sseu)
 {
-   unsigned int i, total = 0;
-
-   for (i = 0; i < ARRAY_SIZE(sseu->subslice_mask); i++)
-   total += hweight8(sseu->subslice_mask[i]);
-
-   return total;
+   return 

[PATCH 4/5] drm/i915/sseu: Simplify gen11+ SSEU handling

2022-04-27 Thread Matt Roper
Although gen11 and gen12 architectures supported the concept of multiple
slices, in practice all the platforms that were actually designed only
had a single slice (i.e., note the parameters to 'intel_sseu_set_info'
that we pass for each platform).  We can simplify the code slightly by
dropping the multi-slice logic from gen11+ platforms.

Signed-off-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/intel_sseu.c | 73 ++--
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c 
b/drivers/gpu/drm/i915/gt/intel_sseu.c
index ef66c2b8861a..f7ff6a9f67b0 100644
--- a/drivers/gpu/drm/i915/gt/intel_sseu.c
+++ b/drivers/gpu/drm/i915/gt/intel_sseu.c
@@ -155,48 +155,32 @@ int intel_sseu_copy_eumask_to_user(void __user *to,
return copy_to_user(to, eu_mask, len);
 }
 
-static u32 get_ss_stride_mask(struct sseu_dev_info *sseu, u8 s, u32 ss_en)
-{
-   u32 ss_mask;
-
-   ss_mask = ss_en >> (s * sseu->max_subslices);
-   ss_mask &= GENMASK(sseu->max_subslices - 1, 0);
-
-   return ss_mask;
-}
-
-static void gen11_compute_sseu_info(struct sseu_dev_info *sseu, u8 s_en,
+static void gen11_compute_sseu_info(struct sseu_dev_info *sseu,
u32 g_ss_en, u32 c_ss_en, u16 eu_en)
 {
-   int s;
+   u32 valid_ss_mask = GENMASK(sseu->max_subslices - 1, 0);
 
/* g_ss_en/c_ss_en represent entire subslice mask across all slices */
GEM_BUG_ON(sseu->max_slices * sseu->max_subslices >
   sizeof(g_ss_en) * BITS_PER_BYTE);
 
-   for (s = 0; s < sseu->max_slices; s++) {
-   if ((s_en & BIT(s)) == 0)
-   continue;
+   sseu->slice_mask |= BIT(0);
+
+   /*
+* XeHP introduces the concept of compute vs geometry DSS. To reduce
+* variation between GENs around subslice usage, store a mask for both
+* the geometry and compute enabled masks since userspace will need to
+* be able to query these masks independently.  Also compute a total
+* enabled subslice count for the purposes of selecting subslices to
+* use in a particular GEM context.
+*/
+   intel_sseu_set_subslices(sseu, 0, sseu->compute_subslice_mask,
+c_ss_en & valid_ss_mask);
+   intel_sseu_set_subslices(sseu, 0, sseu->geometry_subslice_mask,
+g_ss_en & valid_ss_mask);
+   intel_sseu_set_subslices(sseu, 0, sseu->subslice_mask,
+(g_ss_en | c_ss_en) & valid_ss_mask);
 
-   sseu->slice_mask |= BIT(s);
-
-   /*
-* XeHP introduces the concept of compute vs geometry DSS. To
-* reduce variation between GENs around subslice usage, store a
-* mask for both the geometry and compute enabled masks since
-* userspace will need to be able to query these masks
-* independently.  Also compute a total enabled subslice count
-* for the purposes of selecting subslices to use in a
-* particular GEM context.
-*/
-   intel_sseu_set_subslices(sseu, s, sseu->compute_subslice_mask,
-get_ss_stride_mask(sseu, s, c_ss_en));
-   intel_sseu_set_subslices(sseu, s, sseu->geometry_subslice_mask,
-get_ss_stride_mask(sseu, s, g_ss_en));
-   intel_sseu_set_subslices(sseu, s, sseu->subslice_mask,
-get_ss_stride_mask(sseu, s,
-   g_ss_en | c_ss_en));
-   }
sseu->has_common_ss_eumask = 1;
sseu->eu_mask[0] = eu_en;
sseu->eu_per_subslice = hweight16(eu_en);
@@ -229,7 +213,7 @@ static void xehp_sseu_info_init(struct intel_gt *gt)
if (eu_en_fuse & BIT(eu))
eu_en |= BIT(eu * 2) | BIT(eu * 2 + 1);
 
-   gen11_compute_sseu_info(sseu, 0x1, g_dss_en, c_dss_en, eu_en);
+   gen11_compute_sseu_info(sseu, g_dss_en, c_dss_en, eu_en);
 }
 
 static void gen12_sseu_info_init(struct intel_gt *gt)
@@ -249,8 +233,15 @@ static void gen12_sseu_info_init(struct intel_gt *gt)
 */
intel_sseu_set_info(sseu, 1, 6, 16);
 
+   /*
+* Although gen12 architecture supported multiple slices, TGL, RKL,
+* DG1, and ADL only had a single slice.
+*/
s_en = intel_uncore_read(uncore, GEN11_GT_SLICE_ENABLE) &
GEN11_GT_S_ENA_MASK;
+   if (s_en != 0x1)
+   drm_dbg(>i915->drm, "Slice mask %#x is not the expected 
0x1!\n",
+   s_en);
 
g_dss_en = intel_uncore_read(uncore, GEN12_GT_GEOMETRY_DSS_ENABLE);
 
@@ -262,7 +253,7 @@ static void gen12_sseu_info_init(struct intel_gt *gt)
if (eu_en_fuse & BIT(eu))
eu_en |= BIT(eu * 2) | BIT(eu 

[PATCH 1/5] drm/i915/sseu: Don't try to store EU mask internally in UAPI format

2022-04-27 Thread Matt Roper
Storing the EU mask internally in the same format the I915_QUERY
topology queries use makes the final copy_to_user() a bit simpler, but
makes the rest of the driver's SSEU more complicated.  Given that modern
platforms (gen11 and beyond) are architecturally guaranteed to have
equivalent EU masks for every subslice, it also wastes quite a bit of
space since we're storing a duplicate copy of the EU mask for every
single subslice where we really only need to store one instance.

Let's add a has_common_ss_eumask flag to the SSEU structure to determine
which type of hardware we're working on.  For the older pre-gen11
platforms the various subslices can have different EU masks so we use an
array of u16[] to store each subslice's copy.  For gen11 and beyond
we'll only use index [0] of the array and not worry about copying the
repeated value, except when converting into uapi form for the I915_QUERY
ioctl.

Signed-off-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/intel_sseu.c | 66 +---
 drivers/gpu/drm/i915/gt/intel_sseu.h | 21 -
 drivers/gpu/drm/i915/i915_query.c|  8 ++--
 3 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c 
b/drivers/gpu/drm/i915/gt/intel_sseu.c
index 9881a6790574..13387b4024ea 100644
--- a/drivers/gpu/drm/i915/gt/intel_sseu.c
+++ b/drivers/gpu/drm/i915/gt/intel_sseu.c
@@ -91,36 +91,70 @@ static int sseu_eu_idx(const struct sseu_dev_info *sseu, 
int slice,
 static u16 sseu_get_eus(const struct sseu_dev_info *sseu, int slice,
int subslice)
 {
-   int i, offset = sseu_eu_idx(sseu, slice, subslice);
-   u16 eu_mask = 0;
-
-   for (i = 0; i < sseu->eu_stride; i++)
-   eu_mask |=
-   ((u16)sseu->eu_mask[offset + i]) << (i * BITS_PER_BYTE);
+   if (!intel_sseu_has_subslice(sseu, slice, subslice))
+   return 0;
 
-   return eu_mask;
+   if (sseu->has_common_ss_eumask)
+   return sseu->eu_mask[0];
+   else
+   return sseu->eu_mask[slice * sseu->max_subslices + subslice];
 }
 
 static void sseu_set_eus(struct sseu_dev_info *sseu, int slice, int subslice,
 u16 eu_mask)
 {
-   int i, offset = sseu_eu_idx(sseu, slice, subslice);
+   WARN_ON(sseu->has_common_ss_eumask);
+   WARN_ON(sseu->max_eus_per_subslice > sizeof(sseu->eu_mask[0]) * 
BITS_PER_BYTE);
 
-   for (i = 0; i < sseu->eu_stride; i++)
-   sseu->eu_mask[offset + i] =
-   (eu_mask >> (BITS_PER_BYTE * i)) & 0xff;
+   sseu->eu_mask[slice * sseu->max_subslices + subslice] =
+   eu_mask & GENMASK(sseu->max_eus_per_subslice - 1, 0);
 }
 
 static u16 compute_eu_total(const struct sseu_dev_info *sseu)
 {
u16 i, total = 0;
 
+   if (sseu->has_common_ss_eumask)
+   return intel_sseu_subslices_per_slice(sseu, 0) *
+   hweight16(sseu->eu_mask[0]);
+
for (i = 0; i < ARRAY_SIZE(sseu->eu_mask); i++)
-   total += hweight8(sseu->eu_mask[i]);
+   total += hweight16(sseu->eu_mask[i]);
 
return total;
 }
 
+/**
+ * intel_sseu_copy_eumask_to_user - Copy EU mask into a userspace buffer
+ * @to: Pointer to userspace buffer to copy to
+ * @sseu: SSEU structure containing EU mask to copy
+ *
+ * Copies the EU mask to a userspace buffer in the format expected by
+ * the query ioctl's topology queries.
+ *
+ * Returns the result of the copy_to_user() operation.
+ */
+int intel_sseu_copy_eumask_to_user(void __user *to,
+  const struct sseu_dev_info *sseu)
+{
+   u8 eu_mask[GEN_SS_MASK_SIZE * GEN_MAX_EU_STRIDE] = {};
+   int len = sseu->max_slices * sseu->max_subslices * sseu->eu_stride;
+   int s, ss, i;
+
+   for (s = 0; s < sseu->max_slices; s++) {
+   for (ss = 0; ss < sseu->max_subslices; ss++) {
+   int offset = sseu_eu_idx(sseu, s, ss);
+   u16 mask = sseu_get_eus(sseu, s, ss);
+
+   for (i = 0; i < sseu->eu_stride; i++)
+   eu_mask[offset + i] =
+   (mask >> (BITS_PER_BYTE * i)) & 0xff;
+   }
+   }
+
+   return copy_to_user(to, eu_mask, len);
+}
+
 static u32 get_ss_stride_mask(struct sseu_dev_info *sseu, u8 s, u32 ss_en)
 {
u32 ss_mask;
@@ -134,7 +168,7 @@ static u32 get_ss_stride_mask(struct sseu_dev_info *sseu, 
u8 s, u32 ss_en)
 static void gen11_compute_sseu_info(struct sseu_dev_info *sseu, u8 s_en,
u32 g_ss_en, u32 c_ss_en, u16 eu_en)
 {
-   int s, ss;
+   int s;
 
/* g_ss_en/c_ss_en represent entire subslice mask across all slices */
GEM_BUG_ON(sseu->max_slices * sseu->max_subslices >
@@ -162,11 +196,9 @@ static void gen11_compute_sseu_info(struct sseu_dev_info 
*sseu, u8 s_en,
intel_sseu_set_subslices(sseu, 

[PATCH 0/5] i915: SSEU handling updates

2022-04-27 Thread Matt Roper
This series makes a handful of updates to i915's internal handling of
slice/subslice/EU (SSEU) data to handle recent platforms like Xe_HP in a
more natural manner and to prepare for some additional upcoming
platforms we have in the pipeline (the first of which I'll probably
start sending patches for in the next week or two).  One key idea of
this series is that although we have a fixed ABI to convey SSEU data to
userspace (i.e., multiple u8[] arrays with data stored at different
strides), we don't need to use this cumbersome format for the driver's
own internal storage.  As long as we can convert into the uapi form
properly when responding to the I915_QUERY ioctl, it's preferable to use
an internal storage format that's easier for the driver to work with.
Doing so can also save us some storage space on modern platforms since
we don't always need to replicate a bunch of data that's architecturally
guaranteed to be identical.

Another key point here is that Xe_HP platforms today have subslice (DSS)
masks that are 32 bits, which maxes out the storage of a u32.  On future
platforms the architecture design is going to start spreading their DSS
masks over multiple 32-bit fuse registers.  So even for platforms where
the total number of DSS doesn't actually go up, we're going to need
larger storage than just a u32 to express the mask properly.  To
accomodate this, we start storing our subslice mask in a new typedef
that can be processed by the linux/bitmap.h operations.

Finally, since no userspace for Xe_HP or beyond is using the legacy
I915_GETPARAM ioctl lookups for I915_PARAM_SLICE_MASK and
I915_PARAM_SUBSLICE_MASK (since they've migrated to the more flexible
I915_QUERY ioctl that can return more than a simple u32 value), we take
the opportunity to officially drop support for those GETPARAM lookups on
modern platforms.  Maintaining support for these GETPARAM lookups don't
make sense for a number of reasons:

 * Traditional slices no longer exist, and newer ideas like gslices,
   cslices, mslices, etc. aren't something userspace needs to query
   since it can be inferred from other information.
 * The GETPARAM ioctl doesn't have a way to distinguish between geometry
   subslice masks and compute subslice masks, which are distinct on
   Xe_HP and beyond.
 * The I915_GETPARAM ioctl is limited to returning a 32-bit value, so
   when subslice masks begin to exceed 32-bits, it simply can't return
   the entire mask.
 * The GETPARAM ioctl doesn't have a way to give sensible information
   for multi-tile devices.


Cc: Tvrtko Ursulin 

Matt Roper (5):
  drm/i915/sseu: Don't try to store EU mask internally in UAPI format
  drm/i915/xehp: Drop GETPARAM lookups of I915_PARAM_[SUB]SLICE_MASK
  drm/i915/xehp: Use separate sseu init function
  drm/i915/sseu: Simplify gen11+ SSEU handling
  drm/i915/sseu: Disassociate internal subslice mask representation from
uapi

 drivers/gpu/drm/i915/gem/i915_gem_context.c  |   4 +-
 drivers/gpu/drm/i915/gt/intel_engine_cs.c|   2 +-
 drivers/gpu/drm/i915/gt/intel_gt.c   |  14 +-
 drivers/gpu/drm/i915/gt/intel_sseu.c | 371 +++
 drivers/gpu/drm/i915/gt/intel_sseu.h |  69 ++--
 drivers/gpu/drm/i915/gt/intel_sseu_debugfs.c |  28 +-
 drivers/gpu/drm/i915/gt/intel_workarounds.c  |  28 +-
 drivers/gpu/drm/i915/i915_getparam.c |  10 +-
 drivers/gpu/drm/i915/i915_query.c|  16 +-
 9 files changed, 323 insertions(+), 219 deletions(-)

-- 
2.35.1



[PATCH 2/5] drm/i915/xehp: Drop GETPARAM lookups of I915_PARAM_[SUB]SLICE_MASK

2022-04-27 Thread Matt Roper
Slice/subslice/EU information should be obtained via the topology
queries provided by the I915_QUERY interface; let's turn off support for
the old GETPARAM lookups on Xe_HP and beyond where we can't return
meaningful values.

The slice mask lookup is meaningless since Xe_HP doesn't support
traditional slices (and we make no attempt to return the various new
units like gslices, cslices, mslices, etc.) here.

The subslice mask lookup is even more problematic; given the distinct
masks for geometry vs compute purposes, the combined mask returned here
is likely not what userspace would want to act upon anyway.  The value
is also limited to 32-bits by the nature of the GETPARAM ioctl which is
sufficient for the initial Xe_HP platforms, but is unable to convey the
larger masks that will be needed on other upcoming platforms.  Finally,
the value returned here becomes even less meaningful when used on
multi-tile platforms where each tile will have its own masks.

Signed-off-by: Matt Roper 
---
 drivers/gpu/drm/i915/i915_getparam.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_getparam.c 
b/drivers/gpu/drm/i915/i915_getparam.c
index c12a0adefda5..ac9767c56619 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -148,11 +148,19 @@ int i915_getparam_ioctl(struct drm_device *dev, void 
*data,
value = intel_engines_has_context_isolation(i915);
break;
case I915_PARAM_SLICE_MASK:
+   /* Not supported from Xe_HP onward; use topology queries */
+   if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
+   return -EINVAL;
+
value = sseu->slice_mask;
if (!value)
return -ENODEV;
break;
case I915_PARAM_SUBSLICE_MASK:
+   /* Not supported from Xe_HP onward; use topology queries */
+   if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
+   return -EINVAL;
+
/* Only copy bits from the first slice */
memcpy(, sseu->subslice_mask,
   min(sseu->ss_stride, (u8)sizeof(value)));
-- 
2.35.1



[PATCH][next] drm/i915: Fix -Wstringop-overflow warning in call to intel_read_wm_latency()

2022-04-27 Thread Gustavo A. R. Silva
Fix the following -Wstringop-overflow warnings when building with GCC-11:

drivers/gpu/drm/i915/intel_pm.c:3106:9: warning: ‘intel_read_wm_latency’ 
accessing 16 bytes in a region of size 10 [-Wstringop-overflow=]
 3106 | intel_read_wm_latency(dev_priv, dev_priv->wm.pri_latency);
  | ^
drivers/gpu/drm/i915/intel_pm.c:3106:9: note: referencing argument 2 of type 
‘u16 *’ {aka ‘short unsigned int *’}
drivers/gpu/drm/i915/intel_pm.c:2861:13: note: in a call to function 
‘intel_read_wm_latency’
 2861 | static void intel_read_wm_latency(struct drm_i915_private *dev_priv,
  | ^

by removing the over-specified array size from the argument declarations.

It seems that this code is actually safe because the size of the
array depends on the hardware generation, and the function checks
for that.

This helps with the ongoing efforts to globally enable
-Wstringop-overflow.

Link: https://github.com/KSPP/linux/issues/181
Signed-off-by: Gustavo A. R. Silva 
---
 drivers/gpu/drm/i915/intel_pm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index ee0047fdc95d..5735915facc5 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -2862,7 +2862,7 @@ static void ilk_compute_wm_level(const struct 
drm_i915_private *dev_priv,
 }
 
 static void intel_read_wm_latency(struct drm_i915_private *dev_priv,
- u16 wm[8])
+ u16 wm[])
 {
struct intel_uncore *uncore = _priv->uncore;
 
-- 
2.27.0



[PATCH v2] drm/display: Select DP helper for DRM_DP_AUX_CHARDEV and DRM_DP_CEC

2022-04-27 Thread Javier Martinez Canillas
The DRM_DP_AUX_CHARDEV and DRM_DP_CEC Kconfig symbols enable code that use
DP helper functions, that are only present if CONFIG_DRM_DISPLAY_DP_HELPER
is also enabled.

But these don't select the DRM_DISPLAY_DP_HELPER symbol, meaning that it
is possible to enable any of them without CONFIG_DRM_DISPLAY_DP_HELPER.

That will lead to the following linking errors with the mentioned config:

  LD  vmlinux.o
  MODPOST vmlinux.symvers
  MODINFO modules.builtin.modinfo
  GEN modules.builtin
  LD  .tmp_vmlinux.kallsyms1
  KSYMS   .tmp_vmlinux.kallsyms1.S
  AS  .tmp_vmlinux.kallsyms1.S
  LD  .tmp_vmlinux.kallsyms2
  KSYMS   .tmp_vmlinux.kallsyms2.S
  AS  .tmp_vmlinux.kallsyms2.S
  LD  vmlinux
  SYSMAP  System.map
  SORTTAB vmlinux
  OBJCOPY arch/arm64/boot/Image
  MODPOST modules-only.symvers
ERROR: modpost: "drm_dp_dpcd_write" 
[drivers/gpu/drm/display/drm_display_helper.ko] undefined!
ERROR: modpost: "drm_dp_read_desc" 
[drivers/gpu/drm/display/drm_display_helper.ko] undefined!
ERROR: modpost: "drm_dp_dpcd_read" 
[drivers/gpu/drm/display/drm_display_helper.ko] undefined!
make[1]: *** [scripts/Makefile.modpost:134: modules-only.symvers] Error 1
make[1]: *** Deleting file 'modules-only.symvers'
make: *** [Makefile:1749: modules] Error 2

Note: It seems this has been an issue for a long time but was made easier
to reproduce after the commit 1e0f66420b13 ("drm/display: Introduce a DRM
display-helper module"). Adding a Fixes: tag just to make sure that this
fix will be picked for stable once the mentioned change also lands there.

Fixes: 1e0f66420b13 ("drm/display: Introduce a DRM display-helper module")
Signed-off-by: Javier Martinez Canillas 
---

Changes in v2:
- Explain better the issue in the change description.
- Only select DRM_DISPLAY_DP_HELPER and not DRM_DISPLAY_HELPER.

 drivers/gpu/drm/display/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/display/Kconfig b/drivers/gpu/drm/display/Kconfig
index f84f1b0cd23f..9fe80c4e555c 100644
--- a/drivers/gpu/drm/display/Kconfig
+++ b/drivers/gpu/drm/display/Kconfig
@@ -32,6 +32,7 @@ config DRM_DISPLAY_HDMI_HELPER
 config DRM_DP_AUX_CHARDEV
bool "DRM DP AUX Interface"
depends on DRM
+   select DRM_DISPLAY_DP_HELPER
help
  Choose this option to enable a /dev/drm_dp_auxN node that allows to
  read and write values to arbitrary DPCD registers on the DP aux
@@ -40,6 +41,7 @@ config DRM_DP_AUX_CHARDEV
 config DRM_DP_CEC
bool "Enable DisplayPort CEC-Tunneling-over-AUX HDMI support"
depends on DRM
+   select DRM_DISPLAY_DP_HELPER
select CEC_CORE
help
  Choose this option if you want to enable HDMI CEC support for
-- 
2.35.1



Re: [Intel-gfx] [PATCH 0/2] Initial GuC firmware release for DG2

2022-04-27 Thread John Harrison

On 4/27/2022 11:24, Timo Aaltonen wrote:

john.c.harri...@intel.com kirjoitti 27.4.2022 klo 19.55:

From: John Harrison 

Add GuC firmware for DG2.

Note that an older version of this patch exists in the CI topic
branch. Hence this set includes a revert of that patch before applying
the new version. When merging, the revert would simply be dropped and
the corresponding patch in the topic branch would also be dropped.

Signed-off-by: John Harrison 


John Harrison (2):
   Revert "drm/i915/dg2: Define GuC firmware version for DG2"
   drm/i915/dg2: Define GuC firmware version for DG2

  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)



The firmware is not public yet, though? Shouldn't it have been sent 
upstream already? Same complaint applies to DMC.



Not sure about the DMC team, but for i915 we upload the firmware to an 
FDO repo so that the CI system can find it and run the pre-merge testing 
with it. However, we don't send the final pull request for the real 
linux firmware repo until we have merged the i915 patch to 
drm-gt-intel-next and it is definitely going upstream. Otherwise, we 
might end up pushing firmwares to the linux repo that never get used.


John.



Re: [PATCH v2 0/4] Add Toshiba Visconti AFFINE image processing accelerator driver

2022-04-27 Thread Laurent Pinchart
Hi Yuji,

Thank you for the patch. It's nice to see contributions from Toshiba in
the image acceleration domain :-)

I'll start with a high-level question before diving into detailed
review. Why is this implemented in drivers/soc/ with a custom userspace
API, and not as a V4L2 memory-to-memory driver ?

On Wed, Apr 27, 2022 at 10:23:41PM +0900, Yuji Ishikawa wrote:
> This series is the AFFINE image processing accelerator driver for Toshiba's 
> ARM SoC, Visconti[0].
> This provides DT binding documentation, device driver, MAINTAINER files.
> 
> The second patch "soc: visconti: Add Toshiba Visconti image processing 
> accelerator common source"
> is commonly used among acclerator drivers (affine, dnn, dspif, pyramid).
> 
> Best regards,
> Yuji
> 
> [0]: 
> https://toshiba.semicon-storage.com/ap-en/semiconductor/product/image-recognition-processors-visconti.html
>   
>   dt-bindings: soc: visconti: Add Toshiba Visconti AFFINE image
> v1 -> v2:
>   - No update
> 
>   soc: visconti: Add Toshiba Visconti image processing accelerator common 
> source
> v1 -> v2:
>   - apply checkpatch.pl --strict
>   
>   soc: visconti: Add Toshiba Visconti AFFINE image processing accelerator
> v1 -> v2:
>   - apply checkpatch.pl --strict
>   - rename hwd_AFFINE_ to hwd_affine_
> 
>   MAINTAINERS: Add entries for Toshiba Visconti AFFINE image processing 
> accelerator
> v1 -> v2:
>   - No update
> 
> Change in V2:
>   - apply checkpatch.pl --strict
>   - rename hwd_AFFINE_ to hwd_affine_
> 
> Yuji Ishikawa (4):
>   dt-bindings: soc: visconti: Add Toshiba Visconti AFFINE image
> processing accelerator bindings
>   soc: visconti: Add Toshiba Visconti image processing accelerator
> common source
>   soc: visconti: Add Toshiba Visconti AFFINE image processing
> accelerator
>   MAINTAINERS: Add entries for Toshiba Visconti AFFINE image processing
> accelerator
> 
>  .../soc/visconti/toshiba,visconti-affine.yaml |  53 ++
>  MAINTAINERS   |   2 +
>  drivers/soc/Kconfig   |   1 +
>  drivers/soc/Makefile  |   1 +
>  drivers/soc/visconti/Kconfig  |   7 +
>  drivers/soc/visconti/Makefile |   8 +
>  drivers/soc/visconti/affine/Makefile  |   6 +
>  drivers/soc/visconti/affine/affine.c  | 451 ++
>  drivers/soc/visconti/affine/hwd_affine.c  | 206 
>  drivers/soc/visconti/affine/hwd_affine.h  |  83 
>  drivers/soc/visconti/affine/hwd_affine_reg.h  |  45 ++
>  drivers/soc/visconti/ipa_common.c |  55 +++
>  drivers/soc/visconti/ipa_common.h |  18 +
>  drivers/soc/visconti/uapi/affine.h|  87 
>  drivers/soc/visconti/uapi/ipa.h   |  88 
>  15 files changed,  insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/soc/visconti/toshiba,visconti-affine.yaml
>  create mode 100644 drivers/soc/visconti/Kconfig
>  create mode 100644 drivers/soc/visconti/Makefile
>  create mode 100644 drivers/soc/visconti/affine/Makefile
>  create mode 100644 drivers/soc/visconti/affine/affine.c
>  create mode 100644 drivers/soc/visconti/affine/hwd_affine.c
>  create mode 100644 drivers/soc/visconti/affine/hwd_affine.h
>  create mode 100644 drivers/soc/visconti/affine/hwd_affine_reg.h
>  create mode 100644 drivers/soc/visconti/ipa_common.c
>  create mode 100644 drivers/soc/visconti/ipa_common.h
>  create mode 100644 drivers/soc/visconti/uapi/affine.h
>  create mode 100644 drivers/soc/visconti/uapi/ipa.h

-- 
Regards,

Laurent Pinchart


Re: [PATCH] drm/display: Select DP helpers for DRM_DP_AUX_CHARDEV and DRM_DP_CEC

2022-04-27 Thread Javier Martinez Canillas
On 4/27/22 22:25, Javier Martinez Canillas wrote:
> The DRM_DP_AUX_CHARDEV and DRM_DP_CEC boolean Kconfig symbols enable code
> that use DP helper functions, exported by the display-helper module.
> 

[snip]

> @@ -32,6 +32,8 @@ config DRM_DISPLAY_HDMI_HELPER
>  config DRM_DP_AUX_CHARDEV
>   bool "DRM DP AUX Interface"
>   depends on DRM
> + select DRM_DISPLAY_DP_HELPER
> + select DRM_DISPLAY_HELPER

Actually, this is wrong since it will prevent DRM_DISPLAY_HELPER to be set
as a module (it's tristate while this symbol is bool). I now have a better
understanding of the problem and a patch that I believe is the correct one.

I'll post a v2 soon.

-- 
Best regards,

Javier Martinez Canillas
Linux Engineering
Red Hat



Re: How should "max bpc" KMS property work?

2022-04-27 Thread Sebastian Wick
On Wed, Apr 27, 2022 at 5:41 PM Harry Wentland  wrote:
>
>
>
> On 2022-04-27 06:52, Pekka Paalanen wrote:
> > Hi Ville and Alex,
> >
> > thanks for the replies. More below.
> >
> > TL;DR:
> >
> > My take-away from this is that I should slam 'max bpc' to the max by
> > default, and offer a knob for the user in case they want to lower it.
> >
> >
> > On Tue, 26 Apr 2022 20:55:14 +0300
> > Ville Syrjälä  wrote:
> >
> >> On Tue, Apr 26, 2022 at 11:35:02AM +0300, Pekka Paalanen wrote:
> >>> Hi all,
> >>>
> >>> I'm working on setting HDR & WCG video modes in Weston, and I thought
> >>> setting "max bpc" KMS property on the connector would be a good idea.
> >>> I'm confused about how it works though.
> >>>
> >>> I did some digging in 
> >>> https://gitlab.freedesktop.org/wayland/weston/-/issues/612
> >>>
> >>> Summary:
> >>>
> >>> - Apparently the property was originally added as a manual workaround
> >>>   for sink hardware behaving badly with high depth. A simple end user
> >>>   setting for "max bpc" would suffice for this use.
> >>>
> >>> - Drivers will sometimes automatically choose a lower bpc than the "max
> >>>   bpc" value, but never bigger.
> >>>
> >>> - amdgpu seems to (did?) default "max bpc" to 8, meaning that I
> >>>   definitely want to raise it.
> >>
>
> I've wanted to remove the 8 bpc limitations for a while now but it
> looks like we never did for anything other than eDP.
>
> The original problem we solved was that some monitors default timing
> couldn't be driven at a high bpc. Therefore users were faced with black
> displays. On some displays you also can't drive high refresh rate modes
> with a higher bpc.
>
> >> I've occasionally pondered about doing the same for i915, just to have
> >> the safest default possible. But I'd hate to lose the deep color testing
> >> coverage knowing very few people would in practice raise the limit.
> >> Also the number of systems where deep color doesn't work reliably
> >> (or can't be made to work by not using a crap cable) seems to be quite
> >> low.
> >
> > I think when HDR and WCG get into display servers, setting 'max bpc'
> > will become a standard action.
> >
> > It's bit moot to e.g. render everything in electrical 10 bit RGB, if
> > the link is just going to squash that into electrical 8 bit RGB, right?
> >
> > So even 10 bit color would require setting 'max bpc' to at least 10 to
> > be able to actually see it, source-side dithering aside.
> >
> >>>
> >>> If I always slam "max bpc" to the highest supported value for that
> >>> property, do I lose more than workarounds for bad sink hardware?
> >>
> >> We don't have any workarounds implemented like this in the kernel.
> >> Or should not have at least. "max bpc" exists purely for the user
> >> to have a say in the matter in addition to whatever the EDID/quirks
> >> say. Ie. if the kernel knows for sure that deep color won't work on
> >> a particular setup then it should just not allow deep color at all
> >> despite what the prop value says.
> >>
> >> So the only danger is fighting with the user's wishes which I guess
> >> you can overcome with some kind of user visible knob.
> >
> > Right, good.
> >
> > Furthermore, as a KMS client cannot make much assumptions about the KMS
> > state it inherits from some other KMS client, it should know and
> > program all possible KMS properties according to its own desires
> > anyway. That, and the DRM master concept make sure that there cannot be
> > any "third party" KMS configuration programs, like V4L2 has.
> >
> >>> Do I lose the ability to set video modes that take too much bandwidth
> >>> at uncapped driver-selected bpc while capping the bpc lower would allow
> >>> me to use those video modes?
> >>>
> >>> Or, are drivers required to choose a lower-than-usual but highest
> >>> usable bpc to make the requested video mode squeeze through the
> >>> connector and link?
> >>
> >> IMO drivers should implement the "reduce bpc until it fits"
> >> fallback. We have that in i915, except for MST where we'd need
> >> to potentially involve multiple streams in the fallback. That
> >> is something we intend to remedy eventually but it's not an
> >> entirely trivial thing to implement so will take some actual
> >> work. ATM we just cap MST to <=8bpc to avoid users getting into
> >> this situation so often.
> >
> > Excellent, but judging from what Alex said, this is also not what
> > amdgpu does. We have two drivers doing different things then?
> > > So with Weston I probably have to document, that if you can't get the
> > video mode you want working, try turning the 'max bpc' knob down and
> > try again.
> >
> > Or, I could cap 'max bpc' based on my framebuffer depth. If I have an
> > electrical 8 bit FB (default in Weston), then there is not much use for
> > having 'max bpc' > 8. This ignores the KMS color pipeline a bit. Does
> > that make sense?
> >
>
> I think both of those options make sense. I'll need to think about the
> automatic fallback if we don't have enough 

Re: [PATCH 2/2] drm/probe-helper: For DP, add 640x480 if all other modes are bad

2022-04-27 Thread Kuogee Hsieh

Tested-by: Kuogee Hsieh 

On 4/26/2022 2:17 PM, Doug Anderson wrote:

Hi,

On Tue, Apr 26, 2022 at 2:11 PM Abhinav Kumar  wrote:



On 4/26/2022 1:26 PM, Doug Anderson wrote:

Hi,

On Tue, Apr 26, 2022 at 12:20 PM Abhinav Kumar
 wrote:

Missed one more comment.

On 4/26/2022 12:16 PM, Abhinav Kumar wrote:

Hi Doug

One minor comment below.

But otherwise, looking at this change this should work for us acc to me.

We will test this out with our equipment and then provide R-b.

Thanks

Abhinav
On 4/26/2022 11:46 AM, Douglas Anderson wrote:

As per Displayport spec section 5.2.1.2 ("Video Timing Format") says
that all detachable sinks shall support 640x480 @60Hz as a fail safe
mode.

A DP compliance test expected us to utilize the above fact when all
modes it presented to the DP source were not achievable. It presented
only modes that would be achievable with more lanes and/or higher
speeds than we had available and expected that when we couldn't do
that then we'd fall back to 640x480 even though it didn't advertise
this size.

In order to pass the compliance test (and also support any users who
might fall into a similar situation with their display), we need to
add 640x480 into the list of modes. However, we don't want to add
640x480 all the time. Despite the fact that the DP spec says all sinks
_shall support_ 640x480, they're not guaranteed to support it
_well_. Continuing to read the spec you can see that the display is
not required to really treat 640x480 equal to all the other modes. It
doesn't need to scale or anything--just display the pixels somehow for
failsafe purposes. It should also be noted that it's not hard to find
a display hooked up via DisplayPort that _doesn't_ support 640x480 at
all. The HP ZR30w screen I'm sitting in front of has a native DP port
and doesn't work at 640x480. I also plugged in a tiny 800x480 HDMI
display via a DP to HDMI adapter and that screen definitely doesn't
support 640x480.

As a compromise solution, let's only add the 640x480 mode if:
* We're on DP.
* All other modes have been pruned.

This acknowledges that 640x480 might not be the best mode to use but,
since sinks are _supposed_ to support it, we will at least fall back
to it if there's nothing else.

Note that we _don't_ add higher resolution modes like 1024x768 in this
case. We only add those modes for a failed EDID read where we have no
idea what's going on. In the case where we've pruned all modes then
instead we only want 640x480 which is the only defined "Fail Safe"
resolution.

This patch originated in response to Kuogee Hsieh's patch [1].

[1]
https://lore.kernel.org/r/1650671124-14030-1-git-send-email-quic_khs...@quicinc.com


Signed-off-by: Douglas Anderson 
---

drivers/gpu/drm/drm_probe_helper.c | 26 +-
1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/drm_probe_helper.c
b/drivers/gpu/drm/drm_probe_helper.c
index 819225629010..90cd46cbfec1 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -476,7 +476,6 @@ int drm_helper_probe_single_connector_modes(struct
drm_connector *connector,
const struct drm_connector_helper_funcs *connector_funcs =
connector->helper_private;
int count = 0, ret;
-bool verbose_prune = true;
enum drm_connector_status old_status;
struct drm_modeset_acquire_ctx ctx;
@@ -556,8 +555,8 @@ int drm_helper_probe_single_connector_modes(struct
drm_connector *connector,
DRM_DEBUG_KMS("[CONNECTOR:%d:%s] disconnected\n",
connector->base.id, connector->name);
drm_connector_update_edid_property(connector, NULL);
-verbose_prune = false;
-goto prune;
+drm_mode_prune_invalid(dev, >modes, false);
+goto exit;
}
count = (*connector_funcs->get_modes)(connector);
@@ -580,9 +579,26 @@ int
drm_helper_probe_single_connector_modes(struct drm_connector *connector,
}
}
-prune:
-drm_mode_prune_invalid(dev, >modes, verbose_prune);
+drm_mode_prune_invalid(dev, >modes, true);
+/*
+ * Displayport spec section 5.2.1.2 ("Video Timing Format") says
that
+ * all detachable sinks shall support 640x480 @60Hz as a fail safe
+ * mode. If all modes were pruned, perhaps because they need more
+ * lanes or a higher pixel clock than available, at least try to add
+ * in 640x480.
+ */
+if (list_empty(>modes) &&
+connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort) {
+count = drm_add_modes_noedid(connector, 640, 480);
+if (_drm_helper_update_and_validate(connector, maxX, maxY,
)) {
+drm_modeset_backoff();
+goto retry;

Do we need another retry here? This will again repeat everything from
get_modes().
The fact that we are hitting this code is because we have already tried
that and this is already a second-pass. So I think another retry isnt
needed?

This will help cover the case 

Re: [RFC PATCH] drm/edid: drm_add_modes_noedid() should set lowest resolution as preferred

2022-04-27 Thread Kuogee Hsieh

Tested-by: Kuogee Hsieh 


On 4/26/2022 2:21 PM, Abhinav Kumar wrote:



On 4/26/2022 1:52 PM, Doug Anderson wrote:

Hi,

On Tue, Apr 26, 2022 at 1:46 PM Abhinav Kumar 
 wrote:


On 4/26/2022 1:21 PM, Douglas Anderson wrote:

If we're unable to read the EDID for a display because it's corrupt /
bogus / invalid then we'll add a set of standard modes for the
display. When userspace looks at these modes it doesn't really have a
good concept for which mode to pick and it'll likely pick the highest
resolution one by default. That's probably not ideal because the modes
were purely guesses on the part of the Linux kernel.

Let's instead set 640x480 as the "preferred" mode when we have no 
EDID.


Signed-off-by: Douglas Anderson 


drm_dmt_modes array is sorted but you are also relying on this check to
eliminate the non-60fps modes

5611    if (drm_mode_vrefresh(ptr) > 61)
5612    continue;

I am not sure why we filter out the modes > 61 vrefresh.

If that check will remain this is okay.

If its not, its not reliable that the first mode will be 640x480@60


I suspect that the check will remain. I guess I could try to do
something fancier if people want, but I'd be interested in _what_
fancier thing I should do if so. Do we want the rule to remain that we
always prefer 640x480, or do we want to prefer the lowest resolution?
...do we want to prefer 60 Hz or the lowest refresh rate? Do we do
this only for DP (which explicitly calls out 640x480 @60Hz as the best
failsafe) or for everything?

For now, the way it's coded up seems reasonable (to me). It's the
lowest resolution _and_ it's 640x480 just because of the current
values of the table. I suspect that extra lower resolution failsafe
modes won't be added, but we can always change the rules here if/when
they are.

-Doug


Alright, agreed. The way the API is today, I dont see anything getting 
broken with this.


So typically, as per spec, when a preferred mode is not set by the 
sink, the first entry becomes the preferred mode.


This also aligns with that expectation.

So FWIW,

Reviewed-by: Abhinav Kumar 

We will test this one also out with our equipment, then give tested-by 
tags.


Thanks

Abhinav



Re: [PATCH 1/4] drm/i915/huc: check HW directly for HuC auth status

2022-04-27 Thread Ceraolo Spurio, Daniele




On 4/26/2022 5:26 PM, Daniele Ceraolo Spurio wrote:

The huc_is_authenticated function return is based on our SW tracking of
the HuC auth status. However, around suspend/resume and reset this can
go out of sync with the actual HW state, which is why we use
huc_check_state() to look at the actual HW state. Instead of having this
duality, just make huc_is_authenticated() return the HW state and use it
everywhere we need to know if HuC is running.

Signed-off-by: Daniele Ceraolo Spurio 
---
  drivers/gpu/drm/i915/gt/uc/intel_huc.c | 23 ++-
  drivers/gpu/drm/i915/gt/uc/intel_huc.h |  5 -
  2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
index 556829de9c172..773020e69589a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c
@@ -80,6 +80,18 @@ void intel_huc_fini(struct intel_huc *huc)
intel_uc_fw_fini(>fw);
  }
  
+static bool huc_is_authenticated(struct intel_huc *huc)

+{
+   struct intel_gt *gt = huc_to_gt(huc);
+   intel_wakeref_t wakeref;
+   u32 status = 0;
+
+   with_intel_runtime_pm(gt->uncore->rpm, wakeref)
+   status = intel_uncore_read(gt->uncore, huc->status.reg);
+
+   return (status & huc->status.mask) == huc->status.value;
+}
+
  /**
   * intel_huc_auth() - Authenticate HuC uCode
   * @huc: intel_huc structure
@@ -96,7 +108,7 @@ int intel_huc_auth(struct intel_huc *huc)
struct intel_guc *guc = >uc.guc;
int ret;
  
-	GEM_BUG_ON(intel_huc_is_authenticated(huc));

+   GEM_BUG_ON(huc_is_authenticated(huc));


It looks like even on gen9 HuC is surviving the reset, so this BUG_ON is 
now being triggered. I'm going to just change this to a BUG_ON on 
intel_uc_fw_is_running() for now, which would be equivalent to what we 
have right now, and in the meantime I'll follow up with the HuC team to 
see if we can just skip this (and the huc_fw_upload) if HuC shows up as 
already authenticated.


Daniele

  
  	if (!intel_uc_fw_is_loaded(>fw))

return -ENOEXEC;
@@ -150,10 +162,6 @@ int intel_huc_auth(struct intel_huc *huc)
   */
  int intel_huc_check_status(struct intel_huc *huc)
  {
-   struct intel_gt *gt = huc_to_gt(huc);
-   intel_wakeref_t wakeref;
-   u32 status = 0;
-
switch (__intel_uc_fw_status(>fw)) {
case INTEL_UC_FIRMWARE_NOT_SUPPORTED:
return -ENODEV;
@@ -167,10 +175,7 @@ int intel_huc_check_status(struct intel_huc *huc)
break;
}
  
-	with_intel_runtime_pm(gt->uncore->rpm, wakeref)

-   status = intel_uncore_read(gt->uncore, huc->status.reg);
-
-   return (status & huc->status.mask) == huc->status.value;
+   return huc_is_authenticated(huc);
  }
  
  /**

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_huc.h
index 73ec670800f2b..77d813840d76c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_huc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.h
@@ -50,11 +50,6 @@ static inline bool intel_huc_is_used(struct intel_huc *huc)
return intel_uc_fw_is_available(>fw);
  }
  
-static inline bool intel_huc_is_authenticated(struct intel_huc *huc)

-{
-   return intel_uc_fw_is_running(>fw);
-}
-
  void intel_huc_load_status(struct intel_huc *huc, struct drm_printer *p);
  
  #endif




[PATCH] drm/display: Select DP helpers for DRM_DP_AUX_CHARDEV and DRM_DP_CEC

2022-04-27 Thread Javier Martinez Canillas
The DRM_DP_AUX_CHARDEV and DRM_DP_CEC boolean Kconfig symbols enable code
that use DP helper functions, exported by the display-helper module.

But these don't select the DRM_DISPLAY_DP_HELPER and DRM_DISPLAY_HELPER
symbols, to make sure that the functions used will be present. This leads
to the following linking errors if CONFIG_DRM_DISPLAY_HELPER is set to m:

  LD  vmlinux.o
  MODPOST vmlinux.symvers
  MODINFO modules.builtin.modinfo
  GEN modules.builtin
  LD  .tmp_vmlinux.kallsyms1
  KSYMS   .tmp_vmlinux.kallsyms1.S
  AS  .tmp_vmlinux.kallsyms1.S
  LD  .tmp_vmlinux.kallsyms2
  KSYMS   .tmp_vmlinux.kallsyms2.S
  AS  .tmp_vmlinux.kallsyms2.S
  LD  vmlinux
  SYSMAP  System.map
  SORTTAB vmlinux
  OBJCOPY arch/arm64/boot/Image
  MODPOST modules-only.symvers
ERROR: modpost: "drm_dp_dpcd_write" 
[drivers/gpu/drm/display/drm_display_helper.ko] undefined!
ERROR: modpost: "drm_dp_read_desc" 
[drivers/gpu/drm/display/drm_display_helper.ko] undefined!
ERROR: modpost: "drm_dp_dpcd_read" 
[drivers/gpu/drm/display/drm_display_helper.ko] undefined!
make[1]: *** [scripts/Makefile.modpost:134: modules-only.symvers] Error 1
make[1]: *** Deleting file 'modules-only.symvers'
make: *** [Makefile:1749: modules] Error 2

Note: It seems this has been an issue for a long time but was made easier
to reproduce after the commit 1e0f66420b13 ("drm/display: Introduce a DRM
display-helper module"). Adding a Fixes: tag just to make sure that this
fix will be picked for stable once the mentioned change also lands there.

Fixes: 1e0f66420b13 ("drm/display: Introduce a DRM display-helper module")
Signed-off-by: Javier Martinez Canillas 
---

 drivers/gpu/drm/display/Kconfig | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/display/Kconfig b/drivers/gpu/drm/display/Kconfig
index f84f1b0cd23f..d7a413a2c6cc 100644
--- a/drivers/gpu/drm/display/Kconfig
+++ b/drivers/gpu/drm/display/Kconfig
@@ -32,6 +32,8 @@ config DRM_DISPLAY_HDMI_HELPER
 config DRM_DP_AUX_CHARDEV
bool "DRM DP AUX Interface"
depends on DRM
+   select DRM_DISPLAY_DP_HELPER
+   select DRM_DISPLAY_HELPER
help
  Choose this option to enable a /dev/drm_dp_auxN node that allows to
  read and write values to arbitrary DPCD registers on the DP aux
@@ -40,6 +42,8 @@ config DRM_DP_AUX_CHARDEV
 config DRM_DP_CEC
bool "Enable DisplayPort CEC-Tunneling-over-AUX HDMI support"
depends on DRM
+   select DRM_DISPLAY_DP_HELPER
+   select DRM_DISPLAY_HELPER
select CEC_CORE
help
  Choose this option if you want to enable HDMI CEC support for
-- 
2.35.1



[RFC PATCH v3 19/19] KVM: x86: nSVM: expose the nested AVIC to the guest

2022-04-27 Thread Maxim Levitsky
This patch enables and exposes to the nested guest
the support for the nested AVIC.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/svm.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 099329711ad13..431281ccc40ef 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4087,6 +4087,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu 
*vcpu)
if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
}
+
+   svm->avic_enabled = enable_apicv && guest_cpuid_has(vcpu, 
X86_FEATURE_AVIC);
+
init_vmcb_after_set_cpuid(vcpu);
 }
 
@@ -4827,6 +4830,9 @@ static __init void svm_set_cpu_caps(void)
if (vgif)
kvm_cpu_cap_set(X86_FEATURE_VGIF);
 
+   if (enable_apicv)
+   kvm_cpu_cap_set(X86_FEATURE_AVIC);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
-- 
2.26.3



[RFC PATCH v3 18/19] KVM: x86: SVM/nSVM: add optional non strict AVIC doorbell mode

2022-04-27 Thread Maxim Levitsky
By default, peers of a vCPU, can send it doorbell messages,
only when that vCPU is assigned (loaded) a physical CPU.

However when doorbell messages are not allowed, this causes all of
the vCPU's peers to get VM exits, which is suboptimal when this
vCPU is not halted, and therefore just temporary not running
in the guest mode due to being scheduled out and/or
having a userspace VM exit.

In this case peers can't make this vCPU enter guest mode faster,
and thus the VM exits they get don't do anything good.

Therefore this patch introduces (disabled by default)
new non strict mode (enabled by setting avic_doorbell_strict
kvm_amd module param to 0), such as when it is enabled,
and a vCPU is scheduled out but not halted, its peers can continue
sending  doorbell messages to the last physical CPU where the vCPU was
last running.

Security wise, a malicious guest with a compromised guest kernel,
can in this mode in some cases slow down whatever is
running on the last physical CPU where a vCPU was running
by spamming it with doorbell messages (hammering on ICR),
from its another vCPU.

Thus this mode is disabled by default.

However if admin policy is to have 1:1 vCPU/pCPU mapping,
this mode can be useful to avoid VM exits when a vCPU has
a userspace VM exit and such.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c | 16 +---
 arch/x86/kvm/svm/svm.c  | 25 +
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 149df26e17462..4bf0f00f13c12 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -1704,7 +1704,7 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, 
int cpu, bool r)
 
 void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-   u64 entry;
+   u64 old_entry, new_entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -1723,14 +1723,16 @@ void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_vcpu_is_blocking(vcpu))
return;
 
-   entry = READ_ONCE(*(svm->avic_physical_id_cache));
-   WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+   old_entry = READ_ONCE(*(svm->avic_physical_id_cache));
+   new_entry = old_entry;
 
-   entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
-   entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-   entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+   new_entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+   new_entry |= (h_physical_id & 
AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+   new_entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+   if (old_entry != new_entry)
+   WRITE_ONCE(*(svm->avic_physical_id_cache), new_entry);
 
-   WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
 }
 
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index b31bab832360e..099329711ad13 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -191,6 +191,10 @@ module_param(avic, bool, 0444);
 static bool force_avic;
 module_param_unsafe(force_avic, bool, 0444);
 
+static bool avic_doorbell_strict = true;
+module_param(avic_doorbell_strict, bool, 0444);
+
+
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
@@ -1402,10 +1406,23 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int 
cpu)
 
 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 {
-   if (kvm_vcpu_apicv_active(vcpu))
-   __avic_vcpu_put(vcpu);
-
-   __nested_avic_put(vcpu);
+   /*
+* Forbid this vCPU's peers to send doorbell messages.
+* Unless non strict doorbell mode is used.
+*
+* In this mode, doorbell messages are forbidden only when a vCPU
+* blocks, since for correctness only in this case it is needed
+* to intercept an IPI to wake up a vCPU.
+*
+* However this reduces the isolation of the guest since flood of
+* spurious doorbell messages can slow a CPU running another task
+* while this vCPU is scheduled out.
+*/
+   if (avic_doorbell_strict) {
+   if (kvm_vcpu_apicv_active(vcpu))
+   __avic_vcpu_put(vcpu);
+   __nested_avic_put(vcpu);
+   }
 
svm_prepare_host_switch(vcpu);
 
-- 
2.26.3



[RFC PATCH v3 17/19] KVM: x86: nSVM: implement nested AVIC doorbell emulation

2022-04-27 Thread Maxim Levitsky
This patch implements the doorbell msr emulation
for nested AVIC.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c | 49 +
 arch/x86/kvm/svm/svm.c  |  2 ++
 arch/x86/kvm/svm/svm.h  |  1 +
 3 files changed, 52 insertions(+)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index e8c53fd77f0b1..149df26e17462 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -1165,6 +1165,55 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct 
kvm_vcpu *vcpu)
return 0;
 }
 
+int avic_emulate_doorbell_write(struct kvm_vcpu *vcpu, u64 data)
+{
+   int source_l1_apicid = vcpu->vcpu_id;
+   int target_l1_apicid = data & AVIC_DOORBELL_PHYSICAL_ID_MASK;
+   bool target_running, target_nested;
+   struct kvm_vcpu *target;
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   if (!svm->avic_enabled || (data & ~AVIC_DOORBELL_PHYSICAL_ID_MASK))
+   return 1;
+
+   target = avic_vcpu_by_l1_apicid(vcpu->kvm, target_l1_apicid);
+   if (!target)
+   /* Guest bug: targeting invalid APIC ID. */
+   return 0;
+
+   target_running = READ_ONCE(target->mode) == IN_GUEST_MODE;
+   target_nested = is_guest_mode(target);
+
+   trace_kvm_avic_nested_doorbell(source_l1_apicid, target_l1_apicid,
+  target_nested, target_running);
+
+   /*
+* Target is not in the nested mode, thus the doorbell doesn't affect 
it.
+* If it just became nested after is_guest_mode was checked,
+* it means that it just processed AVIC state and KVM doesn't need
+* to send it another doorbell.
+*/
+   if (!target_nested)
+   return 0;
+
+   /*
+* If the target vCPU is in guest mode, kick the real doorbell.
+* Otherwise KVM needs to try to wake it up if it was sleeping.
+*
+* If the target is not longer in guest mode (just exited it),
+* it will either halt and before that it will notice pending IRR
+* bits, and cancel halting, or it will enter the guest mode again,
+* and notice the IRR bits as well.
+*/
+   if (target_running)
+   wrmsr(MSR_AMD64_SVM_AVIC_DOORBELL,
+ kvm_cpu_get_apicid(READ_ONCE(target->cpu)), 0);
+   else
+   kvm_vcpu_wake_up(target);
+
+   return 0;
+}
+
 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool 
flat)
 {
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d96a73931d1e5..b31bab832360e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2772,6 +2772,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+   case MSR_AMD64_SVM_AVIC_DOORBELL:
+   return avic_emulate_doorbell_write(vcpu, data);
case MSR_AMD64_TSC_RATIO:
 
if (!svm->tsc_scaling_enabled) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 93fd9d6f5fd85..14e2c5c451cad 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -714,6 +714,7 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct 
kvm_vcpu *vcpu);
 void avic_reload_apic_pages(struct kvm_vcpu *vcpu);
 void avic_free_nested(struct kvm_vcpu *vcpu);
 bool avic_nested_has_interrupt(struct kvm_vcpu *vcpu);
+int avic_emulate_doorbell_write(struct kvm_vcpu *vcpu, u64 data);
 
 struct avic_physid_table *
 avic_physid_shadow_table_get(struct kvm_vcpu *vcpu, gfn_t gfn);
-- 
2.26.3



[RFC PATCH v3 16/19] KVM: x86: nSVM: implement support for nested AVIC vmexits

2022-04-27 Thread Maxim Levitsky
* SVM_EXIT_AVIC_UNACCELERATED_ACCESS is always forwarded to the L1

* SVM_EXIT_AVIC_INCOMPLETE_IPI is hidden from the guest if:

   - is_running was false in shadow physid page because L1's vCPU
 was scheduled out - in this case, the vCPU is waken up,
 and it will process nested AVIC on next VM entry

  - invalid physical address of avic backing page was present
in the guest's physid page, which KVM translates to
valid physical address of a dummy page and is_running=false.

If this condition happens,
the AVIC_IPI_FAILURE_INVALID_BACKING_PAGE VM exit is injected to
the nested hypervisor.

* Note that it is possible to have SVM_EXIT_AVIC_INCOMPLETE_IPI
  VM exit happen both due to host and guest related reason
  at the same time:

  For example if a broadcast IPI was attempted and some shadow
  physid entries had 'is_running=false' set by the guest,
  and some had it set to false due to scheduled out L1 vCPUs.

  To support this case, all relevant entries of guest's physical
  and logical id tables are checked, and both host related actions
  (e.g wakeup) and guest vm exit reflection are done.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c   | 204 +-
 arch/x86/kvm/svm/nested.c |  14 +++
 2 files changed, 216 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index f13ca1e7b2845..e8c53fd77f0b1 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -917,6 +917,164 @@ static void avic_kick_target_vcpus(struct kvm *kvm, 
struct kvm_lapic *source,
}
 }
 
+static void
+avic_kick_target_vcpu_nested_physical(struct vcpu_svm *svm,
+ int target_l2_apic_id,
+ int *index,
+ bool *invalid_page)
+{
+   u64 gentry, sentry;
+   int target_l1_apicid;
+   struct avic_physid_table *t = svm->nested.l2_physical_id_table;
+
+   if (WARN_ON_ONCE(!t))
+   return;
+
+   /*
+* This shouldn't normally happen because this condition
+* should cause AVIC_IPI_FAILURE_INVALID_TARGET vmexit,
+* however the guest can change the page and trigger this.
+*/
+   if (target_l2_apic_id >= t->nentries)
+   return;
+
+   gentry = t->entries[target_l2_apic_id].gentry;
+   sentry = *t->entries[target_l2_apic_id].sentry;
+
+   /* Same reasoning as above  */
+   if (!(gentry & AVIC_PHYSICAL_ID_ENTRY_VALID_MASK))
+   return;
+
+   /*
+* This races against the guest updating is_running bit.
+*
+* Race itself happens on real hardware as well, and the guest
+* must use the correct means to avoid it.
+*
+* AVIC hardware already set IRR and should have done memory
+* barrier, and then found out that is_running is false
+* in shadow physid table.
+*
+* We are doing another is_running check (in the guest physid table),
+* completing it, thus don't need additional memory barrier.
+*/
+
+   target_l1_apicid = physid_entry_get_apicid(gentry);
+
+   if (target_l1_apicid == -1) {
+
+   /* is_running is false, need to vmexit to the guest */
+   if (*index == -1) {
+   u64 backing_page_phys = 
physid_entry_get_backing_table(sentry);
+
+   *index = target_l2_apic_id;
+   if (backing_page_phys == t->dummy_page_hpa)
+   *invalid_page = true;
+   }
+   } else {
+   /* Wake up the target vCPU and hide the VM exit from the guest 
*/
+   struct kvm_vcpu *target = avic_vcpu_by_l1_apicid(svm->vcpu.kvm, 
target_l1_apicid);
+
+   if (target && target != >vcpu)
+   kvm_vcpu_wake_up(target);
+   }
+
+   trace_kvm_avic_nested_kick_vcpu(svm->vcpu.vcpu_id,
+   target_l2_apic_id,
+   target_l1_apicid);
+}
+
+static void
+avic_kick_target_vcpus_nested_logical(struct vcpu_svm *svm, unsigned long dest,
+ int *index, bool *invalid_page)
+{
+   int logical_id;
+   u8 cluster = 0;
+   u64 *logical_id_table = (u64 *)svm->nested.l2_logical_id_table.hva;
+   int physical_index = -1;
+
+   if (WARN_ON_ONCE(!logical_id_table))
+   return;
+
+   if (nested_avic_get_reg(>vcpu, APIC_DFR) == APIC_DFR_CLUSTER) {
+   if (dest >= 0x40)
+   return;
+   cluster = dest & 0x3C;
+   dest &= 0x3;
+   }
+
+   for_each_set_bit(logical_id, , 8) {
+   int logical_index = cluster | logical_id;
+   u64 log_gentry = logical_id_table[logical_index];
+   int l2_apicid = logid_get_physid(log_gentry);
+
+   /* Should 

[RFC PATCH v3 15/19] KVM: x86: nSVM: add code to reload AVIC physid table when it is invalidated

2022-04-27 Thread Maxim Levitsky
An AVIC table invalidation is not supposed to happen often, and can
only happen when the guest does something suspicious such as:

  - It places physid page in a memslot that is enabled/disabled and memslot
flushing happens.

  - It tries to update apic backing page addresses - guest has no
reason to touch this, and doing so on real hardware will likely
result in unpredictable results.

  - It writes to reserved bits of a tracked page.


  - It write floods a physid table while no vCPU is using it
(the page is likely reused at that point to contain something else)


All of the above causes a KVM_REQ_APIC_PAGE_RELOAD request to be raised
on all vCPUS, which kicks them out of the guest mode,
and then first vCPU to reach the handler will re-create the entries of
the physid page, and others will notice this and do nothing.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c | 13 +
 arch/x86/kvm/svm/svm.c  |  1 +
 arch/x86/kvm/svm/svm.h  |  1 +
 3 files changed, 15 insertions(+)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index e6ec525a88625..f13ca1e7b2845 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -379,6 +379,7 @@ static void avic_physid_shadow_table_invalidate(struct kvm 
*kvm,
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
 
lockdep_assert_held(_svm->avic.tables_lock);
+   kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
avic_physid_shadow_table_erase(kvm, t);
 }
 
@@ -1638,3 +1639,15 @@ bool avic_nested_has_interrupt(struct kvm_vcpu *vcpu)
return true;
return false;
 }
+
+void avic_reload_apic_pages(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_svm *vcpu_svm = to_svm(vcpu);
+   struct avic_physid_table *t = vcpu_svm->nested.l2_physical_id_table;
+
+   int nentries = vcpu_svm->nested.ctl.avic_physical_id &
+   AVIC_PHYSICAL_ID_TABLE_SIZE_MASK;
+
+   if (t && is_guest_mode(vcpu) && nested_avic_in_use(vcpu))
+   avic_physid_shadow_table_sync(vcpu, t, nentries);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a39bb0b27a51d..d96a73931d1e5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4677,6 +4677,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
+   .reload_apic_pages = avic_reload_apic_pages,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
.apicv_post_state_restore = avic_apicv_post_state_restore,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 17fcc09cf4be1..93fd9d6f5fd85 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -711,6 +711,7 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
+void avic_reload_apic_pages(struct kvm_vcpu *vcpu);
 void avic_free_nested(struct kvm_vcpu *vcpu);
 bool avic_nested_has_interrupt(struct kvm_vcpu *vcpu);
 
-- 
2.26.3



[RFC PATCH v3 14/19] KVM: x86: rename .set_apic_access_page_addr to reload_apic_access_page

2022-04-27 Thread Maxim Levitsky
This will be used on SVM to reload shadow page of the AVIC physid table

No functional change intended

Signed-off-by: Maxim Levitsky 
---
 arch/x86/include/asm/kvm-x86-ops.h | 2 +-
 arch/x86/include/asm/kvm_host.h| 3 +--
 arch/x86/kvm/vmx/vmx.c | 8 
 arch/x86/kvm/x86.c | 6 +++---
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h 
b/arch/x86/include/asm/kvm-x86-ops.h
index 96e4e9842dfc6..997edb7453ac2 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -82,7 +82,7 @@ KVM_X86_OP_OPTIONAL(hwapic_isr_update)
 KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
 KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
 KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
-KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
+KVM_X86_OP_OPTIONAL(reload_apic_pages)
 KVM_X86_OP(deliver_interrupt)
 KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
 KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc7df778a3d71..52fa04c3108b1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1436,7 +1436,7 @@ struct kvm_x86_ops {
bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
-   void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
+   void (*reload_apic_pages)(struct kvm_vcpu *vcpu);
void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
  int trig_mode, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
@@ -1909,7 +1909,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
-
 int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cf8581978bce3..7defd31703c61 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6339,7 +6339,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
vmx_update_msr_bitmap_x2apic(vcpu);
 }
 
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+static void vmx_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
struct page *page;
 
@@ -,7 +,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.enable_irq_window = vmx_enable_irq_window,
.update_cr8_intercept = vmx_update_cr8_intercept,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
-   .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+   .reload_apic_pages = vmx_reload_apic_access_page,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_post_state_restore = vmx_apicv_post_state_restore,
@@ -7940,12 +7940,12 @@ static __init int hardware_setup(void)
enable_vnmi = 0;
 
/*
-* set_apic_access_page_addr() is used to reload apic access
+* kvm_vcpu_reload_apic_pages() is used to reload apic access
 * page upon invalidation.  No need to do anything if not
 * using the APIC_ACCESS_ADDR VMCS field.
 */
if (!flexpriority_enabled)
-   vmx_x86_ops.set_apic_access_page_addr = NULL;
+   vmx_x86_ops.reload_apic_pages = NULL;
 
if (!cpu_has_vmx_tpr_shadow())
vmx_x86_ops.update_cr8_intercept = NULL;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d2f73ce87a1e3..ad744ab99734c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9949,12 +9949,12 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm 
*kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
 }
 
-static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_reload_apic_pages(struct kvm_vcpu *vcpu)
 {
if (!lapic_in_kernel(vcpu))
return;
 
-   static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+   static_call_cond(kvm_x86_reload_apic_pages)(vcpu);
 }
 
 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -10071,7 +10071,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
vcpu_load_eoi_exitmap(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
-   kvm_vcpu_reload_apic_access_page(vcpu);
+   kvm_vcpu_reload_apic_pages(vcpu);
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = 

[RFC PATCH v3 13/19] KVM: x86: nSVM: wire nested AVIC to nested guest entry/exit

2022-04-27 Thread Maxim Levitsky
  * Passthrough guest's avic pages that can be passed through
 - logical id table
 - avic backing page

  * Passthrough AVIC's mmio range
 - nested guest is responsible for marking it RW
   in its NPT tables.

  * Write track physical id page
 - all peer's avic backing pages are pinned
   as long as the shadow table is not invalidated/
   freed.

  * Cache guest AVIC settings.

  * Add SDM mandated changes to emulated VM enter/exit.

Note that nested AVIC still can't be enabled, thus this
code has no effect yet.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c   |  51 ++-
 arch/x86/kvm/svm/nested.c | 127 +-
 arch/x86/kvm/svm/svm.c|   2 +
 arch/x86/kvm/svm/svm.h|  24 +++
 4 files changed, 199 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 34da9fabd5194..e6ec525a88625 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -59,6 +59,18 @@ static inline struct kvm_vcpu *avic_vcpu_by_l1_apicid(struct 
kvm *kvm,
return kvm_get_vcpu_by_id(kvm, l1_apicid);
 }
 
+static u32 nested_avic_get_reg(struct kvm_vcpu *vcpu, int reg_off)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   void *nested_apic_regs = svm->nested.l2_apic_access_page.hva;
+
+   if (WARN_ON_ONCE(!nested_apic_regs))
+   return 0;
+
+   return *((u32 *) (nested_apic_regs + reg_off));
+}
+
 static void avic_physid_shadow_entry_set_vcpu(struct kvm *kvm,
  struct avic_physid_table *t,
  int n,
@@ -531,6 +543,20 @@ static void avic_physid_shadow_table_flush_memslot(struct 
kvm *kvm,
mutex_unlock(_svm->avic.tables_lock);
 }
 
+void avic_free_nested(struct kvm_vcpu *vcpu)
+{
+   struct avic_physid_table *t;
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   t = svm->nested.l2_physical_id_table;
+   if (t) {
+   avic_physid_shadow_table_put(vcpu->kvm, t);
+   svm->nested.l2_physical_id_table = NULL;
+   }
+
+   kvm_vcpu_unmap(vcpu, >nested.l2_apic_access_page, true);
+   kvm_vcpu_unmap(vcpu, >nested.l2_logical_id_table, true);
+}
 
 /*
  * This is a wrapper of struct amd_iommu_ir_data.
@@ -586,10 +612,18 @@ void avic_vm_destroy(struct kvm *kvm)
 {
unsigned long flags;
struct kvm_svm_avic *avic = _kvm_svm(kvm)->avic;
+   unsigned long i;
+   struct kvm_vcpu *vcpu;
 
if (!enable_apicv)
return;
 
+   kvm_for_each_vcpu(i, vcpu, kvm) {
+   vcpu_load(vcpu);
+   avic_free_nested(vcpu);
+   vcpu_put(vcpu);
+   }
+
if (avic->logical_id_table_page)
__free_page(avic->logical_id_table_page);
if (avic->physical_id_table_page)
@@ -1501,7 +1535,7 @@ void __nested_avic_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_vcpu_is_blocking(vcpu))
return;
 
-   if (svm->nested.initialized)
+   if (svm->nested.initialized && svm->avic_enabled)
avic_update_peer_physid_entries(vcpu, cpu);
 }
 
@@ -1511,7 +1545,7 @@ void __nested_avic_put(struct kvm_vcpu *vcpu)
 
lockdep_assert_preemption_disabled();
 
-   if (svm->nested.initialized)
+   if (svm->nested.initialized && svm->avic_enabled)
avic_update_peer_physid_entries(vcpu, -1);
 }
 
@@ -1591,3 +1625,16 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
 
nested_avic_load(vcpu);
 }
+
+bool avic_nested_has_interrupt(struct kvm_vcpu *vcpu)
+{
+   int off;
+
+   if (!nested_avic_in_use(vcpu))
+   return false;
+
+   for (off = 0x10; off < 0x80; off += 0x10)
+   if (nested_avic_get_reg(vcpu, APIC_IRR + off))
+   return true;
+   return false;
+}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bed5e1692cef0..eb5e9b600e052 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -387,6 +387,14 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu 
*vcpu,
memcpy(to->reserved_sw, from->reserved_sw,
   sizeof(struct hv_enlightenments));
}
+
+   /* copy avic related settings only when it is enabled */
+   if (from->int_ctl & AVIC_ENABLE_MASK) {
+   to->avic_vapic_bar  = from->avic_vapic_bar;
+   to->avic_backing_page   = from->avic_backing_page;
+   to->avic_logical_id = from->avic_logical_id;
+   to->avic_physical_id= from->avic_physical_id;
+   }
 }
 
 void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
@@ -539,6 +547,79 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
 }
 
+
+static bool nested_vmcb02_prepare_avic(struct vcpu_svm *svm)
+{
+   struct vmcb *vmcb02 = 

[RFC PATCH v3 12/19] KVM: x86: nSVM: make nested AVIC physid write tracking be aware of the host scheduling

2022-04-27 Thread Maxim Levitsky
For each vCPU
  - store a linked list of all shadow physical id entries
which address it.

  - Update those entries when this vCPU is scheduled
in/out

  - update this list, when physid tables are modified by
other means (guest write and/or table sync)

To avoid races vs vcpu schedule, use a spinlock.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c | 113 +---
 arch/x86/kvm/svm/svm.c  |   7 +++
 arch/x86/kvm/svm/svm.h  |  10 
 3 files changed, 122 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index f462b7e48e3ca..34da9fabd5194 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -67,8 +67,12 @@ static void avic_physid_shadow_entry_set_vcpu(struct kvm 
*kvm,
struct avic_physid_entry_descr *e = >entries[n];
u64 sentry = READ_ONCE(*e->sentry);
u64 old_sentry = sentry;
+   struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_vcpu *new_vcpu = NULL;
int l0_apicid = -1;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(_svm->avic.table_entries_lock, flags);
 
WARN_ON(!test_bit(n, t->valid_entires));
 
@@ -79,6 +83,9 @@ static void avic_physid_shadow_entry_set_vcpu(struct kvm *kvm,
new_vcpu = avic_vcpu_by_l1_apicid(kvm, new_l1_apicid);
 
if (new_vcpu)
+   list_add_tail(>link, 
_svm(new_vcpu)->nested.physid_ref_entries);
+
+   if (new_vcpu && to_svm(new_vcpu)->nested_avic_active)
l0_apicid = kvm_cpu_get_apicid(new_vcpu->cpu);
 
physid_entry_set_apicid(, l0_apicid);
@@ -87,6 +94,8 @@ static void avic_physid_shadow_entry_set_vcpu(struct kvm *kvm,
 
if (sentry != old_sentry)
WRITE_ONCE(*e->sentry, sentry);
+
+   raw_spin_unlock_irqrestore(_svm->avic.table_entries_lock, flags);
 }
 
 static void avic_physid_shadow_entry_create(struct kvm *kvm,
@@ -131,7 +140,11 @@ static void avic_physid_shadow_entry_remove(struct kvm 
*kvm,
   int n)
 {
struct avic_physid_entry_descr *e = >entries[n];
+   struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
hpa_t backing_page_hpa;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(_svm->avic.table_entries_lock, flags);
 
if (!test_and_clear_bit(n, t->valid_entires))
WARN_ON(1);
@@ -147,8 +160,49 @@ static void avic_physid_shadow_entry_remove(struct kvm 
*kvm,
 
e->gentry = 0;
*e->sentry = 0;
+
+   raw_spin_unlock_irqrestore(_svm->avic.table_entries_lock, flags);
 }
 
+static void avic_update_peer_physid_entries(struct kvm_vcpu *vcpu, int cpu)
+{
+   /*
+* Update all shadow physid tables which contain entries
+* which reference this vCPU with its new physical location
+*/
+   struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+   struct vcpu_svm *vcpu_svm = to_svm(vcpu);
+   struct avic_physid_entry_descr *e;
+   int updated_nentries = 0;
+   int l0_apicid = -1;
+   unsigned long flags;
+   bool new_active = cpu != -1;
+
+   if (cpu != -1)
+   l0_apicid = kvm_cpu_get_apicid(cpu);
+
+   raw_spin_lock_irqsave(_svm->avic.table_entries_lock, flags);
+
+   list_for_each_entry(e, _svm->nested.physid_ref_entries, link) {
+   u64 sentry = READ_ONCE(*e->sentry);
+   u64 old_sentry = sentry;
+
+   physid_entry_set_apicid(, l0_apicid);
+
+   if (sentry != old_sentry) {
+   updated_nentries++;
+   WRITE_ONCE(*e->sentry, sentry);
+   }
+   }
+
+   if (updated_nentries)
+   trace_kvm_avic_physid_update_vcpu_host(vcpu->vcpu_id,
+  l0_apicid, 
updated_nentries);
+
+   vcpu_svm->nested_avic_active = new_active;
+
+   raw_spin_unlock_irqrestore(_svm->avic.table_entries_lock, flags);
+}
 
 static bool
 avic_physid_shadow_table_setup_write_tracking(struct kvm *kvm,
@@ -603,6 +657,7 @@ int avic_vm_init(struct kvm *kvm)
hash_add(svm_vm_data_hash, >hnode, avic->vm_id);
spin_unlock_irqrestore(_vm_data_hash_lock, flags);
 
+   raw_spin_lock_init(>table_entries_lock);
mutex_init(>tables_lock);
INIT_LIST_HEAD(>physid_tables);
 
@@ -1428,9 +1483,51 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu)
 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
 {
preempt_disable();
-
__avic_vcpu_put(vcpu);
+   preempt_enable();
+}
+
 
+void __nested_avic_load(struct kvm_vcpu *vcpu, int cpu)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   lockdep_assert_preemption_disabled();
+
+   /*
+* For the same reason as in __avic_vcpu_load there is no
+* need to load nested AVIC when this vCPU is blocking
+*/
+   if (kvm_vcpu_is_blocking(vcpu))
+   return;
+
+   if 

[RFC PATCH v3 11/19] KVM: x86: nSVM: implement shadowing of AVIC's physical id table

2022-04-27 Thread Maxim Levitsky
Implement the shadow physical id table and its
write tracking code which will be soon used for the nested AVIC.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c | 461 +++-
 arch/x86/kvm/svm/svm.h  |  71 +++
 2 files changed, 524 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index e5cbbb97fbab6..f462b7e48e3ca 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -51,6 +51,433 @@ static u32 next_vm_id = 0;
 static bool next_vm_id_wrapped = 0;
 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 
+
+static inline struct kvm_vcpu *avic_vcpu_by_l1_apicid(struct kvm *kvm,
+ int l1_apicid)
+{
+   WARN_ON(l1_apicid == -1);
+   return kvm_get_vcpu_by_id(kvm, l1_apicid);
+}
+
+static void avic_physid_shadow_entry_set_vcpu(struct kvm *kvm,
+ struct avic_physid_table *t,
+ int n,
+ int new_l1_apicid)
+{
+   struct avic_physid_entry_descr *e = >entries[n];
+   u64 sentry = READ_ONCE(*e->sentry);
+   u64 old_sentry = sentry;
+   struct kvm_vcpu *new_vcpu = NULL;
+   int l0_apicid = -1;
+
+   WARN_ON(!test_bit(n, t->valid_entires));
+
+   if (!list_empty(>link))
+   list_del_init(>link);
+
+   if (new_l1_apicid != -1)
+   new_vcpu = avic_vcpu_by_l1_apicid(kvm, new_l1_apicid);
+
+   if (new_vcpu)
+   l0_apicid = kvm_cpu_get_apicid(new_vcpu->cpu);
+
+   physid_entry_set_apicid(, l0_apicid);
+
+   trace_kvm_avic_physid_update_vcpu_guest(new_l1_apicid, l0_apicid);
+
+   if (sentry != old_sentry)
+   WRITE_ONCE(*e->sentry, sentry);
+}
+
+static void avic_physid_shadow_entry_create(struct kvm *kvm,
+   struct avic_physid_table *t,
+   int n,
+   u64 gentry)
+{
+   struct avic_physid_entry_descr *e = >entries[n];
+   struct page *backing_page;
+   u64 backing_page_gpa = physid_entry_get_backing_table(gentry);
+   int l1_apic_id = physid_entry_get_apicid(gentry);
+   hpa_t backing_page_hpa;
+   u64 sentry = 0;
+
+
+   if (backing_page_gpa == INVALID_BACKING_PAGE)
+   return;
+
+   /* Pin the APIC backing page */
+   backing_page = gfn_to_page(kvm, gpa_to_gfn(backing_page_gpa));
+
+   if (is_error_page(backing_page))
+   /* Invalid GPA in the guest entry - point to a dummy entry */
+   backing_page_hpa = t->dummy_page_hpa;
+   else
+   backing_page_hpa = page_to_phys(backing_page);
+
+   physid_entry_set_backing_table(, backing_page_hpa);
+
+   e->gentry = gentry;
+   *e->sentry = sentry;
+
+   if (test_and_set_bit(n, t->valid_entires))
+   WARN_ON(1);
+
+   if (backing_page_hpa != t->dummy_page_hpa)
+   avic_physid_shadow_entry_set_vcpu(kvm, t, n, l1_apic_id);
+}
+
+static void avic_physid_shadow_entry_remove(struct kvm *kvm,
+  struct avic_physid_table *t,
+  int n)
+{
+   struct avic_physid_entry_descr *e = >entries[n];
+   hpa_t backing_page_hpa;
+
+   if (!test_and_clear_bit(n, t->valid_entires))
+   WARN_ON(1);
+
+   /* Release the APIC backing page */
+   backing_page_hpa = physid_entry_get_backing_table(*e->sentry);
+
+   if (backing_page_hpa != t->dummy_page_hpa)
+   kvm_release_pfn_dirty(backing_page_hpa >> PAGE_SHIFT);
+
+   if (!list_empty(>link))
+   list_del_init(>link);
+
+   e->gentry = 0;
+   *e->sentry = 0;
+}
+
+
+static bool
+avic_physid_shadow_table_setup_write_tracking(struct kvm *kvm,
+ struct avic_physid_table *t,
+ bool enable)
+{
+   struct kvm_memory_slot *slot;
+
+   write_lock(>mmu_lock);
+   slot = gfn_to_memslot(kvm, t->gfn);
+   if (!slot) {
+   write_unlock(>mmu_lock);
+   return false;
+   }
+
+   if (enable)
+   kvm_slot_page_track_add_page(kvm, slot, t->gfn, 
KVM_PAGE_TRACK_WRITE);
+   else
+   kvm_slot_page_track_remove_page(kvm, slot, t->gfn, 
KVM_PAGE_TRACK_WRITE);
+   write_unlock(>mmu_lock);
+   return true;
+}
+
+static void
+avic_physid_shadow_table_erase(struct kvm *kvm, struct avic_physid_table *t)
+{
+   int i;
+
+   if (!t->nentries)
+   return;
+
+   avic_physid_shadow_table_setup_write_tracking(kvm, t, false);
+
+   for_each_set_bit(i, t->valid_entires, AVIC_MAX_PHYSICAL_ID_COUNT)
+   avic_physid_shadow_entry_remove(kvm, t, i);
+
+   t->nentries = 0;
+   t->flood_count = 

[RFC PATCH v3 10/19] KVM: x86: nSVM: implement AVIC's physid/logid table access helpers

2022-04-27 Thread Maxim Levitsky
This implements a few helpers that help manipulate the AVIC's
physical and logical id table entries.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/svm.h | 45 ++
 1 file changed, 45 insertions(+)

diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6fcb164a6ee4a..dfca4c06e2071 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -628,6 +628,51 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
 
+#define INVALID_BACKING_PAGE   (~(u64)0)
+
+static inline u64 physid_entry_get_backing_table(u64 entry)
+{
+   if (!(entry & AVIC_PHYSICAL_ID_ENTRY_VALID_MASK))
+   return INVALID_BACKING_PAGE;
+   return entry & AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK;
+}
+
+static inline int physid_entry_get_apicid(u64 entry)
+{
+   if (!(entry & AVIC_PHYSICAL_ID_ENTRY_VALID_MASK))
+   return -1;
+   if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+   return -1;
+
+   return entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+}
+
+static inline int logid_get_physid(u64 entry)
+{
+   if (!(entry & AVIC_LOGICAL_ID_ENTRY_VALID_BIT))
+   return -1;
+   return entry & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+}
+
+static inline void physid_entry_set_backing_table(u64 *entry, u64 value)
+{
+   *entry &= ~AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK;
+   *entry |= (AVIC_PHYSICAL_ID_ENTRY_VALID_MASK | value);
+}
+
+static inline void physid_entry_set_apicid(u64 *entry, int value)
+{
+   WARN_ON(!(*entry & AVIC_PHYSICAL_ID_ENTRY_VALID_MASK));
+
+   *entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+
+   if (value == -1)
+   *entry &= ~(AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+   else
+   *entry |= (AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK | value);
+}
+
+
 /* sev.c */
 
 #define GHCB_VERSION_MAX   1ULL
-- 
2.26.3



[RFC PATCH v3 09/19] KVM: x86: nSVM: add nested AVIC tracepoints

2022-04-27 Thread Maxim Levitsky
This patch adds few tracepoints that will be used
to debug/profile the nested AVIC.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/trace.h | 157 ++-
 arch/x86/kvm/x86.c   |  13 
 2 files changed, 169 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index de47625175692..f7ddba5ae06a5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1385,7 +1385,7 @@ TRACE_EVENT(kvm_apicv_accept_irq,
 );
 
 /*
- * Tracepoint for AMD AVIC
+ * Tracepoints for AMD AVIC
  */
 TRACE_EVENT(kvm_avic_incomplete_ipi,
TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
@@ -1479,6 +1479,161 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
  __entry->icrh, __entry->icrl, __entry->index)
 );
 
+TRACE_EVENT(kvm_avic_physid_table_alloc,
+   TP_PROTO(u64 gpa),
+   TP_ARGS(gpa),
+
+   TP_STRUCT__entry(
+   __field(u64, gpa)
+   ),
+
+   TP_fast_assign(
+   __entry->gpa = gpa;
+   ),
+
+   TP_printk("table at gpa 0x%llx",
+ __entry->gpa)
+);
+
+
+TRACE_EVENT(kvm_avic_physid_table_free,
+   TP_PROTO(u64 gpa),
+   TP_ARGS(gpa),
+
+   TP_STRUCT__entry(
+   __field(u64, gpa)
+   ),
+
+   TP_fast_assign(
+   __entry->gpa = gpa;
+   ),
+
+   TP_printk("table at gpa 0x%llx",
+ __entry->gpa)
+);
+
+TRACE_EVENT(kvm_avic_physid_table_reload,
+   TP_PROTO(u64 gpa, int nentries, int new_nentires),
+   TP_ARGS(gpa, nentries, new_nentires),
+
+   TP_STRUCT__entry(
+   __field(u64, gpa)
+   __field(int, nentries)
+   __field(int, new_nentires)
+   ),
+
+   TP_fast_assign(
+   __entry->gpa = gpa;
+   __entry->nentries = nentries;
+   __entry->new_nentires = new_nentires;
+   ),
+
+   TP_printk("table at gpa 0x%llx, nentires %d -> %d",
+ __entry->gpa, __entry->nentries, __entry->new_nentires)
+);
+
+TRACE_EVENT(kvm_avic_physid_table_write,
+   TP_PROTO(u64 gpa, int bytes),
+   TP_ARGS(gpa, bytes),
+
+   TP_STRUCT__entry(
+   __field(u64, gpa)
+   __field(int, bytes)
+   ),
+
+   TP_fast_assign(
+   __entry->gpa = gpa;
+   __entry->bytes = bytes;
+   ),
+
+   TP_printk("gpa 0x%llx, write of %d bytes",
+ __entry->gpa, __entry->bytes)
+);
+
+TRACE_EVENT(kvm_avic_physid_update_vcpu_host,
+   TP_PROTO(int vcpu_id, int cpu_id, int n),
+   TP_ARGS(vcpu_id, cpu_id, n),
+
+   TP_STRUCT__entry(
+   __field(int, vcpu_id)
+   __field(int, cpu_id)
+   __field(int, n)
+   ),
+
+   TP_fast_assign(
+   __entry->vcpu_id = vcpu_id;
+   __entry->cpu_id = cpu_id;
+   __entry->n = n;
+   ),
+
+   TP_printk("l1 vcpu %d -> l0 cpu %d (%d entries)",
+ __entry->vcpu_id, __entry->cpu_id, __entry->n)
+);
+
+TRACE_EVENT(kvm_avic_physid_update_vcpu_guest,
+   TP_PROTO(int vcpu_id, int cpu_id),
+   TP_ARGS(vcpu_id, cpu_id),
+
+   TP_STRUCT__entry(
+   __field(int, vcpu_id)
+   __field(int, cpu_id)
+   ),
+
+   TP_fast_assign(
+   __entry->vcpu_id = vcpu_id;
+   __entry->cpu_id = cpu_id;
+   ),
+
+   TP_printk("l1 vcpu %d -> l0 cpu %d",
+ __entry->vcpu_id, __entry->cpu_id)
+);
+
+TRACE_EVENT(kvm_avic_nested_doorbell,
+   TP_PROTO(int source_l1_apicid, int target_l1_apicid, bool 
target_nested,
+   bool target_running),
+   TP_ARGS(source_l1_apicid, target_l1_apicid, target_nested,
+   target_running),
+
+   TP_STRUCT__entry(
+   __field(int, source_l1_apicid)
+   __field(int, target_l1_apicid)
+   __field(bool, target_nested)
+   __field(bool, target_running)
+   ),
+
+   TP_fast_assign(
+   __entry->source_l1_apicid = source_l1_apicid;
+   __entry->target_l1_apicid = target_l1_apicid;
+   __entry->target_nested = target_nested;
+   __entry->target_running = target_running;
+   ),
+
+   TP_printk("source %d target %d (nested: %d, running %d)",
+ __entry->source_l1_apicid, __entry->target_l1_apicid,
+ __entry->target_nested, __entry->target_running)
+);
+
+TRACE_EVENT(kvm_avic_nested_kick_vcpu,
+   TP_PROTO(int source_l1_apic_id, int target_l2_apic_id, int 
target_l1_apic_id),
+   TP_ARGS(source_l1_apic_id, target_l2_apic_id, target_l1_apic_id),
+
+   TP_STRUCT__entry(
+   __field(int, source_l1_apic_id)
+   __field(int, target_l2_apic_id)
+   __field(int, target_l1_apic_id)
+   ),
+
+   TP_fast_assign(

[RFC PATCH v3 08/19] KVM: x86: SVM: move avic state to separate struct

2022-04-27 Thread Maxim Levitsky
This will make the code a bit easier to read when nested AVIC support
is added.

No functional change intended.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c | 51 +++--
 arch/x86/kvm/svm/svm.h  | 14 ++-
 2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1102421668a11..e5cbbb97fbab6 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -69,6 +69,8 @@ int avic_ga_log_notifier(u32 ga_tag)
unsigned long flags;
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
+   struct kvm_svm_avic *avic;
+
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
 
@@ -76,9 +78,13 @@ int avic_ga_log_notifier(u32 ga_tag)
trace_kvm_avic_ga_log(vm_id, vcpu_id);
 
spin_lock_irqsave(_vm_data_hash_lock, flags);
-   hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
-   if (kvm_svm->avic_vm_id != vm_id)
+   hash_for_each_possible(svm_vm_data_hash, avic, hnode, vm_id) {
+
+
+   if (avic->vm_id != vm_id)
continue;
+
+   kvm_svm = container_of(avic, struct kvm_svm, avic);
vcpu = kvm_get_vcpu_by_id(_svm->kvm, vcpu_id);
break;
}
@@ -98,18 +104,18 @@ int avic_ga_log_notifier(u32 ga_tag)
 void avic_vm_destroy(struct kvm *kvm)
 {
unsigned long flags;
-   struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+   struct kvm_svm_avic *avic = _kvm_svm(kvm)->avic;
 
if (!enable_apicv)
return;
 
-   if (kvm_svm->avic_logical_id_table_page)
-   __free_page(kvm_svm->avic_logical_id_table_page);
-   if (kvm_svm->avic_physical_id_table_page)
-   __free_page(kvm_svm->avic_physical_id_table_page);
+   if (avic->logical_id_table_page)
+   __free_page(avic->logical_id_table_page);
+   if (avic->physical_id_table_page)
+   __free_page(avic->physical_id_table_page);
 
spin_lock_irqsave(_vm_data_hash_lock, flags);
-   hash_del(_svm->hnode);
+   hash_del(>hnode);
spin_unlock_irqrestore(_vm_data_hash_lock, flags);
 }
 
@@ -117,10 +123,9 @@ int avic_vm_init(struct kvm *kvm)
 {
unsigned long flags;
int err = -ENOMEM;
-   struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
-   struct kvm_svm *k2;
struct page *p_page;
struct page *l_page;
+   struct kvm_svm_avic *avic = _kvm_svm(kvm)->avic;
u32 vm_id;
 
if (!enable_apicv)
@@ -131,14 +136,14 @@ int avic_vm_init(struct kvm *kvm)
if (!p_page)
goto free_avic;
 
-   kvm_svm->avic_physical_id_table_page = p_page;
+   avic->physical_id_table_page = p_page;
 
/* Allocating logical APIC ID table (4KB) */
l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!l_page)
goto free_avic;
 
-   kvm_svm->avic_logical_id_table_page = l_page;
+   avic->logical_id_table_page = l_page;
 
spin_lock_irqsave(_vm_data_hash_lock, flags);
  again:
@@ -149,13 +154,15 @@ int avic_vm_init(struct kvm *kvm)
}
/* Is it still in use? Only possible if wrapped at least once */
if (next_vm_id_wrapped) {
-   hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
-   if (k2->avic_vm_id == vm_id)
+   struct kvm_svm_avic *avic2;
+
+   hash_for_each_possible(svm_vm_data_hash, avic2, hnode, vm_id) {
+   if (avic2->vm_id == vm_id)
goto again;
}
}
-   kvm_svm->avic_vm_id = vm_id;
-   hash_add(svm_vm_data_hash, _svm->hnode, kvm_svm->avic_vm_id);
+   avic->vm_id = vm_id;
+   hash_add(svm_vm_data_hash, >hnode, avic->vm_id);
spin_unlock_irqrestore(_vm_data_hash_lock, flags);
 
return 0;
@@ -169,8 +176,8 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
 {
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
-   phys_addr_t lpa = 
__sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
-   phys_addr_t ppa = 
__sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
+   phys_addr_t lpa = 
__sme_set(page_to_phys(kvm_svm->avic.logical_id_table_page));
+   phys_addr_t ppa = 
__sme_set(page_to_phys(kvm_svm->avic.physical_id_table_page));
 
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -193,7 +200,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu 
*vcpu,
if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
return NULL;
 
-   avic_physical_id_table = 
page_address(kvm_svm->avic_physical_id_table_page);
+   avic_physical_id_table = 

[RFC PATCH v3 07/19] KVM: x86: mmu: tweak fast path for emulation of access to nested NPT pages

2022-04-27 Thread Maxim Levitsky
If a non leaf mmu page is write tracked externally for some reason,
which can in theory happen if it was used for nested avic physid page
before, then this code will enter an endless loop of page faults because
unprotecting the mmu page will not remove write tracking, nor will the
write tracker callback be called, because there is no mmu page at
this address.

Fix this by only invoking the fast path if we succeeded in zapping the
mmu page.

Fixes: 147277540bbc5 ("kvm: svm: Add support for additional SVM NPF error 
codes")
Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/mmu/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 633a3138d68e1..8f77d41e7fd80 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5341,8 +5341,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa, u64 error_code,
 */
if (vcpu->arch.mmu->root_role.direct &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
-   kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
-   return 1;
+   if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+   return 1;
}
 
/*
-- 
2.26.3



[RFC PATCH v3 06/19] KVM: x86: mmu: add gfn_in_memslot helper

2022-04-27 Thread Maxim Levitsky
This is a tiny refactoring, and can be useful to check
if a GPA/GFN is within a memslot a bit more cleanly.

Signed-off-by: Maxim Levitsky 
---
 include/linux/kvm_host.h | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 252ee4a61b58b..12e261559070b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1580,6 +1580,13 @@ int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 
+
+static inline bool gfn_in_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+   return (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages);
+}
+
+
 /*
  * Returns a pointer to the memslot if it contains gfn.
  * Otherwise returns NULL.
@@ -1590,12 +1597,13 @@ try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
if (!slot)
return NULL;
 
-   if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages)
+   if (gfn_in_memslot(slot, gfn))
return slot;
else
return NULL;
 }
 
+
 /*
  * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL.
  *
-- 
2.26.3



[RFC PATCH v3 04/19] KVM: x86: mmu: allow to enable write tracking externally

2022-04-27 Thread Maxim Levitsky
This will be used to enable write tracking from nested AVIC code
and can also be used to enable write tracking in GVT-g module
when it actually uses it as opposed to always enabling it,
when the module is compiled in the kernel.

No functional change intended.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/include/asm/kvm_host.h   |  2 +-
 arch/x86/include/asm/kvm_page_track.h |  1 +
 arch/x86/kvm/mmu.h|  8 +---
 arch/x86/kvm/mmu/mmu.c| 17 ++---
 arch/x86/kvm/mmu/page_track.c | 10 --
 5 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 636df87542555..fc7df778a3d71 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1254,7 +1254,7 @@ struct kvm_arch {
 * is used as one input when determining whether certain memslot
 * related allocations are necessary.
 */
-   bool shadow_root_allocated;
+   bool mmu_page_tracking_enabled;
 
 #if IS_ENABLED(CONFIG_HYPERV)
hpa_t   hv_root_tdp;
diff --git a/arch/x86/include/asm/kvm_page_track.h 
b/arch/x86/include/asm/kvm_page_track.h
index eb186bc57f6a9..955a5ae07b10e 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -50,6 +50,7 @@ int kvm_page_track_init(struct kvm *kvm);
 void kvm_page_track_cleanup(struct kvm *kvm);
 
 bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_enable(struct kvm *kvm);
 int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
 
 void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 671cfeccf04e9..44d15551f7156 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -269,7 +269,7 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 int kvm_mmu_post_init_vm(struct kvm *kvm);
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
 
-static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
+static inline bool mmu_page_tracking_enabled(struct kvm *kvm)
 {
/*
 * Read shadow_root_allocated before related pointers. Hence, threads
@@ -277,9 +277,11 @@ static inline bool kvm_shadow_root_allocated(struct kvm 
*kvm)
 * see the pointers. Pairs with smp_store_release in
 * mmu_first_shadow_root_alloc.
 */
-   return smp_load_acquire(>arch.shadow_root_allocated);
+   return smp_load_acquire(>arch.mmu_page_tracking_enabled);
 }
 
+int mmu_enable_write_tracking(struct kvm *kvm);
+
 #ifdef CONFIG_X86_64
 static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return 
kvm->arch.tdp_mmu_enabled; }
 #else
@@ -288,7 +290,7 @@ static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { 
return false; }
 
 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
 {
-   return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
+   return !is_tdp_mmu_enabled(kvm) || mmu_page_tracking_enabled(kvm);
 }
 
 static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 904f0faff2186..fb744616bf7df 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3389,7 +3389,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
return r;
 }
 
-static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+int mmu_enable_write_tracking(struct kvm *kvm)
 {
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
@@ -3399,21 +3399,20 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
 * Check if this is the first shadow root being allocated before
 * taking the lock.
 */
-   if (kvm_shadow_root_allocated(kvm))
+   if (mmu_page_tracking_enabled(kvm))
return 0;
 
mutex_lock(>slots_arch_lock);
 
/* Recheck, under the lock, whether this is the first shadow root. */
-   if (kvm_shadow_root_allocated(kvm))
+   if (mmu_page_tracking_enabled(kvm))
goto out_unlock;
 
/*
 * Check if anything actually needs to be allocated, e.g. all metadata
 * will be allocated upfront if TDP is disabled.
 */
-   if (kvm_memslots_have_rmaps(kvm) &&
-   kvm_page_track_write_tracking_enabled(kvm))
+   if (kvm_memslots_have_rmaps(kvm) && mmu_page_tracking_enabled(kvm))
goto out_success;
 
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -3443,7 +3442,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
 * all the related pointers are set.
 */
 out_success:
-   smp_store_release(>arch.shadow_root_allocated, true);
+   smp_store_release(>arch.mmu_page_tracking_enabled, true);
 
 out_unlock:
mutex_unlock(>slots_arch_lock);
@@ -3480,7 +3479,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
}
 
- 

[RFC PATCH v3 05/19] x86: KVMGT: use kvm_page_track_write_tracking_enable

2022-04-27 Thread Maxim Levitsky
This allows to enable the write tracking only when KVMGT is
actually used and doesn't carry any penalty otherwise.

Tested by booting a VM with a kvmgt mdev device.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/Kconfig | 3 ---
 arch/x86/kvm/mmu/mmu.c   | 2 +-
 drivers/gpu/drm/i915/Kconfig | 1 -
 drivers/gpu/drm/i915/gvt/kvmgt.c | 5 +
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index e3cbd77061364..41341905d3734 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -126,7 +126,4 @@ config KVM_XEN
 
  If in doubt, say "N".
 
-config KVM_EXTERNAL_WRITE_TRACKING
-   bool
-
 endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fb744616bf7df..633a3138d68e1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5753,7 +5753,7 @@ int kvm_mmu_init_vm(struct kvm *kvm)
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
 
-   if (IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || !tdp_enabled)
+   if (!tdp_enabled)
mmu_enable_write_tracking(kvm);
 
return 0;
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 98c5450b8eacc..7d8346f4bae11 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -130,7 +130,6 @@ config DRM_I915_GVT_KVMGT
depends on DRM_I915_GVT
depends on KVM
depends on VFIO_MDEV
-   select KVM_EXTERNAL_WRITE_TRACKING
default n
help
  Choose this option if you want to enable KVMGT support for
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 057ec44901045..4c62ab3ef245d 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1933,6 +1933,7 @@ static int kvmgt_guest_init(struct mdev_device *mdev)
struct intel_vgpu *vgpu;
struct kvmgt_vdev *vdev;
struct kvm *kvm;
+   int ret;
 
vgpu = mdev_get_drvdata(mdev);
if (handle_valid(vgpu->handle))
@@ -1948,6 +1949,10 @@ static int kvmgt_guest_init(struct mdev_device *mdev)
if (__kvmgt_vgpu_exist(vgpu, kvm))
return -EEXIST;
 
+   ret = kvm_page_track_write_tracking_enable(kvm);
+   if (ret)
+   return ret;
+
info = vzalloc(sizeof(struct kvmgt_guest_info));
if (!info)
return -ENOMEM;
-- 
2.26.3



[RFC PATCH v3 03/19] KVM: x86: SVM: remove avic's broken code that updated APIC ID

2022-04-27 Thread Maxim Levitsky
AVIC is now inhibited if the guest changes apic id, thus remove
that broken code.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/kvm/svm/avic.c | 35 ---
 1 file changed, 35 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 54fe03714f8a6..1102421668a11 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -508,35 +508,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
return ret;
 }
 
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
-   u64 *old, *new;
-   struct vcpu_svm *svm = to_svm(vcpu);
-   u32 id = kvm_xapic_id(vcpu->arch.apic);
-
-   if (vcpu->vcpu_id == id)
-   return 0;
-
-   old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
-   new = avic_get_physical_id_entry(vcpu, id);
-   if (!new || !old)
-   return 1;
-
-   /* We need to move physical_id_entry to new offset */
-   *new = *old;
-   *old = 0ULL;
-   to_svm(vcpu)->avic_physical_id_cache = new;
-
-   /*
-* Also update the guest physical APIC ID in the logical
-* APIC ID table entry if already setup the LDR.
-*/
-   if (svm->ldr_reg)
-   avic_handle_ldr_update(vcpu);
-
-   return 0;
-}
-
 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
@@ -555,10 +526,6 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
 
switch (offset) {
-   case APIC_ID:
-   if (avic_handle_apic_id_update(vcpu))
-   return 0;
-   break;
case APIC_LDR:
if (avic_handle_ldr_update(vcpu))
return 0;
@@ -650,8 +617,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
 
 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 {
-   if (avic_handle_apic_id_update(vcpu) != 0)
-   return;
avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
 }
-- 
2.26.3



[RFC PATCH v3 02/19] KVM: x86: inhibit APICv/AVIC when the guest and/or host changes apic id/base from the defaults.

2022-04-27 Thread Maxim Levitsky
Neither of these settings should be changed by the guest and it is
a burden to support it in the acceleration code, so just inhibit
it instead.

Also add a boolean 'apic_id_changed' to indicate if apic id ever changed.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/lapic.c| 25 ++---
 arch/x86/kvm/lapic.h|  8 
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 63eae00625bda..636df87542555 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1070,6 +1070,8 @@ enum kvm_apicv_inhibit {
APICV_INHIBIT_REASON_ABSENT,
/* AVIC is disabled because SEV doesn't support it */
APICV_INHIBIT_REASON_SEV,
+   /* APIC ID and/or APIC base was changed by the guest */
+   APICV_INHIBIT_REASON_RO_SETTINGS,
 };
 
 struct kvm_arch {
@@ -1258,6 +1260,7 @@ struct kvm_arch {
hpa_t   hv_root_tdp;
spinlock_t hv_root_tdp_lock;
 #endif
+   bool apic_id_changed;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 66b0eb0bda94e..8996675b3ef4c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2038,6 +2038,19 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic 
*apic, u32 lvt0_val)
}
 }
 
+static void kvm_lapic_check_initial_apic_id(struct kvm_lapic *apic)
+{
+   if (kvm_apic_has_initial_apic_id(apic))
+   return;
+
+   pr_warn_once("APIC ID change is unsupported by KVM");
+
+   kvm_set_apicv_inhibit(apic->vcpu->kvm,
+   APICV_INHIBIT_REASON_RO_SETTINGS);
+
+   apic->vcpu->kvm->arch.apic_id_changed = true;
+}
+
 static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 {
int ret = 0;
@@ -2046,9 +2059,11 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, 
u32 reg, u32 val)
 
switch (reg) {
case APIC_ID:   /* Local APIC ID */
-   if (!apic_x2apic_mode(apic))
+   if (!apic_x2apic_mode(apic)) {
+
kvm_apic_set_xapic_id(apic, val >> 24);
-   else
+   kvm_lapic_check_initial_apic_id(apic);
+   } else
ret = 1;
break;
 
@@ -2335,8 +2350,11 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 MSR_IA32_APICBASE_BASE;
 
if ((value & MSR_IA32_APICBASE_ENABLE) &&
-apic->base_address != APIC_DEFAULT_PHYS_BASE)
+apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+   kvm_set_apicv_inhibit(apic->vcpu->kvm,
+   APICV_INHIBIT_REASON_RO_SETTINGS);
pr_warn_once("APIC base relocation is unsupported by KVM");
+   }
 }
 
 void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
@@ -2649,6 +2667,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
}
}
 
+   kvm_lapic_check_initial_apic_id(vcpu->arch.apic);
return 0;
 }
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4e4f8a22754f9..b9c406d383080 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -252,4 +252,12 @@ static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
 }
 
+static inline bool kvm_apic_has_initial_apic_id(struct kvm_lapic *apic)
+{
+   if (apic_x2apic_mode(apic))
+   return true;
+
+   return kvm_xapic_id(apic) == apic->vcpu->vcpu_id;
+}
+
 #endif
-- 
2.26.3



[RFC PATCH v3 01/19] KVM: x86: document AVIC/APICv inhibit reasons

2022-04-27 Thread Maxim Levitsky
These days there are too many AVIC/APICv inhibit
reasons, and it doesn't hurt to have some documentation
for them.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/include/asm/kvm_host.h | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f164c6c1514a4..63eae00625bda 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1046,14 +1046,29 @@ struct kvm_x86_msr_filter {
 };
 
 enum kvm_apicv_inhibit {
+   /* APICv/AVIC is disabled by module param and/or not supported in 
hardware */
APICV_INHIBIT_REASON_DISABLE,
+   /* APICv/AVIC is inhibited because AutoEOI feature is being used by a 
HyperV guest*/
APICV_INHIBIT_REASON_HYPERV,
+   /* AVIC is inhibited on a CPU because it runs a nested guest */
APICV_INHIBIT_REASON_NESTED,
+   /* AVIC is inhibited due to wait for an irq window (AVIC doesn't 
support this) */
APICV_INHIBIT_REASON_IRQWIN,
+   /*
+* AVIC is inhibited because i8254 're-inject' mode is used
+* which needs EOI intercept which AVIC doesn't support
+*/
APICV_INHIBIT_REASON_PIT_REINJ,
+   /* AVIC is inhibited because the guest has x2apic in its CPUID*/
APICV_INHIBIT_REASON_X2APIC,
+   /* AVIC/APICv is inhibited because KVM_GUESTDBG_BLOCKIRQ was enabled */
APICV_INHIBIT_REASON_BLOCKIRQ,
+   /*
+* AVIC/APICv is inhibited because the guest didn't yet
+* enable kernel/split irqchip
+*/
APICV_INHIBIT_REASON_ABSENT,
+   /* AVIC is disabled because SEV doesn't support it */
APICV_INHIBIT_REASON_SEV,
 };
 
-- 
2.26.3



[RFC PATCH v3 00/19] RFC: nested AVIC

2022-04-27 Thread Maxim Levitsky
This is V3 of my nested AVIC patches.

I fixed few more bugs, and I also split the cod insto smaller patches.

Review is welcome!

Best regards,
Maxim Levitsky

Maxim Levitsky (19):
  KVM: x86: document AVIC/APICv inhibit reasons
  KVM: x86: inhibit APICv/AVIC when the guest and/or host changes apic
id/base from the defaults.
  KVM: x86: SVM: remove avic's broken code that updated APIC ID
  KVM: x86: mmu: allow to enable write tracking externally
  x86: KVMGT: use kvm_page_track_write_tracking_enable
  KVM: x86: mmu: add gfn_in_memslot helper
  KVM: x86: mmu: tweak fast path for emulation of access to nested NPT
pages
  KVM: x86: SVM: move avic state to separate struct
  KVM: x86: nSVM: add nested AVIC tracepoints
  KVM: x86: nSVM: implement AVIC's physid/logid table access helpers
  KVM: x86: nSVM: implement shadowing of AVIC's physical id table
  KVM: x86: nSVM: make nested AVIC physid write tracking be aware of the
host scheduling
  KVM: x86: nSVM: wire nested AVIC to nested guest entry/exit
  KVM: x86: rename .set_apic_access_page_addr to reload_apic_access_page
  KVM: x86: nSVM: add code to reload AVIC physid table when it is
invalidated
  KVM: x86: nSVM: implement support for nested AVIC vmexits
  KVM: x86: nSVM: implement nested AVIC doorbell emulation
  KVM: x86: SVM/nSVM: add optional non strict AVIC doorbell mode
  KVM: x86: nSVM: expose the nested AVIC to the guest

 arch/x86/include/asm/kvm-x86-ops.h|   2 +-
 arch/x86/include/asm/kvm_host.h   |  23 +-
 arch/x86/include/asm/kvm_page_track.h |   1 +
 arch/x86/kvm/Kconfig  |   3 -
 arch/x86/kvm/lapic.c  |  25 +-
 arch/x86/kvm/lapic.h  |   8 +
 arch/x86/kvm/mmu.h|   8 +-
 arch/x86/kvm/mmu/mmu.c|  21 +-
 arch/x86/kvm/mmu/page_track.c |  10 +-
 arch/x86/kvm/svm/avic.c   | 985 +++---
 arch/x86/kvm/svm/nested.c | 141 +++-
 arch/x86/kvm/svm/svm.c|  39 +-
 arch/x86/kvm/svm/svm.h| 166 -
 arch/x86/kvm/trace.h  | 157 +++-
 arch/x86/kvm/vmx/vmx.c|   8 +-
 arch/x86/kvm/x86.c|  19 +-
 drivers/gpu/drm/i915/Kconfig  |   1 -
 drivers/gpu/drm/i915/gvt/kvmgt.c  |   5 +
 include/linux/kvm_host.h  |  10 +-
 19 files changed, 1507 insertions(+), 125 deletions(-)

-- 
2.26.3




Re: [PATCH 10/11] dt-bindings: display: convert Arm Mali-DP to DT schema

2022-04-27 Thread Rob Herring
On Wed, Apr 27, 2022 at 12:25:27PM +0100, Andre Przywara wrote:
> The Arm Mali Display Processor (DP) 5xx/6xx is a series of IP that scans
> out a framebuffer and hands the pixels over to a digital signal encoder.
> It supports multiple layers, scaling and rotation.
> 
> Convert the existing DT binding to DT schema.
> 
> Signed-off-by: Andre Przywara 
> ---
>  .../bindings/display/arm,malidp.txt   |  68 --
>  .../bindings/display/arm,malidp.yaml  | 117 ++
>  2 files changed, 117 insertions(+), 68 deletions(-)
>  delete mode 100644 Documentation/devicetree/bindings/display/arm,malidp.txt
>  create mode 100644 Documentation/devicetree/bindings/display/arm,malidp.yaml
> 
> diff --git a/Documentation/devicetree/bindings/display/arm,malidp.txt 
> b/Documentation/devicetree/bindings/display/arm,malidp.txt
> deleted file mode 100644
> index 7a97a2b48c2a2..0
> --- a/Documentation/devicetree/bindings/display/arm,malidp.txt
> +++ /dev/null
> @@ -1,68 +0,0 @@
> -ARM Mali-DP
> -
> -The following bindings apply to a family of Display Processors sold as
> -licensable IP by ARM Ltd. The bindings describe the Mali DP500, DP550 and
> -DP650 processors that offer multiple composition layers, support for
> -rotation and scaling output.
> -
> -Required properties:
> -  - compatible: should be one of
> - "arm,mali-dp500"
> - "arm,mali-dp550"
> - "arm,mali-dp650"
> -depending on the particular implementation present in the hardware
> -  - reg: Physical base address and size of the block of registers used by
> -the processor.
> -  - interrupts: Interrupt list, as defined in 
> ../interrupt-controller/interrupts.txt,
> -interrupt client nodes.
> -  - interrupt-names: name of the engine inside the processor that will
> -use the corresponding interrupt. Should be one of "DE" or "SE".
> -  - clocks: A list of phandle + clock-specifier pairs, one for each entry
> -in 'clock-names'
> -  - clock-names: A list of clock names. It should contain:
> -  - "pclk": for the APB interface clock
> -  - "aclk": for the AXI interface clock
> -  - "mclk": for the main processor clock
> -  - "pxlclk": for the pixel clock feeding the output PLL of the 
> processor.
> -  - arm,malidp-output-port-lines: Array of u8 values describing the number
> -of output lines per channel (R, G and B).
> -
> -Required sub-nodes:
> -  - port: The Mali DP connection to an encoder input port. The connection
> -is modelled using the OF graph bindings specified in
> -Documentation/devicetree/bindings/graph.txt
> -
> -Optional properties:
> -  - memory-region: phandle to a node describing memory (see
> -Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt)
> -to be used for the framebuffer; if not present, the framebuffer may
> -be located anywhere in memory.
> -  - arm,malidp-arqos-high-level: integer of u32 value describing the ARQoS
> -levels of DP500's QoS signaling.
> -
> -
> -Example:
> -
> -/ {
> - ...
> -
> - dp0: malidp@6f20 {
> - compatible = "arm,mali-dp650";
> - reg = <0 0x6f20 0 0x2>;
> - memory-region = <_reserved>;
> - interrupts = <0 168 IRQ_TYPE_LEVEL_HIGH>,
> -  <0 168 IRQ_TYPE_LEVEL_HIGH>;
> - interrupt-names = "DE", "SE";
> - clocks = <>, <>, <>, <>;
> - clock-names = "pxlclk", "mclk", "aclk", "pclk";
> - arm,malidp-output-port-lines = /bits/ 8 <8 8 8>;
> - arm,malidp-arqos-high-level = <0xd000d000>;
> - port {
> - dp0_output: endpoint {
> - remote-endpoint = <_2_input>;
> - };
> - };
> - };
> -
> - ...
> -};
> diff --git a/Documentation/devicetree/bindings/display/arm,malidp.yaml 
> b/Documentation/devicetree/bindings/display/arm,malidp.yaml
> new file mode 100644
> index 0..86b636662f803
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/display/arm,malidp.yaml
> @@ -0,0 +1,117 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/display/arm,malidp.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Arm Mali Display Processor (Mali-DP) binding
> +
> +maintainers:
> +  - Liviu Dudau 
> +  - Andre Przywara 
> +
> +description: |+
> +  The following bindings apply to a family of Display Processors sold as
> +  licensable IP by ARM Ltd. The bindings describe the Mali DP500, DP550 and
> +  DP650 processors that offer multiple composition layers, support for
> +  rotation and scaling output.
> +
> +properties:
> +  compatible:
> +enum:
> +  - arm,mali-dp500
> +  - arm,mali-dp550
> +  - arm,mali-dp650
> +
> +  reg:
> +maxItems: 1
> +
> +  interrupts:
> +items:
> +  - description:
> +  The interrupt used by the Display 

Re: [PATCH 09/11] dt-bindings: display: convert Arm HDLCD to DT schema

2022-04-27 Thread Rob Herring
On Wed, Apr 27, 2022 at 12:25:26PM +0100, Andre Przywara wrote:
> The Arm HDLCD is a display controller that scans out a framebuffer and
> hands a signal to a digital encoder to generate a DVI or HDMI signal.
> 
> Convert the existing DT binding to DT schema.
> 
> Signed-off-by: Andre Przywara 
> ---
>  .../devicetree/bindings/display/arm,hdlcd.txt | 79 
>  .../bindings/display/arm,hdlcd.yaml   | 91 +++
>  2 files changed, 91 insertions(+), 79 deletions(-)
>  delete mode 100644 Documentation/devicetree/bindings/display/arm,hdlcd.txt
>  create mode 100644 Documentation/devicetree/bindings/display/arm,hdlcd.yaml
> 
> diff --git a/Documentation/devicetree/bindings/display/arm,hdlcd.txt 
> b/Documentation/devicetree/bindings/display/arm,hdlcd.txt
> deleted file mode 100644
> index 78bc24296f3e4..0
> --- a/Documentation/devicetree/bindings/display/arm,hdlcd.txt
> +++ /dev/null
> @@ -1,79 +0,0 @@
> -ARM HDLCD
> -
> -This is a display controller found on several development platforms produced
> -by ARM Ltd and in more modern of its' Fast Models. The HDLCD is an RGB
> -streamer that reads the data from a framebuffer and sends it to a single
> -digital encoder (DVI or HDMI).
> -
> -Required properties:
> -  - compatible: "arm,hdlcd"
> -  - reg: Physical base address and length of the controller's registers.
> -  - interrupts: One interrupt used by the display controller to notify the
> -interrupt controller when any of the interrupt sources programmed in
> -the interrupt mask register have activated.
> -  - clocks: A list of phandle + clock-specifier pairs, one for each
> -entry in 'clock-names'.
> -  - clock-names: A list of clock names. For HDLCD it should contain:
> -  - "pxlclk" for the clock feeding the output PLL of the controller.
> -
> -Required sub-nodes:
> -  - port: The HDLCD connection to an encoder chip. The connection is modeled
> -using the OF graph bindings specified in
> -Documentation/devicetree/bindings/graph.txt.
> -
> -Optional properties:
> -  - memory-region: phandle to a node describing memory (see
> -Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt) 
> to be
> -used for the framebuffer; if not present, the framebuffer may be located
> -anywhere in memory.
> -
> -
> -Example:
> -
> -/ {
> - ...
> -
> - hdlcd@2b00 {
> - compatible = "arm,hdlcd";
> - reg = <0 0x2b00 0 0x1000>;
> - interrupts = ;
> - clocks = <>;
> - clock-names = "pxlclk";
> - port {
> - hdlcd_output: endpoint@0 {
> - remote-endpoint = <_enc_input>;
> - };
> - };
> - };
> -
> - /* HDMI encoder on I2C bus */
> - i2c@7ffa {
> - 
> - hdmi-transmitter@70 {
> - compatible = ".";
> - reg = <0x70>;
> - port@0 {
> - hdmi_enc_input: endpoint {
> - remote-endpoint = <_output>;
> - };
> -
> - hdmi_enc_output: endpoint {
> - remote-endpoint = <_1_port>;
> - };
> - };
> - };
> -
> - };
> -
> - hdmi1: connector@1 {
> - compatible = "hdmi-connector";
> - type = "a";
> - port {
> - hdmi_1_port: endpoint {
> - remote-endpoint = <_enc_output>;
> - };
> - };
> - };
> -
> - ...
> -};
> diff --git a/Documentation/devicetree/bindings/display/arm,hdlcd.yaml 
> b/Documentation/devicetree/bindings/display/arm,hdlcd.yaml
> new file mode 100644
> index 0..1fe8e07334152
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/display/arm,hdlcd.yaml
> @@ -0,0 +1,91 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/display/arm,hdlcd.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Arm HDLCD display controller binding
> +
> +maintainers:
> +  - Liviu Dudau 
> +  - Andre Przywara 
> +
> +description: |+
> +  The Arm HDLCD is a display controller found on several development 
> platforms
> +  produced by ARM Ltd and in more modern of its Fast Models. The HDLCD is an
> +  RGB streamer that reads the data from a framebuffer and sends it to a 
> single
> +  digital encoder (DVI or HDMI).
> +
> +properties:
> +  compatible:
> +const: arm,hdlcd
> +
> +  reg:
> +maxItems: 1
> +
> +  interrupts:
> +maxItems: 1
> +
> +  clock-names:
> +const: pxlclk
> +
> +  clocks:
> +maxItems: 1
> +description: The input reference for the pixel clock.
> +
> +  memory-region:
> +maxItems: 1
> +description:
> +  

Re: [PATCH v2 1/4] drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking

2022-04-27 Thread Matthew Auld

On 25/04/2022 17:24, Ramalingam C wrote:

While locating the start of ccs scatterlist in smem scatterlist, that has
to be the size of lmem obj size + corresponding ccs data size. Report bug
if scatterlist terminate before that length.

Signed-off-by: Ramalingam C 
---
  drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 9d552f30b627..29d761da02c4 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 
bytes_to_cpy)
bytes_to_cpy -= len;
  
  		it->sg = __sg_next(it->sg);

+
+   /*
+* scatterlist supposed to be the size of
+* bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy).
+*/
+   GEM_BUG_ON(!it->sg);


It will crash and burn anyway, with the below NULL deref. Not sure if 
BUG_ON() is really much better, but I guess with the additional comment,

Reviewed-by: Matthew Auld 


it->dma = sg_dma_address(it->sg);
it->max = it->dma + sg_dma_len(it->sg);
} while (bytes_to_cpy);


Re: [PATCH v2 2/4] drm/i915/gt: optimize the ccs_sz calculation per chunk

2022-04-27 Thread Matthew Auld

On 25/04/2022 17:24, Ramalingam C wrote:

Calculate the ccs_sz that needs to be emitted based on the src
and dst pages emitted per chunk. And handle the return value of emit_pte
for the ccs pages.

Signed-off-by: Ramalingam C 
---
  drivers/gpu/drm/i915/gt/intel_migrate.c | 36 +
  1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 29d761da02c4..463a6a14b5f9 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -647,17 +647,9 @@ static int scatter_list_length(struct scatterlist *sg)
  
  static void

  calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
-  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
-  u32 ccs_bytes_to_cpy)
+  int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy)
  {
if (ccs_bytes_to_cpy) {
-   /*
-* We can only copy the ccs data corresponding to
-* the CHUNK_SZ of lmem which is
-* GET_CCS_BYTES(i915, CHUNK_SZ))
-*/
-   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
-
if (!src_is_lmem)
/*
 * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
@@ -713,10 +705,10 @@ intel_context_migrate_copy(struct intel_context *ce,
struct drm_i915_private *i915 = ce->engine->i915;
u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
enum i915_cache_level ccs_cache_level;
-   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
u8 src_access, dst_access;
struct i915_request *rq;
+   int src_sz, dst_sz;
bool ccs_is_src;
int err;
  
@@ -770,7 +762,7 @@ intel_context_migrate_copy(struct intel_context *ce,

}
  
  	do {

-   int len;
+   int len, ccs_sz;


This could be moved into the reduced scope below.

Reviewed-by: Matthew Auld 

  
  		rq = i915_request_create(ce);

if (IS_ERR(rq)) {
@@ -797,7 +789,7 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
  
-		calculate_chunk_sz(i915, src_is_lmem, _sz, _sz,

+   calculate_chunk_sz(i915, src_is_lmem, _sz,
   bytes_to_cpy, ccs_bytes_to_cpy);
  
  		len = emit_pte(rq, _src, src_cache_level, src_is_lmem,

@@ -835,33 +827,29 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
  
+			ccs_sz = GET_CCS_BYTES(i915, len);

err = emit_pte(rq, _ccs, ccs_cache_level, false,
   ccs_is_src ? src_offset : dst_offset,
   ccs_sz);
+   if (err < 0)
+   goto out_rq;
+   if (err < ccs_sz) {
+   err = -EINVAL;
+   goto out_rq;
+   }
  
  			err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);

if (err)
goto out_rq;
  
-			/*

-* Using max of src_sz and dst_sz, as we need to
-* pass the lmem size corresponding to the ccs
-* blocks we need to handle.
-*/
-   ccs_sz = max_t(int, ccs_is_src ? ccs_sz : src_sz,
-  ccs_is_src ? dst_sz : ccs_sz);
-
err = emit_copy_ccs(rq, dst_offset, dst_access,
-   src_offset, src_access, ccs_sz);
+   src_offset, src_access, len);
if (err)
goto out_rq;
  
  			err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);

if (err)
goto out_rq;
-
-   /* Converting back to ccs bytes */
-   ccs_sz = GET_CCS_BYTES(rq->engine->i915, ccs_sz);
ccs_bytes_to_cpy -= ccs_sz;
}
  


Re: [PATCH] video: hyperv_fb: Allow resolutions with size > 64 MB for Gen1

2022-04-27 Thread Helge Deller
On 4/27/22 15:47, Saurabh Sengar wrote:
> This patch fixes a bug where GEN1 VMs doesn't allow resolutions greater
> than 64 MB size (eg 7680x4320). Unnecessary PCI check limits Gen1 VRAM
> to legacy PCI BAR size only (ie 64MB). Thus any, resolution requesting
> greater then 64MB (eg 7680x4320) would fail. MMIO region assigning this
> memory shouldn't be limited by PCI bar size.

Is that right?
Allocating more memory than what the PCI bar states?
That sounds to me that theoretically the "now bigger" memory framebuffer could
overwrite other memory areas or mapped PCI bars.
I'd like to see some other person from Microsoft to please comment/ack/nack.

Helge

> Signed-off-by: Saurabh Sengar 
> ---
>  drivers/video/fbdev/hyperv_fb.c | 19 +--
>  1 file changed, 1 insertion(+), 18 deletions(-)
>
> diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
> index c8e0ea2..58c304a 100644
> --- a/drivers/video/fbdev/hyperv_fb.c
> +++ b/drivers/video/fbdev/hyperv_fb.c
> @@ -1009,7 +1009,6 @@ static int hvfb_getmem(struct hv_device *hdev, struct 
> fb_info *info)
>   struct pci_dev *pdev  = NULL;
>   void __iomem *fb_virt;
>   int gen2vm = efi_enabled(EFI_BOOT);
> - resource_size_t pot_start, pot_end;
>   phys_addr_t paddr;
>   int ret;
>
> @@ -1060,23 +1059,7 @@ static int hvfb_getmem(struct hv_device *hdev, struct 
> fb_info *info)
>   dio_fb_size =
>   screen_width * screen_height * screen_depth / 8;
>
> - if (gen2vm) {
> - pot_start = 0;
> - pot_end = -1;
> - } else {
> - if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
> - pci_resource_len(pdev, 0) < screen_fb_size) {
> - pr_err("Resource not available or (0x%lx < 0x%lx)\n",
> -(unsigned long) pci_resource_len(pdev, 0),
> -(unsigned long) screen_fb_size);
> - goto err1;
> - }
> -
> - pot_end = pci_resource_end(pdev, 0);
> - pot_start = pot_end - screen_fb_size + 1;
> - }
> -
> - ret = vmbus_allocate_mmio(>mem, hdev, pot_start, pot_end,
> + ret = vmbus_allocate_mmio(>mem, hdev, 0, -1,
> screen_fb_size, 0x10, true);
>   if (ret != 0) {
>   pr_err("Unable to allocate framebuffer memory\n");



Re: dim question: How to revert patches?

2022-04-27 Thread Helge Deller
Hi Daniel,

On 4/27/22 16:21, Daniel Vetter wrote:
> On Thu, Apr 14, 2022 at 10:37:55PM +0200, Helge Deller wrote:
>> Hello dri-devel & dim users,
>
> Apologies for late reply, I'm way behind on stuff.
>
>> I committed this patch to the drm-misc-next branch:
>>
>> commit d6cd978f7e6b6f6895f8d0c4ce6e5d2c8e979afe
>> video: fbdev: fbmem: fix pointer reference to null device field
>>
>> then I noticed that it was fixed already in another branch which led to this 
>> error:
>>
>> Merging drm-misc/drm-misc-next... dim:
>> dim: FAILURE: Could not merge drm-misc/drm-misc-next
>> dim: See the section "Resolving Conflicts when Rebuilding drm-tip"
>> dim: in the drm-tip.rst documentation for how to handle this situation.
>>
>> I fixed it by reverting that patch above with this new commit in the 
>> drm-misc-next branch:
>>
>> commit cabfa2bbe617ddf0a0cc4d01f72b584dae4939ad (HEAD -> drm-misc-next, 
>> drm-misc/for-linux-next, drm-misc/drm-misc-next)
>> Author: Helge Deller 
>> Revert "video: fbdev: fbmem: fix pointer reference to null device field"
>>
>> My question (as "dim" newbie):
>> Was that the right solution?
>
> The patch wasn't really broken, so revert feels a bit silly. The hint was
> to look at the documentation referenced by the error message - the issue
> was only in rebuilding the integration tree:
>
> https://drm.pages.freedesktop.org/maintainer-tools/drm-tip.html#resolving-conflicts-when-rebuilding-drm-tip
>
> This should cover you even for really rare conflict situations.
>
>> Is there a possibility to drop those two patches from the drm-misc-next 
>> branch before it gets pushed upstream?
>
> It's a shared tree, mistakes are forever. The only time we did a forced
> push ever is when someone managed to push their local pile of hacks or
> something, and we're catching those pretty well now with a server-side
> test to make sure you're using dim to push.
>
> It's also no big deal, and next time you get a conflict just resolve it
> in drm-tip per the docs and it's all fine.

Thanks for the feedback!
So, basically I think I did the right thing (although a revert isn't nice).
There was no other useful fixup I could have come up with, because the other 
conflicting
patch had the right & better solution already pushed.

Helge


Re: [PATCH 0/3] HDR aux backlight range calculation

2022-04-27 Thread Lyude Paul
Hey! I will try to test this out ASAP on all of the HDR backlight machines I
have (so, many :) at some point this week, will let you know when

On Tue, 2022-04-26 at 15:30 +0300, Jouni Högander wrote:
> This patch set splits out static hdr metadata backlight range parsing
> from gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c into gpu/drm/drm-edid.c as
> a new function. This new function is then used in admgpu_dm.c and
> intel_dp_aux_backlight.c
> 
> Cc: Maarten Lankhorst 
> Cc: Rodrigo Siqueira 
> Cc: Harry Wentland 
> Cc: Lyude Paul 
> Cc: Mika Kahola 
> Cc: Jani Nikula 
> 
> Jouni Högander (3):
>   drm: New function to get luminance range based on static hdr metadata
>   drm/amdgpu_dm: Use split out luminance calculation function
>   drm/i915: Use luminance range from static hdr metadata
> 
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 35 ++--
>  drivers/gpu/drm/drm_edid.c    | 55 +++
>  .../drm/i915/display/intel_dp_aux_backlight.c |  9 ++-
>  include/drm/drm_edid.h    |  4 ++
>  4 files changed, 70 insertions(+), 33 deletions(-)
> 

-- 
Cheers,
 Lyude Paul (she/her)
 Software Engineer at Red Hat



Re: [Intel-gfx] [PATCH 0/2] Initial GuC firmware release for DG2

2022-04-27 Thread Timo Aaltonen

john.c.harri...@intel.com kirjoitti 27.4.2022 klo 19.55:

From: John Harrison 

Add GuC firmware for DG2.

Note that an older version of this patch exists in the CI topic
branch. Hence this set includes a revert of that patch before applying
the new version. When merging, the revert would simply be dropped and
the corresponding patch in the topic branch would also be dropped.

Signed-off-by: John Harrison 


John Harrison (2):
   Revert "drm/i915/dg2: Define GuC firmware version for DG2"
   drm/i915/dg2: Define GuC firmware version for DG2

  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)



The firmware is not public yet, though? Shouldn't it have been sent 
upstream already? Same complaint applies to DMC.



--
t


Re: [PATCH v2] drm/msm: add trace_dma_fence_emit to msm_gpu_submit

2022-04-27 Thread Chia-I Wu
On Wed, Apr 27, 2022 at 9:07 AM Rob Clark  wrote:
>
> On Tue, Apr 26, 2022 at 11:20 PM Christian König
>  wrote:
> >
> > Am 26.04.22 um 20:50 schrieb Chia-I Wu:
> > > On Tue, Apr 26, 2022 at 11:02 AM Christian König
> > >  wrote:
> > >> Am 26.04.22 um 19:40 schrieb Chia-I Wu:
> > >>> [SNIP]
> > >> Well I just send a patch to completely remove the trace point.
> > >>
> > >> As I said it absolutely doesn't make sense to use this for
> > >> visualization, that's what the trace_dma_fence_init trace point is 
> > >> good for.
> > >>> I am a bit confused by this.  _emit and _signaled are a great way to
> > >>> see how many fences are pending from cpu's point of view.  How does
> > >>> _emit make no sense and _init is good instead?
> > >> We had exactly that confusion now multiple times and it's one of the
> > >> main reasons why I want to remove the _emit trace point.
> > >>
> > >> See the when you want to know how many fences are pending you need to
> > >> watch out for init/destroy and *NOT* emit.
> > >>
> > >> The reason is that in the special case where emit makes sense (e.g. the
> > >> GPU scheduler fences) emit comes later than init, but pending on the CPU
> > >> and taking up resources are all fences and not just the one emitted to
> > >> the hardware.
> > > I am more interested in pending on the GPU.
> > >
> > >> On the other hand when you want to measure how much time each operation
> > >> took on the hardware you need to take a look at the differences of the
> > >> signal events on each timeline.
> > > _signaled alone is not enough when the GPU is not always busy.  After
> > > the last pending fence signals but before the following _init/_emit,
> > > nothing is pending.
> >
> > Yeah, I'm perfectly aware of that.
> >
> > > For all drivers except virtio-gpu, _init and "ring head update" always
> > > happen close enough that I can see why _emit is redundant.  But I like
> > > having _emit as a generic tracepoint for timelines where _init and
> > > _emit can be apart, instead of requiring a special case tracepoint for
> > > each special case timeline.
> >
> > And I'm certainly not going to add _emit to all drivers just because of
> > that. As you said it is a special case for virtio-gpu and the GPU scheduler.
> >
> > And as I explained before the difference between _init and _emit
> > shouldn't matter to your visualization. The background is that as soon
> > as an dma_fence is initialized with _init it is "live" regarding the
> > dependency and memory management and exactly that's what matters for the
> > visualization.
> >
> > The latency between _init and _emit is just interesting for debugging
> > the scheduler and surprisingly virtio-gpu as well, for all other use
> > cases it is irrelevant.
>
> It might actually be *more* interesting for virtio-gpu.. unless there
> is some other way to link guest and host fences to see what the
> potential latency of guest->host is
>
> re: adding the tracepoint to other drivers, I'm fine with folks doing
> that as needed.  Unless you have a better proposal about how to
> visualize init vs emit latency, I think it's fine to have an extra
> tracepoint even if it is redundant in some cases.  The visualization
> tool is the customer here, we have to give it what it wants/needs.
As far as perfetto is concerned, I just use either _init or _emit on a
per-timeline basis.  We can drop this patch for msm, and do not need
to change drivers whose latencies between _init/_emit are ignorable.

init vs emit latency is still interesting.  I prefer keeping _init /
_emit as generic events that tools can parse, rather than adding
per-driver special cases that tools need to understand.

>
> BR,
> -R
>
> >
> > Regards,
> > Christian.
> >
> > >> So there isn't really any use case for the emit trace point, except when
> > >> you want to figure out how much latency the scheduler introduce. Then
> > >> you want to take a look at init and emit, but that isn't really that
> > >> interesting for performance analyses.
> > >>
> > >> Regards,
> > >> Christian.
> > >>
> >


Re: [Intel-gfx] [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-27 Thread Matthew Auld

On 27/04/2022 09:36, Tvrtko Ursulin wrote:


On 20/04/2022 18:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-...@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at _i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;


Would unallocated_cpu_visible_size be useful, to follow the total 
unallocated_size?


Make sense. But I don't think unallocated_size has actually been 
properly wired up yet. It still just gives the same value as 
probed_size. IIRC for unallocated_size we still need a real 
user/usecase/umd, before wiring that up for real with the existing avail 
tracking. Once we have that we can also add unallocated_cpu_visible_size.




Btw, have we ever considered whether unallocated_size should require 
CAP_SYS_ADMIN/PERFMON or something?


Note sure. But just in case we do add it for real at some point, why the 
added restriction?





+    };
+    };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create behaviour, 
with added

+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for the 
stuff that
+ * is immutable. Previously we would have two ioctls, one to create 
the object
+ * with gem_create, and another to apply various parameters, however 
this
+ * creates some ambiguity for the params which are considered 
immutable. Also in

+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+    /**
+ * @size: Requested size for the object.
+ *
+ * The (page-aligned) allocated size for the object will be 
returned.

+ *
+ * Note that for some devices we have might have further minimum
+ * page-size restrictions(larger than 4K), likefor device 
local-memory.

+ * However in general the final size here should always reflect any
+ * rounding up, if for example using the 
I915_GEM_CREATE_EXT_MEMORY_REGIONS

+ * extension to place the object in device local-memory.
+ */
+    __u64 size;
+    /**
+ * @handle: Returned handle for the object.
+ *
+ * Object handles are nonzero.
+ */
+    __u32 handle;
+    /**
+ * @flags: Optional flags.
+ *
+ * Supported values:
+ *
+ * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS - Signal to the 
kernel that

+ * the object will need to be accessed via the CPU.
+ *
+ * Only valid when placing objects in I915_MEMORY_CLASS_DEVICE, and
+ * only strictly required on platforms where only some of the device
+ * memory is directly visible or mappable through the CPU, like 
on DG2+.

+ *
+ * One of the placements MUST also be I915_MEMORY_CLASS_SYSTEM, to
+ * ensure we can always spill the allocation tosystem memory, if we
+ * can't place the object in the mappable part of
+ * I915_MEMORY_CLASS_DEVICE.
+ *
+ * Note that since the kernel only supports 

Re: [PATCH v3] drm/doc: Add sections about tiny drivers and external refs to intro page

2022-04-27 Thread Javier Martinez Canillas
Hello Daniel,

On 4/27/22 17:29, Daniel Vetter wrote:
> On Wed, Apr 20, 2022 at 09:24:11AM +0200, Javier Martinez Canillas wrote:
>> Learning about the DRM subsystem could be quite overwhelming for newcomers
>> but there are lots of useful talks, slides and articles available that can
>> help to understand the needed concepts and ease the learning curve.
>>
>> There are also simple DRM drivers that can be used as example about how a
>> DRM driver should look like.
>>
>> Add sections to the introduction page, that contains references to these.
>>
>> Suggested-by: Daniel Vetter 
>> Signed-off-by: Javier Martinez Canillas 
>> Acked-by: Pekka Paalanen 
>> Acked-by: Thomas Zimmermann 
> 
> Maybe needs more acks to land?
> 
> Acked-by: Daniel Vetter 
>

Thanks! But this landed already a few days ago in drm-misc-next:

https://cgit.freedesktop.org/drm/drm-misc/commit/?id=e41a2999f746
 
> Would be good we can hand out links to pretty htmldocs instead of lore
> links to this patch, the latter is rather hard on the eyes :-)
>

Agreed :)
 -- 
Best regards,

Javier Martinez Canillas
Linux Engineering
Red Hat



Re: [PATCH v2 4/4] uapi/drm/i915: Document memory residency and Flat-CCS capability of obj

2022-04-27 Thread Matthew Auld

On 25/04/2022 17:24, Ramalingam C wrote:

Capture the impact of memory region preference list of an object, on
their memory residency and Flat-CCS capability of the objects.

v2:
   Fix the Flat-CCS capability of an obj with {lmem, smem} preference
   list [Thomas]

Signed-off-by: Ramalingam C 
cc: Matthew Auld 
cc: Thomas Hellstrom 
---
  include/uapi/drm/i915_drm.h | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 35ca528803fd..ad191ed6547c 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3393,6 +3393,24 @@ struct drm_i915_gem_create_ext {
   * At which point we get the object handle in _i915_gem_create_ext.handle,
   * along with the final object size in _i915_gem_create_ext.size, which
   * should account for any rounding up, if required.
+ *
+ * Objects with multiple memory regions in the preference list will be backed
+ * by one of the memory regions mentioned in the preference list. Though I915
+ * tries to honour the order of the memory regions in the preference list,
+ * based on the memory pressure of the regions, objects' backing region
+ * will be selected.
+ *
+ * Userspace has no means of knowing the backing region for such objects.


"Note that userspace has no means of knowing the current backing region 
for objects where @num_regions is larger than one. The kernel will only 
ensure that the priority order of the @regions array is honoured, either 
when initially placing the object, or when moving memory around due to 
memory pressure."



+ *
+ * On Flat-CCS capable HW, compression is supported for the objects residing
+ * in I915_MEMORY_CLASS_DEVICE. When such objects (compressed) has other
+ * memory class in preference list and migrated (by I915, due to memory
+ * constrain) to the non I915_MEMORY_CLASS_DEVICE region, then I915 needs to
+ * decompress the content. But I915 dont have the required information to


"doesn't", also prefer @regions etc instead of "preference list"

Anyway,
Reviewed-by: Matthew Auld 


+ * decompress the userspace compressed objects.
+ *
+ * So I915 supports Flat-CCS, only on the objects which can reside only on
+ * I915_MEMORY_CLASS_DEVICE regions. >*/
  struct drm_i915_gem_create_ext_memory_regions {
/** @base: Extension link. See struct i915_user_extension. */


[PATCH 2/2] drm/i915/dg2: Define GuC firmware version for DG2

2022-04-27 Thread John . C . Harrison
From: John Harrison 

First release of GuC for DG2.

Signed-off-by: John Harrison 
CC: Tomasz Mistat 
CC: Ramalingam C 
CC: Daniele Ceraolo Spurio 
---
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
index a876d39e6bcf..d078f884b5e3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -53,6 +53,7 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw,
  * firmware as TGL.
  */
 #define INTEL_GUC_FIRMWARE_DEFS(fw_def, guc_def) \
+   fw_def(DG2,  0, guc_def(dg2,  70, 1, 2)) \
fw_def(ALDERLAKE_P,  0, guc_def(adlp, 70, 1, 1)) \
fw_def(ALDERLAKE_S,  0, guc_def(tgl,  70, 1, 1)) \
fw_def(DG1,  0, guc_def(dg1,  70, 1, 1)) \
-- 
2.25.1



[PATCH 1/2] Revert "drm/i915/dg2: Define GuC firmware version for DG2"

2022-04-27 Thread John . C . Harrison
From: John Harrison 

This reverts commit 55c7f980e48e56861496526e02ed5bbfdac49ede.

The CI topic branch within drm-top contains an old patch for
supporting GuC on DG2. That needs to be dropped and an updated patch
merged to drm-gt-next. Hence this patch reverts it so the new patch
can be sent in it's correct form for CI testing.

Signed-off-by: John Harrison 
---
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
index cb5dd16421d0..a876d39e6bcf 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -53,7 +53,6 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw,
  * firmware as TGL.
  */
 #define INTEL_GUC_FIRMWARE_DEFS(fw_def, guc_def) \
-   fw_def(DG2,  0, guc_def(dg2,  70, 1, 1)) \
fw_def(ALDERLAKE_P,  0, guc_def(adlp, 70, 1, 1)) \
fw_def(ALDERLAKE_S,  0, guc_def(tgl,  70, 1, 1)) \
fw_def(DG1,  0, guc_def(dg1,  70, 1, 1)) \
-- 
2.25.1



[PATCH 0/2] Initial GuC firmware release for DG2

2022-04-27 Thread John . C . Harrison
From: John Harrison 

Add GuC firmware for DG2.

Note that an older version of this patch exists in the CI topic
branch. Hence this set includes a revert of that patch before applying
the new version. When merging, the revert would simply be dropped and
the corresponding patch in the topic branch would also be dropped.

Signed-off-by: John Harrison 


John Harrison (2):
  Revert "drm/i915/dg2: Define GuC firmware version for DG2"
  drm/i915/dg2: Define GuC firmware version for DG2

 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.25.1



Re: [RESEND PATCH v3] drm/cma-helper: Describe what a "contiguous chunk" actually means

2022-04-27 Thread Lucas Stach
Am Mittwoch, dem 27.04.2022 um 15:09 +0100 schrieb Daniel Thompson:
> Since it's inception in 2012 it has been understood that the DRM GEM CMA
> helpers do not depend on CMA as the backend allocator. In fact the first
> bug fix to ensure the cma-helpers work correctly with an IOMMU backend
> appeared in 2014. However currently the documentation for
> drm_gem_cma_create() talks about "a contiguous chunk of memory" without
> making clear which address space it will be a contiguous part of.
> Additionally the CMA introduction is actively misleading because it only
> contemplates the CMA backend.
> 
> This matters because when the device accesses the bus through an IOMMU
> (and don't use the CMA backend) then the allocated memory is contiguous
> only in the IOVA space. This is a significant difference compared to the
> CMA backend and the behaviour can be a surprise even to someone who does
> a reasonable level of code browsing (but doesn't find all the relevant
> function pointers ;-) ).
> 
> Improve the kernel doc comments accordingly.
> 
> Signed-off-by: Daniel Thompson 

Reviewed-by: Lucas Stach 

> ---
> 
> Notes:
> RESEND is unaltered but rebased on v5.18-rc3.
> 
> Changes in v3:
> - Rebased on v5.17-rc2
> - Minor improvements to wording.
> 
> Changes in v2:
> - Oops. I did a final proof read and accidentally committed these
>   changes as a seperate patch. This means that v1 contains only
>   one tenth of the actual patch. This is fixed in v2. Many apologies
>   for the noise!
> 
>  drivers/gpu/drm/drm_gem_cma_helper.c | 39 +---
>  1 file changed, 29 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c 
> b/drivers/gpu/drm/drm_gem_cma_helper.c
> index f36734c2c9e1..42abee9a0f4f 100644
> --- a/drivers/gpu/drm/drm_gem_cma_helper.c
> +++ b/drivers/gpu/drm/drm_gem_cma_helper.c
> @@ -26,12 +26,22 @@
>  /**
>   * DOC: cma helpers
>   *
> - * The Contiguous Memory Allocator reserves a pool of memory at early boot
> - * that is used to service requests for large blocks of contiguous memory.
> + * The DRM GEM/CMA helpers are a means to provide buffer objects that are
> + * presented to the device as a contiguous chunk of memory. This is useful
> + * for devices that do not support scatter-gather DMA (either directly or
> + * by using an intimately attached IOMMU).
>   *
> - * The DRM GEM/CMA helpers use this allocator as a means to provide buffer
> - * objects that are physically contiguous in memory. This is useful for
> - * display drivers that are unable to map scattered buffers via an IOMMU.
> + * Despite the name, the DRM GEM/CMA helpers are not hardwired to use the
> + * Contiguous Memory Allocator (CMA).
> + *
> + * For devices that access the memory bus through an (external) IOMMU then
> + * the buffer objects are allocated using a traditional page-based
> + * allocator and may be scattered through physical memory. However they
> + * are contiguous in the IOVA space so appear contiguous to devices using
> + * them.
> + *
> + * For other devices then the helpers rely on CMA to provide buffer
> + * objects that are physically contiguous in memory.
>   *
>   * For GEM callback helpers in struct _gem_object functions, see likewise
>   * named functions with an _object_ infix (e.g., drm_gem_cma_object_vmap() 
> wraps
> @@ -111,8 +121,14 @@ __drm_gem_cma_create(struct drm_device *drm, size_t 
> size, bool private)
>   * @drm: DRM device
>   * @size: size of the object to allocate
>   *
> - * This function creates a CMA GEM object and allocates a contiguous chunk of
> - * memory as backing store.
> + * This function creates a CMA GEM object and allocates memory as backing 
> store.
> + * The allocated memory will occupy a contiguous chunk of bus address space.
> + *
> + * For devices that are directly connected to the memory bus then the 
> allocated
> + * memory will be physically contiguous. For devices that access through an
> + * IOMMU, then the allocated memory is not expected to be physically 
> contiguous
> + * because having contiguous IOVAs is sufficient to meet a devices DMA
> + * requirements.
>   *
>   * Returns:
>   * A struct drm_gem_cma_object * on success or an ERR_PTR()-encoded negative
> @@ -162,9 +178,12 @@ EXPORT_SYMBOL_GPL(drm_gem_cma_create);
>   * @size: size of the object to allocate
>   * @handle: return location for the GEM handle
>   *
> - * This function creates a CMA GEM object, allocating a physically contiguous
> - * chunk of memory as backing store. The GEM object is then added to the list
> - * of object associated with the given file and a handle to it is returned.
> + * This function creates a CMA GEM object, allocating a chunk of memory as
> + * backing store. The GEM object is then added to the list of object 
> associated
> + * with the given file and a handle to it is returned.
> + *
> + * The allocated memory will occupy a contiguous chunk of bus address space.
> + * See 

Re: [PATCH v2 3/4] drm/i915/gt: Document the eviction of the Flat-CCS objects

2022-04-27 Thread Matthew Auld

On 25/04/2022 17:24, Ramalingam C wrote:

Capture the eviction details for Flat-CCS capable, lmem objects.

v2:
   Fix the Flat-ccs capbility of lmem obj with smem residency
   possibility [Thomas]

Signed-off-by: Ramalingam C 
cc: Thomas Hellstrom 
cc: Matthew Auld 
---
  drivers/gpu/drm/i915/gt/intel_migrate.c | 23 ++-
  1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 463a6a14b5f9..930e0fd9795f 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -485,16 +485,21 @@ static bool wa_1209644611_applies(int ver, u32 size)
   * And CCS data can be copied in and out of CCS region through
   * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
   *
- * When we exhaust the lmem, if the object's placements support smem, then we 
can
- * directly decompress the compressed lmem object into smem and start using it
- * from smem itself.
+ * I915 supports Flat-CCS on lmem only objects. When an objects has the smem in


"When an object has smem in"


+ * its preference list, on memory pressure, i915 needs to migarte the lmem


"migrate"


+ * content into smem. If the lmem object is Flat-CCS compressed by userspace,
+ * then i915 needs to decompress it. But I915 lack the required information
+ * for such decompression. Hence I915 supports Flat-CCS only on lmem only 
objects.
   *
- * But when we need to swapout the compressed lmem object into a smem region
- * though objects' placement doesn't support smem, then we copy the lmem 
content
- * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT).
- * When the object is referred, lmem content will be swaped in along with
- * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding
- * location.
+ * when we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can


"When"

Otherwise,
Reviewed-by: Matthew Auld 


+ * be temporarily evicted to smem, along with the auxiliary CCS state, where
+ * it can be potentially swapped-out at a later point, if required.
+ * If userspace later touches the evicted pages, then we always move
+ * the backing memory back to lmem, which includes restoring the saved CCS 
state,
+ * and potentially performing any required swap-in.
+ *
+ * For the migration of the lmem objects with smem in placement list, such as
+ * {lmem, smem}, objects are treated as non Flat-CCS capable objects.
   */
  
  static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)


Re: [PATCH v2] drm/msm: add trace_dma_fence_emit to msm_gpu_submit

2022-04-27 Thread Rob Clark
On Tue, Apr 26, 2022 at 11:20 PM Christian König
 wrote:
>
> Am 26.04.22 um 20:50 schrieb Chia-I Wu:
> > On Tue, Apr 26, 2022 at 11:02 AM Christian König
> >  wrote:
> >> Am 26.04.22 um 19:40 schrieb Chia-I Wu:
> >>> [SNIP]
> >> Well I just send a patch to completely remove the trace point.
> >>
> >> As I said it absolutely doesn't make sense to use this for
> >> visualization, that's what the trace_dma_fence_init trace point is 
> >> good for.
> >>> I am a bit confused by this.  _emit and _signaled are a great way to
> >>> see how many fences are pending from cpu's point of view.  How does
> >>> _emit make no sense and _init is good instead?
> >> We had exactly that confusion now multiple times and it's one of the
> >> main reasons why I want to remove the _emit trace point.
> >>
> >> See the when you want to know how many fences are pending you need to
> >> watch out for init/destroy and *NOT* emit.
> >>
> >> The reason is that in the special case where emit makes sense (e.g. the
> >> GPU scheduler fences) emit comes later than init, but pending on the CPU
> >> and taking up resources are all fences and not just the one emitted to
> >> the hardware.
> > I am more interested in pending on the GPU.
> >
> >> On the other hand when you want to measure how much time each operation
> >> took on the hardware you need to take a look at the differences of the
> >> signal events on each timeline.
> > _signaled alone is not enough when the GPU is not always busy.  After
> > the last pending fence signals but before the following _init/_emit,
> > nothing is pending.
>
> Yeah, I'm perfectly aware of that.
>
> > For all drivers except virtio-gpu, _init and "ring head update" always
> > happen close enough that I can see why _emit is redundant.  But I like
> > having _emit as a generic tracepoint for timelines where _init and
> > _emit can be apart, instead of requiring a special case tracepoint for
> > each special case timeline.
>
> And I'm certainly not going to add _emit to all drivers just because of
> that. As you said it is a special case for virtio-gpu and the GPU scheduler.
>
> And as I explained before the difference between _init and _emit
> shouldn't matter to your visualization. The background is that as soon
> as an dma_fence is initialized with _init it is "live" regarding the
> dependency and memory management and exactly that's what matters for the
> visualization.
>
> The latency between _init and _emit is just interesting for debugging
> the scheduler and surprisingly virtio-gpu as well, for all other use
> cases it is irrelevant.

It might actually be *more* interesting for virtio-gpu.. unless there
is some other way to link guest and host fences to see what the
potential latency of guest->host is

re: adding the tracepoint to other drivers, I'm fine with folks doing
that as needed.  Unless you have a better proposal about how to
visualize init vs emit latency, I think it's fine to have an extra
tracepoint even if it is redundant in some cases.  The visualization
tool is the customer here, we have to give it what it wants/needs.

BR,
-R

>
> Regards,
> Christian.
>
> >> So there isn't really any use case for the emit trace point, except when
> >> you want to figure out how much latency the scheduler introduce. Then
> >> you want to take a look at init and emit, but that isn't really that
> >> interesting for performance analyses.
> >>
> >> Regards,
> >> Christian.
> >>
>


Re: [PATCH] drm: handle kernel fences in drm_gem_plane_helper_prepare_fb

2022-04-27 Thread Daniel Vetter
On Thu, Apr 21, 2022 at 09:10:02PM +0200, Christian König wrote:
> drm_gem_plane_helper_prepare_fb() was using
> drm_atomic_set_fence_for_plane() which ignores all implicit fences when an
> explicit fence is already set. That's rather unfortunate when the fb still
> has a kernel fence we need to wait for to avoid presenting garbage on the
> screen.
> 
> So instead update the fence in the plane state directly. While at it also
> take care of all potential GEM objects and not just the first one.
> 
> Also remove the now unused drm_atomic_set_fence_for_plane() function, new
> drivers should probably use the atomic helpers directly.
> 
> Signed-off-by: Christian König 

Is this enough to have amdgpu (and well others using ttm) fully rely on
this for atomic kms updates? Anything to clean up there? Would be neat to
include that in this series too if there's anything.


> ---
>  drivers/gpu/drm/drm_atomic_uapi.c   | 37 ---
>  drivers/gpu/drm/drm_gem_atomic_helper.c | 63 +
>  include/drm/drm_atomic_uapi.h   |  2 -
>  include/drm/drm_plane.h |  4 +-
>  4 files changed, 54 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_atomic_uapi.c 
> b/drivers/gpu/drm/drm_atomic_uapi.c
> index c6394ba13b24..0f461261b3f3 100644
> --- a/drivers/gpu/drm/drm_atomic_uapi.c
> +++ b/drivers/gpu/drm/drm_atomic_uapi.c
> @@ -254,43 +254,6 @@ drm_atomic_set_fb_for_plane(struct drm_plane_state 
> *plane_state,
>  }
>  EXPORT_SYMBOL(drm_atomic_set_fb_for_plane);
>  
> -/**
> - * drm_atomic_set_fence_for_plane - set fence for plane
> - * @plane_state: atomic state object for the plane
> - * @fence: dma_fence to use for the plane
> - *
> - * Helper to setup the plane_state fence in case it is not set yet.
> - * By using this drivers doesn't need to worry if the user choose
> - * implicit or explicit fencing.
> - *
> - * This function will not set the fence to the state if it was set
> - * via explicit fencing interfaces on the atomic ioctl. In that case it will
> - * drop the reference to the fence as we are not storing it anywhere.
> - * Otherwise, if _plane_state.fence is not set this function we just set 
> it
> - * with the received implicit fence. In both cases this function consumes a
> - * reference for @fence.
> - *
> - * This way explicit fencing can be used to overrule implicit fencing, which 
> is
> - * important to make explicit fencing use-cases work: One example is using 
> one
> - * buffer for 2 screens with different refresh rates. Implicit fencing will
> - * clamp rendering to the refresh rate of the slower screen, whereas explicit
> - * fence allows 2 independent render and display loops on a single buffer. 
> If a
> - * driver allows obeys both implicit and explicit fences for plane updates, 
> then
> - * it will break all the benefits of explicit fencing.
> - */
> -void
> -drm_atomic_set_fence_for_plane(struct drm_plane_state *plane_state,
> -struct dma_fence *fence)

I was a bit on the fence with ditching this, but the only offender not
using the prepare_fb helpers is i915, and so just more reasons that i915
needs to get off its hand-rolled atomic code with its own funky dependency
handling and everything.

> -{
> - if (plane_state->fence) {
> - dma_fence_put(fence);
> - return;
> - }
> -
> - plane_state->fence = fence;
> -}
> -EXPORT_SYMBOL(drm_atomic_set_fence_for_plane);
> -
>  /**
>   * drm_atomic_set_crtc_for_connector - set CRTC for connector
>   * @conn_state: atomic state object for the connector
> diff --git a/drivers/gpu/drm/drm_gem_atomic_helper.c 
> b/drivers/gpu/drm/drm_gem_atomic_helper.c
> index a6d89aed0bda..8fc0b42acdff 100644
> --- a/drivers/gpu/drm/drm_gem_atomic_helper.c
> +++ b/drivers/gpu/drm/drm_gem_atomic_helper.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0-or-later
>  
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -141,25 +142,67 @@
>   * See drm_atomic_set_fence_for_plane() for a discussion of implicit and

You forgot to update the kerneldoc here, and also the reference to the
same function in the IN_FENCE_FD.

I think it'd be best to put a reference to that DOC: section here, and
adjust the uapi property doc to just state that the explicit fence will
overrule implicit sync. And then maybe also mention here that USAGE_KERNEL
fences are still obeyed.

With these changes (which should make sure that all references to
drm_atomic_set_fence_for_plane() are truly gone) this is

Reviewed-by: Daniel Vetter 



>   * explicit fencing in atomic modeset updates.
>   */
> -int drm_gem_plane_helper_prepare_fb(struct drm_plane *plane, struct 
> drm_plane_state *state)
> +int drm_gem_plane_helper_prepare_fb(struct drm_plane *plane,
> + struct drm_plane_state *state)
>  {
> + struct dma_fence *fence = dma_fence_get(state->fence);
>   struct drm_gem_object *obj;
> - struct dma_fence *fence;
> +

Re: [RFC v2 1/2] drm/doc/rfc: VM_BIND feature design document

2022-04-27 Thread Niranjana Vishwanathapura

On Wed, Apr 20, 2022 at 03:45:25PM -0700, Niranjana Vishwanathapura wrote:

On Thu, Mar 31, 2022 at 10:28:48AM +0200, Daniel Vetter wrote:

Adding a pile of people who've expressed interest in vm_bind for their
drivers.

Also note to the intel folks: This is largely written with me having my
subsystem co-maintainer hat on, i.e. what I think is the right thing to do
here for the subsystem at large. There is substantial rework involved
here, but it's not any different from i915 adopting ttm or i915 adpoting
drm/sched, and I do think this stuff needs to happen in one form or
another.

On Mon, Mar 07, 2022 at 12:31:45PM -0800, Niranjana Vishwanathapura wrote:

VM_BIND design document with description of intended use cases.

Signed-off-by: Niranjana Vishwanathapura 
---
Documentation/gpu/rfc/i915_vm_bind.rst | 210 +
Documentation/gpu/rfc/index.rst|   4 +
2 files changed, 214 insertions(+)
create mode 100644 Documentation/gpu/rfc/i915_vm_bind.rst

diff --git a/Documentation/gpu/rfc/i915_vm_bind.rst 
b/Documentation/gpu/rfc/i915_vm_bind.rst
new file mode 100644
index ..cdc6bb25b942
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_vm_bind.rst
@@ -0,0 +1,210 @@
+==
+I915 VM_BIND feature design and use cases
+==
+
+VM_BIND feature
+
+DRM_I915_GEM_VM_BIND/UNBIND ioctls allows UMD to bind/unbind GEM buffer
+objects (BOs) or sections of a BOs at specified GPU virtual addresses on
+a specified address space (VM).
+
+These mappings (also referred to as persistent mappings) will be persistent
+across multiple GPU submissions (execbuff) issued by the UMD, without user
+having to provide a list of all required mappings during each submission
+(as required by older execbuff mode).
+
+VM_BIND ioctl deferes binding the mappings until next execbuff submission
+where it will be required, or immediately if I915_GEM_VM_BIND_IMMEDIATE
+flag is set (useful if mapping is required for an active context).


So this is a screw-up I've done, and for upstream I think we need to fix
it: Implicit sync is bad, and it's also still a bad idea for vm_bind, and
I was wrong suggesting we should do this a few years back when we kicked
this off internally :-(

What I think we need is just always VM_BIND_IMMEDIATE mode, and then a few
things on top:
- in and out fences, like with execbuf, to allow userspace to sync with
execbuf as needed
- for compute-mode context this means userspace memory fences
- for legacy context this means a timeline syncobj in drm_syncobj

No sync_file or anything else like this at all. This means a bunch of
work, but also it'll have benefits because it means we should be able to
use exactly the same code paths and logic for both compute and for legacy
context, because drm_syncobj support future fence semantics.



Thanks Daniel,
Ok, will update



I had a long conversation with Daniel on some of the points discussed here.
Thanks to Daniel for clarifying many points here.

Here is the summary of the discussion.

1) A prep patch is needed to update documentation of some existing uapi and this
  new VM_BIND uapi can update/refer to that.
  I will include this prep patch in the next revision of this RFC series.
  Will also include the uapi header file in the rst file so that it gets 
rendered.

2) Will update documentation here with proper use of dma_resv_usage while adding
  fences to vm_bind objects. It is going to be, DMA_RESV_USAGE_BOOKKEEP by 
default
  if not override with execlist in execbuff path.

3) Add extension to execbuff ioctl to specify batch buffer as GPU virtual 
address
  instead of having to pass it as a BO handle in execlist. This will also make 
the
  execlist usage solely for implicit sync setting which is further discussed 
below.

4) Need to look into when will Jason's dma-buf fence import/export ioctl 
support will
  land and whether it will be used both for vl and gl. Need to sync with Jason 
on this.
  Probably the better option here would be to not support execlist in execbuff 
path in
  vm_bind mode for initial vm_bind support (hoping Jason's dma-buf fence 
import/export
  ioctl will be enough). We can add support for execlist in execbuff for 
vm_bind mode
  later if required (say for gl).

5) There are lot of things in execbuff path that doesn't apply in VM_BIND mode 
(like
  relocations, implicit sync etc). Separate them out by using function pointers 
wherever
  the functionality differs between current design and the newer VM_BIND design.

6) Separate out i915_vma active reference counting in execbuff path and do not 
use it in
  VM_BIND mode. Instead use dma-resv fence checking for VM_BIND mode. This 
should be easier
  to get it working with the current TTM backend (which initial VM_BIND support 
will use).
  And remove i915_vma active reference counting fully while supporting TTM 
backend for igfx.

7) As we support compute mode contexts only with GuC 

Re: [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-27 Thread Lionel Landwerlin

On 27/04/2022 18:18, Matthew Auld wrote:

On 27/04/2022 07:48, Lionel Landwerlin wrote:
One question though, how do we detect that this flag 
(I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS) is accepted on a given 
kernel?
I assume older kernels are going to reject object creation if we use 
this flag?


From some offline discussion with Lionel, the plan here is to just do 
a dummy gem_create_ext to check if the kernel throws an error with the 
new flag or not.




I didn't plan to use __drm_i915_query_vma_info, but isn't it 
inconsistent to select the placement on the GEM object and then query 
whether it's mappable by address?
You made a comment stating this is racy, wouldn't querying on the GEM 
object prevent this?


Since mesa at this time doesn't currently have a use for this one, 
then I guess we should maybe just drop this part of the uapi, in this 
version at least, if no objections.



Just repeating what we discussed (maybe I missed some other discussion 
and that's why I was confused) :



The way I was planning to use this is to have 3 heaps in Vulkan :

    - heap0: local only, no cpu visible

    - heap1: system, cpu visible

    - heap2: local & cpu visible


With heap2 having the reported probed_cpu_visible_size size.

It is an error for the application to map from heap0 [1].


With that said, it means if we created a GEM BO without 
I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS, we'll never mmap it.


So why the query?

I guess it would be useful when we import a buffer from another 
application. But in that case, why not have the query on the BO?



-Lionel


[1] : 
https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/vkMapMemory.html 
(VUID-vkMapMemory-memory-00682)






Thanks,

-Lionel

On 27/04/2022 09:35, Lionel Landwerlin wrote:

Hi Matt,


The proposal looks good to me.

Looking forward to try it on drm-tip.


-Lionel

On 20/04/2022 20:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-...@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 
+++

  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at _i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;
+    };
+    };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create 
behaviour, with added

+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for 
the stuff that
+ * is immutable. Previously we would have two ioctls, one to 
create the object
+ * with gem_create, and another to apply various parameters, 
however this
+ * creates some ambiguity for the params which are considered 
immutable. Also in

+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+    /**
+ * @size: Requested size for the object.
+ *
+ * The (page-aligned) allocated size for the object will be 
returned.

+ *
+ * Note that for some devices we have might have further minimum
+ 

Re: [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-27 Thread Matthew Auld

On 27/04/2022 07:48, Lionel Landwerlin wrote:
One question though, how do we detect that this flag 
(I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS) is accepted on a given kernel?
I assume older kernels are going to reject object creation if we use 
this flag?


From some offline discussion with Lionel, the plan here is to just do a 
dummy gem_create_ext to check if the kernel throws an error with the new 
flag or not.




I didn't plan to use __drm_i915_query_vma_info, but isn't it 
inconsistent to select the placement on the GEM object and then query 
whether it's mappable by address?
You made a comment stating this is racy, wouldn't querying on the GEM 
object prevent this?


Since mesa at this time doesn't currently have a use for this one, then 
I guess we should maybe just drop this part of the uapi, in this version 
at least, if no objections.




Thanks,

-Lionel

On 27/04/2022 09:35, Lionel Landwerlin wrote:

Hi Matt,


The proposal looks good to me.

Looking forward to try it on drm-tip.


-Lionel

On 20/04/2022 20:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-...@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at _i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;
+    };
+    };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create behaviour, 
with added

+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for the 
stuff that
+ * is immutable. Previously we would have two ioctls, one to create 
the object
+ * with gem_create, and another to apply various parameters, however 
this
+ * creates some ambiguity for the params which are considered 
immutable. Also in

+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+    /**
+ * @size: Requested size for the object.
+ *
+ * The (page-aligned) allocated size for the object will be 
returned.

+ *
+ * Note that for some devices we have might have further minimum
+ * page-size restrictions(larger than 4K), like for device 
local-memory.

+ * However in general the final size here should always reflect any
+ * rounding up, if for example using the 
I915_GEM_CREATE_EXT_MEMORY_REGIONS

+ * extension to place the object in device local-memory.
+ */
+    __u64 size;
+    /**
+ * @handle: Returned handle for the object.
+ *
+ * Object handles are nonzero.
+ */
+    __u32 handle;
+    /**
+ * @flags: Optional flags.
+ *
+ * Supported values:
+ *
+ * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS - Signal to the 
kernel that

+ * the object will need to be accessed via the CPU.
+ *
+ * Only valid when placing objects in I915_MEMORY_CLASS_DEVICE, and
+ * only strictly required on platforms where only some of the 
device
+ * memory is directly visible or 

Re: How should "max bpc" KMS property work?

2022-04-27 Thread Harry Wentland



On 2022-04-27 06:52, Pekka Paalanen wrote:
> Hi Ville and Alex,
> 
> thanks for the replies. More below.
> 
> TL;DR:
> 
> My take-away from this is that I should slam 'max bpc' to the max by
> default, and offer a knob for the user in case they want to lower it.
> 
> 
> On Tue, 26 Apr 2022 20:55:14 +0300
> Ville Syrjälä  wrote:
> 
>> On Tue, Apr 26, 2022 at 11:35:02AM +0300, Pekka Paalanen wrote:
>>> Hi all,
>>>
>>> I'm working on setting HDR & WCG video modes in Weston, and I thought
>>> setting "max bpc" KMS property on the connector would be a good idea.
>>> I'm confused about how it works though.
>>>
>>> I did some digging in 
>>> https://gitlab.freedesktop.org/wayland/weston/-/issues/612
>>>
>>> Summary:
>>>
>>> - Apparently the property was originally added as a manual workaround
>>>   for sink hardware behaving badly with high depth. A simple end user
>>>   setting for "max bpc" would suffice for this use.
>>>
>>> - Drivers will sometimes automatically choose a lower bpc than the "max
>>>   bpc" value, but never bigger.
>>>
>>> - amdgpu seems to (did?) default "max bpc" to 8, meaning that I
>>>   definitely want to raise it.  
>>

I've wanted to remove the 8 bpc limitations for a while now but it
looks like we never did for anything other than eDP.

The original problem we solved was that some monitors default timing
couldn't be driven at a high bpc. Therefore users were faced with black
displays. On some displays you also can't drive high refresh rate modes
with a higher bpc.

>> I've occasionally pondered about doing the same for i915, just to have
>> the safest default possible. But I'd hate to lose the deep color testing
>> coverage knowing very few people would in practice raise the limit.
>> Also the number of systems where deep color doesn't work reliably
>> (or can't be made to work by not using a crap cable) seems to be quite
>> low.
> 
> I think when HDR and WCG get into display servers, setting 'max bpc'
> will become a standard action.
> 
> It's bit moot to e.g. render everything in electrical 10 bit RGB, if
> the link is just going to squash that into electrical 8 bit RGB, right?
> 
> So even 10 bit color would require setting 'max bpc' to at least 10 to
> be able to actually see it, source-side dithering aside.
> 
>>>
>>> If I always slam "max bpc" to the highest supported value for that
>>> property, do I lose more than workarounds for bad sink hardware?  
>>
>> We don't have any workarounds implemented like this in the kernel.
>> Or should not have at least. "max bpc" exists purely for the user
>> to have a say in the matter in addition to whatever the EDID/quirks
>> say. Ie. if the kernel knows for sure that deep color won't work on
>> a particular setup then it should just not allow deep color at all
>> despite what the prop value says.
>>
>> So the only danger is fighting with the user's wishes which I guess
>> you can overcome with some kind of user visible knob.
> 
> Right, good.
> 
> Furthermore, as a KMS client cannot make much assumptions about the KMS
> state it inherits from some other KMS client, it should know and
> program all possible KMS properties according to its own desires
> anyway. That, and the DRM master concept make sure that there cannot be
> any "third party" KMS configuration programs, like V4L2 has.
> 
>>> Do I lose the ability to set video modes that take too much bandwidth
>>> at uncapped driver-selected bpc while capping the bpc lower would allow
>>> me to use those video modes?
>>>
>>> Or, are drivers required to choose a lower-than-usual but highest
>>> usable bpc to make the requested video mode squeeze through the
>>> connector and link?  
>>
>> IMO drivers should implement the "reduce bpc until it fits"
>> fallback. We have that in i915, except for MST where we'd need
>> to potentially involve multiple streams in the fallback. That
>> is something we intend to remedy eventually but it's not an
>> entirely trivial thing to implement so will take some actual
>> work. ATM we just cap MST to <=8bpc to avoid users getting into
>> this situation so often.
> 
> Excellent, but judging from what Alex said, this is also not what
> amdgpu does. We have two drivers doing different things then?
> > So with Weston I probably have to document, that if you can't get the
> video mode you want working, try turning the 'max bpc' knob down and
> try again.
> 
> Or, I could cap 'max bpc' based on my framebuffer depth. If I have an
> electrical 8 bit FB (default in Weston), then there is not much use for
> having 'max bpc' > 8. This ignores the KMS color pipeline a bit. Does
> that make sense?
> 

I think both of those options make sense. I'll need to think about the
automatic fallback if we don't have enough bandwidth for max_bpc.

If a KMS driver falls back automatically we probably want some way
for a (color managed) compositor to know if the output bpc is reduced.

> Does KMS use dithering automatically, btw?
> 

amdgpu's display driver does.

> The only mention 

Re: [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-27 Thread Daniel Vetter
On Wed, Apr 27, 2022 at 08:55:07AM +0200, Christian König wrote:
> Well usually we increment the drm minor version when adding some new flags
> on amdgpu.
> 
> Additional to that just one comment from our experience with that: You don't
> just need one flag, but two. The first one is a hint which says "CPU access
> needed" and the second is a promise which says "CPU access never needed".
> 
> The background is that on a whole bunch of buffers you can 100% certain say
> that you will never ever need CPU access.
> 
> Then at least we have a whole bunch of buffers where we might need CPU
> access, but can't tell for sure.
> 
> And last we have stuff like transfer buffers you can be 100% sure that you
> need CPU access.
> 
> Separating it like this helped a lot with performance on small BAR systems.

So my assumption was that for transfer buffers you'd fill them with the
cpu first anyway, so no need for the extra flag.

I guess this if for transfer buffers for gpu -> cpu transfers, where it
would result in costly bo move and stalls and it's better to make sure
it's cpu accessible from the start? At least on current gpu we have where
there's no coherent interconnect, those buffers have to be in system
memory or your cpu access will be a disaster, so again they're naturally
cpu accessible.

What's the use-case for the "cpu access required" flag where "cpu access
before gpu access" isn't a good enough hint already to get the same perf
benefits?

Also for scanout my idea at least is that we just fail mmap when you
haven't set the flag and the scanout is pinned to unmappable, for two
reasons:
- 4k buffers are big, if we force them all into mappable things are
  non-pretty.
- You need mesa anyway to access tiled buffers, and mesa knows how to use
  a transfer buffer. That should work even when you do desktop switching
  and fastboot and stuff like that with the getfb2 ioctl should all work
  (and without getfb2 it's doomed to garbage anyway).

So only dumb kms buffers (which are linear) would ever get the
NEEDS_CPU_ACCESS flag, and only those we'd ever pin into cpu accessible
range for scanout. Is there a hole in that plan?

Cheers, Daniel

> 
> Regards,
> Christian.
> 
> Am 27.04.22 um 08:48 schrieb Lionel Landwerlin:
> > One question though, how do we detect that this flag
> > (I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS) is accepted on a given
> > kernel?
> > I assume older kernels are going to reject object creation if we use
> > this flag?
> > 
> > I didn't plan to use __drm_i915_query_vma_info, but isn't it
> > inconsistent to select the placement on the GEM object and then query
> > whether it's mappable by address?
> > You made a comment stating this is racy, wouldn't querying on the GEM
> > object prevent this?
> > 
> > Thanks,
> > 
> > -Lionel
> > 
> > On 27/04/2022 09:35, Lionel Landwerlin wrote:
> > > Hi Matt,
> > > 
> > > 
> > > The proposal looks good to me.
> > > 
> > > Looking forward to try it on drm-tip.
> > > 
> > > 
> > > -Lionel
> > > 
> > > On 20/04/2022 20:13, Matthew Auld wrote:
> > > > Add an entry for the new uapi needed for small BAR on DG2+.
> > > > 
> > > > v2:
> > > >    - Some spelling fixes and other small tweaks. (Akeem & Thomas)
> > > >    - Rework error capture interactions, including no longer needing
> > > >  NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
> > > >    - Add probed_cpu_visible_size. (Lionel)
> > > > 
> > > > Signed-off-by: Matthew Auld 
> > > > Cc: Thomas Hellström 
> > > > Cc: Lionel Landwerlin 
> > > > Cc: Jon Bloomfield 
> > > > Cc: Daniel Vetter 
> > > > Cc: Jordan Justen 
> > > > Cc: Kenneth Graunke 
> > > > Cc: Akeem G Abodunrin 
> > > > Cc: mesa-...@lists.freedesktop.org
> > > > ---
> > > >   Documentation/gpu/rfc/i915_small_bar.h   | 190
> > > > +++
> > > >   Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
> > > >   Documentation/gpu/rfc/index.rst  |   4 +
> > > >   3 files changed, 252 insertions(+)
> > > >   create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
> > > >   create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst
> > > > 
> > > > diff --git a/Documentation/gpu/rfc/i915_small_bar.h
> > > > b/Documentation/gpu/rfc/i915_small_bar.h
> > > > new file mode 100644
> > > > index ..7bfd0cf44d35
> > > > --- /dev/null
> > > > +++ b/Documentation/gpu/rfc/i915_small_bar.h
> > > > @@ -0,0 +1,190 @@
> > > > +/**
> > > > + * struct __drm_i915_memory_region_info - Describes one region
> > > > as known to the
> > > > + * driver.
> > > > + *
> > > > + * Note this is using both struct drm_i915_query_item and
> > > > struct drm_i915_query.
> > > > + * For this new query we are adding the new query id
> > > > DRM_I915_QUERY_MEMORY_REGIONS
> > > > + * at _i915_query_item.query_id.
> > > > + */
> > > > +struct __drm_i915_memory_region_info {
> > > > +    /** @region: The class:instance pair encoding */
> > > > +    struct drm_i915_gem_memory_class_instance region;
> > > > +
> > > > +    /** @rsvd0: 

Re: [PATCH v3] drm/doc: Add sections about tiny drivers and external refs to intro page

2022-04-27 Thread Daniel Vetter
On Wed, Apr 20, 2022 at 09:24:11AM +0200, Javier Martinez Canillas wrote:
> Learning about the DRM subsystem could be quite overwhelming for newcomers
> but there are lots of useful talks, slides and articles available that can
> help to understand the needed concepts and ease the learning curve.
> 
> There are also simple DRM drivers that can be used as example about how a
> DRM driver should look like.
> 
> Add sections to the introduction page, that contains references to these.
> 
> Suggested-by: Daniel Vetter 
> Signed-off-by: Javier Martinez Canillas 
> Acked-by: Pekka Paalanen 
> Acked-by: Thomas Zimmermann 

Maybe needs more acks to land?

Acked-by: Daniel Vetter 

Would be good we can hand out links to pretty htmldocs instead of lore
links to this patch, the latter is rather hard on the eyes :-)

Cheers, Daniel

> ---
> 
> Changes in v3:
> - Fix typos and grammar errors that found when re-reading the changes.
> 
> Changes in v2:
> - Remove paragraph that gave wrong impression that DRM is complex (Pekka 
> Paalanen).
> - Add Thomas Zimmermann's and Pekka Paalanen's Acked-by tags.
> - Replace "Learning material" title with "External References" (Thomas 
> Zimmermann).
> - Add a section about tiny DRM drivers being a good first example (Daniel 
> Vetter).
> - Add some more external references that I found interesting since v1 was 
> posted.
> 
>  Documentation/gpu/introduction.rst | 60 ++
>  1 file changed, 60 insertions(+)
> 
> diff --git a/Documentation/gpu/introduction.rst 
> b/Documentation/gpu/introduction.rst
> index 25a56e9c0cfd..f05eccd2c07c 100644
> --- a/Documentation/gpu/introduction.rst
> +++ b/Documentation/gpu/introduction.rst
> @@ -112,3 +112,63 @@ Please conduct yourself in a respectful and civilised 
> manner when
>  interacting with community members on mailing lists, IRC, or bug
>  trackers. The community represents the project as a whole, and abusive
>  or bullying behaviour is not tolerated by the project.
> +
> +Simple DRM drivers to use as examples
> +=
> +
> +The DRM subsystem contains a lot of helper functions to ease writing drivers 
> for
> +simple graphic devices. For example, the `drivers/gpu/drm/tiny/` directory 
> has a
> +set of drivers that are simple enough to be implemented in a single source 
> file.
> +
> +These drivers make use of the `struct drm_simple_display_pipe_funcs`, that 
> hides
> +any complexity of the DRM subsystem and just requires drivers to implement a 
> few
> +functions needed to operate the device. This could be used for devices that 
> just
> +need a display pipeline with one full-screen scanout buffer feeding one 
> output.
> +
> +The tiny DRM drivers are good examples to understand how DRM drivers should 
> look
> +like. Since are just a few hundreds lines of code, they are quite easy to 
> read.
> +
> +External References
> +===
> +
> +Delving into a Linux kernel subsystem for the first time can be an 
> overwhelming
> +experience, one needs to get familiar with all the concepts and learn about 
> the
> +subsystem's internals, among other details.
> +
> +To shallow the learning curve, this section contains a list of presentations
> +and documents that can be used to learn about DRM/KMS and graphics in 
> general.
> +
> +There are different reasons why someone might want to get into DRM: porting 
> an
> +existing fbdev driver, write a DRM driver for a new hardware, fixing bugs 
> that
> +could face when working on the graphics user-space stack, etc. For this 
> reason,
> +the learning material covers many aspects of the Linux graphics stack. From 
> an
> +overview of the kernel and user-space stacks to very specific topics.
> +
> +The list is sorted in reverse chronological order, to keep the most 
> up-to-date
> +material at the top. But all of them contain useful information, and it can 
> be
> +valuable to go through older material to understand the rationale and context
> +in which the changes to the DRM subsystem were made.
> +
> +Conference talks
> +
> +
> +* `An Overview of the Linux and Userspace Graphics Stack 
> `_ - Paul Kocialkowski (2020)
> +* `Getting pixels on screen on Linux: introduction to Kernel Mode Setting 
> `_ - Simon Ser (2020)
> +* `Everything Great about Upstream Graphics 
> `_ - Daniel Vetter (2019)
> +* `An introduction to the Linux DRM subsystem 
> `_ - Maxime Ripard (2017)
> +* `Embrace the Atomic (Display) Age 
> `_ - Daniel Vetter (2016)
> +* `Anatomy of an Atomic KMS Driver 
> `_ - Laurent Pinchart (2015)
> +* `Atomic Modesetting for Drivers 
> `_ - Daniel Vetter (2015)
> +* `Anatomy of an Embedded KMS 

Re: [PATCH v4] drm/amdgpu: Disable ABM when AC mode

2022-04-27 Thread Harry Wentland




On 2022-04-27 04:08, Ryan Lin wrote:

Disable ABM feature when the system is running on AC mode to get the more
perfect contrast of the display.

v2: remove "UPSTREAM" from the subject.

v3: adv->pm.ac_power updating by amd gpu_acpi_event_handler.

V4: Add the file I lost to fix the build error.

Signed-off-by: Ryan Lin 

---
  drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c  |  3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|  1 +
  drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c | 61 +++
  drivers/gpu/drm/amd/include/amd_acpi.h|  1 +
  drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  1 +
  5 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index 4811b0faafd9a..6ac331ee4255d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -822,7 +822,8 @@ static int amdgpu_acpi_event(struct notifier_block *nb,
struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, 
acpi_nb);
struct acpi_bus_event *entry = (struct acpi_bus_event *)data;
  
-	if (strcmp(entry->device_class, ACPI_AC_CLASS) == 0) {

+   if (strcmp(entry->device_class, ACPI_AC_CLASS) == 0 ||
+   strcmp(entry->device_class, ACPI_BATTERY_CLASS) == 0) {
if (power_supply_is_system_supplied() > 0)
DRM_DEBUG_DRIVER("pm: AC\n");
else
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index abfcc1304ba0c..3a0afe7602727 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3454,6 +3454,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
  
  	adev->gfx.gfx_off_req_count = 1;

adev->pm.ac_power = power_supply_is_system_supplied() > 0;
+   adev->pm.old_ac_power = true;
  
  	atomic_set(>throttling_logging_enabled, 1);

/*
diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c 
b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c
index 54a1408c8015c..090bd23410b45 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c
+++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_abm.c
@@ -22,7 +22,8 @@
   * Authors: AMD
   *
   */
-
+#include 
+#include "amdgpu.h"
  #include "dmub_abm.h"
  #include "dce_abm.h"
  #include "dc.h"
@@ -50,7 +51,7 @@
  
  #define DISABLE_ABM_IMMEDIATELY 255
  
-

+extern uint amdgpu_dm_abm_level;
  
  static void dmub_abm_enable_fractional_pwm(struct dc_context *dc)

  {
@@ -117,28 +118,6 @@ static void dmub_abm_init(struct abm *abm, uint32_t 
backlight)
dmub_abm_enable_fractional_pwm(abm->ctx);
  }
  
-static unsigned int dmub_abm_get_current_backlight(struct abm *abm)

-{
-   struct dce_abm *dce_abm = TO_DMUB_ABM(abm);
-   unsigned int backlight = REG_READ(BL1_PWM_CURRENT_ABM_LEVEL);
-
-   /* return backlight in hardware format which is unsigned 17 bits, with
-* 1 bit integer and 16 bit fractional
-*/
-   return backlight;
-}
-
-static unsigned int dmub_abm_get_target_backlight(struct abm *abm)
-{
-   struct dce_abm *dce_abm = TO_DMUB_ABM(abm);
-   unsigned int backlight = REG_READ(BL1_PWM_TARGET_ABM_LEVEL);
-
-   /* return backlight in hardware format which is unsigned 17 bits, with
-* 1 bit integer and 16 bit fractional
-*/
-   return backlight;
-}
-
  static bool dmub_abm_set_level(struct abm *abm, uint32_t level)
  {
union dmub_rb_cmd cmd;
@@ -147,6 +126,10 @@ static bool dmub_abm_set_level(struct abm *abm, uint32_t 
level)
int i;
int edp_num;
uint8_t panel_mask = 0;
+   struct amdgpu_device *dev = dc->driver_context;


NAK. We can't access amdgpu_device in DC. This is code that's
shared with other OSes.

I've mentioned this in my previous review a month ago.

What happened to the other suggestion I had? I never saw
a follow-up.

My previous comments, copy-pasted here again. Please address
or answer why you disagree:



This patch still has the problem of accessing adev from within DC.
That'll break things on other platforms. This information needs to
come in through the DC interface if we want to enable/disable ABM in
this function.

After a closer look I also don't think that amdgpu is the right place
to control the logic to disable ABM in AC mode, i.e. to switch between
ABM levels. Take a look at dm_connector_state.abm_level and the
abm_level_property. It's exposed to userspace as "abm level".

The "abm level" defaults to "0" unless userspace sets the "abm level"
to something else. The same component that sets the "abm level"
initially is the one that should set it to "0" when in AC mode.



It might be that the ABM level is controlled via the abmlevel module
parameter. If that's the case and there isn't a userspace that sets the
"abm level" property then the easiest way to handle this is to switch
between 0 and amdgpu_dm_abm_level 

Re: [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-27 Thread Christian König

Am 27.04.22 um 17:02 schrieb Matthew Auld:

On 27/04/2022 07:55, Christian König wrote:
Well usually we increment the drm minor version when adding some new 
flags on amdgpu.


Additional to that just one comment from our experience with that: 
You don't just need one flag, but two. The first one is a hint which 
says "CPU access needed" and the second is a promise which says "CPU 
access never needed".


The background is that on a whole bunch of buffers you can 100% 
certain say that you will never ever need CPU access.


Then at least we have a whole bunch of buffers where we might need 
CPU access, but can't tell for sure.


And last we have stuff like transfer buffers you can be 100% sure 
that you need CPU access.


Separating it like this helped a lot with performance on small BAR 
systems.


Thanks for the comments. For the "CPU access never needed" flag, what 
extra stuff does that do on the kernel side vs not specifying any 
flag/hint? I assume it still prioritizes using the non-CPU visible 
portion first? What else does it do?


It's used as a hint when you need to pin BOs for scanout for example.

In general we try to allocate BOs which are marked "CPU access needed" 
in the CPU visible window if possible, but fallback to any memory if 
that won't fit.


Christian.





Regards,
Christian.

Am 27.04.22 um 08:48 schrieb Lionel Landwerlin:
One question though, how do we detect that this flag 
(I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS) is accepted on a given 
kernel?
I assume older kernels are going to reject object creation if we use 
this flag?


I didn't plan to use __drm_i915_query_vma_info, but isn't it 
inconsistent to select the placement on the GEM object and then 
query whether it's mappable by address?
You made a comment stating this is racy, wouldn't querying on the 
GEM object prevent this?


Thanks,

-Lionel

On 27/04/2022 09:35, Lionel Landwerlin wrote:

Hi Matt,


The proposal looks good to me.

Looking forward to try it on drm-tip.


-Lionel

On 20/04/2022 20:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-...@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 
+++

  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at _i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;
+    };
+    };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create 
behaviour, with added

+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for 
the stuff that
+ * is immutable. Previously we would have two ioctls, one to 
create the object
+ * with gem_create, and another to apply various parameters, 
however this
+ * creates some ambiguity for the params which are considered 
immutable. Also in

+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+    /**
+ * @size: Requested size for the object.
+ *
+ * The (page-aligned) allocated 

Re: [PATCH v4 11/15] drm/shmem-helper: Add generic memory shrinker

2022-04-27 Thread Daniel Vetter
On Tue, Apr 19, 2022 at 11:40:41PM +0300, Dmitry Osipenko wrote:
> On 4/19/22 10:22, Thomas Zimmermann wrote:
> > Hi
> > 
> > Am 18.04.22 um 00:37 schrieb Dmitry Osipenko:
> >> Introduce a common DRM SHMEM shrinker. It allows to reduce code
> >> duplication among DRM drivers that implement theirs own shrinkers.
> >> This is initial version of the shrinker that covers basic needs of
> >> GPU drivers, both purging and eviction of shmem objects are supported.
> >>
> >> This patch is based on a couple ideas borrowed from Rob's Clark MSM
> >> shrinker and Thomas' Zimmermann variant of SHMEM shrinker.
> >>
> >> In order to start using DRM SHMEM shrinker drivers should:
> >>
> >> 1. Implement new purge(), evict() + swap_in() GEM callbacks.
> >> 2. Register shrinker using drm_gem_shmem_shrinker_register(drm_device).
> >> 3. Use drm_gem_shmem_set_purgeable_and_evictable(shmem) and alike API
> >>     functions to activate shrinking of GEMs.
> >>
> >> Signed-off-by: Daniel Almeida 
> >> Signed-off-by: Dmitry Osipenko 
> >> ---
> >>   drivers/gpu/drm/drm_gem_shmem_helper.c | 765 -
> >>   include/drm/drm_device.h   |   4 +
> >>   include/drm/drm_gem.h  |  35 ++
> >>   include/drm/drm_gem_shmem_helper.h | 105 +++-
> >>   4 files changed, 877 insertions(+), 32 deletions(-)
> ...
> >> @@ -172,6 +172,41 @@ struct drm_gem_object_funcs {
> >>    * This is optional but necessary for mmap support.
> >>    */
> >>   const struct vm_operations_struct *vm_ops;
> >> +
> >> +    /**
> >> + * @purge:
> >> + *
> >> + * Releases the GEM object's allocated backing storage to the
> >> system.
> >> + *
> >> + * Returns the number of pages that have been freed by purging
> >> the GEM object.
> >> + *
> >> + * This callback is used by the GEM shrinker.
> >> + */
> >> +    unsigned long (*purge)(struct drm_gem_object *obj);

Hm I feel like drivers shouldn't need to know the difference here?

Like shmem helpers can track what's purgeable, and for eviction/purging
the driver callback should do the same?

The only difference is when we try to re-reserve the backing storage. When
the object has been evicted that should suceed, but when the object is
purged that will fail.

That's the difference between evict and purge for drivers?

> >> +
> >> +    /**
> >> + * @evict:
> >> + *
> >> + * Unpins the GEM object's allocated backing storage, allowing
> >> shmem pages
> >> + * to be swapped out.
> > 
> > What's the difference to the existing unpin() callback?
> 
> Drivers need to do more than just unpinning pages when GEMs are evicted.
> Unpinning is only a part of the eviction process. I'll improve the
> doc-comment in v5.
> 
> For example, for VirtIO-GPU driver we need to to detach host from the
> guest's memory before pages are evicted [1].
> 
> [1]
> https://gitlab.collabora.com/dmitry.osipenko/linux-kernel-rd/-/blob/932eb03198bce3a21353b09ab71e95f1c19b84c2/drivers/gpu/drm/virtio/virtgpu_object.c#L145
> 
> In case of Panfrost driver, we will need to remove mappings before pages
> are evicted.

It might be good to align this with ttm, otoh that all works quite a bit
differently for ttm since ttm supports buffer moves and a lot more fancy
stuff.

I'm bringing this up since I have this fancy idea that eventually we could
glue shmem helpers into ttm in some cases for managing buffers when they
sit in system memory (as opposed to vram).

> >> + *
> >> + * Returns the number of pages that have been unpinned.
> >> + *
> >> + * This callback is used by the GEM shrinker.
> >> + */
> >> +    unsigned long (*evict)(struct drm_gem_object *obj);
> >> +
> >> +    /**
> >> + * @swap_in:
> >> + *
> >> + * Pins GEM object's allocated backing storage if it was
> >> previously evicted,
> >> + * moving swapped out pages back to memory.
> >> + *
> >> + * Returns 0 on success, or -errno on error.
> >> + *
> >> + * This callback is used by the GEM shrinker.
> >> + */
> >> +    int (*swap_in)(struct drm_gem_object *obj);
> > 
> > Why do you need swap_in()? This can be done on-demand as part of a pin
> > or vmap operation.
> 
> Similarly to the unpinning, the pining of pages is only a part of what
> needs to be done for GPU drivers. Besides of returning pages back to
> memory, we also need to make them accessible to GPU and this is a
> driver-specific process. This why we need the additional callbacks.

This is a bit much midlayer. The way this works in ttm is you reserve all
the objects you need (which makes sure they're physically available
again), and then the driver goes through and makes sure the page tables
are all set up again.

Once you get towards gpu vm that's really the only approach, since your
swap_in has no idea for which vm it needs to restore pagetables (and
restoring it for all is a bit meh).

If drivers want to optimize this they can adjust/set any tracking
information from their 

Re: [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-27 Thread Matthew Auld

On 27/04/2022 07:55, Christian König wrote:
Well usually we increment the drm minor version when adding some new 
flags on amdgpu.


Additional to that just one comment from our experience with that: You 
don't just need one flag, but two. The first one is a hint which says 
"CPU access needed" and the second is a promise which says "CPU access 
never needed".


The background is that on a whole bunch of buffers you can 100% certain 
say that you will never ever need CPU access.


Then at least we have a whole bunch of buffers where we might need CPU 
access, but can't tell for sure.


And last we have stuff like transfer buffers you can be 100% sure that 
you need CPU access.


Separating it like this helped a lot with performance on small BAR systems.


Thanks for the comments. For the "CPU access never needed" flag, what 
extra stuff does that do on the kernel side vs not specifying any 
flag/hint? I assume it still prioritizes using the non-CPU visible 
portion first? What else does it do?




Regards,
Christian.

Am 27.04.22 um 08:48 schrieb Lionel Landwerlin:
One question though, how do we detect that this flag 
(I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS) is accepted on a given 
kernel?
I assume older kernels are going to reject object creation if we use 
this flag?


I didn't plan to use __drm_i915_query_vma_info, but isn't it 
inconsistent to select the placement on the GEM object and then query 
whether it's mappable by address?
You made a comment stating this is racy, wouldn't querying on the GEM 
object prevent this?


Thanks,

-Lionel

On 27/04/2022 09:35, Lionel Landwerlin wrote:

Hi Matt,


The proposal looks good to me.

Looking forward to try it on drm-tip.


-Lionel

On 20/04/2022 20:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-...@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 
+++

  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at _i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;
+    };
+    };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create 
behaviour, with added

+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for 
the stuff that
+ * is immutable. Previously we would have two ioctls, one to create 
the object
+ * with gem_create, and another to apply various parameters, 
however this
+ * creates some ambiguity for the params which are considered 
immutable. Also in

+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+    /**
+ * @size: Requested size for the object.
+ *
+ * The (page-aligned) allocated size for the object will be 
returned.

+ *
+ * Note that for some devices we have might have further minimum
+ * page-size restrictions(larger than 4K), like for device 
local-memory.
+ * However in general the final size here should always reflect 
any
+ * rounding up, 

Re: [PATCH v4 10/15] drm/shmem-helper: Take reservation lock instead of drm_gem_shmem locks

2022-04-27 Thread Daniel Vetter
On Mon, Apr 18, 2022 at 10:18:54PM +0300, Dmitry Osipenko wrote:
> Hello,
> 
> On 4/18/22 21:38, Thomas Zimmermann wrote:
> > Hi
> > 
> > Am 18.04.22 um 00:37 schrieb Dmitry Osipenko:
> >> Replace drm_gem_shmem locks with the reservation lock to make GEM
> >> lockings more consistent.
> >>
> >> Previously drm_gem_shmem_vmap() and drm_gem_shmem_get_pages() were
> >> protected by separate locks, now it's the same lock, but it doesn't
> >> make any difference for the current GEM SHMEM users. Only Panfrost
> >> and Lima drivers use vmap() and they do it in the slow code paths,
> >> hence there was no practical justification for the usage of separate
> >> lock in the vmap().
> >>
> >> Suggested-by: Daniel Vetter 
> >> Signed-off-by: Dmitry Osipenko 
> >> ---
> ...
> >>   @@ -310,7 +306,7 @@ static int drm_gem_shmem_vmap_locked(struct
> >> drm_gem_shmem_object *shmem,
> >>   } else {
> >>   pgprot_t prot = PAGE_KERNEL;
> >>   -    ret = drm_gem_shmem_get_pages(shmem);
> >> +    ret = drm_gem_shmem_get_pages_locked(shmem);
> >>   if (ret)
> >>   goto err_zero_use;
> >>   @@ -360,11 +356,11 @@ int drm_gem_shmem_vmap(struct
> >> drm_gem_shmem_object *shmem,
> >>   {
> >>   int ret;
> >>   -    ret = mutex_lock_interruptible(>vmap_lock);
> >> +    ret = dma_resv_lock_interruptible(shmem->base.resv, NULL);
> >>   if (ret)
> >>   return ret;
> >>   ret = drm_gem_shmem_vmap_locked(shmem, map);
> > 
> > Within drm_gem_shmem_vmap_locked(), there's a call to dma_buf_vmap() for
> > imported pages. If the exporter side also holds/acquires the same
> > reservation lock as our object, the whole thing can deadlock. We cannot
> > move dma_buf_vmap() out of the CS, because we still need to increment
> > the reference counter. I honestly don't know how to easily fix this
> > problem. There's a TODO item about replacing these locks at [1]. As
> > Daniel suggested this patch, we should talk to him about the issue.
> > 
> > Best regards
> > Thomas
> > 
> > [1]
> > https://www.kernel.org/doc/html/latest/gpu/todo.html#move-buffer-object-locking-to-dma-resv-lock
> 
> Indeed, good catch! Perhaps we could simply use a separate lock for the
> vmapping of the *imported* GEMs? The vmap_use_count is used only by
> vmap/vunmap, so it doesn't matter which lock is used by these functions
> in the case of imported GEMs since we only need to protect the
> vmap_use_count.

Apologies for the late reply, I'm flooded.

I discussed this with Daniel Stone last week in a chat, roughly what we
need to do is:

1. Pick a function from shmem helpers.

2. Go through all drivers that call this, and make sure that we acquire
dma_resv_lock in the top level driver entry point for this.

3. Once all driver code paths are converted, add a dma_resv_assert_held()
call to that function to make sure you have it all correctly.

4. Repeate 1-3 until all shmem helper functions are converted over.

5. Ditch the 3 different shmem helper locks.

The trouble is that I forgot that vmap is a thing, so that needs more
work. I think there's two approaches here:
- Do the vmap at import time. This is the trick we used to untangle the
  dma_resv_lock issues around dma_buf_attachment_map()
- Change the dma_buf_vmap rules that callers must hold the dma_resv_lock.
- Maybe also do what you suggest and keep a separate lock for this, but
  the fundamental issue is that this doesn't really work - if you share
  buffers both ways with two drivers using shmem helpers, then the
  ordering of this vmap_count_mutex vs dma_resv_lock is inconsistent and
  you can get some nice deadlocks. So not a great approach (and also the
  reason why we really need to get everyone to move towards dma_resv_lock
  as _the_ buffer object lock, since otherwise we'll never get a
  consistent lock nesting hierarchy).

The trouble here is that trying to be clever and doing the conversion just
in shmem helpers wont work, because there's a lot of cases where the
drivers are all kinds of inconsistent with their locking.

Adding Daniel S, also maybe for questions it'd be fastest to chat on irc?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


Re: [PATCH] drm/amdgpu: do not use passthrough mode in Xen dom0

2022-04-27 Thread Alex Deucher
Applied.  Thanks!

Alex

On Wed, Apr 27, 2022 at 3:12 AM Marek Marczykowski-Górecki
 wrote:
>
> While technically Xen dom0 is a virtual machine too, it does have
> access to most of the hardware so it doesn't need to be considered a
> "passthrough". Commit b818a5d37454 ("drm/amdgpu/gmc: use PCI BARs for
> APUs in passthrough") changed how FB is accessed based on passthrough
> mode. This breaks amdgpu in Xen dom0 with message like this:
>
> [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for DMUB idle: 
> status=3
>
> While the reason for this failure is unclear, the passthrough mode is
> not really necessary in Xen dom0 anyway. So, to unbreak booting affected
> kernels, disable passthrough mode in this case.
>
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1985
> Fixes: b818a5d37454 ("drm/amdgpu/gmc: use PCI BARs for APUs in passthrough")
> Signed-off-by: Marek Marczykowski-Górecki 
> Cc: sta...@vger.kernel.org
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index a025f080aa6a..5e3756643da3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -24,6 +24,7 @@
>  #include 
>
>  #include 
> +#include 
>
>  #include "amdgpu.h"
>  #include "amdgpu_ras.h"
> @@ -710,7 +711,8 @@ void amdgpu_detect_virtualization(struct amdgpu_device 
> *adev)
> adev->virt.caps |= AMDGPU_SRIOV_CAPS_ENABLE_IOV;
>
> if (!reg) {
> -   if (is_virtual_machine())   /* passthrough mode exclus 
> sriov mod */
> +   /* passthrough mode exclus sriov mod */
> +   if (is_virtual_machine() && !xen_initial_domain())
> adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE;
> }
>
> --
> 2.35.1
>


Re: [PATCH] gpu: drm: remove redundant dma_fence_put() when drm_sched_job_add_dependency() fails

2022-04-27 Thread Andrey Grodzovsky


On 2022-04-26 22:31, Hangyu Hua wrote:

On 2022/4/26 22:55, Andrey Grodzovsky wrote:


On 2022-04-25 22:54, Hangyu Hua wrote:

On 2022/4/25 23:42, Andrey Grodzovsky wrote:

On 2022-04-25 04:36, Hangyu Hua wrote:

When drm_sched_job_add_dependency() fails, dma_fence_put() will be 
called
internally. Calling it again after drm_sched_job_add_dependency() 
finishes

may result in a dangling pointer.

Fix this by removing redundant dma_fence_put().

Signed-off-by: Hangyu Hua 
---
  drivers/gpu/drm/lima/lima_gem.c    | 1 -
  drivers/gpu/drm/scheduler/sched_main.c | 1 -
  2 files changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/lima/lima_gem.c 
b/drivers/gpu/drm/lima/lima_gem.c

index 55bb1ec3c4f7..99c8e7f6bb1c 100644
--- a/drivers/gpu/drm/lima/lima_gem.c
+++ b/drivers/gpu/drm/lima/lima_gem.c
@@ -291,7 +291,6 @@ static int lima_gem_add_deps(struct drm_file 
*file, struct lima_submit *submit)
  err = drm_sched_job_add_dependency(>task->base, 
fence);

  if (err) {
-    dma_fence_put(fence);
  return err;



Makes sense here



  }
  }
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c

index b81fceb0b8a2..ebab9eca37a8 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -708,7 +708,6 @@ int 
drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,

  dma_fence_get(fence);
  ret = drm_sched_job_add_dependency(job, fence);
  if (ret) {
-    dma_fence_put(fence);




Not sure about this one since if you look at the relevant commits -
'drm/scheduler: fix drm_sched_job_add_implicit_dependencies' and
'drm/scheduler: fix drm_sched_job_add_implicit_dependencies harder'
You will see that the dma_fence_put here balances the extra 
dma_fence_get

above

Andrey



I don't think so. I checked the call chain and found no additional 
dma_fence_get(). But dma_fence_get() needs to be called before 
drm_sched_job_add_dependency() to keep the counter balanced. 



I don't say there is an additional get, I just say that 
drm_sched_job_add_dependency doesn't grab an extra reference to the 
fences it stores so this needs to be done outside and for that
drm_sched_job_add_implicit_dependencies->dma_fence_get is called and, 
if this addition fails you just call dma_fence_put to keep the 
counter balanced.




drm_sched_job_add_implicit_dependencies() will call 
drm_sched_job_add_dependency(). And drm_sched_job_add_dependency() 
already call dma_fence_put() when it fails. Calling dma_fence_put() 
twice doesn't make sense.


dma_fence_get() is in [2]. But dma_fence_put() will be called in [1] 
and [3] when xa_alloc() fails.



The way I see it, [2] and [3] are mat matching *get* and *put* 
respectively. [1] *put* is against the original 
dma_fence_init->kref_init of the fence which always set the refcount at 1.
Also in support of this see commit 'drm/scheduler: fix 
drm_sched_job_add_implicit_dependencies harder' - it says there 
"drm_sched_job_add_dependency() could drop the last ref"  - this last 
ref is the original refcount set by dma_fence_init->kref


Andrey





int drm_sched_job_add_dependency(struct drm_sched_job *job,
 struct dma_fence *fence)
{
...
ret = xa_alloc(>dependencies, , fence, xa_limit_32b, 
GFP_KERNEL);

if (ret != 0)
    dma_fence_put(fence);    <--- [1]

return ret;
}
EXPORT_SYMBOL(drm_sched_job_add_dependency);


int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
    struct drm_gem_object *obj,
    bool write)
{
struct dma_resv_iter cursor;
struct dma_fence *fence;
int ret;

dma_resv_for_each_fence(, obj->resv, write, fence) {
    /* Make sure to grab an additional ref on the added fence */
    dma_fence_get(fence);    <--- [2]
    ret = drm_sched_job_add_dependency(job, fence);
    if (ret) {
    dma_fence_put(fence);    <--- [3]
    return ret;
    }
}
return 0;
}




On the other hand, dma_fence_get() and dma_fence_put() are 
meaningless here if threre is an extra dma_fence_get() beacause 
counter will not decrease to 0 during drm_sched_job_add_dependency().


I check the call chain as follows:

msm_ioctl_gem_submit()
-> submit_fence_sync()
-> drm_sched_job_add_implicit_dependencies()



Can you maybe trace or print one such example of problematic refcount 
that you are trying to fix ? I still don't see where is the problem.


Andrey



I also wish I could. System logs can make this easy. But i don't have 
a corresponding GPU physical device. 
drm_sched_job_add_implicit_dependencies is only used in a few devices.


Thanks.




Thanks,
Hangyu




  return ret;
  }
  }

Re: [PATCH v6 19/19] drm/msm/dpu: add wb_idx to DRM traces in dpu_encoder

2022-04-27 Thread Abhinav Kumar

Hi Dmitry

Thanks for fixing it up.

I agree about the indentation issue.

And yes even wb_idx missing in TP_ARGS seems like a geniune miss.

But the weird part is it did not break my compilation. I tested even now 
without your fix.


Am I missing something to be enabled in my config to replicate the error 
for future reference?


Thanks

Abhinav
On 4/27/2022 3:43 AM, Dmitry Baryshkov wrote:

On 26/04/2022 17:41, Abhinav Kumar wrote:

Change the DRM traces to include both the intf_mode
and wb_idx similar to the DRM prints in the previous change.

Signed-off-by: Abhinav Kumar 
Reviewed-by: Dmitry Baryshkov 


This commit got traces broken. I'm going to apply a fix.


---
  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 13 -
  drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h   | 26 
++

  2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c

index 35080c4..52516eb 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
@@ -1354,8 +1354,9 @@ static void dpu_encoder_frame_done_callback(
   * suppress frame_done without waiter,
   * likely autorefresh
   */
-    trace_dpu_enc_frame_done_cb_not_busy(DRMID(drm_enc),
-    event, ready_phys->intf_idx);
+    trace_dpu_enc_frame_done_cb_not_busy(DRMID(drm_enc), event,
+
dpu_encoder_helper_get_intf_type(ready_phys->intf_mode),

+    ready_phys->intf_idx, ready_phys->wb_idx);
  return;
  }
@@ -1433,9 +1434,11 @@ static void _dpu_encoder_trigger_flush(struct 
drm_encoder *drm_enc,

  if (ctl->ops.get_pending_flush)
  ret = ctl->ops.get_pending_flush(ctl);
-    trace_dpu_enc_trigger_flush(DRMID(drm_enc), phys->intf_idx,
-    pending_kickoff_cnt, ctl->idx,
-    extra_flush_bits, ret);
+    trace_dpu_enc_trigger_flush(DRMID(drm_enc),
+    dpu_encoder_helper_get_intf_type(phys->intf_mode),
+    phys->intf_idx, phys->wb_idx,
+    pending_kickoff_cnt, ctl->idx,
+    extra_flush_bits, ret);
  }
  /**
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h

index 58b411f..1106d44 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h
@@ -380,20 +380,26 @@ TRACE_EVENT(dpu_enc_rc,
  );
  TRACE_EVENT(dpu_enc_frame_done_cb_not_busy,
-    TP_PROTO(uint32_t drm_id, u32 event, enum dpu_intf intf_idx),
-    TP_ARGS(drm_id, event, intf_idx),
+    TP_PROTO(uint32_t drm_id, u32 event, char *intf_mode, enum 
dpu_intf intf_idx,

+    enum dpu_wb wb_idx),
+    TP_ARGS(drm_id, event, intf_mode, intf_idx, wb_idx),
  TP_STRUCT__entry(
  __field(    uint32_t,    drm_id    )
  __field(    u32,    event    )
+    __string(    intf_mode_str,    intf_mode    )
  __field(    enum dpu_intf,    intf_idx    )
+    __field(    enum dpu_wb,  wb_idx    )


Nit: indentation broken. Please use tabs.


  ),
  TP_fast_assign(
  __entry->drm_id = drm_id;
  __entry->event = event;
+    __assign_str(intf_mode_str, intf_mode);
  __entry->intf_idx = intf_idx;
+    __entry->wb_idx = wb_idx;
  ),
-    TP_printk("id=%u, event=%u, intf=%d", __entry->drm_id, 
__entry->event,

-  __entry->intf_idx)
+    TP_printk("id=%u, event=%u, intf_mode=%s intf=%d wb=%d", 
__entry->drm_id,

+    __entry->event, __get_str(intf_mode_str),
+    __entry->intf_idx, __entry->wb_idx)
  );
  TRACE_EVENT(dpu_enc_frame_done_cb,
@@ -415,14 +421,16 @@ TRACE_EVENT(dpu_enc_frame_done_cb,
  );
  TRACE_EVENT(dpu_enc_trigger_flush,
-    TP_PROTO(uint32_t drm_id, enum dpu_intf intf_idx,
+    TP_PROTO(uint32_t drm_id, char *intf_mode, enum dpu_intf 
intf_idx, enum dpu_wb wb_idx,

   int pending_kickoff_cnt, int ctl_idx, u32 extra_flush_bits,
   u32 pending_flush_ret),
-    TP_ARGS(drm_id, intf_idx, pending_kickoff_cnt, ctl_idx,
+    TP_ARGS(drm_id, intf_mode, intf_idx, pending_kickoff_cnt, ctl_idx,
  extra_flush_bits, pending_flush_ret),


wb_idx is missing from the TP_ARGS, so compilation fails.


  TP_STRUCT__entry(
  __field(    uint32_t,    drm_id    )
+    __string(    intf_mode_str,    intf_mode    )
  __field(    enum dpu_intf,    intf_idx    )
+    __field(    enum dpu_wb,  wb_idx    )


Nit: indentation broken. Please use tabs.


  __field(    int,    pending_kickoff_cnt    )
  __field(    int,    ctl_idx    )
  __field(    u32,    extra_flush_bits    )
@@ -430,15 +438,17 @@ TRACE_EVENT(dpu_enc_trigger_flush,
  ),
  TP_fast_assign(
  __entry->drm_id = drm_id;
+    __assign_str(intf_mode_str, intf_mode);
  

Re: [PATCH 2/2] Revert "drm: of: Lookup if child node has panel or bridge"

2022-04-27 Thread Maxime Ripard
On Tue, Apr 26, 2022 at 01:40:31PM +0530, Jagan Teki wrote:
> On Tue, Apr 26, 2022 at 1:24 PM Paul Kocialkowski
>  wrote:
> >
> > Hi,
> >
> > On Thu 21 Apr 22, 10:59, Paul Kocialkowski wrote:
> > > Hi Maxime,
> > >
> > > On Thu 21 Apr 22, 10:23, Maxime Ripard wrote:
> > > > On Thu, Apr 21, 2022 at 01:15:54PM +0530, Jagan Teki wrote:
> > > > > + Linus
> > > > > + Marek
> > > > > + Laurent
> > > > > + Robert
> > > > >
> > > > > On Thu, Apr 21, 2022 at 4:40 AM Bjorn Andersson
> > > > >  wrote:
> > > > > >
> > > > > > Commit '80253168dbfd ("drm: of: Lookup if child node has panel or
> > > > > > bridge")' attempted to simplify the case of expressing a simple 
> > > > > > panel
> > > > > > under a DSI controller, by assuming that the first non-graph child 
> > > > > > node
> > > > > > was a panel or bridge.
> > > > > >
> > > > > > Unfortunately for non-trivial cases the first child node might not 
> > > > > > be a
> > > > > > panel or bridge.  Examples of this can be a aux-bus in the case of
> > > > > > DisplayPort, or an opp-table represented before the panel node.
> > > > > >
> > > > > > In these cases the reverted commit prevents the caller from ever 
> > > > > > finding
> > > > > > a reference to the panel.
> > > > > >
> > > > > > This reverts commit '80253168dbfd ("drm: of: Lookup if child node 
> > > > > > has
> > > > > > panel or bridge")', in favor of using an explicit graph reference 
> > > > > > to the
> > > > > > panel in the trivial case as well.
> > > > >
> > > > > This eventually breaks many child-based devm_drm_of_get_bridge
> > > > > switched drivers.  Do you have any suggestions on how to proceed to
> > > > > succeed in those use cases as well?
> > > >
> > > > I guess we could create a new helper for those, like
> > > > devm_drm_of_get_bridge_with_panel, or something.
> > >
> > > Oh wow I feel stupid for not thinking about that.
> > >
> > > Yeah I agree that it seems like the best option.
> >
> > Should I prepare a patch with such a new helper?
> >
> > The idea would be to keep drm_of_find_panel_or_bridge only for the of graph
> > case and add one for the child node case, maybe:
> > drm_of_find_child_panel_or_bridge.
> >
> > I really don't have a clear idea of which driver would need to be switched
> > over though. Could someone (Jagan?) let me know where it would be needed?
> 
> sun6i_mipi_dsi

It doesn't look like sun6i_mipi_dsi is using devm_drm_of_get_bridge at all?

> exynos_drm_dsi

If you reference 711c7adc4687, I don't see why we would need to switch
it back to the old behaviour. It wasn't iterating over its child node
before, so what does the switch to drm_of_get_bridge broke exactly?

> mcde_dsi (as of now)

Yeah, we do need to revert 3730bc6147b0 and 3d7039e1e649

Maxime


Re: [PATCH] drm/i915/uc: use io memcpy functions for device memory copy

2022-04-27 Thread Siva Mullati
LGTM

Acked-by: Siva Mullati 

On 06/04/22 14:48, Vivekanandan, Balasubramani wrote:
> When copying RSA use io memcpy functions if the destination address
> contains a GPU local memory address. Considering even the source
> address can be on local memory, a bounce buffer is used to copy from io
> to io.
> The intention of this patch is to make i915 portable outside x86 mainly
> on ARM64.
>
> Signed-off-by: Balasubramani Vivekanandan 
> 
> ---
>  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 23 +--
>  1 file changed, 21 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c 
> b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
> index bb864655c495..06d30670e15c 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
> @@ -589,7 +589,7 @@ static int uc_fw_rsa_data_create(struct intel_uc_fw 
> *uc_fw)
>   struct intel_gt *gt = __uc_fw_to_gt(uc_fw);
>   struct i915_vma *vma;
>   size_t copied;
> - void *vaddr;
> + void *vaddr, *bounce;
>   int err;
>  
>   err = i915_inject_probe_error(gt->i915, -ENXIO);
> @@ -621,7 +621,26 @@ static int uc_fw_rsa_data_create(struct intel_uc_fw 
> *uc_fw)
>   goto unpin_out;
>   }
>  
> - copied = intel_uc_fw_copy_rsa(uc_fw, vaddr, vma->size);
> + if (i915_gem_object_is_lmem(vma->obj)) {
> + /* When vma is allocated from the GPU local memmory, it means
> +  * the destination address contains an io memory and we need to
> +  * use memcpy function for io memory for copying, to ensure
> +  * i915 portability outside x86. It is most likely the RSA will
> +  * also be on local memory and so the source of copy will also
> +  * be an io address. Since we cannot directly copy from io to
> +  * io, we use a bounce buffer to copy.
> +  */
> + copied = 0;
> + bounce = kmalloc(vma->size, GFP_KERNEL);
> + if (likely(bounce)) {
> + copied = intel_uc_fw_copy_rsa(uc_fw, bounce, vma->size);
> + memcpy_toio((void __iomem *)vaddr, bounce, copied);
> + kfree(bounce);
> + }
> + } else {
> + copied = intel_uc_fw_copy_rsa(uc_fw, vaddr, vma->size);
> + }
> +
>   i915_gem_object_unpin_map(vma->obj);
>  
>   if (copied < uc_fw->rsa_size) {


Re: [RFC] drm/kms: control display brightness through drm_connector properties

2022-04-27 Thread Daniel Vetter
On Wed, Apr 27, 2022 at 05:23:22PM +0300, Jani Nikula wrote:
> On Wed, 27 Apr 2022, Daniel Vetter  wrote:
> > On Thu, Apr 14, 2022 at 01:24:30PM +0300, Jani Nikula wrote:
> >> On Mon, 11 Apr 2022, Alex Deucher  wrote:
> >> > On Mon, Apr 11, 2022 at 6:18 AM Hans de Goede  
> >> > wrote:
> >> >>
> >> >> Hi,
> >> >>
> >> >> On 4/8/22 17:11, Alex Deucher wrote:
> >> >> > On Fri, Apr 8, 2022 at 10:56 AM Hans de Goede  
> >> >> > wrote:
> >> >> >>
> >> >> >> Hi,
> >> >> >>
> >> >> >> On 4/8/22 16:08, Alex Deucher wrote:
> >> >> >>> On Fri, Apr 8, 2022 at 4:07 AM Daniel Vetter  
> >> >> >>> wrote:
> >> >> 
> >> >>  On Thu, Apr 07, 2022 at 05:05:52PM -0400, Alex Deucher wrote:
> >> >> > On Thu, Apr 7, 2022 at 1:43 PM Hans de Goede 
> >> >> >  wrote:
> >> >> >>
> >> >> >> Hi Simon,
> >> >> >>
> >> >> >> On 4/7/22 18:51, Simon Ser wrote:
> >> >> >>> Very nice plan! Big +1 for the overall approach.
> >> >> >>
> >> >> >> Thanks.
> >> >> >>
> >> >> >>> On Thursday, April 7th, 2022 at 17:38, Hans de Goede 
> >> >> >>>  wrote:
> >> >> >>>
> >> >>  The drm_connector brightness properties
> >> >>  ===
> >> >> 
> >> >>  bl_brightness: rw 0-int32_max property controlling the 
> >> >>  brightness setting
> >> >>  of the connected display. The actual maximum of this will be 
> >> >>  less then
> >> >>  int32_max and is given in bl_brightness_max.
> >> >> >>>
> >> >> >>> Do we need to split this up into two props for sw/hw state? The 
> >> >> >>> privacy screen
> >> >> >>> stuff needed this, but you're pretty familiar with that. :)
> >> >> >>
> >> >> >> Luckily that won't be necessary, since the privacy-screen is a 
> >> >> >> security
> >> >> >> feature the firmware/embedded-controller may refuse our requests
> >> >> >> (may temporarily lock-out changes) and/or may make changes 
> >> >> >> without
> >> >> >> us requesting them itself. Neither is really the case with the
> >> >> >> brightness setting of displays.
> >> >> >>
> >> >>  bl_brightness_max: ro 0-int32_max property giving the actual 
> >> >>  maximum
> >> >>  of the display's brightness setting. This will report 0 when 
> >> >>  brightness
> >> >>  control is not available (yet).
> >> >> >>>
> >> >> >>> I don't think we actually need that one. Integer KMS props all 
> >> >> >>> have a
> >> >> >>> range which can be fetched via drmModeGetProperty. The max can 
> >> >> >>> be
> >> >> >>> exposed via this range. Example with the existing alpha prop:
> >> >> >>>
> >> >> >>> "alpha": range [0, UINT16_MAX] = 65535
> >> >> >>
> >> >> >> Right, I already knew that, which is why I explicitly added a 
> >> >> >> range
> >> >> >> to the props already. The problem is that the range must be set
> >> >> >> before registering the connector and when the backlight driver
> >> >> >> only shows up (much) later during boot then we don't know the
> >> >> >> range when registering the connector. I guess we could "patch-up"
> >> >> >> the range later. But AFAIK that would be a bit of abuse of the
> >> >> >> property API as the range is intended to never change, not
> >> >> >> even after hotplug uevents. At least atm there is no infra
> >> >> >> in the kernel to change the range later.
> >> >> >>
> >> >> >> Which is why I added an explicit bl_brightness_max property
> >> >> >> of which the value gives the actual effective maximum of the
> >> >> >> brightness.
> >> >> 
> >> >>  Uh ... I'm not a huge fan tbh. The thing is, if we allow 
> >> >>  hotplugging
> >> >>  brightness control later on then we just perpetuate the nonsense 
> >> >>  we have
> >> >>  right now, forever.
> >> >> 
> >> >>  Imo we should support two kinds of drivers:
> >> >> 
> >> >>  - drivers which are non-crap, and make sure their backlight driver 
> >> >>  is
> >> >>    loaded before they register the drm_device (or at least the
> >> >>    drm_connector). For those we want the drm_connector->backlight 
> >> >>  pointer
> >> >>    to bit static over the lifetime of the connector, and then we 
> >> >>  can also
> >> >>    set up the brightness range correctly.
> >> >> 
> >> >>  - funny drivers which implement the glorious fallback dance which
> >> >>    libbacklight implements currently in userspace. Imo for these 
> >> >>  drivers we
> >> >>    should have a libbacklight_heuristics_backlight, which 
> >> >>  normalizes or
> >> >>    whatever, and is also ways there. And then internally handles the
> >> >>    fallback mess to the "right" backlight driver.
> >> >> 
> >> >>  We might have some gaps on acpi systems to make sure the drm 
> >> >>  driver can
> >> >>  wait 

Re: [RFC] drm/kms: control display brightness through drm_connector properties

2022-04-27 Thread Jani Nikula
On Wed, 27 Apr 2022, Daniel Vetter  wrote:
> On Thu, Apr 14, 2022 at 01:24:30PM +0300, Jani Nikula wrote:
>> On Mon, 11 Apr 2022, Alex Deucher  wrote:
>> > On Mon, Apr 11, 2022 at 6:18 AM Hans de Goede  wrote:
>> >>
>> >> Hi,
>> >>
>> >> On 4/8/22 17:11, Alex Deucher wrote:
>> >> > On Fri, Apr 8, 2022 at 10:56 AM Hans de Goede  
>> >> > wrote:
>> >> >>
>> >> >> Hi,
>> >> >>
>> >> >> On 4/8/22 16:08, Alex Deucher wrote:
>> >> >>> On Fri, Apr 8, 2022 at 4:07 AM Daniel Vetter  wrote:
>> >> 
>> >>  On Thu, Apr 07, 2022 at 05:05:52PM -0400, Alex Deucher wrote:
>> >> > On Thu, Apr 7, 2022 at 1:43 PM Hans de Goede  
>> >> > wrote:
>> >> >>
>> >> >> Hi Simon,
>> >> >>
>> >> >> On 4/7/22 18:51, Simon Ser wrote:
>> >> >>> Very nice plan! Big +1 for the overall approach.
>> >> >>
>> >> >> Thanks.
>> >> >>
>> >> >>> On Thursday, April 7th, 2022 at 17:38, Hans de Goede 
>> >> >>>  wrote:
>> >> >>>
>> >>  The drm_connector brightness properties
>> >>  ===
>> >> 
>> >>  bl_brightness: rw 0-int32_max property controlling the 
>> >>  brightness setting
>> >>  of the connected display. The actual maximum of this will be 
>> >>  less then
>> >>  int32_max and is given in bl_brightness_max.
>> >> >>>
>> >> >>> Do we need to split this up into two props for sw/hw state? The 
>> >> >>> privacy screen
>> >> >>> stuff needed this, but you're pretty familiar with that. :)
>> >> >>
>> >> >> Luckily that won't be necessary, since the privacy-screen is a 
>> >> >> security
>> >> >> feature the firmware/embedded-controller may refuse our requests
>> >> >> (may temporarily lock-out changes) and/or may make changes without
>> >> >> us requesting them itself. Neither is really the case with the
>> >> >> brightness setting of displays.
>> >> >>
>> >>  bl_brightness_max: ro 0-int32_max property giving the actual 
>> >>  maximum
>> >>  of the display's brightness setting. This will report 0 when 
>> >>  brightness
>> >>  control is not available (yet).
>> >> >>>
>> >> >>> I don't think we actually need that one. Integer KMS props all 
>> >> >>> have a
>> >> >>> range which can be fetched via drmModeGetProperty. The max can be
>> >> >>> exposed via this range. Example with the existing alpha prop:
>> >> >>>
>> >> >>> "alpha": range [0, UINT16_MAX] = 65535
>> >> >>
>> >> >> Right, I already knew that, which is why I explicitly added a range
>> >> >> to the props already. The problem is that the range must be set
>> >> >> before registering the connector and when the backlight driver
>> >> >> only shows up (much) later during boot then we don't know the
>> >> >> range when registering the connector. I guess we could "patch-up"
>> >> >> the range later. But AFAIK that would be a bit of abuse of the
>> >> >> property API as the range is intended to never change, not
>> >> >> even after hotplug uevents. At least atm there is no infra
>> >> >> in the kernel to change the range later.
>> >> >>
>> >> >> Which is why I added an explicit bl_brightness_max property
>> >> >> of which the value gives the actual effective maximum of the
>> >> >> brightness.
>> >> 
>> >>  Uh ... I'm not a huge fan tbh. The thing is, if we allow hotplugging
>> >>  brightness control later on then we just perpetuate the nonsense we 
>> >>  have
>> >>  right now, forever.
>> >> 
>> >>  Imo we should support two kinds of drivers:
>> >> 
>> >>  - drivers which are non-crap, and make sure their backlight driver is
>> >>    loaded before they register the drm_device (or at least the
>> >>    drm_connector). For those we want the drm_connector->backlight 
>> >>  pointer
>> >>    to bit static over the lifetime of the connector, and then we can 
>> >>  also
>> >>    set up the brightness range correctly.
>> >> 
>> >>  - funny drivers which implement the glorious fallback dance which
>> >>    libbacklight implements currently in userspace. Imo for these 
>> >>  drivers we
>> >>    should have a libbacklight_heuristics_backlight, which normalizes 
>> >>  or
>> >>    whatever, and is also ways there. And then internally handles the
>> >>    fallback mess to the "right" backlight driver.
>> >> 
>> >>  We might have some gaps on acpi systems to make sure the drm driver 
>> >>  can
>> >>  wait for the backlight driver to show up, but that's about it.
>> >> 
>> >>  Hotplugging random pieces later on is really not how drivers work 
>> >>  nowadays
>> >>  with deferred probe and component framework and all that.
>> >> 
>> >> >> I did consider using the range for this and updating it
>> >> >> on the fly I think nothing is 

Re: dim question: How to revert patches?

2022-04-27 Thread Daniel Vetter
On Thu, Apr 14, 2022 at 10:37:55PM +0200, Helge Deller wrote:
> Hello dri-devel & dim users,

Apologies for late reply, I'm way behind on stuff.

> I committed this patch to the drm-misc-next branch:
> 
> commit d6cd978f7e6b6f6895f8d0c4ce6e5d2c8e979afe
> video: fbdev: fbmem: fix pointer reference to null device field
> 
> then I noticed that it was fixed already in another branch which led to this 
> error:
> 
> Merging drm-misc/drm-misc-next... dim:
> dim: FAILURE: Could not merge drm-misc/drm-misc-next
> dim: See the section "Resolving Conflicts when Rebuilding drm-tip"
> dim: in the drm-tip.rst documentation for how to handle this situation.
> 
> I fixed it by reverting that patch above with this new commit in the 
> drm-misc-next branch:
> 
> commit cabfa2bbe617ddf0a0cc4d01f72b584dae4939ad (HEAD -> drm-misc-next, 
> drm-misc/for-linux-next, drm-misc/drm-misc-next)
> Author: Helge Deller 
> Revert "video: fbdev: fbmem: fix pointer reference to null device field"
> 
> My question (as "dim" newbie):
> Was that the right solution?

The patch wasn't really broken, so revert feels a bit silly. The hint was
to look at the documentation referenced by the error message - the issue
was only in rebuilding the integration tree:

https://drm.pages.freedesktop.org/maintainer-tools/drm-tip.html#resolving-conflicts-when-rebuilding-drm-tip

This should cover you even for really rare conflict situations.

> Is there a possibility to drop those two patches from the drm-misc-next 
> branch before it gets pushed upstream?

It's a shared tree, mistakes are forever. The only time we did a forced
push ever is when someone managed to push their local pile of hacks or
something, and we're catching those pretty well now with a server-side
test to make sure you're using dim to push.

It's also no big deal, and next time you get a conflict just resolve it
in drm-tip per the docs and it's all fine.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


[PATCH 1/4] drm/format-helper: Implement drm_fb_swab() with per-line helpers

2022-04-27 Thread Thomas Zimmermann
Replace the inner loop of drm_fb_swab() with helper functions that
swap the bytes in each pixel. This will allow to share the outer
loop with other conversion helpers.

Signed-off-by: Thomas Zimmermann 
---
 drivers/gpu/drm/drm_format_helper.c | 60 +
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/drm_format_helper.c 
b/drivers/gpu/drm/drm_format_helper.c
index 34b7ef443ad2..f70499344a04 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -100,6 +100,26 @@ void drm_fb_memcpy_toio(void __iomem *dst, unsigned int 
dst_pitch, const void *v
 }
 EXPORT_SYMBOL(drm_fb_memcpy_toio);
 
+static void drm_fb_swab16_line(void *dbuf, const void *sbuf, unsigned int 
pixels)
+{
+   u16 *dbuf16 = dbuf;
+   const u16 *sbuf16 = sbuf;
+   const u16 *send16 = sbuf16 + pixels;
+
+   while (sbuf16 < send16)
+   *dbuf16++ = swab16(*sbuf16++);
+}
+
+static void drm_fb_swab32_line(void *dbuf, const void *sbuf, unsigned int 
pixels)
+{
+   u32 *dbuf32 = dbuf;
+   const u32 *sbuf32 = sbuf;
+   const u32 *send32 = sbuf32 + pixels;
+
+   while (sbuf32 < send32)
+   *dbuf32++ = swab32(*sbuf32++);
+}
+
 /**
  * drm_fb_swab - Swap bytes into clip buffer
  * @dst: Destination buffer
@@ -120,12 +140,11 @@ void drm_fb_swab(void *dst, unsigned int dst_pitch, const 
void *src,
 bool cached)
 {
u8 cpp = fb->format->cpp[0];
-   size_t len = drm_rect_width(clip) * cpp;
-   const u16 *src16;
-   const u32 *src32;
-   u16 *dst16;
-   u32 *dst32;
-   unsigned int x, y;
+   unsigned long linepixels = drm_rect_width(clip);
+   size_t len = linepixels * cpp;
+   const void *sbuf;
+   void *dbuf;
+   unsigned int y;
void *buf = NULL;
 
if (WARN_ON_ONCE(cpp != 2 && cpp != 4))
@@ -133,31 +152,22 @@ void drm_fb_swab(void *dst, unsigned int dst_pitch, const 
void *src,
 
if (!dst_pitch)
dst_pitch = len;
+   src += clip_offset(clip, fb->pitches[0], cpp);
 
if (!cached)
buf = kmalloc(len, GFP_KERNEL);
 
-   src += clip_offset(clip, fb->pitches[0], cpp);
-
for (y = clip->y1; y < clip->y2; y++) {
-   if (buf) {
-   memcpy(buf, src, len);
-   src16 = buf;
-   src32 = buf;
-   } else {
-   src16 = src;
-   src32 = src;
-   }
-
-   dst16 = dst;
-   dst32 = dst;
+   if (buf)
+   sbuf = memcpy(buf, src, len);
+   else
+   sbuf = src;
+   dbuf = dst + clip->x1 * cpp;
 
-   for (x = clip->x1; x < clip->x2; x++) {
-   if (cpp == 4)
-   *dst32++ = swab32(*src32++);
-   else
-   *dst16++ = swab16(*src16++);
-   }
+   if (cpp == 4)
+   drm_fb_swab32_line(dbuf, sbuf, linepixels);
+   else
+   drm_fb_swab16_line(dbuf, sbuf, linepixels);
 
src += fb->pitches[0];
dst += dst_pitch;
-- 
2.36.0



[PATCH 4/4] drm/format-helper: Share implementation among conversion helpers

2022-04-27 Thread Thomas Zimmermann
Provide format-independent conversion helpers for system and I/O
memory. Implement most existing helpers on top of it. The source and
destination formats of each conversion is handled by a per-line
helper that is given to the generic implementation.

Signed-off-by: Thomas Zimmermann 
---
 drivers/gpu/drm/drm_format_helper.c | 370 ++--
 1 file changed, 124 insertions(+), 246 deletions(-)

diff --git a/drivers/gpu/drm/drm_format_helper.c 
b/drivers/gpu/drm/drm_format_helper.c
index 21d0d282c6a1..6f8030ebb56d 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -40,6 +40,95 @@ unsigned int drm_fb_clip_offset(unsigned int pitch, const 
struct drm_format_info
 }
 EXPORT_SYMBOL(drm_fb_clip_offset);
 
+/* TODO: Make this functon work with multi-plane formats. */
+static int drm_fb_xfrm(void *dst, unsigned long dst_pitch, unsigned long 
dst_pixsize,
+  const void *vaddr, const struct drm_framebuffer *fb,
+  const struct drm_rect *clip, bool vaddr_cached_hint,
+  void (*xfrm_line)(void *dbuf, const void *sbuf, unsigned 
int npixels))
+{
+   unsigned long linepixels = drm_rect_width(clip);
+   unsigned long lines = drm_rect_height(clip);
+   size_t sbuf_len = linepixels * fb->format->cpp[0];
+   void *stmp = NULL;
+   unsigned long i;
+   const void *sbuf;
+
+   /*
+* Some source buffers, such as CMA memory, use write-combine
+* caching, so reads are uncached. Speed up access by fetching
+* one line at a time.
+*/
+   if (!vaddr_cached_hint) {
+   stmp = kmalloc(sbuf_len, GFP_KERNEL);
+   if (!stmp)
+   return -ENOMEM;
+   }
+
+   if (!dst_pitch)
+   dst_pitch = drm_rect_width(clip) * dst_pixsize;
+   vaddr += clip_offset(clip, fb->pitches[0], fb->format->cpp[0]);
+
+   for (i = 0; i < lines; ++i) {
+   if (stmp)
+   sbuf = memcpy(stmp, vaddr, sbuf_len);
+   else
+   sbuf = vaddr;
+   xfrm_line(dst, sbuf, linepixels);
+   vaddr += fb->pitches[0];
+   dst += dst_pitch;
+   }
+
+   kfree(stmp);
+
+   return 0;
+}
+
+/* TODO: Make this functon work with multi-plane formats. */
+static int drm_fb_xfrm_toio(void __iomem *dst, unsigned long dst_pitch, 
unsigned long dst_pixsize,
+   const void *vaddr, const struct drm_framebuffer *fb,
+   const struct drm_rect *clip, bool vaddr_cached_hint,
+   void (*xfrm_line)(void *dbuf, const void *sbuf, 
unsigned int npixels))
+{
+   unsigned long linepixels = drm_rect_width(clip);
+   unsigned long lines = drm_rect_height(clip);
+   size_t dbuf_len = linepixels * dst_pixsize;
+   size_t stmp_off = round_up(dbuf_len, ARCH_KMALLOC_MINALIGN); /* for 
sbuf alignment */
+   size_t sbuf_len = linepixels * fb->format->cpp[0];
+   void *stmp = NULL;
+   unsigned long i;
+   const void *sbuf;
+   void *dbuf;
+
+   if (vaddr_cached_hint) {
+   dbuf = kmalloc(dbuf_len, GFP_KERNEL);
+   } else {
+   dbuf = kmalloc(stmp_off + sbuf_len, GFP_KERNEL);
+   stmp = dbuf + stmp_off;
+   }
+   if (!dbuf)
+   return -ENOMEM;
+
+   if (!dst_pitch)
+   dst_pitch = linepixels * dst_pixsize;
+   vaddr += clip_offset(clip, fb->pitches[0], fb->format->cpp[0]);
+
+   for (i = 0; i < lines; ++i) {
+   if (stmp)
+   sbuf = memcpy(stmp, vaddr, sbuf_len);
+   else
+   sbuf = vaddr;
+   xfrm_line(dbuf, sbuf, linepixels);
+   memcpy_toio(dst, dbuf, dbuf_len);
+   vaddr += fb->pitches[0];
+   dst += dst_pitch;
+   }
+
+   kfree(dbuf);
+
+   return 0;
+}
+
+
 /**
  * drm_fb_memcpy - Copy clip buffer
  * @dst: Destination buffer
@@ -140,45 +229,23 @@ void drm_fb_swab(void *dst, unsigned int dst_pitch, const 
void *src,
 bool cached)
 {
u8 cpp = fb->format->cpp[0];
-   unsigned long linepixels = drm_rect_width(clip);
-   size_t len = linepixels * cpp;
-   const void *sbuf;
-   void *dbuf;
-   unsigned int y;
-   void *buf = NULL;
-
-   if (WARN_ON_ONCE(cpp != 2 && cpp != 4))
-   return;
-
-   if (!dst_pitch)
-   dst_pitch = len;
-   src += clip_offset(clip, fb->pitches[0], cpp);
-
-   if (!cached)
-   buf = kmalloc(len, GFP_KERNEL);
 
-   for (y = clip->y1; y < clip->y2; y++) {
-   if (buf)
-   sbuf = memcpy(buf, src, len);
-   else
-   sbuf = src;
-   dbuf = dst + clip->x1 * cpp;
-
-   if (cpp == 4)
-   drm_fb_swab32_line(dbuf, 

[PATCH 3/4] drm/format-helper: Unify the parameters of all per-line conversion helpers

2022-04-27 Thread Thomas Zimmermann
Give each per-line conversion helper pointers of type void and the
number of pixels in the line. Remove the unused swab parameters.

Signed-off-by: Thomas Zimmermann 
---
 drivers/gpu/drm/drm_format_helper.c | 87 +
 1 file changed, 50 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/drm_format_helper.c 
b/drivers/gpu/drm/drm_format_helper.c
index b7daa40fc856..21d0d282c6a1 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -177,16 +177,19 @@ void drm_fb_swab(void *dst, unsigned int dst_pitch, const 
void *src,
 }
 EXPORT_SYMBOL(drm_fb_swab);
 
-static void drm_fb_xrgb_to_rgb332_line(u8 *dbuf, const __le32 *sbuf, 
unsigned int pixels)
+static void drm_fb_xrgb_to_rgb332_line(void *dbuf, const void *sbuf, 
unsigned int pixels,
+  bool swab)
 {
+   u8 *dbuf8 = dbuf;
+   const __le32 *sbuf32 = sbuf;
unsigned int x;
u32 pix;
 
for (x = 0; x < pixels; x++) {
-   pix = le32_to_cpu(sbuf[x]);
-   dbuf[x] = ((pix & 0x00e0) >> 16) |
- ((pix & 0xe000) >> 11) |
- ((pix & 0x00c0) >> 6);
+   pix = le32_to_cpu(sbuf32[x]);
+   dbuf8[x] = ((pix & 0x00e0) >> 16) |
+  ((pix & 0xe000) >> 11) |
+  ((pix & 0x00c0) >> 6);
}
 }
 
@@ -219,7 +222,7 @@ void drm_fb_xrgb_to_rgb332(void *dst, unsigned int 
dst_pitch, const void *sr
src += clip_offset(clip, fb->pitches[0], sizeof(u32));
for (y = 0; y < drm_rect_height(clip); y++) {
memcpy(sbuf, src, src_len);
-   drm_fb_xrgb_to_rgb332_line(dst, sbuf, width);
+   drm_fb_xrgb_to_rgb332_line(dst, sbuf, width, false);
src += fb->pitches[0];
dst += dst_pitch;
}
@@ -228,31 +231,34 @@ void drm_fb_xrgb_to_rgb332(void *dst, unsigned int 
dst_pitch, const void *sr
 }
 EXPORT_SYMBOL(drm_fb_xrgb_to_rgb332);
 
-static void drm_fb_xrgb_to_rgb565_line(u16 *dbuf, const u32 *sbuf,
-  unsigned int pixels)
+static void drm_fb_xrgb_to_rgb565_line(void *dbuf, const void *sbuf, 
unsigned int pixels)
 {
+   u16 *dbuf16 = dbuf;
+   const u32 *sbuf32 = sbuf;
unsigned int x;
u16 val16;
 
for (x = 0; x < pixels; x++) {
-   val16 = ((sbuf[x] & 0x00F8) >> 8) |
-   ((sbuf[x] & 0xFC00) >> 5) |
-   ((sbuf[x] & 0x00F8) >> 3);
-   dbuf[x] = val16;
+   val16 = ((sbuf32[x] & 0x00F8) >> 8) |
+   ((sbuf32[x] & 0xFC00) >> 5) |
+   ((sbuf32[x] & 0x00F8) >> 3);
+   dbuf16[x] = val16;
}
 }
 
-static void drm_fb_xrgb_to_rgb565_swab_line(u16 *dbuf, const u32 *sbuf,
+static void drm_fb_xrgb_to_rgb565_swab_line(void *dbuf, const void *sbuf,
unsigned int pixels)
 {
+   u16 *dbuf16 = dbuf;
+   const u32 *sbuf32 = sbuf;
unsigned int x;
u16 val16;
 
for (x = 0; x < pixels; x++) {
-   val16 = ((sbuf[x] & 0x00F8) >> 8) |
-   ((sbuf[x] & 0xFC00) >> 5) |
-   ((sbuf[x] & 0x00F8) >> 3);
-   dbuf[x] = swab16(val16);
+   val16 = ((sbuf32[x] & 0x00F8) >> 8) |
+   ((sbuf32[x] & 0xFC00) >> 5) |
+   ((sbuf32[x] & 0x00F8) >> 3);
+   dbuf16[x] = swab16(val16);
}
 }
 
@@ -347,15 +353,16 @@ void drm_fb_xrgb_to_rgb565_toio(void __iomem *dst, 
unsigned int dst_pitch,
 }
 EXPORT_SYMBOL(drm_fb_xrgb_to_rgb565_toio);
 
-static void drm_fb_xrgb_to_rgb888_line(u8 *dbuf, const u32 *sbuf,
-  unsigned int pixels)
+static void drm_fb_xrgb_to_rgb888_line(void *dbuf, const void *sbuf, 
unsigned int pixels)
 {
+   u8 *dbuf8 = dbuf;
+   const u32 *sbuf32 = sbuf;
unsigned int x;
 
for (x = 0; x < pixels; x++) {
-   *dbuf++ = (sbuf[x] & 0x00FF) >>  0;
-   *dbuf++ = (sbuf[x] & 0xFF00) >>  8;
-   *dbuf++ = (sbuf[x] & 0x00FF) >> 16;
+   *dbuf8++ = (sbuf32[x] & 0x00FF) >>  0;
+   *dbuf8++ = (sbuf32[x] & 0xFF00) >>  8;
+   *dbuf8++ = (sbuf32[x] & 0x00FF) >> 16;
}
 }
 
@@ -521,17 +528,18 @@ static void drm_fb_rgb888_to_xrgb_toio(void __iomem 
*dst, unsigned int dst_p
kfree(dbuf);
 }
 
-static void drm_fb_xrgb_to_xrgb2101010_line(u32 *dbuf, const u32 *sbuf,
-   unsigned int pixels)
+static void drm_fb_xrgb_to_xrgb2101010_line(void *dbuf, const void *sbuf, 
unsigned int pixels)
 {
+   u32 *dbuf32 = dbuf;
+ 

[PATCH 0/4] drm/format-helper: Share common code among conversion helpers

2022-04-27 Thread Thomas Zimmermann
Move all format-specific handling in per-line conversion functions and
share the overall loop among conversion helpers. This is another step
towards composable format conversion. 

Thomas Zimmermann (4):
  drm/format-helper: Implement drm_fb_swab() with per-line helpers
  drm/format-helper: Remove optional byte-swap from line convertion
  drm/format-helper: Unify the parameters of all per-line conversion
helpers
  drm/format-helper: Share implementation among conversion helpers

 drivers/gpu/drm/drm_format_helper.c | 479 
 1 file changed, 198 insertions(+), 281 deletions(-)

-- 
2.36.0



[PATCH 2/4] drm/format-helper: Remove optional byte-swap from line convertion

2022-04-27 Thread Thomas Zimmermann
Implement per-pixel byte swapping in a separate conversion helper
for the single function that requires it. Select the correct helper
for each conversion.

Signed-off-by: Thomas Zimmermann 
---
 drivers/gpu/drm/drm_format_helper.c | 32 +
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/drm_format_helper.c 
b/drivers/gpu/drm/drm_format_helper.c
index f70499344a04..b7daa40fc856 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -229,8 +229,7 @@ void drm_fb_xrgb_to_rgb332(void *dst, unsigned int 
dst_pitch, const void *sr
 EXPORT_SYMBOL(drm_fb_xrgb_to_rgb332);
 
 static void drm_fb_xrgb_to_rgb565_line(u16 *dbuf, const u32 *sbuf,
-  unsigned int pixels,
-  bool swab)
+  unsigned int pixels)
 {
unsigned int x;
u16 val16;
@@ -239,10 +238,21 @@ static void drm_fb_xrgb_to_rgb565_line(u16 *dbuf, 
const u32 *sbuf,
val16 = ((sbuf[x] & 0x00F8) >> 8) |
((sbuf[x] & 0xFC00) >> 5) |
((sbuf[x] & 0x00F8) >> 3);
-   if (swab)
-   dbuf[x] = swab16(val16);
-   else
-   dbuf[x] = val16;
+   dbuf[x] = val16;
+   }
+}
+
+static void drm_fb_xrgb_to_rgb565_swab_line(u16 *dbuf, const u32 *sbuf,
+   unsigned int pixels)
+{
+   unsigned int x;
+   u16 val16;
+
+   for (x = 0; x < pixels; x++) {
+   val16 = ((sbuf[x] & 0x00F8) >> 8) |
+   ((sbuf[x] & 0xFC00) >> 5) |
+   ((sbuf[x] & 0x00F8) >> 3);
+   dbuf[x] = swab16(val16);
}
 }
 
@@ -282,7 +292,10 @@ void drm_fb_xrgb_to_rgb565(void *dst, unsigned int 
dst_pitch, const void *va
vaddr += clip_offset(clip, fb->pitches[0], sizeof(u32));
for (y = 0; y < lines; y++) {
memcpy(sbuf, vaddr, src_len);
-   drm_fb_xrgb_to_rgb565_line(dst, sbuf, linepixels, swab);
+   if (swab)
+   drm_fb_xrgb_to_rgb565_swab_line(dst, sbuf, 
linepixels);
+   else
+   drm_fb_xrgb_to_rgb565_line(dst, sbuf, linepixels);
vaddr += fb->pitches[0];
dst += dst_pitch;
}
@@ -321,7 +334,10 @@ void drm_fb_xrgb_to_rgb565_toio(void __iomem *dst, 
unsigned int dst_pitch,
 
vaddr += clip_offset(clip, fb->pitches[0], sizeof(u32));
for (y = 0; y < lines; y++) {
-   drm_fb_xrgb_to_rgb565_line(dbuf, vaddr, linepixels, swab);
+   if (swab)
+   drm_fb_xrgb_to_rgb565_swab_line(dbuf, vaddr, 
linepixels);
+   else
+   drm_fb_xrgb_to_rgb565_line(dbuf, vaddr, linepixels);
memcpy_toio(dst, dbuf, dst_len);
vaddr += fb->pitches[0];
dst += dst_pitch;
-- 
2.36.0



Re: [PATCH] drm/bochs: Explicitly include linux/module.h

2022-04-27 Thread Daniel Vetter
On Wed, Apr 13, 2022 at 06:12:59PM +0200, Michel Dänzer wrote:
> From: Michel Dänzer 
> 
> Instead of relying on it getting pulled in indirectly.
> 
> Signed-off-by: Michel Dänzer 

Reviewed-by: Daniel Vetter 

> ---
>  drivers/gpu/drm/tiny/bochs.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/tiny/bochs.c b/drivers/gpu/drm/tiny/bochs.c
> index ed971c8bb446..4f8bf86633df 100644
> --- a/drivers/gpu/drm/tiny/bochs.c
> +++ b/drivers/gpu/drm/tiny/bochs.c
> @@ -1,5 +1,6 @@
>  // SPDX-License-Identifier: GPL-2.0-or-later
>  
> +#include 
>  #include 
>  
>  #include 
> -- 
> 2.35.1
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


[RESEND PATCH v3] drm/cma-helper: Describe what a "contiguous chunk" actually means

2022-04-27 Thread Daniel Thompson
Since it's inception in 2012 it has been understood that the DRM GEM CMA
helpers do not depend on CMA as the backend allocator. In fact the first
bug fix to ensure the cma-helpers work correctly with an IOMMU backend
appeared in 2014. However currently the documentation for
drm_gem_cma_create() talks about "a contiguous chunk of memory" without
making clear which address space it will be a contiguous part of.
Additionally the CMA introduction is actively misleading because it only
contemplates the CMA backend.

This matters because when the device accesses the bus through an IOMMU
(and don't use the CMA backend) then the allocated memory is contiguous
only in the IOVA space. This is a significant difference compared to the
CMA backend and the behaviour can be a surprise even to someone who does
a reasonable level of code browsing (but doesn't find all the relevant
function pointers ;-) ).

Improve the kernel doc comments accordingly.

Signed-off-by: Daniel Thompson 
---

Notes:
RESEND is unaltered but rebased on v5.18-rc3.

Changes in v3:
- Rebased on v5.17-rc2
- Minor improvements to wording.

Changes in v2:
- Oops. I did a final proof read and accidentally committed these
  changes as a seperate patch. This means that v1 contains only
  one tenth of the actual patch. This is fixed in v2. Many apologies
  for the noise!

 drivers/gpu/drm/drm_gem_cma_helper.c | 39 +---
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c 
b/drivers/gpu/drm/drm_gem_cma_helper.c
index f36734c2c9e1..42abee9a0f4f 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -26,12 +26,22 @@
 /**
  * DOC: cma helpers
  *
- * The Contiguous Memory Allocator reserves a pool of memory at early boot
- * that is used to service requests for large blocks of contiguous memory.
+ * The DRM GEM/CMA helpers are a means to provide buffer objects that are
+ * presented to the device as a contiguous chunk of memory. This is useful
+ * for devices that do not support scatter-gather DMA (either directly or
+ * by using an intimately attached IOMMU).
  *
- * The DRM GEM/CMA helpers use this allocator as a means to provide buffer
- * objects that are physically contiguous in memory. This is useful for
- * display drivers that are unable to map scattered buffers via an IOMMU.
+ * Despite the name, the DRM GEM/CMA helpers are not hardwired to use the
+ * Contiguous Memory Allocator (CMA).
+ *
+ * For devices that access the memory bus through an (external) IOMMU then
+ * the buffer objects are allocated using a traditional page-based
+ * allocator and may be scattered through physical memory. However they
+ * are contiguous in the IOVA space so appear contiguous to devices using
+ * them.
+ *
+ * For other devices then the helpers rely on CMA to provide buffer
+ * objects that are physically contiguous in memory.
  *
  * For GEM callback helpers in struct _gem_object functions, see likewise
  * named functions with an _object_ infix (e.g., drm_gem_cma_object_vmap() 
wraps
@@ -111,8 +121,14 @@ __drm_gem_cma_create(struct drm_device *drm, size_t size, 
bool private)
  * @drm: DRM device
  * @size: size of the object to allocate
  *
- * This function creates a CMA GEM object and allocates a contiguous chunk of
- * memory as backing store.
+ * This function creates a CMA GEM object and allocates memory as backing 
store.
+ * The allocated memory will occupy a contiguous chunk of bus address space.
+ *
+ * For devices that are directly connected to the memory bus then the allocated
+ * memory will be physically contiguous. For devices that access through an
+ * IOMMU, then the allocated memory is not expected to be physically contiguous
+ * because having contiguous IOVAs is sufficient to meet a devices DMA
+ * requirements.
  *
  * Returns:
  * A struct drm_gem_cma_object * on success or an ERR_PTR()-encoded negative
@@ -162,9 +178,12 @@ EXPORT_SYMBOL_GPL(drm_gem_cma_create);
  * @size: size of the object to allocate
  * @handle: return location for the GEM handle
  *
- * This function creates a CMA GEM object, allocating a physically contiguous
- * chunk of memory as backing store. The GEM object is then added to the list
- * of object associated with the given file and a handle to it is returned.
+ * This function creates a CMA GEM object, allocating a chunk of memory as
+ * backing store. The GEM object is then added to the list of object associated
+ * with the given file and a handle to it is returned.
+ *
+ * The allocated memory will occupy a contiguous chunk of bus address space.
+ * See drm_gem_cma_create() for more details.
  *
  * Returns:
  * A struct drm_gem_cma_object * on success or an ERR_PTR()-encoded negative

base-commit: b2d229d4ddb17db541098b83524d901257e93845
--
2.35.1



Re: [RFC] drm/kms: control display brightness through drm_connector properties

2022-04-27 Thread Daniel Vetter
On Thu, Apr 14, 2022 at 01:24:30PM +0300, Jani Nikula wrote:
> On Mon, 11 Apr 2022, Alex Deucher  wrote:
> > On Mon, Apr 11, 2022 at 6:18 AM Hans de Goede  wrote:
> >>
> >> Hi,
> >>
> >> On 4/8/22 17:11, Alex Deucher wrote:
> >> > On Fri, Apr 8, 2022 at 10:56 AM Hans de Goede  
> >> > wrote:
> >> >>
> >> >> Hi,
> >> >>
> >> >> On 4/8/22 16:08, Alex Deucher wrote:
> >> >>> On Fri, Apr 8, 2022 at 4:07 AM Daniel Vetter  wrote:
> >> 
> >>  On Thu, Apr 07, 2022 at 05:05:52PM -0400, Alex Deucher wrote:
> >> > On Thu, Apr 7, 2022 at 1:43 PM Hans de Goede  
> >> > wrote:
> >> >>
> >> >> Hi Simon,
> >> >>
> >> >> On 4/7/22 18:51, Simon Ser wrote:
> >> >>> Very nice plan! Big +1 for the overall approach.
> >> >>
> >> >> Thanks.
> >> >>
> >> >>> On Thursday, April 7th, 2022 at 17:38, Hans de Goede 
> >> >>>  wrote:
> >> >>>
> >>  The drm_connector brightness properties
> >>  ===
> >> 
> >>  bl_brightness: rw 0-int32_max property controlling the brightness 
> >>  setting
> >>  of the connected display. The actual maximum of this will be less 
> >>  then
> >>  int32_max and is given in bl_brightness_max.
> >> >>>
> >> >>> Do we need to split this up into two props for sw/hw state? The 
> >> >>> privacy screen
> >> >>> stuff needed this, but you're pretty familiar with that. :)
> >> >>
> >> >> Luckily that won't be necessary, since the privacy-screen is a 
> >> >> security
> >> >> feature the firmware/embedded-controller may refuse our requests
> >> >> (may temporarily lock-out changes) and/or may make changes without
> >> >> us requesting them itself. Neither is really the case with the
> >> >> brightness setting of displays.
> >> >>
> >>  bl_brightness_max: ro 0-int32_max property giving the actual 
> >>  maximum
> >>  of the display's brightness setting. This will report 0 when 
> >>  brightness
> >>  control is not available (yet).
> >> >>>
> >> >>> I don't think we actually need that one. Integer KMS props all 
> >> >>> have a
> >> >>> range which can be fetched via drmModeGetProperty. The max can be
> >> >>> exposed via this range. Example with the existing alpha prop:
> >> >>>
> >> >>> "alpha": range [0, UINT16_MAX] = 65535
> >> >>
> >> >> Right, I already knew that, which is why I explicitly added a range
> >> >> to the props already. The problem is that the range must be set
> >> >> before registering the connector and when the backlight driver
> >> >> only shows up (much) later during boot then we don't know the
> >> >> range when registering the connector. I guess we could "patch-up"
> >> >> the range later. But AFAIK that would be a bit of abuse of the
> >> >> property API as the range is intended to never change, not
> >> >> even after hotplug uevents. At least atm there is no infra
> >> >> in the kernel to change the range later.
> >> >>
> >> >> Which is why I added an explicit bl_brightness_max property
> >> >> of which the value gives the actual effective maximum of the
> >> >> brightness.
> >> 
> >>  Uh ... I'm not a huge fan tbh. The thing is, if we allow hotplugging
> >>  brightness control later on then we just perpetuate the nonsense we 
> >>  have
> >>  right now, forever.
> >> 
> >>  Imo we should support two kinds of drivers:
> >> 
> >>  - drivers which are non-crap, and make sure their backlight driver is
> >>    loaded before they register the drm_device (or at least the
> >>    drm_connector). For those we want the drm_connector->backlight 
> >>  pointer
> >>    to bit static over the lifetime of the connector, and then we can 
> >>  also
> >>    set up the brightness range correctly.
> >> 
> >>  - funny drivers which implement the glorious fallback dance which
> >>    libbacklight implements currently in userspace. Imo for these 
> >>  drivers we
> >>    should have a libbacklight_heuristics_backlight, which normalizes or
> >>    whatever, and is also ways there. And then internally handles the
> >>    fallback mess to the "right" backlight driver.
> >> 
> >>  We might have some gaps on acpi systems to make sure the drm driver 
> >>  can
> >>  wait for the backlight driver to show up, but that's about it.
> >> 
> >>  Hotplugging random pieces later on is really not how drivers work 
> >>  nowadays
> >>  with deferred probe and component framework and all that.
> >> 
> >> >> I did consider using the range for this and updating it
> >> >> on the fly I think nothing is really preventing us from
> >> >> doing so, but it very much feels like abusing the generic
> >> >> properties API.
> >> >>
> >>  

Re: [RFC v2 1/2] drm/doc/rfc: VM_BIND feature design document

2022-04-27 Thread Daniel Vetter
On Wed, Apr 20, 2022 at 03:50:00PM -0700, Niranjana Vishwanathapura wrote:
> On Thu, Mar 31, 2022 at 01:37:08PM +0200, Daniel Vetter wrote:
> > One thing I've forgotten, since it's only hinted at here: If/when we
> > switch tlb flushing from the current dumb implementation
> > we now have in i915 in upstream to one with batching using dma_fence,
> > then I think that should be something which is done with a small
> > helper library of shared code too. The batching is somewhat tricky,
> > and you need to make sure you put the fence into the right
> > dma_resv_usage slot, and the trick with replace the vm fence with a
> > tlb flush fence is also a good reason to share the code so we only
> > have it one.
> > 
> > Christian's recent work also has some prep work for this already with
> > the fence replacing trick.
> 
> Sure, but this optimization is not required for initial vm_bind support
> to land right? We can look at it soon after that. Is that ok?
> I have made a reference to this TLB flush batching work in the rst file.

Yeah for now we can just rely on the tlb flush we do on vma unbinding,
which also means there's no need for any separate tlb flushing in vm_bind
related code. This was just a thought I dropped on here to make sure we
ahve a complete picture.
-Daniel


> 
> Niranjana
> 
> > -Daniel
> > 
> > On Thu, 31 Mar 2022 at 10:28, Daniel Vetter  wrote:
> > > Adding a pile of people who've expressed interest in vm_bind for their
> > > drivers.
> > > 
> > > Also note to the intel folks: This is largely written with me having my
> > > subsystem co-maintainer hat on, i.e. what I think is the right thing to do
> > > here for the subsystem at large. There is substantial rework involved
> > > here, but it's not any different from i915 adopting ttm or i915 adpoting
> > > drm/sched, and I do think this stuff needs to happen in one form or
> > > another.
> > > 
> > > On Mon, Mar 07, 2022 at 12:31:45PM -0800, Niranjana Vishwanathapura wrote:
> > > > VM_BIND design document with description of intended use cases.
> > > >
> > > > Signed-off-by: Niranjana Vishwanathapura 
> > > > 
> > > > ---
> > > >  Documentation/gpu/rfc/i915_vm_bind.rst | 210 +
> > > >  Documentation/gpu/rfc/index.rst|   4 +
> > > >  2 files changed, 214 insertions(+)
> > > >  create mode 100644 Documentation/gpu/rfc/i915_vm_bind.rst
> > > >
> > > > diff --git a/Documentation/gpu/rfc/i915_vm_bind.rst 
> > > > b/Documentation/gpu/rfc/i915_vm_bind.rst
> > > > new file mode 100644
> > > > index ..cdc6bb25b942
> > > > --- /dev/null
> > > > +++ b/Documentation/gpu/rfc/i915_vm_bind.rst
> > > > @@ -0,0 +1,210 @@
> > > > +==
> > > > +I915 VM_BIND feature design and use cases
> > > > +==
> > > > +
> > > > +VM_BIND feature
> > > > +
> > > > +DRM_I915_GEM_VM_BIND/UNBIND ioctls allows UMD to bind/unbind GEM buffer
> > > > +objects (BOs) or sections of a BOs at specified GPU virtual addresses 
> > > > on
> > > > +a specified address space (VM).
> > > > +
> > > > +These mappings (also referred to as persistent mappings) will be 
> > > > persistent
> > > > +across multiple GPU submissions (execbuff) issued by the UMD, without 
> > > > user
> > > > +having to provide a list of all required mappings during each 
> > > > submission
> > > > +(as required by older execbuff mode).
> > > > +
> > > > +VM_BIND ioctl deferes binding the mappings until next execbuff 
> > > > submission
> > > > +where it will be required, or immediately if I915_GEM_VM_BIND_IMMEDIATE
> > > > +flag is set (useful if mapping is required for an active context).
> > > 
> > > So this is a screw-up I've done, and for upstream I think we need to fix
> > > it: Implicit sync is bad, and it's also still a bad idea for vm_bind, and
> > > I was wrong suggesting we should do this a few years back when we kicked
> > > this off internally :-(
> > > 
> > > What I think we need is just always VM_BIND_IMMEDIATE mode, and then a few
> > > things on top:
> > > - in and out fences, like with execbuf, to allow userspace to sync with
> > >   execbuf as needed
> > > - for compute-mode context this means userspace memory fences
> > > - for legacy context this means a timeline syncobj in drm_syncobj
> > > 
> > > No sync_file or anything else like this at all. This means a bunch of
> > > work, but also it'll have benefits because it means we should be able to
> > > use exactly the same code paths and logic for both compute and for legacy
> > > context, because drm_syncobj support future fence semantics.
> > > 
> > > Also on the implementation side we still need to install dma_fence to the
> > > various dma_resv, and for this we need the new dma_resv_usage series from
> > > Christian König first. vm_bind fences can then use the USAGE_BOOKKEEPING
> > > flag to make sure they never result in an oversync issue with execbuf. I
> > > don't think trying to land vm_bind without that 

[PATCH] video: hyperv_fb: Allow resolutions with size > 64 MB for Gen1

2022-04-27 Thread Saurabh Sengar
This patch fixes a bug where GEN1 VMs doesn't allow resolutions greater
than 64 MB size (eg 7680x4320). Unnecessary PCI check limits Gen1 VRAM
to legacy PCI BAR size only (ie 64MB). Thus any, resolution requesting
greater then 64MB (eg 7680x4320) would fail. MMIO region assigning this
memory shouldn't be limited by PCI bar size.

Signed-off-by: Saurabh Sengar 
---
 drivers/video/fbdev/hyperv_fb.c | 19 +--
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index c8e0ea2..58c304a 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -1009,7 +1009,6 @@ static int hvfb_getmem(struct hv_device *hdev, struct 
fb_info *info)
struct pci_dev *pdev  = NULL;
void __iomem *fb_virt;
int gen2vm = efi_enabled(EFI_BOOT);
-   resource_size_t pot_start, pot_end;
phys_addr_t paddr;
int ret;
 
@@ -1060,23 +1059,7 @@ static int hvfb_getmem(struct hv_device *hdev, struct 
fb_info *info)
dio_fb_size =
screen_width * screen_height * screen_depth / 8;
 
-   if (gen2vm) {
-   pot_start = 0;
-   pot_end = -1;
-   } else {
-   if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
-   pci_resource_len(pdev, 0) < screen_fb_size) {
-   pr_err("Resource not available or (0x%lx < 0x%lx)\n",
-  (unsigned long) pci_resource_len(pdev, 0),
-  (unsigned long) screen_fb_size);
-   goto err1;
-   }
-
-   pot_end = pci_resource_end(pdev, 0);
-   pot_start = pot_end - screen_fb_size + 1;
-   }
-
-   ret = vmbus_allocate_mmio(>mem, hdev, pot_start, pot_end,
+   ret = vmbus_allocate_mmio(>mem, hdev, 0, -1,
  screen_fb_size, 0x10, true);
if (ret != 0) {
pr_err("Unable to allocate framebuffer memory\n");
-- 
1.8.3.1



[PATCH v2 3/4] soc: visconti: Add Toshiba Visconti AFFINE image processing accelerator

2022-04-27 Thread Yuji Ishikawa
Adds support to AFFINE image processing accelerator on Toshiba Visconti ARM 
SoCs.
This accelerator supoorts affine transform, lens undistortion and LUT transform.

Signed-off-by: Yuji Ishikawa 
Reviewed-by: Nobuhiro Iwamatsu 
---
v1 -> v2:
  - apply checkpatch.pl --strict
  - renamed identifiers; hwd_AFFINE_ to hwd_affine_
---
 drivers/soc/visconti/Kconfig |   6 +
 drivers/soc/visconti/Makefile|   2 +
 drivers/soc/visconti/affine/Makefile |   6 +
 drivers/soc/visconti/affine/affine.c | 451 +++
 drivers/soc/visconti/affine/hwd_affine.c | 206 +
 drivers/soc/visconti/affine/hwd_affine.h |  83 
 drivers/soc/visconti/affine/hwd_affine_reg.h |  45 ++
 drivers/soc/visconti/uapi/affine.h   |  87 
 8 files changed, 886 insertions(+)
 create mode 100644 drivers/soc/visconti/affine/Makefile
 create mode 100644 drivers/soc/visconti/affine/affine.c
 create mode 100644 drivers/soc/visconti/affine/hwd_affine.c
 create mode 100644 drivers/soc/visconti/affine/hwd_affine.h
 create mode 100644 drivers/soc/visconti/affine/hwd_affine_reg.h
 create mode 100644 drivers/soc/visconti/uapi/affine.h

diff --git a/drivers/soc/visconti/Kconfig b/drivers/soc/visconti/Kconfig
index 8b1378917..01583d407 100644
--- a/drivers/soc/visconti/Kconfig
+++ b/drivers/soc/visconti/Kconfig
@@ -1 +1,7 @@
+if ARCH_VISCONTI
+
+config VISCONTI_AFFINE
+bool "Visconti Affine driver"
+
+endif
 
diff --git a/drivers/soc/visconti/Makefile b/drivers/soc/visconti/Makefile
index 8d710da08..b25a726c3 100644
--- a/drivers/soc/visconti/Makefile
+++ b/drivers/soc/visconti/Makefile
@@ -4,3 +4,5 @@
 #
 
 obj-y += ipa_common.o
+
+obj-$(CONFIG_VISCONTI_AFFINE) += affine/
diff --git a/drivers/soc/visconti/affine/Makefile 
b/drivers/soc/visconti/affine/Makefile
new file mode 100644
index 0..82f83b2d6
--- /dev/null
+++ b/drivers/soc/visconti/affine/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Visconti AFFINE driver
+#
+
+obj-y += affine.o hwd_affine.o
diff --git a/drivers/soc/visconti/affine/affine.c 
b/drivers/soc/visconti/affine/affine.c
new file mode 100644
index 0..eea045dcf
--- /dev/null
+++ b/drivers/soc/visconti/affine/affine.c
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/* Toshiba Visconti Affine Accelerator Support
+ *
+ * (C) Copyright 2022 TOSHIBA CORPORATION
+ * (C) Copyright 2022 Toshiba Electronic Devices & Storage Corporation
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "hwd_affine.h"
+#include "../ipa_common.h"
+#include "../uapi/affine.h"
+
+struct affine_priv {
+   struct device *dev;
+   struct miscdevice miscdev;
+   struct mutex lock;
+   void __iomem *regs;
+   int irq;
+   wait_queue_head_t waitq;
+   enum drv_ipa_state status;
+   unsigned int hwd_event;
+   unsigned int poll_event;
+   int id;
+   char name[16];
+   bool dma_coherent;
+   struct hwd_affine_status hwd_status;
+
+   struct dma_buf_attachment *dba[DRV_AFFINE_BUFFER_INDEX_MAX];
+   struct sg_table *sgt[DRV_AFFINE_BUFFER_INDEX_MAX];
+   enum dma_data_direction dma_dir[DRV_AFFINE_BUFFER_INDEX_MAX];
+   unsigned int dma_count;
+
+   dma_addr_t buffer_iova[DRV_AFFINE_BUFFER_INDEX_MAX];
+};
+
+static u32 affine_ipa_addr_to_iova(struct affine_priv *priv, struct 
drv_ipa_addr addr)
+{
+   u32 iova = 0;
+
+   if (addr.buffer_index < priv->dma_count &&
+   addr.offset < priv->dba[addr.buffer_index]->dmabuf->size)
+   iova = priv->buffer_iova[addr.buffer_index] + addr.offset;
+   return iova;
+}
+
+static int affine_attach_dma_buf(struct affine_priv *priv, unsigned int 
buffer_index,
+struct drv_ipa_buffer_info *buffer_info)
+{
+   int ret = 0;
+   dma_addr_t addr;
+
+   if (buffer_index >= DRV_AFFINE_BUFFER_INDEX_MAX) {
+   dev_err(priv->dev, "Buffer index invalid: index=%d\n", 
buffer_index);
+   return -EINVAL;
+   }
+
+   switch (buffer_info[buffer_index].direction) {
+   case DRV_IPA_DIR_NONE:
+   priv->dma_dir[priv->dma_count] = DMA_NONE;
+   break;
+   case DRV_IPA_DIR_TO_DEVICE:
+   priv->dma_dir[priv->dma_count] = DMA_TO_DEVICE;
+   break;
+   case DRV_IPA_DIR_FROM_DEVICE:
+   priv->dma_dir[priv->dma_count] = DMA_FROM_DEVICE;
+   break;
+   case DRV_IPA_DIR_BIDIRECTION:
+   priv->dma_dir[priv->dma_count] = DMA_BIDIRECTIONAL;
+   break;
+   default:
+   dev_err(priv->dev, "DMA direction invalid: index=%d dir=%d\n", 
buffer_index,
+   buffer_info[buffer_index].direction);
+   return -EINVAL;
+   }
+
+   if (!buffer_info[buffer_index].coherent) {
+  

  1   2   >