Re: [RFC 1/7] drm/amdgpu: UAPI for user queue management

2022-12-24 Thread Bas Nieuwenhuizen
On Fri, Dec 23, 2022 at 8:37 PM Shashank Sharma  wrote:
>
> From: Alex Deucher 
>
> This patch intorduces new UAPI/IOCTL for usermode graphics
> queue. The userspace app will fill this structure and request
> the graphics driver to add a graphics work queue for it. The
> output of this UAPI is a queue id.
>
> This UAPI maps the queue into GPU, so the graphics app can start
> submitting work to the queue as soon as the call returns.
>
> Cc: Alex Deucher 
> Cc: Christian Koenig 
> Signed-off-by: Alex Deucher 
> Signed-off-by: Shashank Sharma 
> ---
>  include/uapi/drm/amdgpu_drm.h | 52 +++
>  1 file changed, 52 insertions(+)
>
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 0d93ec132ebb..a3d0dd6f62c5 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -54,6 +54,7 @@ extern "C" {
>  #define DRM_AMDGPU_VM  0x13
>  #define DRM_AMDGPU_FENCE_TO_HANDLE 0x14
>  #define DRM_AMDGPU_SCHED   0x15
> +#define DRM_AMDGPU_USERQ   0x16
>
>  #define DRM_IOCTL_AMDGPU_GEM_CREATEDRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
>  #define DRM_IOCTL_AMDGPU_GEM_MMAP  DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
> @@ -71,6 +72,7 @@ extern "C" {
>  #define DRM_IOCTL_AMDGPU_VMDRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_VM, union drm_amdgpu_vm)
>  #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle)
>  #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + 
> DRM_AMDGPU_SCHED, union drm_amdgpu_sched)
> +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE + 
> DRM_AMDGPU_USERQ, union drm_amdgpu_userq)
>
>  /**
>   * DOC: memory domains
> @@ -288,6 +290,56 @@ union drm_amdgpu_ctx {
> union drm_amdgpu_ctx_out out;
>  };
>
> +/* user queue IOCTL */
> +#define AMDGPU_USERQ_OP_CREATE 1
> +#define AMDGPU_USERQ_OP_FREE   2
> +
> +#define AMDGPU_USERQ_MQD_FLAGS_SECURE  (1 << 0)
> +#define AMDGPU_USERQ_MQD_FLAGS_AQL (1 << 1)

Can we document what AQL means here?


> +
> +struct drm_amdgpu_userq_mqd {
> +   /** Flags: AMDGPU_USERQ_MQD_FLAGS_* */
> +   __u32   flags;
> +   /** IP type: AMDGPU_HW_IP_* */
> +   __u32   ip_type;
> +   /** GEM object handle */
> +   __u32   doorbell_handle;
> +   /** Doorbell offset in dwords */
> +   __u32   doorbell_offset;

What are the doorbell handle/offset for? I don't see any of them used
in the rest of the series (we only check the handle isn't 0, which
isn't enough validation for a GEM handle to consider it valid), and
the kernel seems to allocate some kind of doorbell index in patch 4.
Does userspace need to know about that one? (similarly use_doorbell in
that patch seems like it is never explicitly written to)

The other questions I have are about how this interacts with memory
management. Does this have access to all BOs allocated with
AMDGPU_GEM_CREATE_VM_ALWAYS_VALID? What about imported BOs? How does
this interact with VA unmap/map operations? (AFAICT we have no way to
tell if pagetable modifying operations are complete from userspace for
now). What happens if we need to spill BOs from VRAM due to
(cross-process) memory pressure?

> +   /** GPU virtual address of the queue */
> +   __u64   queue_va;
> +   /** Size of the queue in bytes */
> +   __u64   queue_size;
> +   /** GPU virtual address of the rptr */
> +   __u64   rptr_va;
> +   /** GPU virtual address of the wptr */
> +   __u64   wptr_va;
> +};
> +
> +struct drm_amdgpu_userq_in {
> +   /** AMDGPU_USERQ_OP_* */
> +   __u32   op;
> +   /** Flags */
> +   __u32   flags;
> +   /** Context handle to associate the queue with */
> +   __u32   ctx_id;
> +   __u32   pad;
> +   /** Queue descriptor */
> +   struct drm_amdgpu_userq_mqd mqd;
> +};
> +
> +struct drm_amdgpu_userq_out {
> +   /** Queue handle */
> +   __u32   q_id;
> +   /** Flags */
> +   __u32   flags;
> +};
> +
> +union drm_amdgpu_userq {
> +   struct drm_amdgpu_userq_in in;
> +   struct drm_amdgpu_userq_out out;
> +};
> +
>  /* vm ioctl */
>  #define AMDGPU_VM_OP_RESERVE_VMID  1
>  #define AMDGPU_VM_OP_UNRESERVE_VMID2
> --
> 2.34.1
>


Re: [RFC 2/7] drm/amdgpu: Add usermode queue for gfx work

2022-12-24 Thread Oded Gabbay
On Fri, Dec 23, 2022 at 9:37 PM Shashank Sharma  wrote:
>
> This patch adds skeleton code for usermode queue creation. It
> typically contains:
> - A new structure to keep all the user queue data in one place.
> - An IOCTL function to create/free a usermode queue.
> - A function to generate unique index for the queue.
> - A global ptr in amdgpu_dev
>
> Cc: Alex Deucher 
> Cc: Christian Koenig 
> Signed-off-by: Shashank Sharma 
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile   |   2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   6 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h   |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 187 ++
>  .../drm/amd/include/amdgpu_usermode_queue.h   |  50 +
>  5 files changed, 246 insertions(+)
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
>  create mode 100644 drivers/gpu/drm/amd/include/amdgpu_usermode_queue.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 6ad39cf71bdd..e2a34ee57bfb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -209,6 +209,8 @@ amdgpu-y += \
>  # add amdkfd interfaces
>  amdgpu-y += amdgpu_amdkfd.o
>
> +# add usermode queue
> +amdgpu-y += amdgpu_userqueue.o
>
>  ifneq ($(CONFIG_HSA_AMD),)
>  AMDKFD_PATH := ../amdkfd
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 8639a4f9c6e8..4b566fcfca18 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -749,6 +749,11 @@ struct amdgpu_mqd {
> struct amdgpu_mqd_prop *p);
>  };
>
> +struct amdgpu_userq_globals {
> +   struct ida ida;
> +   struct mutex userq_mutex;
> +};
> +
>  #define AMDGPU_RESET_MAGIC_NUM 64
>  #define AMDGPU_MAX_DF_PERFMONS 4
>  #define AMDGPU_PRODUCT_NAME_LEN 64
> @@ -955,6 +960,7 @@ struct amdgpu_device {
> boolenable_mes_kiq;
> struct amdgpu_mes   mes;
> struct amdgpu_mqd   mqds[AMDGPU_HW_IP_NUM];
> +   struct amdgpu_userq_globals userq;
>
> /* df */
> struct amdgpu_dfdf;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67..f7413859b14f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -57,6 +57,7 @@ struct amdgpu_ctx {
> unsigned long   ras_counter_ce;
> unsigned long   ras_counter_ue;
> uint32_tstable_pstate;
> +   struct amdgpu_usermode_queue*userq;
>  };
>
>  struct amdgpu_ctx_mgr {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> new file mode 100644
> index ..3b6e8f75495c
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> @@ -0,0 +1,187 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include "amdgpu.h"
> +#include "amdgpu_vm.h"
> +#include "amdgpu_mes.h"
> +#include "amdgpu_usermode_queue.h"
> +#include "soc15_common.h"
> +
> +#define CHECK_ACCESS(a) (access_ok((const void __user *)a, sizeof(__u64)))
> +
> +static int
> +amdgpu_userqueue_index(struct amdgpu_device *adev)
> +{
> +int index;
> +struct amdgpu_userq_globals *uqg = &adev->userq;
> +
> +index = ida_simple_get(&uqg->ida, 2, AMDGPU_MAX_USERQ, GFP_KERNEL);
> +return index;
> +}
> +
> +static void
> +amdgpu_userqueue_remove_index(struct amdgpu_device *adev, struct 
> amdgpu_usermode_queue *queue)
> +{
> +struct amdgpu_userq_globals *uqg = &adev->userq;
> +
> +ida_simple_remove(&uqg->ida, queue->queue_id);
> +}
> +
> +static int
> +amdgpu_userque

Re: [PATCH 0/2] Recover from failure to probe GPU

2022-12-24 Thread Thomas Zimmermann

Hi

Am 22.12.22 um 19:30 schrieb Mario Limonciello:

One of the first thing that KMS drivers do during initialization is
destroy the system firmware framebuffer by means of
`drm_aperture_remove_conflicting_pci_framebuffers`

This means that if for any reason the GPU failed to probe the user
will be stuck with at best a screen frozen at the last thing that
was shown before the KMS driver continued it's probe.

The problem is most pronounced when new GPU support is introduced
because users will need to have a recent linux-firmware snapshot
on their system when they boot a kernel with matching support.

However the problem is further exaggerated in the case of amdgpu because
it has migrated to "IP discovery" where amdgpu will attempt to load
on "ALL" AMD GPUs even if the driver is missing support for IP blocks
contained in that GPU.

IP discovery requires some probing and isn't run until after the
framebuffer has been destroyed.

This means a situation can occur where a user purchases a new GPU not
yet supported by a distribution and when booting the installer it will
"freeze" even if the distribution doesn't have the matching kernel support
for those IP blocks.

The perfect example of this is Ubuntu 21.10 and the new dGPUs just
launched by AMD.  The installation media ships with kernel 5.19 (which
has IP discovery) but the amdgpu support for those IP blocks landed in
kernel 6.0. The matching linux-firmware was released after 21.10's launch.
The screen will freeze without nomodeset. Even if a user manages to install
and then upgrades to kernel 6.0 after install they'll still have the
problem of missing firmware, and the same experience.

This is quite jarring for users, particularly if they don't know
that they have to use "nomodeset" to install.

To help the situation, allow drivers to re-run the init process for the
firmware framebuffer during a failed probe. As this problem is most
pronounced with amdgpu, this is the only driver changed.

But if this makes sense more generally for other KMS drivers, the call
can be added to the cleanup routine for those too.


Just a quick drive-by comment: as Javier noted, at some point while 
probing, your driver has changed the device' state and the system FB 
will be gone. you cannot reestablish the sysfb after that.


You are, however free to read device state at any time, as long as it 
has no side effects.


So why not just move the call to 
drm_aperture_remove_conflicting_pci_framebuffers() to a later point when 
you know that your driver supports the hardware? That's the solution we 
always proposed to this kind of problem. It's safe and won't require any 
changes to the aperture helpers.


Best regards
Thomas



Here is a sample of what happens with missing GPU firmware and this
series:

[5.950056] amdgpu :63:00.0: vgaarb: deactivate vga console
[5.950114] amdgpu :63:00.0: enabling device (0006 -> 0007)
[5.950883] [drm] initializing kernel modesetting (YELLOW_CARP 0x1002:0x1681 
0x17AA:0x22F1 0xD2).
[5.952954] [drm] register mmio base: 0xB0A0
[5.952958] [drm] register mmio size: 524288
[5.954633] [drm] add ip block number 0 
[5.954636] [drm] add ip block number 1 
[5.954637] [drm] add ip block number 2 
[5.954638] [drm] add ip block number 3 
[5.954639] [drm] add ip block number 4 
[5.954641] [drm] add ip block number 5 
[5.954642] [drm] add ip block number 6 
[5.954643] [drm] add ip block number 7 
[5.954644] [drm] add ip block number 8 
[5.954645] [drm] add ip block number 9 
[5.954663] amdgpu :63:00.0: amdgpu: Fetched VBIOS from VFCT
[5.954666] amdgpu: ATOM BIOS: 113-REMBRANDT-X37
[5.954677] [drm] VCN(0) decode is enabled in VM mode
[5.954678] [drm] VCN(0) encode is enabled in VM mode
[5.954680] [drm] JPEG decode is enabled in VM mode
[5.954681] amdgpu :63:00.0: amdgpu: Trusted Memory Zone (TMZ) feature 
disabled as experimental (default)
[5.954683] amdgpu :63:00.0: amdgpu: PCIE atomic ops is not supported
[5.954724] [drm] vm size is 262144 GB, 4 levels, block size is 9-bit, 
fragment size is 9-bit
[5.954732] amdgpu :63:00.0: amdgpu: VRAM: 512M 0x00F4 - 
0x00F41FFF (512M used)
[5.954735] amdgpu :63:00.0: amdgpu: GART: 1024M 0x - 
0x3FFF
[5.954738] amdgpu :63:00.0: amdgpu: AGP: 267419648M 0x00F8 
- 0x
[5.954747] [drm] Detected VRAM RAM=512M, BAR=512M
[5.954750] [drm] RAM width 256bits LPDDR5
[5.954834] [drm] amdgpu: 512M of VRAM memory ready
[5.954838] [drm] amdgpu: 15680M of GTT memory ready.
[5.954873] [drm] GART: num cpu pages 262144, num gpu pages 262144
[5.955333] [drm] PCIE GART of 1024M enabled (table at 0x00F41FC0).
[5.955502] amdgpu :63:00.0: Direct firmware load for 
amdgpu/yellow_carp_toc.bin failed with error -2
[5.955505] amdgpu :63:00.0: amdgpu: fail to request/validate toc 
microcode
[