Re: [PATCH 1/1] amdgpu: move asic id table to a separate file

2017-09-17 Thread Zhang, Jerry (Junwei)

looks fine to me, feel free to add my RB.
Reviewed-by: Junwei Zhang 

BTW, we also has 1 or 2 patch to improve the name parsing.
Please also take a look.

Jerry

On 05/11/2017 05:10 AM, Li, Samuel wrote:

Also attach a sample ids file for reference. The names are from marketing, not 
related to source code and no reviews necessary here:)  It can be put in 
directory /usr/share/libdrm.

Sam

-Original Message-
From: Li, Samuel
Sent: Wednesday, May 10, 2017 4:57 PM
To: amd-gfx@lists.freedesktop.org
Cc: Yuan, Xiaojie ; Li, Samuel 
Subject: [PATCH 1/1] amdgpu: move asic id table to a separate file

From: Xiaojie Yuan 

Change-Id: I12216da14910f5e2b0970bc1fafc2a20b0ef1ba9
Signed-off-by: Samuel Li 
---
  amdgpu/Makefile.am   |   2 +
  amdgpu/Makefile.sources  |   2 +-
  amdgpu/amdgpu_asic_id.c  | 198 +++
  amdgpu/amdgpu_asic_id.h  | 165 ---
  amdgpu/amdgpu_device.c   |  28 +--
  amdgpu/amdgpu_internal.h |  10 +++
  6 files changed, 232 insertions(+), 173 deletions(-)
  create mode 100644 amdgpu/amdgpu_asic_id.c
  delete mode 100644 amdgpu/amdgpu_asic_id.h

diff --git a/amdgpu/Makefile.am b/amdgpu/Makefile.am
index cf7bc1b..ecf9e82 100644
--- a/amdgpu/Makefile.am
+++ b/amdgpu/Makefile.am
@@ -30,6 +30,8 @@ AM_CFLAGS = \
$(PTHREADSTUBS_CFLAGS) \
-I$(top_srcdir)/include/drm

+AM_CPPFLAGS = -DAMDGPU_ASIC_ID_TABLE=\"${datadir}/libdrm/amdgpu.ids\"
+
  libdrm_amdgpu_la_LTLIBRARIES = libdrm_amdgpu.la
  libdrm_amdgpu_ladir = $(libdir)
  libdrm_amdgpu_la_LDFLAGS = -version-number 1:0:0 -no-undefined
diff --git a/amdgpu/Makefile.sources b/amdgpu/Makefile.sources
index 487b9e0..bc3abaa 100644
--- a/amdgpu/Makefile.sources
+++ b/amdgpu/Makefile.sources
@@ -1,5 +1,5 @@
  LIBDRM_AMDGPU_FILES := \
-   amdgpu_asic_id.h \
+   amdgpu_asic_id.c \
amdgpu_bo.c \
amdgpu_cs.c \
amdgpu_device.c \
diff --git a/amdgpu/amdgpu_asic_id.c b/amdgpu/amdgpu_asic_id.c
new file mode 100644
index 000..d50e21a
--- /dev/null
+++ b/amdgpu/amdgpu_asic_id.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright © 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "amdgpu_drm.h"
+#include "amdgpu_internal.h"
+
+static int parse_one_line(const char *line, struct amdgpu_asic_id *id)
+{
+   char *buf;
+   char *s_did;
+   char *s_rid;
+   char *s_name;
+   char *endptr;
+   int r = 0;
+
+   buf = strdup(line);
+   if (!buf)
+   return -ENOMEM;
+
+   /* ignore empty line and commented line */
+   if (strlen(line) == 0 || line[0] == '#') {
+   r = -EAGAIN;
+   goto out;
+   }
+
+   /* device id */
+   s_did = strtok(buf, ",");
+   if (!s_did) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   id->did = strtol(s_did, , 16);
+   if (*endptr) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   /* revision id */
+   s_rid = strtok(NULL, ",");
+   if (!s_rid) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   id->rid = strtol(s_rid, , 16);
+   if (*endptr) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   /* marketing name */
+   s_name = strtok(NULL, ",");
+   if (!s_name) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   id->marketing_name = strdup(s_name);
+   if (id->marketing_name == NULL) {
+   r = -EINVAL;
+   goto out;
+   }
+
+out:
+   free(buf);
+
+   return r;
+}
+
+int amdgpu_parse_asic_ids(struct amdgpu_asic_id 

Re: [PATCH] drm/amdgpu/psp: declare raven psp firmware

2017-09-17 Thread Zhang, Jerry (Junwei)

On 09/16/2017 05:37 AM, Alex Deucher wrote:

So it gets picked up properly by the kernel.

Signed-off-by: Alex Deucher 

Reviewed-by: Junwei Zhang 


---
  drivers/gpu/drm/amd/amdgpu/psp_v10_0.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
index 6ec5c9f..77cab1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v10_0.c
@@ -35,6 +35,8 @@
  #include "raven1/GC/gc_9_1_offset.h"
  #include "raven1/SDMA0/sdma0_4_1_offset.h"

+MODULE_FIRMWARE("amdgpu/raven_asd.bin");
+
  static int
  psp_v10_0_get_fw_type(struct amdgpu_firmware_info *ucode, enum 
psp_gfx_fw_type *type)
  {


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdkfd: check for null dev to avoid a null pointer dereference

2017-09-17 Thread Oded Gabbay
On Fri, Sep 8, 2017 at 5:13 PM, Colin King  wrote:
> From: Colin Ian King 
>
> The call to kfd_device_by_id can potentially return null, so check that
> dev is null and return with -EINVAL to avoid a null pointer dereference.
>
> Detected by CoverityScan CID#1454629 ("Dereference null return value")
>
> Fixes: 5d71dbc3a588 ("drm/amdkfd: Implement image tiling mode support v2")
> Signed-off-by: Colin Ian King 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index e4a8c2e52cb2..660b3fbade41 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -892,6 +892,8 @@ static int kfd_ioctl_get_tile_config(struct file *filep,
> int err = 0;
>
> dev = kfd_device_by_id(args->gpu_id);
> +   if (!dev)
> +   return -EINVAL;
>
> dev->kfd2kgd->get_tile_config(dev->kgd, );
>
> --
> 2.14.1
>
Thanks!
Applied to my -fixes tree
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 10/11] drm/amdkfd: Print event limit messages only once per process

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling  wrote:
> To avoid spamming the log.
>
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 -
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 1 +
>  2 files changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 5979158..944abfa 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -292,7 +292,10 @@ static int create_signal_event(struct file *devkfd,
> struct kfd_event *ev)
>  {
> if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
> -   pr_warn("Signal event wasn't created because limit was 
> reached\n");
> +   if (!p->signal_event_limit_reached) {
> +   pr_warn("Signal event wasn't created because limit 
> was reached\n");
> +   p->signal_event_limit_reached = true;
> +   }
> return -ENOMEM;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bb71697..a546d01 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -532,6 +532,7 @@ struct kfd_process {
> struct list_head signal_event_pages;
> u32 next_nonsignal_event_id;
> size_t signal_event_count;
> +   bool signal_event_limit_reached;
>  };
>
>  /**
> --
> 2.7.4
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
This patch is:
Reviewed-by: Oded Gabbay 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 09/11] drm/amdkfd: Fix kernel-queue wrapping bugs

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:43 AM, Felix Kuehling  wrote:
> From: Yong Zhao 
>
> Avoid intermediate negative numbers when doing calculations with a mix
> of signed and unsigned variables where implicit conversions can lead
> to unexpected results.
>
> When kernel queue buffer wraps around to 0, we need to check that rptr
> won't be overwritten by the new packet.
>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 9ebb4c1..1c66334 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
> uint32_t wptr, rptr;
> unsigned int *queue_address;
>
> +   /* When rptr == wptr, the buffer is empty.
Start comment text in a new line. First line should be just /*

> +* When rptr == wptr + 1, the buffer is full.
> +* It is always rptr that advances to the position of wptr, rather 
> than
> +* the opposite. So we can only use up to queue_size_dwords - 1 
> dwords.
> +*/
> rptr = *kq->rptr_kernel;
> wptr = *kq->wptr_kernel;
> queue_address = (unsigned int *)kq->pq_kernel_addr;
> @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue 
> *kq,
> pr_debug("wptr: %d\n", wptr);
> pr_debug("queue_address 0x%p\n", queue_address);
>
> -   available_size = (rptr - 1 - wptr + queue_size_dwords) %
> +   available_size = (rptr + queue_size_dwords - 1 - wptr) %
> queue_size_dwords;
>
> -   if (packet_size_in_dwords >= queue_size_dwords ||
> -   packet_size_in_dwords >= available_size) {
> +   if (packet_size_in_dwords > available_size) {
> /*
>  * make sure calling functions know
>  * acquire_packet_buffer() failed
> @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
> }
>
> if (wptr + packet_size_in_dwords >= queue_size_dwords) {
> +   /* make sure after rolling back to position 0, there is
> +* still enough space.
> +*/
> +   if (packet_size_in_dwords >= rptr) {
> +   *buffer_ptr = NULL;
> +   return -ENOMEM;
> +   }

I don't think the condition is correct.
Suppose, queue_size_dwords == 100, wptr == rptr == 50 (queue is empty)
and we have a new packet with size of 70.
Now, wptr + size is 120, which is >= 100
However, 70 >= rptr (50) which will give us -ENOMEM, but this is not
correct condition, because the packet *does* have enough room in the
queue.

I think the condition should be:
if (packet_size_in_dwords - (queue_size_dwords - wptr) >= rptr)
but please check this.

> +   /* fill nops, roll back and start at position 0 */
> while (wptr > 0) {
> queue_address[wptr] = kq->nop_packet;
> wptr = (wptr + 1) % queue_size_dwords;
> --
> 2.7.4
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 08/11] drm/amdkfd: Drop _nocpsch suffix from shared functions

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling  wrote:
> From: Yong Zhao 
>
> Several functions in DQM are shared between cpsch and nocpsch code.
> Remove the misleading _nocpsch suffix from their names.
>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 24 
> +++---
>  1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 0ecea67..169e061 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -386,7 +386,7 @@ static int update_queue(struct device_queue_manager *dqm, 
> struct queue *q)
> return retval;
>  }
>
> -static struct mqd_manager *get_mqd_manager_nocpsch(
> +static struct mqd_manager *get_mqd_manager(
> struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
>  {
> struct mqd_manager *mqd;
> @@ -407,7 +407,7 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
> return mqd;
>  }
>
> -static int register_process_nocpsch(struct device_queue_manager *dqm,
> +static int register_process(struct device_queue_manager *dqm,
> struct qcm_process_device *qpd)
>  {
> struct device_process_node *n;
> @@ -431,7 +431,7 @@ static int register_process_nocpsch(struct 
> device_queue_manager *dqm,
> return retval;
>  }
>
> -static int unregister_process_nocpsch(struct device_queue_manager *dqm,
> +static int unregister_process(struct device_queue_manager *dqm,
> struct qcm_process_device *qpd)
>  {
> int retval;
> @@ -513,7 +513,7 @@ static int initialize_nocpsch(struct device_queue_manager 
> *dqm)
> return 0;
>  }
>
> -static void uninitialize_nocpsch(struct device_queue_manager *dqm)
> +static void uninitialize(struct device_queue_manager *dqm)
>  {
> int i;
>
> @@ -1097,10 +1097,10 @@ struct device_queue_manager 
> *device_queue_manager_init(struct kfd_dev *dev)
> dqm->ops.stop = stop_cpsch;
> dqm->ops.destroy_queue = destroy_queue_cpsch;
> dqm->ops.update_queue = update_queue;
> -   dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
> -   dqm->ops.register_process = register_process_nocpsch;
> -   dqm->ops.unregister_process = unregister_process_nocpsch;
> -   dqm->ops.uninitialize = uninitialize_nocpsch;
> +   dqm->ops.get_mqd_manager = get_mqd_manager;
> +   dqm->ops.register_process = register_process;
> +   dqm->ops.unregister_process = unregister_process;
> +   dqm->ops.uninitialize = uninitialize;
> dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
> dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
> dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
> @@ -1112,11 +1112,11 @@ struct device_queue_manager 
> *device_queue_manager_init(struct kfd_dev *dev)
> dqm->ops.create_queue = create_queue_nocpsch;
> dqm->ops.destroy_queue = destroy_queue_nocpsch;
> dqm->ops.update_queue = update_queue;
> -   dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
> -   dqm->ops.register_process = register_process_nocpsch;
> -   dqm->ops.unregister_process = unregister_process_nocpsch;
> +   dqm->ops.get_mqd_manager = get_mqd_manager;
> +   dqm->ops.register_process = register_process;
> +   dqm->ops.unregister_process = unregister_process;
> dqm->ops.initialize = initialize_nocpsch;
> -   dqm->ops.uninitialize = uninitialize_nocpsch;
> +   dqm->ops.uninitialize = uninitialize;
> dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
> break;
> default:
> --
> 2.7.4
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

This patch is:
Reviewed-by: Oded Gabbay 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 05/11] drm/amdkfd: Fix incorrect destroy_mqd parameter

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling  wrote:
> When uninitializing a kernel queue.
>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 0c82446..09356d0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -184,7 +184,7 @@ static void uninitialize(struct kernel_queue *kq)
> if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> kq->mqd->destroy_mqd(kq->mqd,
> NULL,
> -   false,
> +   KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> KFD_UNMAP_LATENCY_MS,
> kq->queue->pipe,
> kq->queue->queue);
> --
> 2.7.4
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
This patch is:
Reviewed-by: Oded Gabbay 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 04/11] drm/amdkfd: Adjust dequeue latencies and timeouts

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling  wrote:
> Adjust latencies and timeouts for dequeueing with HWS and consolidate
> them in one place. Make them longer to allow long running waves to
> complete without causing a timeout. The timeout is twice as long as the
> latency plus some buffer to make sure we don't detect a timeout
> prematurely.
>
> Change timeouts for dequeueing HQDs without HWS. KFD_UNMAP_LATENCY is
> more consistent with what the HWS does for user queues.
>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 4 +++-
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 ---
>  5 files changed, 6 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 3db6a31..5da7ef4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -323,7 +323,7 @@ static int destroy_queue_nocpsch(struct 
> device_queue_manager *dqm,
>
> retval = mqd->destroy_mqd(mqd, q->mqd,
> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> -   QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
> +   KFD_UNMAP_LATENCY_MS,
> q->pipe, q->queue);
>
> if (retval)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index faf820a..99e2305 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -29,7 +29,9 @@
>  #include "kfd_priv.h"
>  #include "kfd_mqd_manager.h"
>
> -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS   (500)
> +#define KFD_UNMAP_LATENCY_MS   (4000)
> +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
> +
>  #define CIK_VMID_NUM   (8)
>  #define KFD_VMID_START_OFFSET  (8)
>  #define VMID_PER_DEVICECIK_VMID_NUM
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 681b639..0c82446 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -185,7 +185,7 @@ static void uninitialize(struct kernel_queue *kq)
> kq->mqd->destroy_mqd(kq->mqd,
> NULL,
> false,
> -   QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
> +   KFD_UNMAP_LATENCY_MS,
> kq->queue->pipe,
> kq->queue->queue);
> else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 1d31260..9eda884 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -376,7 +376,7 @@ int pm_send_set_resources(struct packet_manager *pm,
> packet->bitfields2.queue_type =
> 
> queue_type__mes_set_resources__hsa_interface_queue_hiq;
> packet->bitfields2.vmid_mask = res->vmid_mask;
> -   packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY;
> +   packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
> packet->bitfields7.oac_mask = res->oac_mask;
> packet->bitfields8.gds_heap_base = res->gds_heap_base;
> packet->bitfields8.gds_heap_size = res->gds_heap_size;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f8d6a8e..099dc33 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -673,11 +673,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
>
>  /* Packet Manager */
>
> -#define KFD_HIQ_TIMEOUT (500)
> -
>  #define KFD_FENCE_COMPLETED (100)
>  #define KFD_FENCE_INIT   (10)
> -#define KFD_UNMAP_LATENCY (150)
>
>  struct packet_manager {
> struct device_queue_manager *dqm;
> --
> 2.7.4
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

This patch is:
Reviewed-by: Oded Gabbay 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org

Re: [PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling  wrote:
> From: Yong Zhao 
>
> When we do suspend/resume through "sudo pm-suspend" while there is
> HSA activity running, upon resume we will encounter HWS hanging, which
> is caused by memory read/write failures. The root cause is that when
> suspend, we neglected to unbind pasid from kfd device.
>
> Another major change is that the bind/unbinding is changed to be
> performed on a per process basis, instead of whether there are queues
> in dqm.
>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c| 22 --
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 15 +++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 89 
> ++
>  4 files changed, 101 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index cc8af11..ff3f97c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -191,7 +191,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev 
> *pdev, int pasid)
> struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
>
> if (dev)
> -   kfd_unbind_process_from_device(dev, pasid);
> +   kfd_process_iommu_unbind_callback(dev, pasid);
>  }
>
>  /*
> @@ -339,12 +339,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>
>  void kgd2kfd_suspend(struct kfd_dev *kfd)
>  {
> -   if (kfd->init_complete) {
> -   kfd->dqm->ops.stop(kfd->dqm);
> -   amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
> -   amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
> -   amd_iommu_free_device(kfd->pdev);
> -   }
> +   if (!kfd->init_complete)
> +   return;
> +
> +   kfd->dqm->ops.stop(kfd->dqm);
> +
> +   kfd_unbind_processes_from_device(kfd);
> +
> +   amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
> +   amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
> +   amd_iommu_free_device(kfd->pdev);
>  }
>
>  int kgd2kfd_resume(struct kfd_dev *kfd)
> @@ -369,6 +373,10 @@ static int kfd_resume(struct kfd_dev *kfd)
> amd_iommu_set_invalid_ppr_cb(kfd->pdev,
>  iommu_invalid_ppr_cb);
>
> +   err = kfd_bind_processes_to_device(kfd);
> +   if (err)
> +   return -ENXIO;

You need to undo previous initialization in case
kfd_bind_processes_to_device fails, i.e. call amd_iommu_free_device()

> +
> err = kfd->dqm->ops.start(kfd->dqm);
> if (err) {
> dev_err(kfd_device,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 53a66e8..5db82b8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager 
> *dqm)
>
>  static int start_cpsch(struct device_queue_manager *dqm)
>  {
> -   struct device_process_node *node;
> int retval;
>
> retval = 0;
> @@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
>
> init_interrupts(dqm);
>
> -   list_for_each_entry(node, >queues, list)
> -   if (node->qpd->pqm->process && dqm->dev)
> -   kfd_bind_process_to_device(dqm->dev,
> -   node->qpd->pqm->process);
> -
> execute_queues_cpsch(dqm, true);
>
> return 0;
> @@ -714,15 +708,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
>
>  static int stop_cpsch(struct device_queue_manager *dqm)
>  {
> -   struct device_process_node *node;
> -   struct kfd_process_device *pdd;
> -
> destroy_queues_cpsch(dqm, true, true);
>
> -   list_for_each_entry(node, >queues, list) {
> -   pdd = qpd_to_pdd(node->qpd);
> -   pdd->bound = false;
> -   }
> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> pm_uninit(>packets);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index b397ec7..ef582cc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -435,6 +435,13 @@ struct qcm_process_device {
> uint32_t sh_hidden_private_base;
>  };
>
> +
> +enum kfd_pdd_bound {
> +   PDD_UNBOUND = 0,
> +   PDD_BOUND,
> +   PDD_BOUND_SUSPENDED,
> +};
> +
>  /* Data that is per-process-per device. */
>  struct kfd_process_device {
> /*
> @@ -459,7 +466,7 @@ struct kfd_process_device {
> uint64_t scratch_limit;
>
> /* Is this process/pasid bound to this device? 

Re: [PATCH 01/11] drm/amdkfd: Reorganize kfd resume code

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling  wrote:
> From: Yong Zhao 
>
> The idea is to let kfd init and resume function share the same code path
> as much as possible, rather than to have two copies of almost identical
> code. That way improves the code readability and maintainability.
>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 78 
> +
>  1 file changed, 40 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 61fff25..cc8af11 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -92,6 +92,8 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned 
> int buf_size,
> unsigned int chunk_size);
>  static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
>
> +static int kfd_resume(struct kfd_dev *kfd);
> +
>  static const struct kfd_device_info *lookup_device_info(unsigned short did)
>  {
> size_t i;
> @@ -176,15 +178,8 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
> pasid_limit,
> kfd->doorbell_process_limit - 1);
>
> -   err = amd_iommu_init_device(kfd->pdev, pasid_limit);
> -   if (err < 0) {
> -   dev_err(kfd_device, "error initializing iommu device\n");
> -   return false;
> -   }
> -
> if (!kfd_set_pasid_limit(pasid_limit)) {
> dev_err(kfd_device, "error setting pasid limit\n");
> -   amd_iommu_free_device(kfd->pdev);
> return false;
> }
>
> @@ -280,29 +275,22 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> goto kfd_interrupt_error;
> }
>
> -   if (!device_iommu_pasid_init(kfd)) {
> -   dev_err(kfd_device,
> -   "Error initializing iommuv2 for device %x:%x\n",
> -   kfd->pdev->vendor, kfd->pdev->device);
> -   goto device_iommu_pasid_error;
> -   }
> -   amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
> -   
> iommu_pasid_shutdown_callback);
> -   amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
> -
> kfd->dqm = device_queue_manager_init(kfd);
> if (!kfd->dqm) {
> dev_err(kfd_device, "Error initializing queue manager\n");
> goto device_queue_manager_error;
> }
>
> -   if (kfd->dqm->ops.start(kfd->dqm)) {
> +   if (!device_iommu_pasid_init(kfd)) {
> dev_err(kfd_device,
> -   "Error starting queue manager for device %x:%x\n",
> +   "Error initializing iommuv2 for device %x:%x\n",
> kfd->pdev->vendor, kfd->pdev->device);
> -   goto dqm_start_error;
> +   goto device_iommu_pasid_error;
> }
>
> +   if (kfd_resume(kfd))
> +   goto kfd_resume_error;
> +
> kfd->dbgmgr = NULL;
>
> kfd->init_complete = true;
> @@ -314,11 +302,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>
> goto out;
>
> -dqm_start_error:
> +kfd_resume_error:
> +device_iommu_pasid_error:
> device_queue_manager_uninit(kfd->dqm);
>  device_queue_manager_error:
> -   amd_iommu_free_device(kfd->pdev);
> -device_iommu_pasid_error:
> kfd_interrupt_exit(kfd);
>  kfd_interrupt_error:
> kfd_topology_remove_device(kfd);
> @@ -338,8 +325,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  void kgd2kfd_device_exit(struct kfd_dev *kfd)
>  {
> if (kfd->init_complete) {
> +   kgd2kfd_suspend(kfd);
> device_queue_manager_uninit(kfd->dqm);
> -   amd_iommu_free_device(kfd->pdev);
> kfd_interrupt_exit(kfd);
> kfd_topology_remove_device(kfd);
> kfd_doorbell_fini(kfd);
> @@ -362,25 +349,40 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
>
>  int kgd2kfd_resume(struct kfd_dev *kfd)
>  {
> -   unsigned int pasid_limit;
> -   int err;
> +   if (!kfd->init_complete)
> +   return 0;
>
> -   pasid_limit = kfd_get_pasid_limit();
> +   return kfd_resume(kfd);
>
> -   if (kfd->init_complete) {
> -   err = amd_iommu_init_device(kfd->pdev, pasid_limit);
> -   if (err < 0) {
> -   dev_err(kfd_device, "failed to initialize iommu\n");
> -   return -ENXIO;
> -   }
> +}
>
> -   amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
> -   
> iommu_pasid_shutdown_callback);
> -   amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
> -   

Re: [PATCH 07/11] drm/amdkfd: Reuse CHIP_* from amdgpu

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling  wrote:
> From: Yong Zhao 
>
> There are already CHIP_* definitions under amd_shared.h file on amdgpu
> side, so KFD should reuse them rather than defining new ones.
>
> Using enum for asic type requires default cases on switch statements
> to prevent compiler warnings. BUG on unsupported ASICs. It should never
> get there because KFD should not be initialized on unsupported devices.

We did an effort to remove all BUG statements from the driver so
please don't introduce new ones.
Even if the code should never reach there, it is not a reason to crash
the entire kernel as it doesn't effect the rest of the system's
functionality.
Oded.

>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 9 +++--
>  4 files changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 897ff083..0ecea67 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1132,6 +1132,8 @@ struct device_queue_manager 
> *device_queue_manager_init(struct kfd_dev *dev)
> case CHIP_KAVERI:
> device_queue_manager_init_cik(>ops_asic_specific);
> break;
> +   default:
> +   BUG();
Replace this with some error printing.

> }
>
> if (!dqm->ops.initialize(dqm))
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 09356d0..9ebb4c1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -291,6 +291,8 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev 
> *dev,
> case CHIP_KAVERI:
> kernel_queue_init_cik(>ops_asic_specific);
> break;
> +   default:
> +   BUG();
Replace this with some error printing.

> }
>
> if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index b1ef136..b5a87ba 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -31,6 +31,8 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
> return mqd_manager_init_cik(type, dev);
> case CHIP_CARRIZO:
> return mqd_manager_init_vi(type, dev);
> +   default:
> +   BUG();
Replace this with some error printing.

> }
>
> return NULL;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 7bed4ef..bb71697 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -33,6 +33,8 @@
>  #include 
>  #include 
>
> +#include "amd_shared.h"
> +
>  #define KFD_SYSFS_FILE_MODE 0444
>
>  #define KFD_MMAP_DOORBELL_MASK 0x8
> @@ -112,11 +114,6 @@ enum cache_policy {
> cache_policy_noncoherent
>  };
>
> -enum asic_family_type {
> -   CHIP_KAVERI = 0,
> -   CHIP_CARRIZO
> -};
> -
>  struct kfd_event_interrupt_class {
> bool (*interrupt_isr)(struct kfd_dev *dev,
> const uint32_t *ih_ring_entry);
> @@ -125,7 +122,7 @@ struct kfd_event_interrupt_class {
>  };
>
>  struct kfd_device_info {
> -   unsigned int asic_family;
> +   enum amd_asic_type asic_family;
> const struct kfd_event_interrupt_class *event_interrupt_class;
> unsigned int max_pasid_bits;
> unsigned int max_no_of_hqd;
> --
> 2.7.4
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx