RE: [PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-03 Thread Kuehling, Felix
[AMD Official Use Only - Internal Distribution Only]

So are you saying you'll make the event descriptions text rather than binary?

If you switch to a text format, I wouldn't use a binary header. Rather I'd make 
it a text format completely. You could use one line per event, that makes it 
easy to use something like fgets to read a line (event) at a time in user mode.

Each line could still start with an event identifier, but it would be text 
rather than a binary. And you don’t need the size if you define "\n" as 
delimiter between events.

Regards,
  Felix

-Original Message-
From: Lin, Amber  
Sent: Friday, April 3, 2020 11:38
To: Kuehling, Felix ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH v2] drm/amdkfd: Provide SMI events watch

Further thinking about it, I'll use struct kfd_smi_msg_header. Instead of using 
struct kfd_smi_msg_vmfault, it's a description about the event. 
This way we make it generic to all events.

On 2020-04-03 9:38 a.m., Amber Lin wrote:
> Thanks Felix. I'll make changes accordingly but please pay attention 
> to my last reply inline.
>
> On 2020-04-02 7:51 p.m., Felix Kuehling wrote:
>> On 2020-04-02 4:46 p.m., Amber Lin wrote:
>>> When the compute is malfunctioning or performance drops, the system 
>>> admin will use SMI (System Management Interface) tool to 
>>> monitor/diagnostic what went wrong. This patch provides an event 
>>> watch interface for the user space to register events they are 
>>> interested. After the event is registered, the user can use 
>>> annoymous file descriptor's poll function with wait-time specified 
>>> to wait for the event to happen. Once the event happens, the user 
>>> can use read() to retrieve information related to the event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> v2: - remove UNREGISTER and add event ENABLE/DISABLE
>>>  - correct kfifo usage
>>>  - move event message API to kfd_ioctl.h
>>>
>>> Signed-off-by: Amber Lin 
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
>>>   drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
>>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  12 ++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177
>>> +++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
>>>   include/uapi/linux/kfd_ioctl.h   |  30 +++-
>>>   9 files changed, 286 insertions(+), 2 deletions(-)
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..cc98b4a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,7 +53,8 @@ AMDKFD_FILES    := $(AMDKFD_PATH)/kfd_module.o \
>>>   $(AMDKFD_PATH)/kfd_int_process_v9.o \
>>>   $(AMDKFD_PATH)/kfd_dbgdev.o \
>>>   $(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> -    $(AMDKFD_PATH)/kfd_crat.o
>>> +    $(AMDKFD_PATH)/kfd_crat.o \
>>> +    $(AMDKFD_PATH)/kfd_smi_events.o
>>>     ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git 
>>> a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>>   #include "kfd_events.h"
>>>   #include "cik_int.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>     static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>>   const uint32_t *ih_ring_entry, @@ -107,6 
>>> +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>>   ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>>   struct kfd_vm_fault_info info;
>>>   +    kfd_smi_event_update_vmfault(dev, pasid);
>>>   kfd_process_vm_fault(dev->dqm, pasid);
>>>     memset(, 0, sizeof(info)); diff --git 
>>> a/drivers/gpu/drm/amd/amdkfd/k

Re: [PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-03 Thread Amber Lin
Further thinking about it, I'll use struct kfd_smi_msg_header. Instead 
of using struct kfd_smi_msg_vmfault, it's a description about the event. 
This way we make it generic to all events.


On 2020-04-03 9:38 a.m., Amber Lin wrote:
Thanks Felix. I'll make changes accordingly but please pay attention 
to my last reply inline.


On 2020-04-02 7:51 p.m., Felix Kuehling wrote:

On 2020-04-02 4:46 p.m., Amber Lin wrote:
When the compute is malfunctioning or performance drops, the system 
admin
will use SMI (System Management Interface) tool to 
monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the 
event

happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
 - correct kfifo usage
 - move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  12 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 
+++

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
  include/uapi/linux/kfd_ioctl.h   |  30 +++-
  9 files changed, 286 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile

index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES    := $(AMDKFD_PATH)/kfd_module.o \
  $(AMDKFD_PATH)/kfd_int_process_v9.o \
  $(AMDKFD_PATH)/kfd_dbgdev.o \
  $(AMDKFD_PATH)/kfd_dbgmgr.o \
-    $(AMDKFD_PATH)/kfd_crat.o
+    $(AMDKFD_PATH)/kfd_crat.o \
+    $(AMDKFD_PATH)/kfd_smi_events.o
    ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c

index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static bool cik_event_interrupt_isr(struct kfd_dev *dev,
  const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct 
kfd_dev *dev,

  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
  struct kfd_vm_fault_info info;
  +    kfd_smi_event_update_vmfault(dev, pasid);
  kfd_process_vm_fault(dev->dqm, pasid);
    memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static long kfd_ioctl(struct file *, unsigned int, unsigned long);
  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file 
*filep, struct kfd_process *p,

  return ret;
  }
  +/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+    struct kfd_process *p, void *data)
+{
+    struct kfd_ioctl_smi_events_args *args = data;
+    struct kfd_dev *dev;
+
+    dev = kfd_device_by_id(args->gpu_id);
+    if (!dev)
+    return -EINVAL;
+
+    switch (args->op) {
+    case KFD_SMI_EVENTS_REGISTER:
+    /* register the device */
+    return kfd_smi_event_register(dev, >data);
+    case KFD_SMI_EVENTS_ENABLE:
+    /* subscribe events to the device */
+    return kfd_smi_event_enable(dev, args->events);
+    case KFD_SMI_EVENTS_DISABLE:
+    /* unsubscribe events */
+    return kfd_smi_event_disable(dev, args->events);
+    }
+
+    return -EINVAL;
+}
+
  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
  {
  struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc 
amdkfd_ioctls[] = {

    AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
  kfd_ioctl_alloc_queue_gws, 0),
+
+    AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+    

Re: [PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-03 Thread Amber Lin
Thanks Felix. I'll make changes accordingly but please pay attention to 
my last reply inline.


On 2020-04-02 7:51 p.m., Felix Kuehling wrote:

On 2020-04-02 4:46 p.m., Amber Lin wrote:
When the compute is malfunctioning or performance drops, the system 
admin
will use SMI (System Management Interface) tool to monitor/diagnostic 
what

went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
 - correct kfifo usage
 - move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  12 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 
+++

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
  include/uapi/linux/kfd_ioctl.h   |  30 +++-
  9 files changed, 286 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile

index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES    := $(AMDKFD_PATH)/kfd_module.o \
  $(AMDKFD_PATH)/kfd_int_process_v9.o \
  $(AMDKFD_PATH)/kfd_dbgdev.o \
  $(AMDKFD_PATH)/kfd_dbgmgr.o \
-    $(AMDKFD_PATH)/kfd_crat.o
+    $(AMDKFD_PATH)/kfd_crat.o \
+    $(AMDKFD_PATH)/kfd_smi_events.o
    ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c

index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static bool cik_event_interrupt_isr(struct kfd_dev *dev,
  const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev 
*dev,

  ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
  struct kfd_vm_fault_info info;
  +    kfd_smi_event_update_vmfault(dev, pasid);
  kfd_process_vm_fault(dev->dqm, pasid);
    memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
    static long kfd_ioctl(struct file *, unsigned int, unsigned long);
  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file 
*filep, struct kfd_process *p,

  return ret;
  }
  +/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+    struct kfd_process *p, void *data)
+{
+    struct kfd_ioctl_smi_events_args *args = data;
+    struct kfd_dev *dev;
+
+    dev = kfd_device_by_id(args->gpu_id);
+    if (!dev)
+    return -EINVAL;
+
+    switch (args->op) {
+    case KFD_SMI_EVENTS_REGISTER:
+    /* register the device */
+    return kfd_smi_event_register(dev, >data);
+    case KFD_SMI_EVENTS_ENABLE:
+    /* subscribe events to the device */
+    return kfd_smi_event_enable(dev, args->events);
+    case KFD_SMI_EVENTS_DISABLE:
+    /* unsubscribe events */
+    return kfd_smi_event_disable(dev, args->events);
+    }
+
+    return -EINVAL;
+}
+
  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
  {
  struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc 
amdkfd_ioctls[] = {

    AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
  kfd_ioctl_alloc_queue_gws, 0),
+
+    AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+    kfd_ioctl_smi_events, 0),
  };
    #define AMDKFD_CORE_IOCTL_COUNT    ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c

index 0491ab2..6ac6f31 100644
--- 

Re: [PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-02 Thread Felix Kuehling

On 2020-04-02 4:46 p.m., Amber Lin wrote:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
 - correct kfifo usage
 - move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  12 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
  include/uapi/linux/kfd_ioctl.h   |  30 +++-
  9 files changed, 286 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
  
  ifneq ($(CONFIG_AMD_IOMMU_V2),)

  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static bool cik_event_interrupt_isr(struct kfd_dev *dev,

const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
  
+		kfd_smi_event_update_vmfault(dev, pasid);

kfd_process_vm_fault(dev->dqm, pasid);
  
  		memset(, 0, sizeof(info));

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
  }
  
+/* Handle requests for watching SMI events */

+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   /* register the device */
+   return kfd_smi_event_register(dev, >data);
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events to the device */
+   return kfd_smi_event_enable(dev, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(dev, args->events);
+   }
+
+   return -EINVAL;
+}
+
  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
  {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
  
  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,

kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
  };
  
  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 

[PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-02 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  12 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
 include/uapi/linux/kfd_ioctl.h   |  30 +++-
 9 files changed, 286 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   /* register the device */
+   return kfd_smi_event_register(dev, >data);
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events to the device */
+   return kfd_smi_event_enable(dev, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(dev, args->events);
+   }
+
+   return -EINVAL;
+}
+
 bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..6ac6f31 100644
---