Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-05-13 Thread Felix Kuehling
Am 2020-05-13 um 3:41 p.m. schrieb Amber Lin:
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register devices and subscribe events they are interested. After
> registered, the user can use annoymous file descriptor's poll function
> with wait-time specified and wait for events to happen. Once an event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> v2: - remove UNREGISTER and add event ENABLE/DISABLE
> - correct kfifo usage
> - move event message API to kfd_ioctl.h
> v3: send the event msg in text than in binary
> v4: support multiple clients
> v5: move events enablement from ioctl to fd write
>
> Signed-off-by: Amber Lin 

Reviewed-by: Felix Kuehling 


> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  18 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   7 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   4 +
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 214 
> +++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  29 +++
>  include/uapi/linux/kfd_ioctl.h   |  16 +-
>  9 files changed, 292 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..e1e4115 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,6 +53,7 @@ AMDKFD_FILES:= $(AMDKFD_PATH)/kfd_module.o \
>   $(AMDKFD_PATH)/kfd_int_process_v9.o \
>   $(AMDKFD_PATH)/kfd_dbgdev.o \
>   $(AMDKFD_PATH)/kfd_dbgmgr.o \
> + $(AMDKFD_PATH)/kfd_smi_events.o \
>   $(AMDKFD_PATH)/kfd_crat.o
>  
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>  
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>   const uint32_t *ih_ring_entry,
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>   ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>   struct kfd_vm_fault_info info;
>  
> + kfd_smi_event_update_vmfault(dev, pasid);
>   kfd_process_vm_fault(dev->dqm, pasid);
>  
>   memset(&info, 0, sizeof(info));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index cf0017f..e9b96ad 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_dbgmgr.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>  
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
> @@ -1740,6 +1741,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
>   return r;
>  }
>  
> +/* Handle requests for watching SMI events */
> +static int kfd_ioctl_smi_events(struct file *filep,
> + struct kfd_process *p, void *data)
> +{
> + struct kfd_ioctl_smi_events_args *args = data;
> + struct kfd_dev *dev;
> +
> + dev = kfd_device_by_id(args->gpuid);
> + if (!dev)
> + return -EINVAL;
> +
> + return kfd_smi_event_open(dev, &args->anon_fd);
> +}
> +
>  #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
>   [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
>   .cmd_drv = 0, .name = #ioctl}
> @@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = 
> {
>  
>   AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
>   kfd_ioctl_alloc_queue_gws, 0),
> +
> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
> + kfd_ioctl_smi_events, 0),
>  };
>  
>  #define AMDKFD_CORE_IOCTL_COUNT  ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 0491ab2..2c030c2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -5

[PATCH] drm/amdkfd: Provide SMI events watch

2020-05-13 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register devices and subscribe events they are interested. After
registered, the user can use annoymous file descriptor's poll function
with wait-time specified and wait for events to happen. Once an event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
v3: send the event msg in text than in binary
v4: support multiple clients
v5: move events enablement from ioctl to fd write

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  18 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   7 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 214 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  29 +++
 include/uapi/linux/kfd_ioctl.h   |  16 +-
 9 files changed, 292 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..e1e4115 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cf0017f..e9b96ad 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1740,6 +1741,20 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return r;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpuid);
+   if (!dev)
+   return -EINVAL;
+
+   return kfd_smi_event_open(dev, &args->anon_fd);
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..2c030c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -586,6 +586,11 @@ static int kfd_gws_init(struct kfd_dev *kfd)
return ret;
 }
 
+static void kfd_smi_init(struct kfd_dev *dev) {
+   INIT_LIST_HEAD(&dev->smi_clients);
+   spin_lock_init(&dev->smi_lock);
+}
+
 bool kgd2kfd_device_init(struct kfd_dev *kfd

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-04-01 Thread Felix Kuehling
Am 2020-04-01 um 9:10 a.m. schrieb Amber Lin:
>
> Thanks Felix for the review. I have a better understanding of how
> kfifo works now and have changed my code quite a bit. Couple of
> questions below inline regarding the gpu_id and data arguments.
>
Replies inline ...


> Thanks.
>
> Amber
>
> On 2020-03-26 4:53 p.m., Felix Kuehling wrote:
>>
>> Hi Amber,
>>
>> I see that this is based on the debugger event code. Jon and I are
>> just working through some issues with that code. The lessons from
>> that will need to be applied to this as well. But I think we can
>> define your API to simplify this a bit.
>>
>> The basic problem is, that we have one Fifo in the kfd_device, but
>> potentially multiple file descriptors referring to it. For the event
>> interface I think we can enforce only a single file descriptor per
>> device. If there is already one, your register call can fail. See
>> more comments inline.
>>
>> On 2020-03-17 13:57, Amber Lin wrote:
>>> When the compute is malfunctioning or performance drops, the system admin
>>> will use SMI (System Management Interface) tool to monitor/diagnostic what
>>> went wrong. This patch provides an event watch interface for the user
>>> space to register events they are interested. After the event is
>>> registered, the user can use annoymous file descriptor's pull function
>>
>> pull -> poll
>>
> Thank you for spotting the typo. I’ll change that.
>
>>> with wait-time specified to wait for the event to happen. Once the event
>>> happens, the user can use read() to retrieve information related to the
>>> event.
>>>
>>> VM fault event is done in this patch.
>>>
>>> Signed-off-by: Amber Lin 
>>> ---
>>>  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
>>>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 
>>> +++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
>>>  include/uapi/linux/kfd_ioctl.h   |  27 -
>>>  9 files changed, 265 insertions(+), 2 deletions(-)
>>>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
>>> b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> index 6147462..cc98b4a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
>>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
>>> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
>>> $(AMDKFD_PATH)/kfd_int_process_v9.o \
>>> $(AMDKFD_PATH)/kfd_dbgdev.o \
>>> $(AMDKFD_PATH)/kfd_dbgmgr.o \
>>> -   $(AMDKFD_PATH)/kfd_crat.o
>>> +   $(AMDKFD_PATH)/kfd_crat.o \
>>> +   $(AMDKFD_PATH)/kfd_smi_events.o
>>>  
>>>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>>>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
>>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> index 9f59ba9..24b4717 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>>> @@ -24,6 +24,7 @@
>>>  #include "kfd_events.h"
>>>  #include "cik_int.h"
>>>  #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
>>> const uint32_t *ih_ring_entry,
>>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
>>> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>>> struct kfd_vm_fault_info info;
>>>  
>>> +   kfd_smi_event_update_vmfault(dev, pasid);
>>> kfd_process_vm_fault(dev->dqm, pasid);
>>>  
>>> memset(&info, 0, sizeof(info));
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index f8fa03a..8e92956 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -39,6 +39,7 @@
>>>  #include "kfd_device_queue_manager.h"
>>>  #include "kfd_dbgmgr.h"
>>>  #include "amdgpu_amdkfd.h"
>>> +#include "kfd_smi_events.h"
>>>  
>>>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>>>  static int kfd_open(struct inode *, struct file *);
>>> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
>>> struct kfd_process *p,
>>> return ret;
>>>  }
>>>  
>>> +/* Handle requests for watching SMI events */
>>> +static int kfd_ioctl_smi_events(struct file *filep,
>>> +   struct kfd_process *p, void *data)
>>> +{
>>> +   struct kfd_ioctl_smi_events_args *args = data;
>>> +   struct kfd_dev *dev;
>>

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-04-01 Thread Amber Lin
Thanks Felix for the review. I have a better understanding of how kfifo 
works now and have changed my code quite a bit. Couple of questions 
below inline regarding the gpu_id and data arguments.


Thanks.

Amber

On 2020-03-26 4:53 p.m., Felix Kuehling wrote:


Hi Amber,

I see that this is based on the debugger event code. Jon and I are 
just working through some issues with that code. The lessons from that 
will need to be applied to this as well. But I think we can define 
your API to simplify this a bit.


The basic problem is, that we have one Fifo in the kfd_device, but 
potentially multiple file descriptors referring to it. For the event 
interface I think we can enforce only a single file descriptor per 
device. If there is already one, your register call can fail. See more 
comments inline.


On 2020-03-17 13:57, Amber Lin wrote:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function


pull -> poll


Thank you for spotting the typo. I’ll change that.


with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
  include/uapi/linux/kfd_ioctl.h   |  27 -
  9 files changed, 265 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
  
  ifneq ($(CONFIG_AMD_IOMMU_V2),)

  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static bool cik_event_interrupt_isr(struct kfd_dev *dev,

const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
  
+		kfd_smi_event_update_vmfault(dev, pasid);

kfd_process_vm_fault(dev->dqm, pasid);
  
  		memset(&info, 0, sizeof(info));

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
  }
  
+/* Handle requests for watching SMI events */

+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+   int ret = 0;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   ret = kfd_smi_event_register(dev, args->events);
+   if (ret >= 0) {
+   /* When the registration is successful, it returns the
+* annoym

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-26 Thread Felix Kuehling

Hi Amber,

I see that this is based on the debugger event code. Jon and I are just 
working through some issues with that code. The lessons from that will 
need to be applied to this as well. But I think we can define your API 
to simplify this a bit.


The basic problem is, that we have one Fifo in the kfd_device, but 
potentially multiple file descriptors referring to it. For the event 
interface I think we can enforce only a single file descriptor per 
device. If there is already one, your register call can fail. See more 
comments inline.


On 2020-03-17 13:57, Amber Lin wrote:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function


pull -> poll



with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
  include/uapi/linux/kfd_ioctl.h   |  27 -
  9 files changed, 265 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
  
  ifneq ($(CONFIG_AMD_IOMMU_V2),)

  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static bool cik_event_interrupt_isr(struct kfd_dev *dev,

const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
  
+		kfd_smi_event_update_vmfault(dev, pasid);

kfd_process_vm_fault(dev->dqm, pasid);
  
  		memset(&info, 0, sizeof(info));

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
  }
  
+/* Handle requests for watching SMI events */

+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+   int ret = 0;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   ret = kfd_smi_event_register(dev, args->events);
+   if (ret >= 0) {
+   /* When the registration is successful, it returns the
+* annoymous inode. Pass it to the user in data1
+*/
+   args->data1 = ret;
+   ret = 0;


You could return the file descriptor as the return value. On success it 
returns a positive fd. On failure it returns a negative error code.




+   }
+   

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-24 Thread Amber Lin
Sorry for the messed-up link. This is the link (rocm-smi-lib) which 
makes use of the interface

https://github.com/RadeonOpenCompute/rocm_smi_lib

On 2020-03-23 2:19 p.m., Amber Lin wrote:

Somehow my reply didn't seem to reach the mailing list...

Hi Alex,

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FRadeonOpenCompute%2Frocm_smi_lib&data=02%7C01%7Camber.lin%40amd.com%7C37d1a82d9e734d9fec6d08d7cf56ce36%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637205844045641423&sdata=I%2BVkN3VKYFUiZ0xGW0Yst70rcqrMRXUTcd995RgfRa4%3D&reserved=0 
will use this interface. Those functions will be added to this library:


/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */ 
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,

    rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is 
returned with

 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t 
timeout_ms,

    rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

I add the ioctl to /dev/kfd with a debate if it should be in 
/dev/dri/card* or /dev/dri/renderD* instead. The first event to report 
is VM fault in this patch. Other events like RAS errors, PCIe errors, 
GPU reset… etc will be added for the system admin to diagnose the 
system health. I see this as a system feature so I use /dev/kfd. I’ll 
like to hear if people think differently. Thanks.


Thanks.

Amber

On 2020-03-17 3:03 p.m., Alex Deucher wrote:

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin  wrote:
When the compute is malfunctioning or performance drops, the system 
admin
will use SMI (System Management Interface) tool to 
monitor/diagnostic what

went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function
with wait-time specified to wait for the event to happen. Once the 
event

happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin 
Can you provide a link to the userspace tools that make use of this 
interface?


Thanks,

Alex


---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 
+++

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
  include/uapi/linux/kfd_ioctl.h   |  27 -
  9 files changed, 265 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile

index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
 $(AMDKFD_PATH)/kfd_int_process_v9.o \
 $(AMDKFD_PATH)/kfd_dbgdev.o \
 $(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o

  ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c

index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"

  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct 
kfd_dev *dev,

 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 struct kfd_vm_fault_info info;

+   kfd_smi_event_update_vmfault(dev, pasid);
 kfd_process_vm_fault(dev->dqm, pasid);

 memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manag

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-23 Thread Amber Lin

Somehow my reply didn't seem to reach the mailing list...

Hi Alex,

https://github.com/RadeonOpenCompute/rocm_smi_lib will use this 
interface. Those functions will be added to this library:


/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */ 
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,

    rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is returned with
 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t 
timeout_ms,

    rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

I add the ioctl to /dev/kfd with a debate if it should be in 
/dev/dri/card* or /dev/dri/renderD* instead. The first event to report 
is VM fault in this patch. Other events like RAS errors, PCIe errors, 
GPU reset… etc will be added for the system admin to diagnose the system 
health. I see this as a system feature so I use /dev/kfd. I’ll like to 
hear if people think differently. Thanks.


Thanks.

Amber

On 2020-03-17 3:03 p.m., Alex Deucher wrote:

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin  wrote:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin 

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex


---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
  include/uapi/linux/kfd_ioctl.h   |  27 -
  9 files changed, 265 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
 $(AMDKFD_PATH)/kfd_int_process_v9.o \
 $(AMDKFD_PATH)/kfd_dbgdev.o \
 $(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o

  ifneq ($(CONFIG_AMD_IOMMU_V2),)
  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"

  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 struct kfd_vm_fault_info info;

+   kfd_smi_event_update_vmfault(dev, pasid);
 kfd_process_vm_fault(dev->dqm, pasid);

 memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"

  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
 return ret;
  }

+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+  

RE: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-19 Thread Lin, Amber
[AMD Public Use]

Hi Alex,

https://github.com/RadeonOpenCompute/rocm_smi_lib will use this interface. 
Those functions will be added to this library:

/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,
rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is returned with
 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t timeout_ms,
rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

I add the ioctl to /dev/kfd with a debate if it should be in /dev/dri/card* or 
/dev/dri/renderD* instead. The first event to report is VM fault in this patch. 
Other events like RAS errors, PCIe errors, GPU reset… etc will be added for the 
system admin to diagnose the system health. I see this as a system feature so I 
use /dev/kfd. I’ll like to hear if people think differently. Thanks.

Regards,
Amber

-Original Message-
From: Alex Deucher  
Sent: Tuesday, March 17, 2020 3:03 PM
To: Lin, Amber 
Cc: amd-gfx list 
Subject: Re: [PATCH] drm/amdkfd: Provide SMI events watch

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin  wrote:
>
> When the compute is malfunctioning or performance drops, the system 
> admin will use SMI (System Management Interface) tool to 
> monitor/diagnostic what went wrong. This patch provides an event watch 
> interface for the user space to register events they are interested. 
> After the event is registered, the user can use annoymous file 
> descriptor's pull function with wait-time specified to wait for the 
> event to happen. Once the event happens, the user can use read() to 
> retrieve information related to the event.
>
> VM fault event is done in this patch.
>
> Signed-off-by: Amber Lin 

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex

> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 
> +++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
>  include/uapi/linux/kfd_ioctl.h   |  27 -
>  9 files changed, 265 insertions(+), 2 deletions(-)  create mode 
> 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..cc98b4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
> $(AMDKFD_PATH)/kfd_int_process_v9.o \
> $(AMDKFD_PATH)/kfd_dbgdev.o \
> $(AMDKFD_PATH)/kfd_dbgmgr.o \
> -   $(AMDKFD_PATH)/kfd_crat.o
> +   $(AMDKFD_PATH)/kfd_crat.o \
> +   $(AMDKFD_PATH)/kfd_smi_events.o
>
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git 
> a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
> const uint32_t *ih_ring_entry, 
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
> struct kfd_vm_fault_info info;
>
> +   kfd_smi_event_update_vmfault(dev, pasid);
> kfd_process_vm_fault(dev->dqm, pasid);
>
> memset(&info, 0, sizeof(info)); diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..8e92956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_dev

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-17 Thread Lin, Amber
[AMD Official Use Only - Internal Distribution Only]




From: Alex Deucher 
Sent: Tuesday, March 17, 2020 3:03 PM
To: Lin, Amber 
Cc: amd-gfx list 
Subject: Re: [PATCH] drm/amdkfd: Provide SMI events watch

On Tue, Mar 17, 2020 at 1:57 PM Amber Lin  wrote:
>
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register events they are interested. After the event is
> registered, the user can use annoymous file descriptor's pull function
> with wait-time specified to wait for the event to happen. Once the event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> Signed-off-by: Amber Lin 

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex
=
Hi Alex,

https://github.com/RadeonOpenCompute/rocm_smi_lib will use this interface. 
Those functions will be added to this library:

/* Get a handler for watching events */
rsmi_status_t rsmi_event_init(rsmi_event_handle_t *handle);
/* Register events for the device using the handler from init */
rsmi_status_t rsmi_event_register(uint32_t dv_ind, uint32_t events,
rsmi_event_handle_t *handle);
/* Wait for events. If one of the events happens, a success is returned with
 * with details in data.
 */
rsmi_status_t rsmi_event_wait(rsmi_event_handle_t handle, uint32_t timeout_ms,
rsmi_event_data_t *data);
/* Stop watching events */
rsmi_status_t rsmi_event_free(rsmi_event_handle_t handle);

Regards,
Amber

> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 
> +++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
>  include/uapi/linux/kfd_ioctl.h   |  27 -
>  9 files changed, 265 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..cc98b4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
> $(AMDKFD_PATH)/kfd_int_process_v9.o \
> $(AMDKFD_PATH)/kfd_dbgdev.o \
> $(AMDKFD_PATH)/kfd_dbgmgr.o \
> -   $(AMDKFD_PATH)/kfd_crat.o
> +   $(AMDKFD_PATH)/kfd_crat.o \
> +   $(AMDKFD_PATH)/kfd_smi_events.o
>
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
> const uint32_t *ih_ring_entry,
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
> struct kfd_vm_fault_info info;
>
> +   kfd_smi_event_update_vmfault(dev, pasid);
> kfd_process_vm_fault(dev->dqm, pasid);
>
> memset(&info, 0, sizeof(info));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..8e92956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_dbgmgr.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct fil

Re: [PATCH] drm/amdkfd: Provide SMI events watch

2020-03-17 Thread Alex Deucher
On Tue, Mar 17, 2020 at 1:57 PM Amber Lin  wrote:
>
> When the compute is malfunctioning or performance drops, the system admin
> will use SMI (System Management Interface) tool to monitor/diagnostic what
> went wrong. This patch provides an event watch interface for the user
> space to register events they are interested. After the event is
> registered, the user can use annoymous file descriptor's pull function
> with wait-time specified to wait for the event to happen. Once the event
> happens, the user can use read() to retrieve information related to the
> event.
>
> VM fault event is done in this patch.
>
> Signed-off-by: Amber Lin 

Can you provide a link to the userspace tools that make use of this interface?

Thanks,

Alex

> ---
>  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
>  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 
> +++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
>  include/uapi/linux/kfd_ioctl.h   |  27 -
>  9 files changed, 265 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> index 6147462..cc98b4a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
> $(AMDKFD_PATH)/kfd_int_process_v9.o \
> $(AMDKFD_PATH)/kfd_dbgdev.o \
> $(AMDKFD_PATH)/kfd_dbgmgr.o \
> -   $(AMDKFD_PATH)/kfd_crat.o
> +   $(AMDKFD_PATH)/kfd_crat.o \
> +   $(AMDKFD_PATH)/kfd_smi_events.o
>
>  ifneq ($(CONFIG_AMD_IOMMU_V2),)
>  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> index 9f59ba9..24b4717 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
> @@ -24,6 +24,7 @@
>  #include "kfd_events.h"
>  #include "cik_int.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static bool cik_event_interrupt_isr(struct kfd_dev *dev,
> const uint32_t *ih_ring_entry,
> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
> struct kfd_vm_fault_info info;
>
> +   kfd_smi_event_update_vmfault(dev, pasid);
> kfd_process_vm_fault(dev->dqm, pasid);
>
> memset(&info, 0, sizeof(info));
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index f8fa03a..8e92956 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -39,6 +39,7 @@
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_dbgmgr.h"
>  #include "amdgpu_amdkfd.h"
> +#include "kfd_smi_events.h"
>
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
> @@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
> struct kfd_process *p,
> return ret;
>  }
>
> +/* Handle requests for watching SMI events */
> +static int kfd_ioctl_smi_events(struct file *filep,
> +   struct kfd_process *p, void *data)
> +{
> +   struct kfd_ioctl_smi_events_args *args = data;
> +   struct kfd_dev *dev;
> +   int ret = 0;
> +
> +   dev = kfd_device_by_id(args->gpu_id);
> +   if (!dev)
> +   return -EINVAL;
> +
> +   switch (args->op) {
> +   case KFD_SMI_EVENTS_REGISTER:
> +   ret = kfd_smi_event_register(dev, args->events);
> +   if (ret >= 0) {
> +   /* When the registration is successful, it returns the
> +* annoymous inode. Pass it to the user in data1
> +*/
> +   args->data1 = ret;
> +   ret = 0;
> +   }
> +   break;
> +   case KFD_SMI_EVENTS_UNREGISTER:
> +   kfd_smi_event_unregister(dev, args->events);
> +   break;
> +   default:
> +   ret = -EINVAL;
> +   break;
> +   }
> +
> +   return ret;
> +}
> +
>  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
>  {
> struct kfd_local_mem_info mem_info;
> @@ -1827,6 +1862,9 @@ static const struct amdkfd_

[PATCH] drm/amdkfd: Provide SMI events watch

2020-03-17 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's pull function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  38 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 143 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  41 +++
 include/uapi/linux/kfd_ioctl.h   |  27 -
 9 files changed, 265 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..8e92956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,40 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+   int ret = 0;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   ret = kfd_smi_event_register(dev, args->events);
+   if (ret >= 0) {
+   /* When the registration is successful, it returns the
+* annoymous inode. Pass it to the user in data1
+*/
+   args->data1 = ret;
+   ret = 0;
+   }
+   break;
+   case KFD_SMI_EVENTS_UNREGISTER:
+   kfd_smi_event_unregister(dev, args->events);
+   break;
+   default:
+   ret = -EINVAL;
+   break;
+   }
+
+   return ret;
+}
+
 bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1862,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/dri