RE: [PATCH] drm/amdkfd: Add GPU reset SMI event

2020-08-26 Thread Joshi, Mukul
[AMD Official Use Only - Internal Distribution Only]

Sorry I missed that. Thanks for pointing it out.
I will send out an updated patch.

Thanks,
Mukul

From: Nils Wallménius 
Sent: Wednesday, August 26, 2020 4:30 AM
To: Joshi, Mukul 
Cc: amd-gfx@lists.freedesktop.org; Kuehling, Felix 
Subject: Re: [PATCH] drm/amdkfd: Add GPU reset SMI event

[CAUTION: External Email]
Hi, see inline comment below.
Den tis 25 aug. 2020 21:12Mukul Joshi 
mailto:mukul.jo...@amd.com>> skrev:
Add support for reporting GPU reset events through SMI. KFD
would report both pre and post GPU reset events.

Signed-off-by: Mukul Joshi mailto:mukul.jo...@amd.com>>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c |  4 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 30 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  1 +
 include/uapi/linux/kfd_ioctl.h  |  2 ++
 5 files changed, 39 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index e1cd6599529f..aad1ecfa1239 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
if (!kfd->init_complete)
return 0;

+   kfd_smi_event_update_gpu_reset(kfd, false);
+
kfd->dqm->ops.pre_reset(kfd->dqm);

kgd2kfd_suspend(kfd, false);
@@ -833,6 +835,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
if (!kfd->init_complete)
return 0;

+   kfd_smi_event_update_gpu_reset(kfd, true);
+
ret = kfd_resume(kfd);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 18bc711f97ae..b1a2979e086f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -312,6 +312,8 @@ struct kfd_dev {
/* Clients watching SMI events */
struct list_head smi_clients;
spinlock_t smi_lock;
+
+   uint64_t reset_seq_num;
 };

 enum kfd_mempool {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 4d4b6e3ab697..448abfdde230 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, 
unsigned int smi_event,
rcu_read_unlock();
 }

+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
+{
+   /*
+* GpuReset msg = Reset seq number (incremented for
+* every reset message sent before GPU reset).
+* 1 byte event + 1 byte space + 16 bytes seq num +
+* 1 byte \n + 1 byte \0 = 20
+*/
+   char fifo_in[20];
+   int len;
+   unsigned int event;
+
+   if (list_empty(>smi_clients)) {
+   return;
+   }
+
+   memset(fifo_in, 0x0, sizeof(fifo_in));
+
+   if (post_reset) {
+   event = KFD_SMI_EVENT_GPU_POST_RESET;
+   } else {
+   event = KFD_SMI_EVENT_GPU_PRE_RESET;
+   ++(dev->reset_seq_num);
+   }
+
+   len = snprintf(fifo_in, 4, "%x %llx\n", event, dev->reset_seq_num);

I think the 4 will cause truncation of the message here.

Regards
Nils

+
+   add_event_to_kfifo(dev, event, fifo_in, len);
+}
+
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 uint32_t throttle_bitmask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index 15537b2cccb5..b9b0438202e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 uint32_t throttle_bitmask);
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);

 #endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index cb1f963a84e0..8b7368bfbd84 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -453,6 +453,8 @@ enum kfd_smi_event {
 KFD_SMI_EVENT_NONE = 0, /* not used */
 KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
 KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
+   KFD_SMI_EVENT_GPU_PRE_RESET = 3,
+   KFD_SMI_EVENT_GPU_POST_RESET = 4,
 };

 #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/amd-gfx<ht

Re: [PATCH] drm/amdkfd: Add GPU reset SMI event

2020-08-26 Thread Nils Wallménius
Hi, see inline comment below.

Den tis 25 aug. 2020 21:12Mukul Joshi  skrev:

> Add support for reporting GPU reset events through SMI. KFD
> would report both pre and post GPU reset events.
>
> Signed-off-by: Mukul Joshi 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c |  4 +++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  2 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 30 +
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  1 +
>  include/uapi/linux/kfd_ioctl.h  |  2 ++
>  5 files changed, 39 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index e1cd6599529f..aad1ecfa1239 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
> if (!kfd->init_complete)
> return 0;
>
> +   kfd_smi_event_update_gpu_reset(kfd, false);
> +
> kfd->dqm->ops.pre_reset(kfd->dqm);
>
> kgd2kfd_suspend(kfd, false);
> @@ -833,6 +835,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
> if (!kfd->init_complete)
> return 0;
>
> +   kfd_smi_event_update_gpu_reset(kfd, true);
> +
> ret = kfd_resume(kfd);
> if (ret)
> return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 18bc711f97ae..b1a2979e086f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -312,6 +312,8 @@ struct kfd_dev {
> /* Clients watching SMI events */
> struct list_head smi_clients;
> spinlock_t smi_lock;
> +
> +   uint64_t reset_seq_num;
>  };
>
>  enum kfd_mempool {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 4d4b6e3ab697..448abfdde230 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev,
> unsigned int smi_event,
> rcu_read_unlock();
>  }
>
> +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
> +{
> +   /*
> +* GpuReset msg = Reset seq number (incremented for
> +* every reset message sent before GPU reset).
> +* 1 byte event + 1 byte space + 16 bytes seq num +
> +* 1 byte \n + 1 byte \0 = 20
> +*/
> +   char fifo_in[20];
> +   int len;
> +   unsigned int event;
> +
> +   if (list_empty(>smi_clients)) {
> +   return;
> +   }
> +
> +   memset(fifo_in, 0x0, sizeof(fifo_in));
> +
> +   if (post_reset) {
> +   event = KFD_SMI_EVENT_GPU_POST_RESET;
> +   } else {
> +   event = KFD_SMI_EVENT_GPU_PRE_RESET;
> +   ++(dev->reset_seq_num);
> +   }
> +
> +   len = snprintf(fifo_in, 4, "%x %llx\n", event, dev->reset_seq_num);
>

I think the 4 will cause truncation of the message here.

Regards
Nils

+
> +   add_event_to_kfifo(dev, event, fifo_in, len);
> +}
> +
>  void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>  uint32_t throttle_bitmask)
>  {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index 15537b2cccb5..b9b0438202e2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t
> *fd);
>  void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
>  void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>  uint32_t throttle_bitmask);
> +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
>
>  #endif
> diff --git a/include/uapi/linux/kfd_ioctl.h
> b/include/uapi/linux/kfd_ioctl.h
> index cb1f963a84e0..8b7368bfbd84 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -453,6 +453,8 @@ enum kfd_smi_event {
>  KFD_SMI_EVENT_NONE = 0, /* not used */
>  KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
>  KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
> +   KFD_SMI_EVENT_GPU_PRE_RESET = 3,
> +   KFD_SMI_EVENT_GPU_POST_RESET = 4,
>  };
>
>  #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
> --
> 2.17.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdkfd: Add GPU reset SMI event

2020-08-25 Thread Mukul Joshi
Add support for reporting GPU reset events through SMI. KFD
would report both pre and post GPU reset events.

Signed-off-by: Mukul Joshi 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c |  4 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 30 +
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  1 +
 include/uapi/linux/kfd_ioctl.h  |  2 ++
 5 files changed, 39 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index e1cd6599529f..aad1ecfa1239 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
if (!kfd->init_complete)
return 0;
 
+   kfd_smi_event_update_gpu_reset(kfd, false);
+
kfd->dqm->ops.pre_reset(kfd->dqm);
 
kgd2kfd_suspend(kfd, false);
@@ -833,6 +835,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
if (!kfd->init_complete)
return 0;
 
+   kfd_smi_event_update_gpu_reset(kfd, true);
+
ret = kfd_resume(kfd);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 18bc711f97ae..b1a2979e086f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -312,6 +312,8 @@ struct kfd_dev {
/* Clients watching SMI events */
struct list_head smi_clients;
spinlock_t smi_lock;
+
+   uint64_t reset_seq_num;
 };
 
 enum kfd_mempool {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 4d4b6e3ab697..448abfdde230 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, 
unsigned int smi_event,
rcu_read_unlock();
 }
 
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset)
+{
+   /*
+* GpuReset msg = Reset seq number (incremented for
+* every reset message sent before GPU reset).
+* 1 byte event + 1 byte space + 16 bytes seq num +
+* 1 byte \n + 1 byte \0 = 20
+*/
+   char fifo_in[20];
+   int len;
+   unsigned int event;
+
+   if (list_empty(>smi_clients)) {
+   return;
+   }
+
+   memset(fifo_in, 0x0, sizeof(fifo_in));
+
+   if (post_reset) {
+   event = KFD_SMI_EVENT_GPU_POST_RESET;
+   } else {
+   event = KFD_SMI_EVENT_GPU_PRE_RESET;
+   ++(dev->reset_seq_num);
+   }
+
+   len = snprintf(fifo_in, 4, "%x %llx\n", event, dev->reset_seq_num);
+
+   add_event_to_kfifo(dev, event, fifo_in, len);
+}
+
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 uint32_t throttle_bitmask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index 15537b2cccb5..b9b0438202e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 uint32_t throttle_bitmask);
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
 
 #endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index cb1f963a84e0..8b7368bfbd84 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -453,6 +453,8 @@ enum kfd_smi_event {
 KFD_SMI_EVENT_NONE = 0, /* not used */
 KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
 KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
+   KFD_SMI_EVENT_GPU_PRE_RESET = 3,
+   KFD_SMI_EVENT_GPU_POST_RESET = 4,
 };
 
 #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdkfd: Add GPU reset SMI event

2020-07-27 Thread Mukul Joshi
Add support for reporting GPU reset events through SMI.

Signed-off-by: Mukul Joshi 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 18 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  1 +
 include/uapi/linux/kfd_ioctl.h  |  1 +
 4 files changed, 22 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index d5e790f046b4..d788aa24ef3f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -811,6 +811,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
if (!kfd->init_complete)
return 0;
 
+   kfd_smi_event_update_gpu_reset(kfd);
+
kfd->dqm->ops.pre_reset(kfd->dqm);
 
kgd2kfd_suspend(kfd, false);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 4d4b6e3ab697..4de57923d9f5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -174,6 +174,24 @@ static void add_event_to_kfifo(struct kfd_dev *dev, 
unsigned int smi_event,
rcu_read_unlock();
 }
 
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev)
+{
+   /*
+* GpuReset msg = empty
+* 1 byte event + 1 byte space + 1 byte \n + 1 byte \0 = 4
+*/
+   char fifo_in[4];
+   int len;
+
+   if (list_empty(>smi_clients)) {
+   return;
+   }
+
+   len = snprintf(fifo_in, 4, "%x \n", KFD_SMI_EVENT_GPU_RESET);
+
+   add_event_to_kfifo(dev, KFD_SMI_EVENT_GPU_RESET, fifo_in, len);
+}
+
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 uint32_t throttle_bitmask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index 15537b2cccb5..ffdb822d120b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 uint32_t throttle_bitmask);
+void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev);
 
 #endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index cb1f963a84e0..128b6235b540 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -453,6 +453,7 @@ enum kfd_smi_event {
 KFD_SMI_EVENT_NONE = 0, /* not used */
 KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */
 KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
+   KFD_SMI_EVENT_GPU_RESET = 3,
 };
 
 #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx