RE: [PATCH] drm/amdkfd: Add GPU reset SMI event
[AMD Official Use Only - Internal Distribution Only] Sorry I missed that. Thanks for pointing it out. I will send out an updated patch. Thanks, Mukul From: Nils Wallménius Sent: Wednesday, August 26, 2020 4:30 AM To: Joshi, Mukul Cc: amd-gfx@lists.freedesktop.org; Kuehling, Felix Subject: Re: [PATCH] drm/amdkfd: Add GPU reset SMI event [CAUTION: External Email] Hi, see inline comment below. Den tis 25 aug. 2020 21:12Mukul Joshi mailto:mukul.jo...@amd.com>> skrev: Add support for reporting GPU reset events through SMI. KFD would report both pre and post GPU reset events. Signed-off-by: Mukul Joshi mailto:mukul.jo...@amd.com>> --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 +++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 30 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 1 + include/uapi/linux/kfd_ioctl.h | 2 ++ 5 files changed, 39 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index e1cd6599529f..aad1ecfa1239 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd, false); + kfd->dqm->ops.pre_reset(kfd->dqm); kgd2kfd_suspend(kfd, false); @@ -833,6 +835,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd, true); + ret = kfd_resume(kfd); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 18bc711f97ae..b1a2979e086f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -312,6 +312,8 @@ struct kfd_dev { /* Clients watching SMI events */ struct list_head smi_clients; spinlock_t smi_lock; + + uint64_t reset_seq_num; }; enum kfd_mempool { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 4d4b6e3ab697..448abfdde230 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, rcu_read_unlock(); } +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset) +{ + /* +* GpuReset msg = Reset seq number (incremented for +* every reset message sent before GPU reset). +* 1 byte event + 1 byte space + 16 bytes seq num + +* 1 byte \n + 1 byte \0 = 20 +*/ + char fifo_in[20]; + int len; + unsigned int event; + + if (list_empty(>smi_clients)) { + return; + } + + memset(fifo_in, 0x0, sizeof(fifo_in)); + + if (post_reset) { + event = KFD_SMI_EVENT_GPU_POST_RESET; + } else { + event = KFD_SMI_EVENT_GPU_PRE_RESET; + ++(dev->reset_seq_num); + } + + len = snprintf(fifo_in, 4, "%x %llx\n", event, dev->reset_seq_num); I think the 4 will cause truncation of the message here. Regards Nils + + add_event_to_kfifo(dev, event, fifo_in, len); +} + void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 15537b2cccb5..b9b0438202e2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask); +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); #endif diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index cb1f963a84e0..8b7368bfbd84 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -453,6 +453,8 @@ enum kfd_smi_event { KFD_SMI_EVENT_NONE = 0, /* not used */ KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ KFD_SMI_EVENT_THERMAL_THROTTLE = 2, + KFD_SMI_EVENT_GPU_PRE_RESET = 3, + KFD_SMI_EVENT_GPU_POST_RESET = 4, }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> https://lists.freedesktop.org/mailman/listinfo/amd-gfx<ht
Re: [PATCH] drm/amdkfd: Add GPU reset SMI event
Hi, see inline comment below. Den tis 25 aug. 2020 21:12Mukul Joshi skrev: > Add support for reporting GPU reset events through SMI. KFD > would report both pre and post GPU reset events. > > Signed-off-by: Mukul Joshi > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 +++ > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 30 + > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 1 + > include/uapi/linux/kfd_ioctl.h | 2 ++ > 5 files changed, 39 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index e1cd6599529f..aad1ecfa1239 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) > if (!kfd->init_complete) > return 0; > > + kfd_smi_event_update_gpu_reset(kfd, false); > + > kfd->dqm->ops.pre_reset(kfd->dqm); > > kgd2kfd_suspend(kfd, false); > @@ -833,6 +835,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) > if (!kfd->init_complete) > return 0; > > + kfd_smi_event_update_gpu_reset(kfd, true); > + > ret = kfd_resume(kfd); > if (ret) > return ret; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 18bc711f97ae..b1a2979e086f 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -312,6 +312,8 @@ struct kfd_dev { > /* Clients watching SMI events */ > struct list_head smi_clients; > spinlock_t smi_lock; > + > + uint64_t reset_seq_num; > }; > > enum kfd_mempool { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > index 4d4b6e3ab697..448abfdde230 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > @@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, > unsigned int smi_event, > rcu_read_unlock(); > } > > +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset) > +{ > + /* > +* GpuReset msg = Reset seq number (incremented for > +* every reset message sent before GPU reset). > +* 1 byte event + 1 byte space + 16 bytes seq num + > +* 1 byte \n + 1 byte \0 = 20 > +*/ > + char fifo_in[20]; > + int len; > + unsigned int event; > + > + if (list_empty(>smi_clients)) { > + return; > + } > + > + memset(fifo_in, 0x0, sizeof(fifo_in)); > + > + if (post_reset) { > + event = KFD_SMI_EVENT_GPU_POST_RESET; > + } else { > + event = KFD_SMI_EVENT_GPU_PRE_RESET; > + ++(dev->reset_seq_num); > + } > + > + len = snprintf(fifo_in, 4, "%x %llx\n", event, dev->reset_seq_num); > I think the 4 will cause truncation of the message here. Regards Nils + > + add_event_to_kfifo(dev, event, fifo_in, len); > +} > + > void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, > uint32_t throttle_bitmask) > { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > index 15537b2cccb5..b9b0438202e2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t > *fd); > void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); > void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, > uint32_t throttle_bitmask); > +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); > > #endif > diff --git a/include/uapi/linux/kfd_ioctl.h > b/include/uapi/linux/kfd_ioctl.h > index cb1f963a84e0..8b7368bfbd84 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -453,6 +453,8 @@ enum kfd_smi_event { > KFD_SMI_EVENT_NONE = 0, /* not used */ > KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ > KFD_SMI_EVENT_THERMAL_THROTTLE = 2, > + KFD_SMI_EVENT_GPU_PRE_RESET = 3, > + KFD_SMI_EVENT_GPU_POST_RESET = 4, > }; > > #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) > -- > 2.17.1 > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdkfd: Add GPU reset SMI event
Add support for reporting GPU reset events through SMI. KFD would report both pre and post GPU reset events. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 +++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 30 + drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 1 + include/uapi/linux/kfd_ioctl.h | 2 ++ 5 files changed, 39 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index e1cd6599529f..aad1ecfa1239 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd, false); + kfd->dqm->ops.pre_reset(kfd->dqm); kgd2kfd_suspend(kfd, false); @@ -833,6 +835,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd, true); + ret = kfd_resume(kfd); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 18bc711f97ae..b1a2979e086f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -312,6 +312,8 @@ struct kfd_dev { /* Clients watching SMI events */ struct list_head smi_clients; spinlock_t smi_lock; + + uint64_t reset_seq_num; }; enum kfd_mempool { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 4d4b6e3ab697..448abfdde230 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, rcu_read_unlock(); } +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset) +{ + /* +* GpuReset msg = Reset seq number (incremented for +* every reset message sent before GPU reset). +* 1 byte event + 1 byte space + 16 bytes seq num + +* 1 byte \n + 1 byte \0 = 20 +*/ + char fifo_in[20]; + int len; + unsigned int event; + + if (list_empty(>smi_clients)) { + return; + } + + memset(fifo_in, 0x0, sizeof(fifo_in)); + + if (post_reset) { + event = KFD_SMI_EVENT_GPU_POST_RESET; + } else { + event = KFD_SMI_EVENT_GPU_PRE_RESET; + ++(dev->reset_seq_num); + } + + len = snprintf(fifo_in, 4, "%x %llx\n", event, dev->reset_seq_num); + + add_event_to_kfifo(dev, event, fifo_in, len); +} + void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 15537b2cccb5..b9b0438202e2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask); +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); #endif diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index cb1f963a84e0..8b7368bfbd84 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -453,6 +453,8 @@ enum kfd_smi_event { KFD_SMI_EVENT_NONE = 0, /* not used */ KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ KFD_SMI_EVENT_THERMAL_THROTTLE = 2, + KFD_SMI_EVENT_GPU_PRE_RESET = 3, + KFD_SMI_EVENT_GPU_POST_RESET = 4, }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdkfd: Add GPU reset SMI event
Add support for reporting GPU reset events through SMI. Signed-off-by: Mukul Joshi --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 18 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 1 + include/uapi/linux/kfd_ioctl.h | 1 + 4 files changed, 22 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index d5e790f046b4..d788aa24ef3f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -811,6 +811,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd); + kfd->dqm->ops.pre_reset(kfd->dqm); kgd2kfd_suspend(kfd, false); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 4d4b6e3ab697..4de57923d9f5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -174,6 +174,24 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, rcu_read_unlock(); } +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev) +{ + /* +* GpuReset msg = empty +* 1 byte event + 1 byte space + 1 byte \n + 1 byte \0 = 4 +*/ + char fifo_in[4]; + int len; + + if (list_empty(>smi_clients)) { + return; + } + + len = snprintf(fifo_in, 4, "%x \n", KFD_SMI_EVENT_GPU_RESET); + + add_event_to_kfifo(dev, KFD_SMI_EVENT_GPU_RESET, fifo_in, len); +} + void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 15537b2cccb5..ffdb822d120b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask); +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev); #endif diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index cb1f963a84e0..128b6235b540 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -453,6 +453,7 @@ enum kfd_smi_event { KFD_SMI_EVENT_NONE = 0, /* not used */ KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ KFD_SMI_EVENT_THERMAL_THROTTLE = 2, + KFD_SMI_EVENT_GPU_RESET = 3, }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx