[PATCH 2/3] drm/amdgpu: Fix RAS function interface

2021-05-26 Thread Luben Tuikov
The correctable and uncorrectable errors
are calculated at each invocation of this
function. Therefore, it is highly inefficient to
return just one of them based on a Boolean
input. If the caller wants both, twice the work
would be done. (And this work is O(n^3) on
Vega20.)

Fix this "interface" to simply return what it had
calculated--both values. Let the caller choose
what it wants to record, inspect, use.

Cc: Alexander Deucher 
Cc: John Clements 
Cc: Hawking Zhang 
Signed-off-by: Luben Tuikov 
Reviewed-by: Alexander Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e3a4c3a7635a..ed3c43e8b0b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1043,29 +1043,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 }
 
 /* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
-   struct ras_err_data data = {0, 0};
+   unsigned long ce, ue;
 
if (!adev->ras_enabled || !con)
-   return 0;
+   return;
 
+   ce = 0;
+   ue = 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
};
 
if (amdgpu_ras_query_error_status(adev, &info))
-   return 0;
+   return;
 
-   data.ce_count += info.ce_count;
-   data.ue_count += info.ue_count;
+   ce += info.ce_count;
+   ue += info.ue_count;
}
 
-   return is_ce ? data.ce_count : data.ue_count;
+   if (ce_count)
+   *ce_count = ce;
+
+   if (ue_count)
+   *ue_count = ue;
 }
 /* query/inject/cure end */
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bfa40c8ecc94..10fca0393106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -485,8 +485,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device 
*adev,
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count);
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-- 
2.31.1.527.g2d677e5b15

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/3] drm/amdgpu: Fix RAS function interface

2021-05-25 Thread Luben Tuikov
The correctable and uncorrectable errors
are calculated at each invocation of this
function. Therefore, it is highly inefficient to
return just one of them based on a Boolean
input. If the caller wants both, twice the work
would be done. (And this work is O(n^3) on
Vega20.)

Fix this "interface" to simply return what it had
calculated--both values. Let the caller choose
what it wants to record, inspect, use.

Cc: Alexander Deucher 
Cc: John Clements 
Cc: Hawking Zhang 
Signed-off-by: Luben Tuikov 
Reviewed-by: Alexander Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e3a4c3a7635a..ed3c43e8b0b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1043,29 +1043,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 }
 
 /* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
-   struct ras_err_data data = {0, 0};
+   unsigned long ce, ue;
 
if (!adev->ras_enabled || !con)
-   return 0;
+   return;
 
+   ce = 0;
+   ue = 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
};
 
if (amdgpu_ras_query_error_status(adev, &info))
-   return 0;
+   return;
 
-   data.ce_count += info.ce_count;
-   data.ue_count += info.ue_count;
+   ce += info.ce_count;
+   ue += info.ue_count;
}
 
-   return is_ce ? data.ce_count : data.ue_count;
+   if (ce_count)
+   *ce_count = ce;
+
+   if (ue_count)
+   *ue_count = ue;
 }
 /* query/inject/cure end */
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bfa40c8ecc94..10fca0393106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -485,8 +485,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device 
*adev,
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count);
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-- 
2.31.1.527.g2d677e5b15

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/3] drm/amdgpu: Fix RAS function interface

2021-05-21 Thread Luben Tuikov
The correctable and uncorrectable errors
are calculated at each invocation of this
function. Therefore, it is highly inefficient to
return just one of them based on a Boolean
input. If the caller wants both, twice the work
would be done. (And this work is O(n^3) on
Vega20.)

Fix this "interface" to simply return what it had
calculated--both values. Let the caller choose
what it wants to record, inspect, use.

Cc: Alexander Deucher 
Cc: John Clements 
Cc: Hawking Zhang 
Signed-off-by: Luben Tuikov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e3a4c3a7635a..ed3c43e8b0b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1043,29 +1043,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 }
 
 /* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
-   struct ras_err_data data = {0, 0};
+   unsigned long ce, ue;
 
if (!adev->ras_enabled || !con)
-   return 0;
+   return;
 
+   ce = 0;
+   ue = 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
};
 
if (amdgpu_ras_query_error_status(adev, &info))
-   return 0;
+   return;
 
-   data.ce_count += info.ce_count;
-   data.ue_count += info.ue_count;
+   ce += info.ce_count;
+   ue += info.ue_count;
}
 
-   return is_ce ? data.ce_count : data.ue_count;
+   if (ce_count)
+   *ce_count = ce;
+
+   if (ue_count)
+   *ue_count = ue;
 }
 /* query/inject/cure end */
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bfa40c8ecc94..10fca0393106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -485,8 +485,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device 
*adev,
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-   bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count);
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-- 
2.31.1.527.g2d677e5b15

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx