[PATCH] drm/radeon: disable any GPU activity after unrecovered lockup v3

2012-06-27 Thread j.gli...@gmail.com
From: Jerome Glisse 

After unrecovered GPU lockup avoid any GPU activities to avoid
things like kernel segfault and alike to happen in any of the
path that assume hw is working.

The segfault is due to PCIE vram gart table being unmapped after
suspend in the GPU reset path. To avoid segault to happen and to
avoid further GPU activity if unsuccessful at reseting GPU we
use the accel_working boolean to transform ttm activities into
noop. It does not impact the module load path because in that
path ttm have an empty schedule queue and accel_working will be
set to true as soon as the gart table is in valid state. Because
ttm might have work queued it is better to use the accel working
then disabling radeon_bo ioctl.

To trigger the segfault launch a program that repeatly create bo
in ttm and let it run in background, then trigger gpu lockup from
another process.

This patch also for video mode restoring on r1xx,r2xx,r3xx,r4xx,
r5xx,rs4xx,rs6xx GPU even if GPU reset fail. When GPU reset fails
it is very likely (so far i never had it not working) that the
modesetting part of the GPU is still alive. So we can have a
chance to get kernel backtrace or other debugging informations
on the screen if we always restore the video mode.

v2: fix spelling error and disable accel before suspend and reenable
it after pcie gart initialization to be even more cautious about
possible segfault. Improve commit message
v3: Improve commit message to describe the video mode restoring no
matter what.

cc: stable at vger.kernel.org
Signed-off-by: Jerome Glisse 
---
 drivers/gpu/drm/radeon/evergreen.c |2 +-
 drivers/gpu/drm/radeon/ni.c|2 +-
 drivers/gpu/drm/radeon/r300.c  |2 +-
 drivers/gpu/drm/radeon/r520.c  |2 +-
 drivers/gpu/drm/radeon/r600.c  |2 +-
 drivers/gpu/drm/radeon/radeon_device.c |9 ---
 drivers/gpu/drm/radeon/radeon_object.c |7 ++
 drivers/gpu/drm/radeon/radeon_ttm.c|   41 
 drivers/gpu/drm/radeon/rs400.c |2 +-
 drivers/gpu/drm/radeon/rs600.c |2 +-
 drivers/gpu/drm/radeon/rs690.c |2 +-
 drivers/gpu/drm/radeon/rv515.c |2 +-
 drivers/gpu/drm/radeon/rv770.c |2 +-
 drivers/gpu/drm/radeon/si.c|2 +-
 drivers/gpu/drm/ttm/ttm_tt.c   |1 +
 15 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/radeon/evergreen.c 
b/drivers/gpu/drm/radeon/evergreen.c
index c3073f7..5f154e3 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -3071,6 +3071,7 @@ static int evergreen_startup(struct radeon_device *rdev)
if (r)
return r;
}
+   rdev->accel_working = true;
evergreen_gpu_init(rdev);

r = evergreen_blit_init(rdev);
@@ -3145,7 +3146,6 @@ int evergreen_resume(struct radeon_device *rdev)
/* post card */
atom_asic_init(rdev->mode_info.atom_context);

-   rdev->accel_working = true;
r = evergreen_startup(rdev);
if (r) {
DRM_ERROR("evergreen startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index dc2e34d..486faa8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1245,6 +1245,7 @@ static int cayman_startup(struct radeon_device *rdev)
r = cayman_pcie_gart_enable(rdev);
if (r)
return r;
+   rdev->accel_working = true;
cayman_gpu_init(rdev);

r = evergreen_blit_init(rdev);
@@ -1337,7 +1338,6 @@ int cayman_resume(struct radeon_device *rdev)
/* post card */
atom_asic_init(rdev->mode_info.atom_context);

-   rdev->accel_working = true;
r = cayman_startup(rdev);
if (r) {
DRM_ERROR("cayman startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 97722a3..206ac1f 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -1358,6 +1358,7 @@ static int r300_startup(struct radeon_device *rdev)
if (r)
return r;
}
+   rdev->accel_working = true;

if (rdev->family == CHIP_R300 ||
rdev->family == CHIP_R350 ||
@@ -1426,7 +1427,6 @@ int r300_resume(struct radeon_device *rdev)
/* Initialize surface registers */
radeon_surface_init(rdev);

-   rdev->accel_working = true;
r = r300_startup(rdev);
if (r) {
rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/r520.c b/drivers/gpu/drm/radeon/r520.c
index b5cf837..6409eb0 100644
--- a/drivers/gpu/drm/radeon/r520.c
+++ b/drivers/gpu/drm/radeon/r520.c
@@ -181,6 +181,7 @@ static int r520_startup(struct radeon_device *rdev)
if (r)
return r;
}
+   rdev->accel_working = true;

/* allocate w

[PATCH] drm/radeon: disable any GPU activity after unrecovered lockup v3

2012-06-27 Thread j . glisse
From: Jerome Glisse 

After unrecovered GPU lockup avoid any GPU activities to avoid
things like kernel segfault and alike to happen in any of the
path that assume hw is working.

The segfault is due to PCIE vram gart table being unmapped after
suspend in the GPU reset path. To avoid segault to happen and to
avoid further GPU activity if unsuccessful at reseting GPU we
use the accel_working boolean to transform ttm activities into
noop. It does not impact the module load path because in that
path ttm have an empty schedule queue and accel_working will be
set to true as soon as the gart table is in valid state. Because
ttm might have work queued it is better to use the accel working
then disabling radeon_bo ioctl.

To trigger the segfault launch a program that repeatly create bo
in ttm and let it run in background, then trigger gpu lockup from
another process.

This patch also for video mode restoring on r1xx,r2xx,r3xx,r4xx,
r5xx,rs4xx,rs6xx GPU even if GPU reset fail. When GPU reset fails
it is very likely (so far i never had it not working) that the
modesetting part of the GPU is still alive. So we can have a
chance to get kernel backtrace or other debugging informations
on the screen if we always restore the video mode.

v2: fix spelling error and disable accel before suspend and reenable
it after pcie gart initialization to be even more cautious about
possible segfault. Improve commit message
v3: Improve commit message to describe the video mode restoring no
matter what.

cc: sta...@vger.kernel.org
Signed-off-by: Jerome Glisse 
---
 drivers/gpu/drm/radeon/evergreen.c |2 +-
 drivers/gpu/drm/radeon/ni.c|2 +-
 drivers/gpu/drm/radeon/r300.c  |2 +-
 drivers/gpu/drm/radeon/r520.c  |2 +-
 drivers/gpu/drm/radeon/r600.c  |2 +-
 drivers/gpu/drm/radeon/radeon_device.c |9 ---
 drivers/gpu/drm/radeon/radeon_object.c |7 ++
 drivers/gpu/drm/radeon/radeon_ttm.c|   41 
 drivers/gpu/drm/radeon/rs400.c |2 +-
 drivers/gpu/drm/radeon/rs600.c |2 +-
 drivers/gpu/drm/radeon/rs690.c |2 +-
 drivers/gpu/drm/radeon/rv515.c |2 +-
 drivers/gpu/drm/radeon/rv770.c |2 +-
 drivers/gpu/drm/radeon/si.c|2 +-
 drivers/gpu/drm/ttm/ttm_tt.c   |1 +
 15 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/radeon/evergreen.c 
b/drivers/gpu/drm/radeon/evergreen.c
index c3073f7..5f154e3 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -3071,6 +3071,7 @@ static int evergreen_startup(struct radeon_device *rdev)
if (r)
return r;
}
+   rdev->accel_working = true;
evergreen_gpu_init(rdev);
 
r = evergreen_blit_init(rdev);
@@ -3145,7 +3146,6 @@ int evergreen_resume(struct radeon_device *rdev)
/* post card */
atom_asic_init(rdev->mode_info.atom_context);
 
-   rdev->accel_working = true;
r = evergreen_startup(rdev);
if (r) {
DRM_ERROR("evergreen startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index dc2e34d..486faa8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1245,6 +1245,7 @@ static int cayman_startup(struct radeon_device *rdev)
r = cayman_pcie_gart_enable(rdev);
if (r)
return r;
+   rdev->accel_working = true;
cayman_gpu_init(rdev);
 
r = evergreen_blit_init(rdev);
@@ -1337,7 +1338,6 @@ int cayman_resume(struct radeon_device *rdev)
/* post card */
atom_asic_init(rdev->mode_info.atom_context);
 
-   rdev->accel_working = true;
r = cayman_startup(rdev);
if (r) {
DRM_ERROR("cayman startup failed on resume\n");
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 97722a3..206ac1f 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -1358,6 +1358,7 @@ static int r300_startup(struct radeon_device *rdev)
if (r)
return r;
}
+   rdev->accel_working = true;
 
if (rdev->family == CHIP_R300 ||
rdev->family == CHIP_R350 ||
@@ -1426,7 +1427,6 @@ int r300_resume(struct radeon_device *rdev)
/* Initialize surface registers */
radeon_surface_init(rdev);
 
-   rdev->accel_working = true;
r = r300_startup(rdev);
if (r) {
rdev->accel_working = false;
diff --git a/drivers/gpu/drm/radeon/r520.c b/drivers/gpu/drm/radeon/r520.c
index b5cf837..6409eb0 100644
--- a/drivers/gpu/drm/radeon/r520.c
+++ b/drivers/gpu/drm/radeon/r520.c
@@ -181,6 +181,7 @@ static int r520_startup(struct radeon_device *rdev)
if (r)
return r;
}
+   rdev->accel_working = true;
 
/* alloca