Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-08-05 Thread Marcin Slusarz
Hi

I refreshed this patchset to current nouveau git.
http://people.freedesktop.org/~mslusarz/gpu-lockup-recovery/

Marcin
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-05-27 Thread Marcin Slusarz
From: Marcin Slusarz 
Subject: [PATCH v4] drm/nouveau: gpu lockup recovery

Detect lockups by watching for vm flush / fence timeouts and signal them by
returning EIO. When EIOs are met at ioctl level, reset the card and repeat
last ioctl.

GPU reset is done by going through suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening wait times

v2:
- move ioctl locking from drm core to nouveau
- make ioctl-side locking interruptible
- fix build bug on 32-bit systems

v3:
- make reset-side locking interruptible
- add module parameter to disable lockup recovery
- move reset code to nouveau_ioctl

v4:
- rebased on top current nouveau-git

Signed-off-by: Marcin Slusarz 
---
I skipped posting v3 because of possible other approach to the problem, but
I find this patch useful for debugging, so I'm posting rebased version for
other devs.
---
 drivers/gpu/drm/nouveau/Makefile|2 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c|2 +-
 drivers/gpu/drm/nouveau/nouveau_drv.c   |   88 -
 drivers/gpu/drm/nouveau/nouveau_drv.h   |   47 -
 drivers/gpu/drm/nouveau/nouveau_fence.c |   10 ++-
 drivers/gpu/drm/nouveau/nouveau_reset.c |  166 +++
 drivers/gpu/drm/nouveau/nouveau_state.c |6 +
 drivers/gpu/drm/nouveau/nv50_graph.c|   11 +-
 8 files changed, 318 insertions(+), 14 deletions(-)
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c

diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 338450e..1fa707c 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -10,7 +10,7 @@ nouveau-y := nouveau_device.o nouveau_subdev.o 
nouveau_engine.o \
  nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
  nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
  nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
- nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+ nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
 nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_therm.o \
 nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
 nouveau_fanctl.o nouveau_abi16.o nouveau_agp.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c 
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index f30a75a..6827f2e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1133,7 +1133,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, 
bool intr,
}
 
/* CPU copy if we have no accelerated method available */
-   if (!ndev->ttm.move) {
+   if (!ndev->ttm.move || nouveau_gpu_reset_in_progress(ndev)) {
ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, 
no_wait_gpu, new_mem);
goto out;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c 
b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 79b3236..1dccfcc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -131,6 +131,10 @@ MODULE_PARM_DESC(mxmdcb, "Santise DCB table according to 
MXM-SIS");
 int nouveau_mxmdcb = 1;
 module_param_named(mxmdcb, nouveau_mxmdcb, int, 0400);
 
+MODULE_PARM_DESC(lockup_recovery, "Reset GPU on lockup (default: 1)\n");
+int nouveau_lockup_recovery = 1;
+module_param_named(lockup_recovery, nouveau_lockup_recovery, int, 0600);
+
 int nouveau_fbpercrtc;
 #if 0
 module_param_named(fbpercrtc, nouveau_fbpercrtc, int, 0400);
@@ -222,7 +226,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t 
pm_state)
}
 
NV_INFO(ndev, "Disabling engines...\n");
-   ret = nouveau_device_fini(ndev, true);
+   ret = nouveau_device_fini(ndev, !nouveau_gpu_reset_in_progress(ndev));
if (ret)
goto out_abort;
 
@@ -362,11 +366,91 @@ static struct drm_ioctl_desc nouveau_ioctls[] = {
DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, 
DRM_UNLOCKED|DRM_AUTH),
 };
 
+void intr_rwsem_init(struct intr_rwsem *r)
+{
+   atomic_set(&r->readers, 0);
+   mutex_init(&r->mutex);
+}
+
+int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
+{
+   int ret = mutex_lock_interruptible(&r->mutex);
+   if (ret)
+   return ret;
+   atomic_inc(&r->readers);
+   mutex_unlock(&r->mutex);
+   return 0;
+}
+
+void intr_rwsem_down_read(struct intr_rwsem *r)
+{
+   mutex_lock(&r->mutex);
+   atomic_inc(&r->readers);
+   mutex_unlock(&r->mutex);
+}
+
+void intr_rwsem_up_read(struct intr_rwsem *r)
+{
+   atomic_dec(&r->readers);
+}
+
+int intr_rwsem_down_write_interruptible(struct intr_rwsem *r)
+{
+   int ret = mutex_lock_interruptible(&r->mutex);
+   if (ret)
+   return ret;
+   while (atomic_read(&r->readers)) {
+   if (signal_pending(current)) {
+   mutex_unlock(&r->mutex);
+ 

Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-05-02 Thread Martin Peres

On 02/05/2012 15:48, Ben Skeggs wrote:

On Wed, 2012-05-02 at 15:33 +0200, Martin Peres wrote:

On 02/05/2012 13:28, Ben Skeggs wrote:

Right, again, I don't disagree :)  I think we can improve a lot on the
big-hammer-suspend-the-gpu solution though, and instead reset only the
faulting engine.  It's (in theory) almost possible for us to do now, but
I have a couple of reworks to areas related to this pending (basically,
making the various driver subsystems more independent), which should be
ready soon.  This'll go a long way to making it very easy to reset a
single engine, and likely result in *far* faster recovery from hangs.

Hey,

What about kicking a channel that put the card in a bad state? Wouldn't
that be possible?

This way, we don't loose the context of other channels and only the
application that hang the card will be exited.

That's pretty much the idea.  The trouble comes in where PFIFO will hang
waiting for the stuck engine to report that it's done (eg. it will wait
for PGRAPH to go "i've finished unloading my context now" after it's
told PGRAPH to do so).

Hence why it's important to be able to (preferably) un-stick the stuck
engine (usually handling the appropriate interrupts properly will
achieve this), and failing that, reset it and lose the context for just
that channel.

The work I'm doing at the moment will, among other nice things, make
handling all of this a lot nicer.  And it should be nice and speedy in
comparison to the suspend/resume option, we won't have to evict all
buffers from vram without accel, which can take quite a while (not to
mention that it might not even be possible to get to the VRAM not mapped
into the FB BAR on earlier chipsets if accel dies).

I get it, that seems nice and good.



I wonder how pfifo handles commands sent to a non-existing channel, but
I'm sure it shouldn't hang or anything.

It can't happen anyway, if we destroyed the fifo context for a channel
we wouldn't be telling it to execute commands still :)
Right, but there may still be some commands left in the IB ring buffer, 
right?



Anyway, if this is not possible to only kick one channel, then what
about kicking all channels, rePOSTING the card and using KMS to output
the lockup report (and send a notification of the report through udev
and store the report in a sysfs file)?

Let's not try to be perfect, let us just be able to do better bug reports.

I'm still skeptical about how useful any kind of generic "lockup report"
can possibly be, beyond kernel logs..  However, as part of the work I'm
working on, there may be some additional information available via
debugfs..  I don't wan't to elaborate on this too much yet until I wrap
my head around what exactly I want to achieve, but I'll give you a
heads-up once I do :)
Well, a good report is important so as we can have an idea of what went 
wrong

and also, that would allow us to differenciate bug reports.

Basically, I'm now convinced that the nvaX random lockup is not actually 
one issue.

Having such an enhanced bug report could allow us to verify this theory.

PS: Speaking about nvaX lockups. I still get lockups (nva3/5) and I 
suspect that the
problem comes from the context switching micro code. Not loosing the 
email I'm writing

simply because kwin's channel crashed would be a big win to me.

Martin
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-05-02 Thread Ben Skeggs
On Wed, 2012-05-02 at 15:33 +0200, Martin Peres wrote:
> On 02/05/2012 13:28, Ben Skeggs wrote:
> > Right, again, I don't disagree :)  I think we can improve a lot on the
> > big-hammer-suspend-the-gpu solution though, and instead reset only the
> > faulting engine.  It's (in theory) almost possible for us to do now, but
> > I have a couple of reworks to areas related to this pending (basically,
> > making the various driver subsystems more independent), which should be
> > ready soon.  This'll go a long way to making it very easy to reset a
> > single engine, and likely result in *far* faster recovery from hangs.
> Hey,
> 
> What about kicking a channel that put the card in a bad state? Wouldn't 
> that be possible?
> 
> This way, we don't loose the context of other channels and only the 
> application that hang the card will be exited.
That's pretty much the idea.  The trouble comes in where PFIFO will hang
waiting for the stuck engine to report that it's done (eg. it will wait
for PGRAPH to go "i've finished unloading my context now" after it's
told PGRAPH to do so).

Hence why it's important to be able to (preferably) un-stick the stuck
engine (usually handling the appropriate interrupts properly will
achieve this), and failing that, reset it and lose the context for just
that channel.

The work I'm doing at the moment will, among other nice things, make
handling all of this a lot nicer.  And it should be nice and speedy in
comparison to the suspend/resume option, we won't have to evict all
buffers from vram without accel, which can take quite a while (not to
mention that it might not even be possible to get to the VRAM not mapped
into the FB BAR on earlier chipsets if accel dies).

> 
> I wonder how pfifo handles commands sent to a non-existing channel, but 
> I'm sure it shouldn't hang or anything.
It can't happen anyway, if we destroyed the fifo context for a channel
we wouldn't be telling it to execute commands still :)

> 
> Anyway, if this is not possible to only kick one channel, then what 
> about kicking all channels, rePOSTING the card and using KMS to output 
> the lockup report (and send a notification of the report through udev 
> and store the report in a sysfs file)?
> 
> Let's not try to be perfect, let us just be able to do better bug reports.
I'm still skeptical about how useful any kind of generic "lockup report"
can possibly be, beyond kernel logs..  However, as part of the work I'm
working on, there may be some additional information available via
debugfs..  I don't wan't to elaborate on this too much yet until I wrap
my head around what exactly I want to achieve, but I'll give you a
heads-up once I do :)

Ben.

> 
> Martin
> ___
> Nouveau mailing list
> Nouveau@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau


___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-05-02 Thread Martin Peres

On 02/05/2012 13:28, Ben Skeggs wrote:

Right, again, I don't disagree :)  I think we can improve a lot on the
big-hammer-suspend-the-gpu solution though, and instead reset only the
faulting engine.  It's (in theory) almost possible for us to do now, but
I have a couple of reworks to areas related to this pending (basically,
making the various driver subsystems more independent), which should be
ready soon.  This'll go a long way to making it very easy to reset a
single engine, and likely result in *far* faster recovery from hangs.

Hey,

What about kicking a channel that put the card in a bad state? Wouldn't 
that be possible?


This way, we don't loose the context of other channels and only the 
application that hang the card will be exited.


I wonder how pfifo handles commands sent to a non-existing channel, but 
I'm sure it shouldn't hang or anything.


Anyway, if this is not possible to only kick one channel, then what 
about kicking all channels, rePOSTING the card and using KMS to output 
the lockup report (and send a notification of the report through udev 
and store the report in a sysfs file)?


Let's not try to be perfect, let us just be able to do better bug reports.

Martin
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-05-02 Thread Ben Skeggs
On Sat, 2012-04-28 at 16:49 +0200, Marcin Slusarz wrote:
> On Thu, Apr 26, 2012 at 05:32:29PM +1000, Ben Skeggs wrote:
> > On Wed, 2012-04-25 at 23:20 +0200, Marcin Slusarz wrote:
> > > Overall idea:
> > > Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> > > handle them at ioctl level, reset the GPU and repeat last ioctl.
> > > 
> > > GPU reset is done by doing suspend / resume cycle with few tweaks:
> > > - CPU-only bo eviction
> > > - ignoring vm flush / fence timeouts
> > > - shortening waits
> > Okay.  I've thought about this a bit for a couple of days and think I'll
> > be able to coherently share my thoughts on this issue now :)
> > 
> > Firstly, while I agree that we need to become more resilient to errors,
> > I don't think that following in the radeon/intel footsteps with
> > something (imo, hackish) like this is the right choice for us
> > necessarily.
> 
> This is not only radeon/intel way. Windows, since Vista SP1, does the
> same - see http://msdn.microsoft.com/en-us/windows/hardware/gg487368.
> It's funny how similar it is to this patch (I haven't seen this page earlier).
Yes, I am aware of this feature in Windows.  And I'm not arguing that
something like it isn't necessary.

> 
> If you fear people will stop reporting bugs - don't. GPU reset is painfully
> slow and can take up to 50 seconds (BO eviction is the most time consuming
> part), so people will be annoyed enough to report them.
> Currently, GPU lockups make users so angry, they frequently switch to blob
> without even thinking about reporting anything.
I'm not so concerned about the lost bug reports, I expect the same
people that are actually willing to report bugs now will continue to do
so :)

> 
> > The *vast* majority of "lockups" we have are as a result of us badly
> > mishandling exceptions reported to us by the GPU.  There are a couple of
> > exceptions, however, they're very rare..
> 
> > A very common example is where people gain DMA_PUSHERs for whatever
> > reason, and things go haywire eventually.
> 
> Nope, I had tens of lockups during testing, and only once I had DMA_PUSHER
> before detecting GPU lockup.
Out of curiosity, what were the lockup situations you were triggering
exactly?

> 
> > To handle a DMA_PUSHER
> > sanely, generally you have to drop all pending commands for the channel
> > (set GET=PUT, etc) and continue on.  However, this leaves us with fences
> > and semaphores unsignalled etc, causing issues further up the stack with
> > perfectly good channels hanging on attempting to sync with the crashed
> > channel etc.
> > 
> > The next most common example I can think of is nv4x hardware, getting a
> > LIMIT_COLOR/ZETA exception from PGRAPH, and then a hang.  The solution
> > is simple, learn how to handle the exception, log it, and PGRAPH
> > survives.
> > 
> > I strongly believe that if we focused our efforts on dealing with what
> > the GPU reports to us a lot better, we'll find we really don't need such
> > "lockup recovery".
> 
> While I agree we need to improve on error handling to make "lockup recovery"
> not needed, the reality is we can't predict everything and driver needs to
> cope with its own bugs.
Right, again, I don't disagree :)  I think we can improve a lot on the
big-hammer-suspend-the-gpu solution though, and instead reset only the
faulting engine.  It's (in theory) almost possible for us to do now, but
I have a couple of reworks to areas related to this pending (basically,
making the various driver subsystems more independent), which should be
ready soon.  This'll go a long way to making it very easy to reset a
single engine, and likely result in *far* faster recovery from hangs.

> 
> > I am, however, considering pulling the vm flush timeout error
> > propagation and break-out-of-waits-on-signals that builds on it.  As we
> > really do need to become better at having killable processes if things
> > go wrong :)
> 
> Good :)
> 
> Marcin
> ___
> Nouveau mailing list
> Nouveau@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau


___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-04-30 Thread Martin Peres

Le 28/04/2012 16:56, Marcin Slusarz a écrit :

On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz wrote:

Overall idea:
Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
handle them at ioctl level, reset the GPU and repeat last ioctl.

GPU reset is done by doing suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening waits

Signed-off-by: Marcin Slusarz
---

Martin,

I'm wondering how below patch (which builds upon the above) affects
reclocking stability. I can't test it on my card, because it has only
one performance level. Can you test it on yours?

Hi Marcin,

I was away from my computers for a few days. I'll test this patch tonight.

Martin

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-04-28 Thread Marcin Slusarz
On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz wrote:
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
> 
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
> 
> Signed-off-by: Marcin Slusarz 
> ---

Martin,

I'm wondering how below patch (which builds upon the above) affects
reclocking stability. I can't test it on my card, because it has only
one performance level. Can you test it on yours?

---
From: Marcin Slusarz 
Subject: [PATCH] drm/nouveau: take ioctls_rwsem before reclocking

Signed-off-by: Marcin Slusarz 
---
 drivers/gpu/drm/nouveau/nouveau_pm.c|6 ++
 drivers/gpu/drm/nouveau/nouveau_reset.c |2 +-
 2 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_pm.c 
b/drivers/gpu/drm/nouveau/nouveau_pm.c
index 34d591b..4716f39 100644
--- a/drivers/gpu/drm/nouveau/nouveau_pm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_pm.c
@@ -383,9 +383,15 @@ nouveau_pm_set_perflvl(struct device *d, struct 
device_attribute *a,
   const char *buf, size_t count)
 {
struct drm_device *dev = pci_get_drvdata(to_pci_dev(d));
+   struct drm_nouveau_private *dev_priv = dev->dev_private;
int ret;
 
+   intr_rwsem_down_write(&dev_priv->ioctls_rwsem);
+
ret = nouveau_pm_profile_set(dev, buf);
+
+   intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
+
if (ret)
return ret;
return strlen(buf);
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c 
b/drivers/gpu/drm/nouveau/nouveau_reset.c
index e893096..7c25a3c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_reset.c
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -139,7 +139,7 @@ int nouveau_reset_device(struct drm_device *dev)
end = jiffies;
NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) / 
DRM_HZ);
while 
(intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem))
-   ; /* not possible, we are holding reset_lock */
+   ;
}
mutex_unlock(&dev_priv->reset_lock);
 
-- 
1.7.8.5

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-04-28 Thread Marcin Slusarz
On Thu, Apr 26, 2012 at 05:32:29PM +1000, Ben Skeggs wrote:
> On Wed, 2012-04-25 at 23:20 +0200, Marcin Slusarz wrote:
> > Overall idea:
> > Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> > handle them at ioctl level, reset the GPU and repeat last ioctl.
> > 
> > GPU reset is done by doing suspend / resume cycle with few tweaks:
> > - CPU-only bo eviction
> > - ignoring vm flush / fence timeouts
> > - shortening waits
> Okay.  I've thought about this a bit for a couple of days and think I'll
> be able to coherently share my thoughts on this issue now :)
> 
> Firstly, while I agree that we need to become more resilient to errors,
> I don't think that following in the radeon/intel footsteps with
> something (imo, hackish) like this is the right choice for us
> necessarily.

This is not only radeon/intel way. Windows, since Vista SP1, does the
same - see http://msdn.microsoft.com/en-us/windows/hardware/gg487368.
It's funny how similar it is to this patch (I haven't seen this page earlier).

If you fear people will stop reporting bugs - don't. GPU reset is painfully
slow and can take up to 50 seconds (BO eviction is the most time consuming
part), so people will be annoyed enough to report them.
Currently, GPU lockups make users so angry, they frequently switch to blob
without even thinking about reporting anything.

> The *vast* majority of "lockups" we have are as a result of us badly
> mishandling exceptions reported to us by the GPU.  There are a couple of
> exceptions, however, they're very rare..

> A very common example is where people gain DMA_PUSHERs for whatever
> reason, and things go haywire eventually.

Nope, I had tens of lockups during testing, and only once I had DMA_PUSHER
before detecting GPU lockup.

> To handle a DMA_PUSHER
> sanely, generally you have to drop all pending commands for the channel
> (set GET=PUT, etc) and continue on.  However, this leaves us with fences
> and semaphores unsignalled etc, causing issues further up the stack with
> perfectly good channels hanging on attempting to sync with the crashed
> channel etc.
> 
> The next most common example I can think of is nv4x hardware, getting a
> LIMIT_COLOR/ZETA exception from PGRAPH, and then a hang.  The solution
> is simple, learn how to handle the exception, log it, and PGRAPH
> survives.
> 
> I strongly believe that if we focused our efforts on dealing with what
> the GPU reports to us a lot better, we'll find we really don't need such
> "lockup recovery".

While I agree we need to improve on error handling to make "lockup recovery"
not needed, the reality is we can't predict everything and driver needs to
cope with its own bugs.

> I am, however, considering pulling the vm flush timeout error
> propagation and break-out-of-waits-on-signals that builds on it.  As we
> really do need to become better at having killable processes if things
> go wrong :)

Good :)

Marcin
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-04-26 Thread Ben Skeggs
On Wed, 2012-04-25 at 23:20 +0200, Marcin Slusarz wrote:
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
> 
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
Okay.  I've thought about this a bit for a couple of days and think I'll
be able to coherently share my thoughts on this issue now :)

Firstly, while I agree that we need to become more resilient to errors,
I don't think that following in the radeon/intel footsteps with
something (imo, hackish) like this is the right choice for us
necessarily.

The *vast* majority of "lockups" we have are as a result of us badly
mishandling exceptions reported to us by the GPU.  There are a couple of
exceptions, however, they're very rare..

A very common example is where people gain DMA_PUSHERs for whatever
reason, and things go haywire eventually.  To handle a DMA_PUSHER
sanely, generally you have to drop all pending commands for the channel
(set GET=PUT, etc) and continue on.  However, this leaves us with fences
and semaphores unsignalled etc, causing issues further up the stack with
perfectly good channels hanging on attempting to sync with the crashed
channel etc.

The next most common example I can think of is nv4x hardware, getting a
LIMIT_COLOR/ZETA exception from PGRAPH, and then a hang.  The solution
is simple, learn how to handle the exception, log it, and PGRAPH
survives.

I strongly believe that if we focused our efforts on dealing with what
the GPU reports to us a lot better, we'll find we really don't need such
"lockup recovery".

I am, however, considering pulling the vm flush timeout error
propagation and break-out-of-waits-on-signals that builds on it.  As we
really do need to become better at having killable processes if things
go wrong :)

Ben.

> 
> Signed-off-by: Marcin Slusarz 
> ---
>  drivers/gpu/drm/nouveau/Makefile   |2 +-
>  drivers/gpu/drm/nouveau/nouveau_bo.c   |2 +-
>  drivers/gpu/drm/nouveau/nouveau_channel.c  |5 +-
>  drivers/gpu/drm/nouveau/nouveau_drv.c  |   56 ++-
>  drivers/gpu/drm/nouveau/nouveau_drv.h  |   45 -
>  drivers/gpu/drm/nouveau/nouveau_fence.c|7 +-
>  drivers/gpu/drm/nouveau/nouveau_gem.c  |   14 +++-
>  drivers/gpu/drm/nouveau/nouveau_notifier.c |3 +
>  drivers/gpu/drm/nouveau/nouveau_object.c   |6 +
>  drivers/gpu/drm/nouveau/nouveau_reset.c|  148 
> 
>  drivers/gpu/drm/nouveau/nouveau_state.c|6 +
>  drivers/gpu/drm/nouveau/nv50_graph.c   |   11 +-
>  12 files changed, 290 insertions(+), 15 deletions(-)
>  create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c
> 
> diff --git a/drivers/gpu/drm/nouveau/Makefile 
> b/drivers/gpu/drm/nouveau/Makefile
> index 03860f5..77d0c33 100644
> --- a/drivers/gpu/drm/nouveau/Makefile
> +++ b/drivers/gpu/drm/nouveau/Makefile
> @@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o 
> nouveau_mem.o \
>   nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
>   nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
>   nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
> - nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
> + nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
>nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
>nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
>   nv04_timer.o \
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c 
> b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index 5b0dc50..7de6cad 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, 
> bool intr,
>   }
>  
>   /* Software copy if the card isn't up and running yet. */
> - if (!dev_priv->channel) {
> + if (!dev_priv->channel || nouveau_gpu_reset_in_progress(dev_priv->dev)) 
> {
>   ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, 
> no_wait_gpu, new_mem);
>   goto out;
>   }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c 
> b/drivers/gpu/drm/nouveau/nouveau_channel.c
> index 846afb0..c0fa5a7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_channel.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
> @@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void 
> *data,
>   init->fb_ctxdma_handle,
>   init->tt_ctxdma_handle);
>   if (ret)
> - return ret;
> + goto out;
>   init->channel  = chan->id;
>  
>   if (nouveau_vram_pushbuf == 0) {
> @@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void 
> *data,
>   if (ret ==

Re: [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-04-25 Thread Marcin Slusarz
On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz wrote:
> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
> 
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
> 
> Signed-off-by: Marcin Slusarz 
> ---

What changed from v1:
- moved ioctl locking from drm core to nouveau
- made down_reads interruptible
- fixed build bug on 32-bit systems
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

2012-04-25 Thread Marcin Slusarz
Overall idea:
Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
handle them at ioctl level, reset the GPU and repeat last ioctl.

GPU reset is done by doing suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening waits

Signed-off-by: Marcin Slusarz 
---
 drivers/gpu/drm/nouveau/Makefile   |2 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c   |2 +-
 drivers/gpu/drm/nouveau/nouveau_channel.c  |5 +-
 drivers/gpu/drm/nouveau/nouveau_drv.c  |   56 ++-
 drivers/gpu/drm/nouveau/nouveau_drv.h  |   45 -
 drivers/gpu/drm/nouveau/nouveau_fence.c|7 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c  |   14 +++-
 drivers/gpu/drm/nouveau/nouveau_notifier.c |3 +
 drivers/gpu/drm/nouveau/nouveau_object.c   |6 +
 drivers/gpu/drm/nouveau/nouveau_reset.c|  148 
 drivers/gpu/drm/nouveau/nouveau_state.c|6 +
 drivers/gpu/drm/nouveau/nv50_graph.c   |   11 +-
 12 files changed, 290 insertions(+), 15 deletions(-)
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c

diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 03860f5..77d0c33 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o 
nouveau_mem.o \
  nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
  nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
  nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
- nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+ nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
 nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
 nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
  nv04_timer.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c 
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 5b0dc50..7de6cad 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, 
bool intr,
}
 
/* Software copy if the card isn't up and running yet. */
-   if (!dev_priv->channel) {
+   if (!dev_priv->channel || nouveau_gpu_reset_in_progress(dev_priv->dev)) 
{
ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, 
no_wait_gpu, new_mem);
goto out;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c 
b/drivers/gpu/drm/nouveau/nouveau_channel.c
index 846afb0..c0fa5a7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_channel.c
+++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
@@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
init->fb_ctxdma_handle,
init->tt_ctxdma_handle);
if (ret)
-   return ret;
+   goto out;
init->channel  = chan->id;
 
if (nouveau_vram_pushbuf == 0) {
@@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
if (ret == 0)
atomic_inc(&chan->users); /* userspace reference */
nouveau_channel_put(&chan);
+out:
+   if (ret == -EIO)
+   ret = nouveau_reset_device(dev);
return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c 
b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 090fff6..261e1f5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -237,7 +237,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t 
pm_state)
if (!dev_priv->eng[e])
continue;
 
-   ret = dev_priv->eng[e]->fini(dev, e, true);
+   ret = dev_priv->eng[e]->fini(dev, e, 
!nouveau_gpu_reset_in_progress(dev));
if (ret) {
NV_ERROR(dev, "... engine %d failed: %d\n", e, ret);
goto out_abort;
@@ -443,11 +443,63 @@ nouveau_pci_resume(struct pci_dev *pdev)
return 0;
 }
 
+void intr_rwsem_init(struct intr_rwsem *r)
+{
+   init_rwsem(&r->rwsem);
+   mutex_init(&r->mutex);
+}
+
+int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
+{
+   while (down_read_trylock(&r->rwsem) == 0) {
+   int ret = mutex_lock_interruptible(&r->mutex);
+   if (ret)
+   return ret;
+   mutex_unlock(&r->mutex);
+   }
+   return 0;
+}
+
+void intr_rwsem_up_read(struct intr_rwsem *r)
+{
+   up_read(&r->rwsem);
+}
+
+void intr_rwsem_down_write(struct intr_rwsem *r)
+{
+   mutex_lock(&r->mutex);
+   down_write(&r->rwsem);
+}
+
+void intr_rwsem_up_write(struct intr_rwsem *r)
+{
+   up_write(&r->rwsem);
+   mutex_unlock(&r->mutex);
+}
+
+static long nouveau_ioctl(struct file *