[PATCH 10/10] drm/radeon: work around bugs in caymans compute rings

2012-06-01 Thread Christian König
From: Christian Koenig 

The shader preemption on cayman doesn't work
correctly with multiple rings. So to be able to
still make use of the compute rings we use a
semaphore to make sure that only one IB can execute
at the same time.

This isn't as effective as shader preemption, but
also isn't as bad as putting everything on the GFX ring.

Signed-off-by: Christian Koenig 
---
 drivers/gpu/drm/radeon/ni.c|  142 ++--
 drivers/gpu/drm/radeon/radeon.h|2 +
 drivers/gpu/drm/radeon/radeon_cs.c |2 +-
 3 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index 9d9f5ac..6a3e8a8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1125,13 +1125,75 @@ void cayman_fence_ring_emit(struct radeon_device *rdev,
radeon_ring_write(ring, 0);
 }

+/* The shader preemption on cayman doesn't work
+ * correctly with multiple rings. So to be able to
+ * still make use of the compute rings we use a
+ * semaphore to make sure that only one IB can execute
+ * at the same time
+ */
+static void cayman_cp_ring_create_workaround(struct radeon_device *rdev)
+{
+   struct radeon_ring *ring = >ring[RADEON_RING_TYPE_GFX_INDEX];
+   int r;
+
+   r = radeon_semaphore_create(rdev, >cayman_ring_lock);
+   if (r) {
+   dev_err(rdev->dev, "Can't allocate "
+   "cayman_ring_lock (%d)!\n", r);
+   return;
+   }
+
+   r = radeon_ring_alloc(rdev, ring, 8);
+   if (r) {
+   dev_err(rdev->dev, "Can't initialize "
+   "cayman_ring_lock (%d)!\n", r);
+   radeon_semaphore_free(rdev, >cayman_ring_lock, NULL);
+   return;
+   }
+
+   radeon_semaphore_emit_signal(rdev, RADEON_RING_TYPE_GFX_INDEX,
+rdev->cayman_ring_lock);
+
+   radeon_ring_commit(rdev, >ring[RADEON_RING_TYPE_GFX_INDEX]);
+}
+
+static void cayman_cp_ring_cleanup_workaround(struct radeon_device *rdev)
+{
+   struct radeon_fence *fence;
+   int r;
+
+   r = radeon_fence_emit(rdev, , RADEON_RING_TYPE_GFX_INDEX);
+   if (r) {
+   dev_err(rdev->dev, "Can't cleanup "
+   "cayman_ring_lock (%d)!\n", r);
+   return;
+   }
+
+   radeon_semaphore_free(rdev, >cayman_ring_lock, fence);
+   radeon_fence_unref();
+}
+
 void cayman_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib)
 {
struct radeon_ring *ring = >ring[ib->ring];

+   if (ib->ring != RADEON_RING_TYPE_GFX_INDEX) {
+   if (rdev->cayman_ring_lock == NULL) {
+   cayman_cp_ring_create_workaround(rdev);
+   }
+   } else {
+   if (rdev->cayman_ring_lock != NULL &&
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP1_INDEX) &&
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP2_INDEX)) {
+   cayman_cp_ring_cleanup_workaround(rdev);
+   }
+   }
+
/* set to DX10/11 mode */
radeon_ring_write(ring, PACKET3(PACKET3_MODE_CONTROL, 0));
radeon_ring_write(ring, 1);
+   if (rdev->cayman_ring_lock)
+   radeon_semaphore_emit_wait(rdev, ib->ring, 
rdev->cayman_ring_lock);
radeon_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
radeon_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -1140,6 +1202,8 @@ void cayman_ring_ib_execute(struct radeon_device *rdev, 
struct radeon_ib *ib)
  (ib->gpu_addr & 0xFFFC));
radeon_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFF);
radeon_ring_write(ring, ib->length_dw | (ib->vm_id << 24));
+   if (rdev->cayman_ring_lock)
+   radeon_semaphore_emit_signal(rdev, ib->ring, 
rdev->cayman_ring_lock);

/* flush read cache over gart for this vmid */
radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
@@ -1190,6 +1254,25 @@ static int cayman_cp_load_microcode(struct radeon_device 
*rdev)
return 0;
 }

+static int cayman_cp_start_compute(struct radeon_device *rdev, int ridx)
+{
+   struct radeon_ring *ring = >ring[ridx];
+   int r;
+
+   r = radeon_ring_lock(rdev, ring, 2);
+   if (r) {
+   DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r);
+   return r;
+   }
+
+   /* clear the compute context state */
+   radeon_ring_write(ring, PACKET3(PACKET3_CLEAR_STATE, 0) | 2);
+   radeon_ring_write(ring, 0);
+
+   radeon_ring_unlock_commit(rdev, ring);
+   return 0;
+}
+
 static int cayman_cp_start(struct radeon_device *rdev)
 {
struct radeon_ring *ring = >ring[RADEON_RING_TYPE_GFX_INDEX];
@@ -1251,7 +1334,17 @@ static int cayman_cp_start(struct radeon_device *rdev)

radeon_ring_unlock_commit(rdev, ring);

-   /* XXX init 

[PATCH 10/10] drm/radeon: work around bugs in caymans compute rings

2012-06-01 Thread Christian König
From: Christian Koenig christian.koe...@amd.com

The shader preemption on cayman doesn't work
correctly with multiple rings. So to be able to
still make use of the compute rings we use a
semaphore to make sure that only one IB can execute
at the same time.

This isn't as effective as shader preemption, but
also isn't as bad as putting everything on the GFX ring.

Signed-off-by: Christian Koenig christian.koe...@amd.com
---
 drivers/gpu/drm/radeon/ni.c|  142 ++--
 drivers/gpu/drm/radeon/radeon.h|2 +
 drivers/gpu/drm/radeon/radeon_cs.c |2 +-
 3 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index 9d9f5ac..6a3e8a8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1125,13 +1125,75 @@ void cayman_fence_ring_emit(struct radeon_device *rdev,
radeon_ring_write(ring, 0);
 }
 
+/* The shader preemption on cayman doesn't work
+ * correctly with multiple rings. So to be able to
+ * still make use of the compute rings we use a
+ * semaphore to make sure that only one IB can execute
+ * at the same time
+ */
+static void cayman_cp_ring_create_workaround(struct radeon_device *rdev)
+{
+   struct radeon_ring *ring = rdev-ring[RADEON_RING_TYPE_GFX_INDEX];
+   int r;
+
+   r = radeon_semaphore_create(rdev, rdev-cayman_ring_lock);
+   if (r) {
+   dev_err(rdev-dev, Can't allocate 
+   cayman_ring_lock (%d)!\n, r);
+   return;
+   }
+
+   r = radeon_ring_alloc(rdev, ring, 8);
+   if (r) {
+   dev_err(rdev-dev, Can't initialize 
+   cayman_ring_lock (%d)!\n, r);
+   radeon_semaphore_free(rdev, rdev-cayman_ring_lock, NULL);
+   return;
+   }
+
+   radeon_semaphore_emit_signal(rdev, RADEON_RING_TYPE_GFX_INDEX,
+rdev-cayman_ring_lock);
+
+   radeon_ring_commit(rdev, rdev-ring[RADEON_RING_TYPE_GFX_INDEX]);
+}
+
+static void cayman_cp_ring_cleanup_workaround(struct radeon_device *rdev)
+{
+   struct radeon_fence *fence;
+   int r;
+
+   r = radeon_fence_emit(rdev, fence, RADEON_RING_TYPE_GFX_INDEX);
+   if (r) {
+   dev_err(rdev-dev, Can't cleanup 
+   cayman_ring_lock (%d)!\n, r);
+   return;
+   }
+
+   radeon_semaphore_free(rdev, rdev-cayman_ring_lock, fence);
+   radeon_fence_unref(fence);
+}
+
 void cayman_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib)
 {
struct radeon_ring *ring = rdev-ring[ib-ring];
 
+   if (ib-ring != RADEON_RING_TYPE_GFX_INDEX) {
+   if (rdev-cayman_ring_lock == NULL) {
+   cayman_cp_ring_create_workaround(rdev);
+   }
+   } else {
+   if (rdev-cayman_ring_lock != NULL 
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP1_INDEX) 
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP2_INDEX)) {
+   cayman_cp_ring_cleanup_workaround(rdev);
+   }
+   }
+
/* set to DX10/11 mode */
radeon_ring_write(ring, PACKET3(PACKET3_MODE_CONTROL, 0));
radeon_ring_write(ring, 1);
+   if (rdev-cayman_ring_lock)
+   radeon_semaphore_emit_wait(rdev, ib-ring, 
rdev-cayman_ring_lock);
radeon_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
radeon_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -1140,6 +1202,8 @@ void cayman_ring_ib_execute(struct radeon_device *rdev, 
struct radeon_ib *ib)
  (ib-gpu_addr  0xFFFC));
radeon_ring_write(ring, upper_32_bits(ib-gpu_addr)  0xFF);
radeon_ring_write(ring, ib-length_dw | (ib-vm_id  24));
+   if (rdev-cayman_ring_lock)
+   radeon_semaphore_emit_signal(rdev, ib-ring, 
rdev-cayman_ring_lock);
 
/* flush read cache over gart for this vmid */
radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
@@ -1190,6 +1254,25 @@ static int cayman_cp_load_microcode(struct radeon_device 
*rdev)
return 0;
 }
 
+static int cayman_cp_start_compute(struct radeon_device *rdev, int ridx)
+{
+   struct radeon_ring *ring = rdev-ring[ridx];
+   int r;
+
+   r = radeon_ring_lock(rdev, ring, 2);
+   if (r) {
+   DRM_ERROR(radeon: cp failed to lock ring (%d).\n, r);
+   return r;
+   }
+
+   /* clear the compute context state */
+   radeon_ring_write(ring, PACKET3(PACKET3_CLEAR_STATE, 0) | 2);
+   radeon_ring_write(ring, 0);
+
+   radeon_ring_unlock_commit(rdev, ring);
+   return 0;
+}
+
 static int cayman_cp_start(struct radeon_device *rdev)
 {
struct radeon_ring *ring = rdev-ring[RADEON_RING_TYPE_GFX_INDEX];
@@ -1251,7 +1334,17 @@ static int cayman_cp_start(struct radeon_device *rdev)
 
radeon_ring_unlock_commit(rdev, 

[PATCH 10/10] drm/radeon: work around bugs in caymans compute rings

2012-05-31 Thread Christian König
From: Christian Koenig 

The shader preemption on cayman doesn't work
correctly with multiple rings. So to be able to
still make use of the compute rings we use a
semaphore to make sure that only one IB can execute
at the same time.

This isn't as effective as shader preemption, but
also isn't as bad as putting everything on the GFX ring.

Signed-off-by: Christian Koenig 
---
 drivers/gpu/drm/radeon/ni.c|  142 ++--
 drivers/gpu/drm/radeon/radeon.h|2 +
 drivers/gpu/drm/radeon/radeon_cs.c |2 +-
 3 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index 9d9f5ac..6a3e8a8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1125,13 +1125,75 @@ void cayman_fence_ring_emit(struct radeon_device *rdev,
radeon_ring_write(ring, 0);
 }

+/* The shader preemption on cayman doesn't work
+ * correctly with multiple rings. So to be able to
+ * still make use of the compute rings we use a
+ * semaphore to make sure that only one IB can execute
+ * at the same time
+ */
+static void cayman_cp_ring_create_workaround(struct radeon_device *rdev)
+{
+   struct radeon_ring *ring = >ring[RADEON_RING_TYPE_GFX_INDEX];
+   int r;
+
+   r = radeon_semaphore_create(rdev, >cayman_ring_lock);
+   if (r) {
+   dev_err(rdev->dev, "Can't allocate "
+   "cayman_ring_lock (%d)!\n", r);
+   return;
+   }
+
+   r = radeon_ring_alloc(rdev, ring, 8);
+   if (r) {
+   dev_err(rdev->dev, "Can't initialize "
+   "cayman_ring_lock (%d)!\n", r);
+   radeon_semaphore_free(rdev, >cayman_ring_lock, NULL);
+   return;
+   }
+
+   radeon_semaphore_emit_signal(rdev, RADEON_RING_TYPE_GFX_INDEX,
+rdev->cayman_ring_lock);
+
+   radeon_ring_commit(rdev, >ring[RADEON_RING_TYPE_GFX_INDEX]);
+}
+
+static void cayman_cp_ring_cleanup_workaround(struct radeon_device *rdev)
+{
+   struct radeon_fence *fence;
+   int r;
+
+   r = radeon_fence_emit(rdev, , RADEON_RING_TYPE_GFX_INDEX);
+   if (r) {
+   dev_err(rdev->dev, "Can't cleanup "
+   "cayman_ring_lock (%d)!\n", r);
+   return;
+   }
+
+   radeon_semaphore_free(rdev, >cayman_ring_lock, fence);
+   radeon_fence_unref();
+}
+
 void cayman_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib)
 {
struct radeon_ring *ring = >ring[ib->ring];

+   if (ib->ring != RADEON_RING_TYPE_GFX_INDEX) {
+   if (rdev->cayman_ring_lock == NULL) {
+   cayman_cp_ring_create_workaround(rdev);
+   }
+   } else {
+   if (rdev->cayman_ring_lock != NULL &&
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP1_INDEX) &&
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP2_INDEX)) {
+   cayman_cp_ring_cleanup_workaround(rdev);
+   }
+   }
+
/* set to DX10/11 mode */
radeon_ring_write(ring, PACKET3(PACKET3_MODE_CONTROL, 0));
radeon_ring_write(ring, 1);
+   if (rdev->cayman_ring_lock)
+   radeon_semaphore_emit_wait(rdev, ib->ring, 
rdev->cayman_ring_lock);
radeon_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
radeon_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -1140,6 +1202,8 @@ void cayman_ring_ib_execute(struct radeon_device *rdev, 
struct radeon_ib *ib)
  (ib->gpu_addr & 0xFFFC));
radeon_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFF);
radeon_ring_write(ring, ib->length_dw | (ib->vm_id << 24));
+   if (rdev->cayman_ring_lock)
+   radeon_semaphore_emit_signal(rdev, ib->ring, 
rdev->cayman_ring_lock);

/* flush read cache over gart for this vmid */
radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
@@ -1190,6 +1254,25 @@ static int cayman_cp_load_microcode(struct radeon_device 
*rdev)
return 0;
 }

+static int cayman_cp_start_compute(struct radeon_device *rdev, int ridx)
+{
+   struct radeon_ring *ring = >ring[ridx];
+   int r;
+
+   r = radeon_ring_lock(rdev, ring, 2);
+   if (r) {
+   DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r);
+   return r;
+   }
+
+   /* clear the compute context state */
+   radeon_ring_write(ring, PACKET3(PACKET3_CLEAR_STATE, 0) | 2);
+   radeon_ring_write(ring, 0);
+
+   radeon_ring_unlock_commit(rdev, ring);
+   return 0;
+}
+
 static int cayman_cp_start(struct radeon_device *rdev)
 {
struct radeon_ring *ring = >ring[RADEON_RING_TYPE_GFX_INDEX];
@@ -1251,7 +1334,17 @@ static int cayman_cp_start(struct radeon_device *rdev)

radeon_ring_unlock_commit(rdev, ring);

-   /* XXX init 

[PATCH 10/10] drm/radeon: work around bugs in caymans compute rings

2012-05-31 Thread Christian König
From: Christian Koenig christian.koe...@amd.com

The shader preemption on cayman doesn't work
correctly with multiple rings. So to be able to
still make use of the compute rings we use a
semaphore to make sure that only one IB can execute
at the same time.

This isn't as effective as shader preemption, but
also isn't as bad as putting everything on the GFX ring.

Signed-off-by: Christian Koenig christian.koe...@amd.com
---
 drivers/gpu/drm/radeon/ni.c|  142 ++--
 drivers/gpu/drm/radeon/radeon.h|2 +
 drivers/gpu/drm/radeon/radeon_cs.c |2 +-
 3 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index 9d9f5ac..6a3e8a8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1125,13 +1125,75 @@ void cayman_fence_ring_emit(struct radeon_device *rdev,
radeon_ring_write(ring, 0);
 }
 
+/* The shader preemption on cayman doesn't work
+ * correctly with multiple rings. So to be able to
+ * still make use of the compute rings we use a
+ * semaphore to make sure that only one IB can execute
+ * at the same time
+ */
+static void cayman_cp_ring_create_workaround(struct radeon_device *rdev)
+{
+   struct radeon_ring *ring = rdev-ring[RADEON_RING_TYPE_GFX_INDEX];
+   int r;
+
+   r = radeon_semaphore_create(rdev, rdev-cayman_ring_lock);
+   if (r) {
+   dev_err(rdev-dev, Can't allocate 
+   cayman_ring_lock (%d)!\n, r);
+   return;
+   }
+
+   r = radeon_ring_alloc(rdev, ring, 8);
+   if (r) {
+   dev_err(rdev-dev, Can't initialize 
+   cayman_ring_lock (%d)!\n, r);
+   radeon_semaphore_free(rdev, rdev-cayman_ring_lock, NULL);
+   return;
+   }
+
+   radeon_semaphore_emit_signal(rdev, RADEON_RING_TYPE_GFX_INDEX,
+rdev-cayman_ring_lock);
+
+   radeon_ring_commit(rdev, rdev-ring[RADEON_RING_TYPE_GFX_INDEX]);
+}
+
+static void cayman_cp_ring_cleanup_workaround(struct radeon_device *rdev)
+{
+   struct radeon_fence *fence;
+   int r;
+
+   r = radeon_fence_emit(rdev, fence, RADEON_RING_TYPE_GFX_INDEX);
+   if (r) {
+   dev_err(rdev-dev, Can't cleanup 
+   cayman_ring_lock (%d)!\n, r);
+   return;
+   }
+
+   radeon_semaphore_free(rdev, rdev-cayman_ring_lock, fence);
+   radeon_fence_unref(fence);
+}
+
 void cayman_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib)
 {
struct radeon_ring *ring = rdev-ring[ib-ring];
 
+   if (ib-ring != RADEON_RING_TYPE_GFX_INDEX) {
+   if (rdev-cayman_ring_lock == NULL) {
+   cayman_cp_ring_create_workaround(rdev);
+   }
+   } else {
+   if (rdev-cayman_ring_lock != NULL 
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP1_INDEX) 
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP2_INDEX)) {
+   cayman_cp_ring_cleanup_workaround(rdev);
+   }
+   }
+
/* set to DX10/11 mode */
radeon_ring_write(ring, PACKET3(PACKET3_MODE_CONTROL, 0));
radeon_ring_write(ring, 1);
+   if (rdev-cayman_ring_lock)
+   radeon_semaphore_emit_wait(rdev, ib-ring, 
rdev-cayman_ring_lock);
radeon_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
radeon_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -1140,6 +1202,8 @@ void cayman_ring_ib_execute(struct radeon_device *rdev, 
struct radeon_ib *ib)
  (ib-gpu_addr  0xFFFC));
radeon_ring_write(ring, upper_32_bits(ib-gpu_addr)  0xFF);
radeon_ring_write(ring, ib-length_dw | (ib-vm_id  24));
+   if (rdev-cayman_ring_lock)
+   radeon_semaphore_emit_signal(rdev, ib-ring, 
rdev-cayman_ring_lock);
 
/* flush read cache over gart for this vmid */
radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
@@ -1190,6 +1254,25 @@ static int cayman_cp_load_microcode(struct radeon_device 
*rdev)
return 0;
 }
 
+static int cayman_cp_start_compute(struct radeon_device *rdev, int ridx)
+{
+   struct radeon_ring *ring = rdev-ring[ridx];
+   int r;
+
+   r = radeon_ring_lock(rdev, ring, 2);
+   if (r) {
+   DRM_ERROR(radeon: cp failed to lock ring (%d).\n, r);
+   return r;
+   }
+
+   /* clear the compute context state */
+   radeon_ring_write(ring, PACKET3(PACKET3_CLEAR_STATE, 0) | 2);
+   radeon_ring_write(ring, 0);
+
+   radeon_ring_unlock_commit(rdev, ring);
+   return 0;
+}
+
 static int cayman_cp_start(struct radeon_device *rdev)
 {
struct radeon_ring *ring = rdev-ring[RADEON_RING_TYPE_GFX_INDEX];
@@ -1251,7 +1334,17 @@ static int cayman_cp_start(struct radeon_device *rdev)
 
radeon_ring_unlock_commit(rdev, 

[PATCH 10/10] drm/radeon: work around bugs in caymans compute rings

2012-05-24 Thread Christian König
From: Christian Koenig 

The shader preemption on cayman doesn't work
correctly with multiple rings. So to be able to
still make use of the compute rings we use a
semaphore to make sure that only one IB can execute
at the same time.

This isn't as effective as shader preemption, but
also isn't as bad as putting everything on the GFX ring.

Signed-off-by: Christian Koenig 
---
 drivers/gpu/drm/radeon/ni.c|  142 ++--
 drivers/gpu/drm/radeon/radeon.h|2 +
 drivers/gpu/drm/radeon/radeon_cs.c |2 +-
 3 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index 9d9f5ac..6a3e8a8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1125,13 +1125,75 @@ void cayman_fence_ring_emit(struct radeon_device *rdev,
radeon_ring_write(ring, 0);
 }

+/* The shader preemption on cayman doesn't work
+ * correctly with multiple rings. So to be able to
+ * still make use of the compute rings we use a
+ * semaphore to make sure that only one IB can execute
+ * at the same time
+ */
+static void cayman_cp_ring_create_workaround(struct radeon_device *rdev)
+{
+   struct radeon_ring *ring = >ring[RADEON_RING_TYPE_GFX_INDEX];
+   int r;
+
+   r = radeon_semaphore_create(rdev, >cayman_ring_lock);
+   if (r) {
+   dev_err(rdev->dev, "Can't allocate "
+   "cayman_ring_lock (%d)!\n", r);
+   return;
+   }
+
+   r = radeon_ring_alloc(rdev, ring, 8);
+   if (r) {
+   dev_err(rdev->dev, "Can't initialize "
+   "cayman_ring_lock (%d)!\n", r);
+   radeon_semaphore_free(rdev, >cayman_ring_lock, NULL);
+   return;
+   }
+
+   radeon_semaphore_emit_signal(rdev, RADEON_RING_TYPE_GFX_INDEX,
+rdev->cayman_ring_lock);
+
+   radeon_ring_commit(rdev, >ring[RADEON_RING_TYPE_GFX_INDEX]);
+}
+
+static void cayman_cp_ring_cleanup_workaround(struct radeon_device *rdev)
+{
+   struct radeon_fence *fence;
+   int r;
+
+   r = radeon_fence_emit(rdev, , RADEON_RING_TYPE_GFX_INDEX);
+   if (r) {
+   dev_err(rdev->dev, "Can't cleanup "
+   "cayman_ring_lock (%d)!\n", r);
+   return;
+   }
+
+   radeon_semaphore_free(rdev, >cayman_ring_lock, fence);
+   radeon_fence_unref();
+}
+
 void cayman_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib)
 {
struct radeon_ring *ring = >ring[ib->ring];

+   if (ib->ring != RADEON_RING_TYPE_GFX_INDEX) {
+   if (rdev->cayman_ring_lock == NULL) {
+   cayman_cp_ring_create_workaround(rdev);
+   }
+   } else {
+   if (rdev->cayman_ring_lock != NULL &&
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP1_INDEX) &&
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP2_INDEX)) {
+   cayman_cp_ring_cleanup_workaround(rdev);
+   }
+   }
+
/* set to DX10/11 mode */
radeon_ring_write(ring, PACKET3(PACKET3_MODE_CONTROL, 0));
radeon_ring_write(ring, 1);
+   if (rdev->cayman_ring_lock)
+   radeon_semaphore_emit_wait(rdev, ib->ring, 
rdev->cayman_ring_lock);
radeon_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
radeon_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -1140,6 +1202,8 @@ void cayman_ring_ib_execute(struct radeon_device *rdev, 
struct radeon_ib *ib)
  (ib->gpu_addr & 0xFFFC));
radeon_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFF);
radeon_ring_write(ring, ib->length_dw | (ib->vm_id << 24));
+   if (rdev->cayman_ring_lock)
+   radeon_semaphore_emit_signal(rdev, ib->ring, 
rdev->cayman_ring_lock);

/* flush read cache over gart for this vmid */
radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
@@ -1190,6 +1254,25 @@ static int cayman_cp_load_microcode(struct radeon_device 
*rdev)
return 0;
 }

+static int cayman_cp_start_compute(struct radeon_device *rdev, int ridx)
+{
+   struct radeon_ring *ring = >ring[ridx];
+   int r;
+
+   r = radeon_ring_lock(rdev, ring, 2);
+   if (r) {
+   DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r);
+   return r;
+   }
+
+   /* clear the compute context state */
+   radeon_ring_write(ring, PACKET3(PACKET3_CLEAR_STATE, 0) | 2);
+   radeon_ring_write(ring, 0);
+
+   radeon_ring_unlock_commit(rdev, ring);
+   return 0;
+}
+
 static int cayman_cp_start(struct radeon_device *rdev)
 {
struct radeon_ring *ring = >ring[RADEON_RING_TYPE_GFX_INDEX];
@@ -1251,7 +1334,17 @@ static int cayman_cp_start(struct radeon_device *rdev)

radeon_ring_unlock_commit(rdev, ring);

-   /* XXX init 

[PATCH 10/10] drm/radeon: work around bugs in caymans compute rings

2012-05-24 Thread Christian König
From: Christian Koenig christian.koe...@amd.com

The shader preemption on cayman doesn't work
correctly with multiple rings. So to be able to
still make use of the compute rings we use a
semaphore to make sure that only one IB can execute
at the same time.

This isn't as effective as shader preemption, but
also isn't as bad as putting everything on the GFX ring.

Signed-off-by: Christian Koenig christian.koe...@amd.com
---
 drivers/gpu/drm/radeon/ni.c|  142 ++--
 drivers/gpu/drm/radeon/radeon.h|2 +
 drivers/gpu/drm/radeon/radeon_cs.c |2 +-
 3 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index 9d9f5ac..6a3e8a8 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1125,13 +1125,75 @@ void cayman_fence_ring_emit(struct radeon_device *rdev,
radeon_ring_write(ring, 0);
 }
 
+/* The shader preemption on cayman doesn't work
+ * correctly with multiple rings. So to be able to
+ * still make use of the compute rings we use a
+ * semaphore to make sure that only one IB can execute
+ * at the same time
+ */
+static void cayman_cp_ring_create_workaround(struct radeon_device *rdev)
+{
+   struct radeon_ring *ring = rdev-ring[RADEON_RING_TYPE_GFX_INDEX];
+   int r;
+
+   r = radeon_semaphore_create(rdev, rdev-cayman_ring_lock);
+   if (r) {
+   dev_err(rdev-dev, Can't allocate 
+   cayman_ring_lock (%d)!\n, r);
+   return;
+   }
+
+   r = radeon_ring_alloc(rdev, ring, 8);
+   if (r) {
+   dev_err(rdev-dev, Can't initialize 
+   cayman_ring_lock (%d)!\n, r);
+   radeon_semaphore_free(rdev, rdev-cayman_ring_lock, NULL);
+   return;
+   }
+
+   radeon_semaphore_emit_signal(rdev, RADEON_RING_TYPE_GFX_INDEX,
+rdev-cayman_ring_lock);
+
+   radeon_ring_commit(rdev, rdev-ring[RADEON_RING_TYPE_GFX_INDEX]);
+}
+
+static void cayman_cp_ring_cleanup_workaround(struct radeon_device *rdev)
+{
+   struct radeon_fence *fence;
+   int r;
+
+   r = radeon_fence_emit(rdev, fence, RADEON_RING_TYPE_GFX_INDEX);
+   if (r) {
+   dev_err(rdev-dev, Can't cleanup 
+   cayman_ring_lock (%d)!\n, r);
+   return;
+   }
+
+   radeon_semaphore_free(rdev, rdev-cayman_ring_lock, fence);
+   radeon_fence_unref(fence);
+}
+
 void cayman_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib)
 {
struct radeon_ring *ring = rdev-ring[ib-ring];
 
+   if (ib-ring != RADEON_RING_TYPE_GFX_INDEX) {
+   if (rdev-cayman_ring_lock == NULL) {
+   cayman_cp_ring_create_workaround(rdev);
+   }
+   } else {
+   if (rdev-cayman_ring_lock != NULL 
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP1_INDEX) 
+   !radeon_fence_count_emitted(rdev, 
CAYMAN_RING_TYPE_CP2_INDEX)) {
+   cayman_cp_ring_cleanup_workaround(rdev);
+   }
+   }
+
/* set to DX10/11 mode */
radeon_ring_write(ring, PACKET3(PACKET3_MODE_CONTROL, 0));
radeon_ring_write(ring, 1);
+   if (rdev-cayman_ring_lock)
+   radeon_semaphore_emit_wait(rdev, ib-ring, 
rdev-cayman_ring_lock);
radeon_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
radeon_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -1140,6 +1202,8 @@ void cayman_ring_ib_execute(struct radeon_device *rdev, 
struct radeon_ib *ib)
  (ib-gpu_addr  0xFFFC));
radeon_ring_write(ring, upper_32_bits(ib-gpu_addr)  0xFF);
radeon_ring_write(ring, ib-length_dw | (ib-vm_id  24));
+   if (rdev-cayman_ring_lock)
+   radeon_semaphore_emit_signal(rdev, ib-ring, 
rdev-cayman_ring_lock);
 
/* flush read cache over gart for this vmid */
radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
@@ -1190,6 +1254,25 @@ static int cayman_cp_load_microcode(struct radeon_device 
*rdev)
return 0;
 }
 
+static int cayman_cp_start_compute(struct radeon_device *rdev, int ridx)
+{
+   struct radeon_ring *ring = rdev-ring[ridx];
+   int r;
+
+   r = radeon_ring_lock(rdev, ring, 2);
+   if (r) {
+   DRM_ERROR(radeon: cp failed to lock ring (%d).\n, r);
+   return r;
+   }
+
+   /* clear the compute context state */
+   radeon_ring_write(ring, PACKET3(PACKET3_CLEAR_STATE, 0) | 2);
+   radeon_ring_write(ring, 0);
+
+   radeon_ring_unlock_commit(rdev, ring);
+   return 0;
+}
+
 static int cayman_cp_start(struct radeon_device *rdev)
 {
struct radeon_ring *ring = rdev-ring[RADEON_RING_TYPE_GFX_INDEX];
@@ -1251,7 +1334,17 @@ static int cayman_cp_start(struct radeon_device *rdev)
 
radeon_ring_unlock_commit(rdev,