[Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator (v2)

2010-02-09 Thread Luca Barbieri
Changes in v2:
- Addressed review comments
- Fixed lockless algorithm (must not dec if negative in addition to if 0)
- Made spinlock irqsave (fences are completed in IRQs)

This patch adds code to allocate semaphores in a dynamic way using
a lockless algorithm.

1. Semaphore BOs

Semaphore BOs are BOs containing semaphores. Each is 4KB large and
contains 1024 4-byte semaphores. They are pinned and mapped.

Semaphore BOs are allocated on-demand and freed at device takedown.
Those that are not fully allocated are kept on a free list.

Each is assigned an handle. DMA objects and references are created
on demand for each channel that needs to use a semaphore BO.
Those objects and references are automatically destroyed at channel
destruction time.

Typically only a single semaphore BO will be used.

2. Semaphore allocation

Each semaphore BO contains a bitmask of free semaphores within the BO.
Allocation is done in a lockless fashion using a count of free
semaphores and the bitmask.

Semaphores are released once the fence on the waiting side passed.
This is done by adding fields to nouveau_fence.

Semaphore values are zeroed when the semaphore BO is allocated, and
are afterwards only modified by the GPU.

This is performed by storing a bitmask that allows to alternate
between using the values 0 and 1 for a given semaphore.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_drv.h   |9 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |  265 +++
 drivers/gpu/drm/nouveau/nouveau_state.c |4 +
 3 files changed, 278 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index bb9024c..93e5427 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -621,6 +621,13 @@ struct drm_nouveau_private {
struct {
struct dentry *channel_root;
} debugfs;
+
+   struct {
+   spinlock_t free_list_lock;
+   struct nouveau_sem_bo *free_list;
+   uint32_t handles;
+   uint32_t max_handles;
+   } sem;
 };
 
 static inline struct drm_nouveau_private *
@@ -1142,6 +1149,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
 extern void nouveau_fence_handler(struct drm_device *dev, int channel);
+extern void nouveau_fence_device_init(struct drm_device *dev);
+extern void nouveau_fence_device_takedown(struct drm_device *dev);
 
 /* nouveau_gem.c */
 extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 9b1c2c3..7157148 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -32,6 +32,13 @@
 
 #define USE_REFCNT (dev_priv-card_type = NV_10)
 
+#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
+
+/* reading fences can be very expensive
+ * use a threshold that would only use up half a single sem_bo
+ */
+#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / 
(NOUVEAU_MAX_CHANNEL_NR * 2))
+
 struct nouveau_fence {
struct nouveau_channel *channel;
struct kref refcount;
@@ -47,6 +54,240 @@ nouveau_fence(void *sync_obj)
return (struct nouveau_fence *)sync_obj;
 }
 
+struct nouveau_sem_bo {
+   struct nouveau_sem_bo *next;
+   struct nouveau_bo *bo;
+   uint32_t handle;
+
+   /* = 0: num_free + 1 slots are free, sem_bo is or is about to be on 
free_list
+   -1: all allocated, sem_bo is NOT on free_list
+   */
+   atomic_t num_free;
+
+   DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
+};
+
+struct nouveau_sem {
+   struct nouveau_sem_bo *sem_bo;
+   unsigned num;
+   uint32_t value;
+};
+
+static struct nouveau_sem_bo*
+nouveau_sem_bo_alloc(struct drm_device *dev)
+{
+   struct drm_nouveau_private *dev_priv = dev-dev_private;
+   struct nouveau_sem_bo *sem_bo;
+   struct nouveau_bo *bo;
+   int flags = TTM_PL_FLAG_VRAM;
+   int ret;
+   bool is_iomem;
+   void *mem;
+   unsigned handle;
+
+   do {
+   handle = dev_priv-sem.handles;
+   if (handle = dev_priv-sem.max_handles)
+   return NULL;
+   } while (cmpxchg(dev_priv-sem.handles, handle, handle + 1) != handle);
+
+   sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+   if (!sem_bo)
+   return NULL;
+
+   sem_bo-handle = NvSem + handle;
+
+   ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
+   0, 0x, true, true, bo);
+   if (ret)
+   goto out_free;
+
+   sem_bo-bo = bo;
+
+   ret = 

[Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Luca Barbieri
This patch adds code to allocate semaphores in a dynamic way using
an algorithm with a lockless fast path.

1. Semaphore BOs

Semaphore BOs are BOs containing semaphores. Each is 4KB large and
contains 1024 4-byte semaphores. They are pinned.

Semaphore BOs are allocated on-demand and freed at device takedown.
Those that are not fully allocated are kept on a free list.

Each is assigned an handle. DMA objects and references are created
on demand for each channel that needs to use a semaphore BO.
Those objects and references are automatically destroyed at channel
destruction time.

Typically only a single semaphore BO will be needed.

2. Semaphore allocation

Each semaphore BO contains a bitmask of free semaphores within the BO.
Allocation is done in a lockless fashion using a count of free
semaphores and the bitmask.

Semaphores are released once the fence on the waiting side passed.
This is done by adding fields to nouveau_fence.

Semaphore values are zeroed when the semaphore BO is allocated, and
are afterwards only modified by the GPU.

This is performed by storing a bitmask that allows to alternate
between using the values 0 and 1 for a given semaphore.

Signed-off-by: Luca Barbieri l...@luca-barbieri.com
---
 drivers/gpu/drm/nouveau/nouveau_dma.h   |1 +
 drivers/gpu/drm/nouveau/nouveau_drv.h   |7 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |  243 +++
 drivers/gpu/drm/nouveau/nouveau_state.c |4 +
 4 files changed, 255 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h 
b/drivers/gpu/drm/nouveau/nouveau_dma.h
index dabfd65..0658979 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.h
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
@@ -69,6 +69,7 @@ enum {
NvGdiRect   = 0x800c,
NvImageBlit = 0x800d,
NvSw= 0x800e,
+   NvSem   = 0x8100, /* range of 16M handles */
 
/* G80+ display objects */
NvEvoVRAM   = 0x0100,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 2b78ee6..0a7abc7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -620,6 +620,11 @@ struct drm_nouveau_private {
struct {
struct dentry *channel_root;
} debugfs;
+
+   spinlock_t sem_bo_free_list_lock;
+   struct nouveau_sem_bo *sem_bo_free_list;
+   atomic_t sem_handles;
+   uint32_t sem_max_handles;
 };
 
 static inline struct drm_nouveau_private *
@@ -1141,6 +1146,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
 extern void nouveau_fence_handler(struct drm_device *dev, int channel);
+extern void nouveau_fence_device_init(struct drm_device *dev);
+extern void nouveau_fence_device_takedown(struct drm_device *dev);
 
 /* nouveau_gem.c */
 extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 9b1c2c3..01152f3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -32,6 +32,13 @@
 
 #define USE_REFCNT (dev_priv-card_type = NV_10)
 
+#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
+
+/* reading fences can be very expensive
+ * use a threshold that would only use up half a single sem_bo
+ */
+#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / 
(NOUVEAU_MAX_CHANNEL_NR * 2))
+
 struct nouveau_fence {
struct nouveau_channel *channel;
struct kref refcount;
@@ -47,6 +54,218 @@ nouveau_fence(void *sync_obj)
return (struct nouveau_fence *)sync_obj;
 }
 
+struct nouveau_sem_bo {
+   struct nouveau_sem_bo *next;
+   struct nouveau_bo *bo;
+   uint32_t handle;
+
+   /* = 0: num_free + 1 slots are free, sem_bo is or is about to be on 
free_list
+   -1: all allocated, sem_bo is NOT on free_list
+   */
+   atomic_t num_free;
+
+   DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+   DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
+};
+
+struct nouveau_sem {
+   struct nouveau_sem_bo *sem_bo;
+   unsigned num;
+   uint32_t value;
+};
+
+struct nouveau_sem_bo*
+nouveau_sem_bo_alloc(struct drm_device *dev)
+{
+   struct nouveau_sem_bo *sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+   struct nouveau_bo *bo;
+   int flags = TTM_PL_FLAG_VRAM;
+   int ret;
+   bool is_iomem;
+   void *mem;
+
+   sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+
+   if (!sem_bo)
+   return 0;
+
+   ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
+   0, 0x, true, true, bo);
+   if (ret)
+   goto out_free;
+
+   sem_bo-bo = bo;
+
+   ret = 

Re: [Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Francisco Jerez
Luca Barbieri l...@luca-barbieri.com writes:

 This patch adds code to allocate semaphores in a dynamic way using
 an algorithm with a lockless fast path.

 1. Semaphore BOs

 Semaphore BOs are BOs containing semaphores. Each is 4KB large and
 contains 1024 4-byte semaphores. They are pinned.

 Semaphore BOs are allocated on-demand and freed at device takedown.
 Those that are not fully allocated are kept on a free list.

 Each is assigned an handle. DMA objects and references are created
 on demand for each channel that needs to use a semaphore BO.
 Those objects and references are automatically destroyed at channel
 destruction time.

 Typically only a single semaphore BO will be needed.

 2. Semaphore allocation

 Each semaphore BO contains a bitmask of free semaphores within the BO.
 Allocation is done in a lockless fashion using a count of free
 semaphores and the bitmask.

 Semaphores are released once the fence on the waiting side passed.
 This is done by adding fields to nouveau_fence.

 Semaphore values are zeroed when the semaphore BO is allocated, and
 are afterwards only modified by the GPU.

 This is performed by storing a bitmask that allows to alternate
 between using the values 0 and 1 for a given semaphore.

 Signed-off-by: Luca Barbieri l...@luca-barbieri.com
 ---
  drivers/gpu/drm/nouveau/nouveau_dma.h   |1 +
  drivers/gpu/drm/nouveau/nouveau_drv.h   |7 +
  drivers/gpu/drm/nouveau/nouveau_fence.c |  243 
 +++
  drivers/gpu/drm/nouveau/nouveau_state.c |4 +
  4 files changed, 255 insertions(+), 0 deletions(-)

 diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h 
 b/drivers/gpu/drm/nouveau/nouveau_dma.h
 index dabfd65..0658979 100644
 --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
 +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
 @@ -69,6 +69,7 @@ enum {
   NvGdiRect   = 0x800c,
   NvImageBlit = 0x800d,
   NvSw= 0x800e,
 + NvSem   = 0x8100, /* range of 16M handles */
  
   /* G80+ display objects */
   NvEvoVRAM   = 0x0100,
 diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
 b/drivers/gpu/drm/nouveau/nouveau_drv.h
 index 2b78ee6..0a7abc7 100644
 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
 +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
 @@ -620,6 +620,11 @@ struct drm_nouveau_private {
   struct {
   struct dentry *channel_root;
   } debugfs;
 +
 + spinlock_t sem_bo_free_list_lock;
 + struct nouveau_sem_bo *sem_bo_free_list;
 + atomic_t sem_handles;
 + uint32_t sem_max_handles;
  };
  
  static inline struct drm_nouveau_private *
 @@ -1141,6 +1146,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
  extern void nouveau_fence_unref(void **obj);
  extern void *nouveau_fence_ref(void *obj);
  extern void nouveau_fence_handler(struct drm_device *dev, int channel);
 +extern void nouveau_fence_device_init(struct drm_device *dev);
 +extern void nouveau_fence_device_takedown(struct drm_device *dev);
  
  /* nouveau_gem.c */
  extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
 diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
 b/drivers/gpu/drm/nouveau/nouveau_fence.c
 index 9b1c2c3..01152f3 100644
 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
 +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
 @@ -32,6 +32,13 @@
  
  #define USE_REFCNT (dev_priv-card_type = NV_10)
  
 +#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
 +
 +/* reading fences can be very expensive
 + * use a threshold that would only use up half a single sem_bo
 + */
 +#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / 
 (NOUVEAU_MAX_CHANNEL_NR * 2))
 +
  struct nouveau_fence {
   struct nouveau_channel *channel;
   struct kref refcount;
 @@ -47,6 +54,218 @@ nouveau_fence(void *sync_obj)
   return (struct nouveau_fence *)sync_obj;
  }
  
 +struct nouveau_sem_bo {
 + struct nouveau_sem_bo *next;
 + struct nouveau_bo *bo;
 + uint32_t handle;
 +
 + /* = 0: num_free + 1 slots are free, sem_bo is or is about to be on 
 free_list
 + -1: all allocated, sem_bo is NOT on free_list
 + */
 + atomic_t num_free;
 +
 + DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
 + DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
 + DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
 +};
 +
 +struct nouveau_sem {
 + struct nouveau_sem_bo *sem_bo;
 + unsigned num;
 + uint32_t value;
 +};
 +
 +struct nouveau_sem_bo*
 +nouveau_sem_bo_alloc(struct drm_device *dev)
 +{
 + struct nouveau_sem_bo *sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
 + struct nouveau_bo *bo;
 + int flags = TTM_PL_FLAG_VRAM;
 + int ret;
 + bool is_iomem;
 + void *mem;
 +
 + sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
 +
 + if (!sem_bo)
 + return 0;
 +
 + ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
 + 0, 0x, true, true, bo);
 + 

Re: [Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Luca Barbieri
 How often do we expect cross-channel sync to kick in? Maybe 2-3 times
 per frame? I suspect contentions will be rare enough to make spinlocks
 as fast as atomics for all real-life cases, and they don't have such a
 high maintainability cost. What do you guys think?

For the case of a single (or a few) GL application the requirements
are indeed modest.

I'm not sure that spinlocks or an otherwise reduced solution would be
much simpler.
You basically would just avoid the retrying code.

Also if you have a multithreaded/multiprocess GPGPU application on
large SMP machine things may change, as you may have a lot of commands
and semaphores in flight, as well as high contention for anything
global.

Of course, currently we hold both the BKL and struct_mutex around
things, which makes it all moot, but hopefully we'll switch to
per-channel mutexes soon (I'm looking into that).
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Francisco Jerez
Luca Barbieri l...@luca-barbieri.com writes:

 How often do we expect cross-channel sync to kick in? Maybe 2-3 times
 per frame? I suspect contentions will be rare enough to make spinlocks
 as fast as atomics for all real-life cases, and they don't have such a
 high maintainability cost. What do you guys think?

 For the case of a single (or a few) GL application the requirements
 are indeed modest.

 I'm not sure that spinlocks or an otherwise reduced solution would be
 much simpler.
 You basically would just avoid the retrying code.

 Also if you have a multithreaded/multiprocess GPGPU application on
 large SMP machine things may change, as you may have a lot of commands
 and semaphores in flight, as well as high contention for anything
 global.

Sounds like premature optimization to me. I'm just stating my personal
view here, but I have a feeling a patch with 60% of lines could do very
well the same for most realistic cases.

Maarten, Ben, what do you think about this?

 Of course, currently we hold both the BKL and struct_mutex around
 things, which makes it all moot, but hopefully we'll switch to
 per-channel mutexes soon (I'm looking into that).

BTW, the kernel has some linked list helpers you might want to use for
sem_bo_free_list, and probably the best place for the sem stuff to live
is dev_priv-fence instead of the root of drm_nouveau_private.


pgpECYGXD4Dlu.pgp
Description: PGP signature
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Luca Barbieri
 Sounds like premature optimization to me. I'm just stating my personal
 view here, but I have a feeling a patch with 60% of lines could do very
 well the same for most realistic cases.

Perhaps, but really, the only thing you would probably save by using
spinlocks in the fast path is retrying in nouveau_sem_alloc, which
should be at most 10 lines saved.

You could save much more by supporting only a single static semaphore
BO, and still retain almost all flexibility by making it large.
However, it's somewhat inelegant, and why not just keep the
functionality so we never need to revisit this again?

 BTW, the kernel has some linked list helpers you might want to use for
 sem_bo_free_list
It is a singly linked list, and slist.h never got merged.
I could possibly make it doubly linked, even though it's a bit useless.

 and probably the best place for the sem stuff to live
 is dev_priv-fence instead of the root of drm_nouveau_private.
There is no fence currently in drm_nouveau_private.
Adding a sem or fence substructure could make sense though.
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator

2010-02-01 Thread Marcin Slusarz
On Mon, Feb 01, 2010 at 10:50:09AM +0100, Luca Barbieri wrote:
 This patch adds code to allocate semaphores in a dynamic way using
 an algorithm with a lockless fast path.

some minor comments below

 
 1. Semaphore BOs
 
 Semaphore BOs are BOs containing semaphores. Each is 4KB large and
 contains 1024 4-byte semaphores. They are pinned.
 
 Semaphore BOs are allocated on-demand and freed at device takedown.
 Those that are not fully allocated are kept on a free list.
 
 Each is assigned an handle. DMA objects and references are created
 on demand for each channel that needs to use a semaphore BO.
 Those objects and references are automatically destroyed at channel
 destruction time.
 
 Typically only a single semaphore BO will be needed.
 
 2. Semaphore allocation
 
 Each semaphore BO contains a bitmask of free semaphores within the BO.
 Allocation is done in a lockless fashion using a count of free
 semaphores and the bitmask.
 
 Semaphores are released once the fence on the waiting side passed.
 This is done by adding fields to nouveau_fence.
 
 Semaphore values are zeroed when the semaphore BO is allocated, and
 are afterwards only modified by the GPU.
 
 This is performed by storing a bitmask that allows to alternate
 between using the values 0 and 1 for a given semaphore.
 
 Signed-off-by: Luca Barbieri l...@luca-barbieri.com
 ---
  drivers/gpu/drm/nouveau/nouveau_dma.h   |1 +
  drivers/gpu/drm/nouveau/nouveau_drv.h   |7 +
  drivers/gpu/drm/nouveau/nouveau_fence.c |  243 
 +++
  drivers/gpu/drm/nouveau/nouveau_state.c |4 +
  4 files changed, 255 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h 
 b/drivers/gpu/drm/nouveau/nouveau_dma.h
 index dabfd65..0658979 100644
 --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
 +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
 @@ -69,6 +69,7 @@ enum {
   NvGdiRect   = 0x800c,
   NvImageBlit = 0x800d,
   NvSw= 0x800e,
 + NvSem   = 0x8100, /* range of 16M handles */
  
   /* G80+ display objects */
   NvEvoVRAM   = 0x0100,
 diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h 
 b/drivers/gpu/drm/nouveau/nouveau_drv.h
 index 2b78ee6..0a7abc7 100644
 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
 +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
 @@ -620,6 +620,11 @@ struct drm_nouveau_private {
   struct {
   struct dentry *channel_root;
   } debugfs;
 +
 + spinlock_t sem_bo_free_list_lock;
 + struct nouveau_sem_bo *sem_bo_free_list;
 + atomic_t sem_handles;
 + uint32_t sem_max_handles;
  };
  
  static inline struct drm_nouveau_private *
 @@ -1141,6 +1146,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
  extern void nouveau_fence_unref(void **obj);
  extern void *nouveau_fence_ref(void *obj);
  extern void nouveau_fence_handler(struct drm_device *dev, int channel);
 +extern void nouveau_fence_device_init(struct drm_device *dev);
 +extern void nouveau_fence_device_takedown(struct drm_device *dev);
  
  /* nouveau_gem.c */
  extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
 diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c 
 b/drivers/gpu/drm/nouveau/nouveau_fence.c
 index 9b1c2c3..01152f3 100644
 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
 +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
 @@ -32,6 +32,13 @@
  
  #define USE_REFCNT (dev_priv-card_type = NV_10)
  
 +#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
 +
 +/* reading fences can be very expensive
 + * use a threshold that would only use up half a single sem_bo
 + */
 +#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / 
 (NOUVEAU_MAX_CHANNEL_NR * 2))
 +
  struct nouveau_fence {
   struct nouveau_channel *channel;
   struct kref refcount;
 @@ -47,6 +54,218 @@ nouveau_fence(void *sync_obj)
   return (struct nouveau_fence *)sync_obj;
  }
  
 +struct nouveau_sem_bo {
 + struct nouveau_sem_bo *next;
 + struct nouveau_bo *bo;
 + uint32_t handle;
 +
 + /* = 0: num_free + 1 slots are free, sem_bo is or is about to be on 
 free_list
 + -1: all allocated, sem_bo is NOT on free_list
 + */
 + atomic_t num_free;
 +
 + DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
 + DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
 + DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
 +};
 +
 +struct nouveau_sem {
 + struct nouveau_sem_bo *sem_bo;
 + unsigned num;
 + uint32_t value;
 +};
 +
 +struct nouveau_sem_bo*
 +nouveau_sem_bo_alloc(struct drm_device *dev)
 +{
 + struct nouveau_sem_bo *sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);


 + struct nouveau_bo *bo;
 + int flags = TTM_PL_FLAG_VRAM;
 + int ret;
 + bool is_iomem;
 + void *mem;
 +
 + sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);

double allocation

 +
 + if (!sem_bo)
 + return 0;

sparse would probably complain about 0