[Mesa-dev] [PATCH] st/nine: Call GetPipe for implicit pipe usages

2016-12-18 Thread Axel Davy
With csmt, every usage of the pipe in the main thread
has to be protected by calling GetPipe.

Signed-off-by: Axel Davy 
---
 src/gallium/state_trackers/nine/device9.c| 6 ++
 src/gallium/state_trackers/nine/swapchain9.c | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/src/gallium/state_trackers/nine/device9.c 
b/src/gallium/state_trackers/nine/device9.c
index 8f5f117951..dce908e7f8 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -2821,6 +2821,8 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
 vtxbuf.user_buffer = pVertexStreamZeroData;
 
 if (!This->driver_caps.user_vbufs) {
+/* Implicit use of context pipe */
+(void)NineDevice9_GetPipe(This);
 u_upload_data(This->vertex_uploader,
   0,
   (prim_count_to_vertex_count(PrimitiveType, 
PrimitiveCount)) * VertexStreamZeroStride, /* XXX */
@@ -2884,6 +2886,8 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 
*This,
 
 if (!This->driver_caps.user_vbufs) {
 const unsigned base = MinVertexIndex * VertexStreamZeroStride;
+/* Implicit use of context pipe */
+(void)NineDevice9_GetPipe(This);
 u_upload_data(This->vertex_uploader,
   base,
   NumVertices * VertexStreamZeroStride, /* XXX */
@@ -2897,6 +2901,8 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 
*This,
 vbuf.user_buffer = NULL;
 }
 if (!This->driver_caps.user_ibufs) {
+/* Implicit use of context pipe */
+(void)NineDevice9_GetPipe(This);
 u_upload_data(This->index_uploader,
   0,
   (prim_count_to_vertex_count(PrimitiveType, 
PrimitiveCount)) * ibuf.index_size,
diff --git a/src/gallium/state_trackers/nine/swapchain9.c 
b/src/gallium/state_trackers/nine/swapchain9.c
index 502736c221..51aab066a5 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -603,6 +603,8 @@ handle_draw_cursor_and_hud( struct NineSwapChain9 *This, 
struct pipe_resource *r
 }
 
 if (device->hud && resource) {
+/* Implicit use of context pipe */
+(void)NineDevice9_GetPipe(This->base.device);
 hud_draw(device->hud, resource); /* XXX: no offset */
 /* HUD doesn't clobber stipple */
 nine_state_restore_non_cso(device);
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 57/84 v2] st/nine: Implement Fast path for dynamic buffers and csmt

2016-12-18 Thread Axel Davy
Use the secondary pipe for DISCARD/NOOVERWRITE, which
avoids stalling to get the pipe from the worker thread.

v2: flush at unmap. This is required for example if
the driver does hidden draw calls or copies. In the case
of unsynchronized it is probably not required, but
it is more safe.

Signed-off-by: Axel Davy 
---
 src/gallium/state_trackers/nine/buffer9.c  | 65 ++
 src/gallium/state_trackers/nine/buffer9.h  |  7 ++-
 src/gallium/state_trackers/nine/indexbuffer9.c |  2 +
 3 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/src/gallium/state_trackers/nine/buffer9.c 
b/src/gallium/state_trackers/nine/buffer9.c
index e497222434..bd115ff70b 100644
--- a/src/gallium/state_trackers/nine/buffer9.c
+++ b/src/gallium/state_trackers/nine/buffer9.c
@@ -32,6 +32,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_format.h"
 #include "util/u_box.h"
+#include "util/u_inlines.h"
 
 #define DBG_CHANNEL (DBG_INDEXBUFFER|DBG_VERTEXBUFFER)
 
@@ -50,7 +51,7 @@ NineBuffer9_ctor( struct NineBuffer9 *This,
 
 user_assert(Pool != D3DPOOL_SCRATCH, D3DERR_INVALIDCALL);
 
-This->maps = MALLOC(sizeof(struct pipe_transfer *));
+This->maps = MALLOC(sizeof(struct NineTransfer));
 if (!This->maps)
 return E_OUTOFMEMORY;
 This->nmaps = 0;
@@ -169,6 +170,25 @@ NineBuffer9_GetResource( struct NineBuffer9 *This )
 return NineResource9_GetResource(&This->base);
 }
 
+static void
+NineBuffer9_RebindIfRequired( struct NineBuffer9 *This,
+  struct NineDevice9 *device )
+{
+int i;
+
+if (!This->bind_count)
+return;
+for (i = 0; i < device->caps.MaxStreams; i++) {
+if (device->state.stream[i] == (struct NineVertexBuffer9 *)This)
+nine_context_set_stream_source(device, i,
+   (struct NineVertexBuffer9 *)This,
+   
device->state.vtxbuf[i].buffer_offset,
+   device->state.vtxbuf[i].stride);
+}
+if (device->state.idxbuf == (struct NineIndexBuffer9 *)This)
+nine_context_set_indices(device, (struct NineIndexBuffer9 *)This);
+}
+
 HRESULT NINE_WINAPI
 NineBuffer9_Lock( struct NineBuffer9 *This,
 UINT OffsetToLock,
@@ -176,6 +196,7 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 void **ppbData,
 DWORD Flags )
 {
+struct NineDevice9 *device = This->base.base.device;
 struct pipe_box box;
 struct pipe_context *pipe;
 void *data;
@@ -241,9 +262,9 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 usage |= PIPE_TRANSFER_DONTBLOCK;
 
 if (This->nmaps == This->maxmaps) {
-struct pipe_transfer **newmaps =
-REALLOC(This->maps, sizeof(struct pipe_transfer *)*This->maxmaps,
-sizeof(struct pipe_transfer *)*(This->maxmaps << 1));
+struct NineTransfer *newmaps =
+REALLOC(This->maps, sizeof(struct NineTransfer)*This->maxmaps,
+sizeof(struct NineTransfer)*(This->maxmaps << 1));
 if (newmaps == NULL)
 return E_OUTOFMEMORY;
 
@@ -251,9 +272,29 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 This->maps = newmaps;
 }
 
-pipe = NineDevice9_GetPipe(This->base.base.device);
+This->maps[This->nmaps].is_pipe_secondary = FALSE;
+
+if (Flags & D3DLOCK_DISCARD && device->csmt_active) {
+struct pipe_screen *screen = NineDevice9_GetScreen(device);
+struct pipe_resource *new_res = screen->resource_create(screen, 
&This->base.info);
+if (new_res) {
+/* Use the new resource */
+pipe_resource_reference(&This->base.resource, new_res);
+pipe_resource_reference(&new_res, NULL);
+usage = PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED;
+NineBuffer9_RebindIfRequired(This, device);
+This->maps[This->nmaps].is_pipe_secondary = TRUE;
+}
+} else if (Flags & D3DLOCK_NOOVERWRITE && device->csmt_active)
+This->maps[This->nmaps].is_pipe_secondary = TRUE;
+
+if (This->maps[This->nmaps].is_pipe_secondary)
+pipe = device->pipe_secondary;
+else
+pipe = NineDevice9_GetPipe(device);
+
 data = pipe->transfer_map(pipe, This->base.resource, 0,
-  usage, &box, &This->maps[This->nmaps]);
+  usage, &box, &This->maps[This->nmaps].transfer);
 
 if (!data) {
 DBG("pipe::transfer_map failed\n"
@@ -277,15 +318,21 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 HRESULT NINE_WINAPI
 NineBuffer9_Unlock( struct NineBuffer9 *This )
 {
+struct NineDevice9 *device = This->base.base.device;
 struct pipe_context *pipe;
 DBG("This=%p\n", This);
 
 user_assert(This->nmaps > 0, D3DERR_INVALIDCALL);
+This->nmaps--;
 if (This->base.pool != D3DPOOL_MANAGED) {
-pipe = NineDevice9_GetPipe(This->b

[Mesa-dev] [PATCH 84/84 v5] st/nine: Implement new buffer upload path

2016-12-18 Thread Axel Davy
This new buffer upload path enables to lock
faster than the normal path when using
DISCARD/NOOVERWRITE.

v2: Diverse cleanups and fixes.
v3: Fix allocation size for 'lone' buffers and
add more debug info.
v4: Rewrite of the path to handle when DISCARD/NOOVERWRITE
is not used anymore. The resource content is copied to the
new resource used.
v5: flush for safety after unmap (not sure it is really required
here, but safer to flush).

Signed-off-by: Axel Davy 
---
 src/gallium/state_trackers/nine/Makefile.sources   |   2 +
 src/gallium/state_trackers/nine/buffer9.c  |  96 ++-
 src/gallium/state_trackers/nine/buffer9.h  |   9 +-
 src/gallium/state_trackers/nine/device9.c  |   5 +
 src/gallium/state_trackers/nine/device9.h  |   3 +
 src/gallium/state_trackers/nine/indexbuffer9.c |  10 +-
 src/gallium/state_trackers/nine/indexbuffer9.h |   2 -
 .../state_trackers/nine/nine_buffer_upload.c   | 299 +
 .../state_trackers/nine/nine_buffer_upload.h   |  59 
 src/gallium/state_trackers/nine/vertexbuffer9.c|   3 +-
 10 files changed, 464 insertions(+), 24 deletions(-)
 create mode 100644 src/gallium/state_trackers/nine/nine_buffer_upload.c
 create mode 100644 src/gallium/state_trackers/nine/nine_buffer_upload.h

diff --git a/src/gallium/state_trackers/nine/Makefile.sources 
b/src/gallium/state_trackers/nine/Makefile.sources
index 2bb08a2612..56698a19f1 100644
--- a/src/gallium/state_trackers/nine/Makefile.sources
+++ b/src/gallium/state_trackers/nine/Makefile.sources
@@ -23,6 +23,8 @@ C_SOURCES := \
indexbuffer9.h \
iunknown.c \
iunknown.h \
+   nine_buffer_upload.c \
+   nine_buffer_upload.h \
nine_csmt_helper.h \
nine_debug.c \
nine_debug.h \
diff --git a/src/gallium/state_trackers/nine/buffer9.c 
b/src/gallium/state_trackers/nine/buffer9.c
index c745d77c2c..bb492bcf47 100644
--- a/src/gallium/state_trackers/nine/buffer9.c
+++ b/src/gallium/state_trackers/nine/buffer9.c
@@ -23,6 +23,7 @@
 
 #include "buffer9.h"
 #include "device9.h"
+#include "nine_buffer_upload.h"
 #include "nine_helpers.h"
 #include "nine_pipe.h"
 
@@ -100,6 +101,9 @@ NineBuffer9_ctor( struct NineBuffer9 *This,
 else
 info->usage = PIPE_USAGE_DYNAMIC;
 
+/* When Writeonly is not set, we don't want to enable the
+ * optimizations */
+This->discard_nooverwrite_only = !!(Usage & D3DUSAGE_WRITEONLY);
 /* if (pDesc->Usage & D3DUSAGE_DONOTCLIP) { } */
 /* if (pDesc->Usage & D3DUSAGE_NONSECURE) { } */
 /* if (pDesc->Usage & D3DUSAGE_NPATCHES) { } */
@@ -161,12 +165,18 @@ NineBuffer9_dtor( struct NineBuffer9 *This )
 list_del(&This->managed.list2);
 }
 
+if (This->buf)
+nine_upload_release_buffer(This->base.base.device->buffer_upload, 
This->buf);
+
 NineResource9_dtor(&This->base);
 }
 
 struct pipe_resource *
-NineBuffer9_GetResource( struct NineBuffer9 *This )
+NineBuffer9_GetResource( struct NineBuffer9 *This, unsigned *offset )
 {
+if (This->buf)
+return nine_upload_buffer_resource_and_offset(This->buf, offset);
+*offset = 0;
 return NineResource9_GetResource(&This->base);
 }
 
@@ -264,6 +274,8 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 if (Flags & D3DLOCK_DONOTWAIT && !(This->base.usage & D3DUSAGE_DYNAMIC))
 usage |= PIPE_TRANSFER_DONTBLOCK;
 
+This->discard_nooverwrite_only &= !!(Flags & (D3DLOCK_DISCARD | 
D3DLOCK_NOOVERWRITE));
+
 if (This->nmaps == This->maxmaps) {
 struct NineTransfer *newmaps =
 REALLOC(This->maps, sizeof(struct NineTransfer)*This->maxmaps,
@@ -275,8 +287,67 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 This->maps = newmaps;
 }
 
-This->maps[This->nmaps].is_pipe_secondary = FALSE;
+if (This->buf && !This->discard_nooverwrite_only) {
+struct pipe_box src_box;
+unsigned offset;
+struct pipe_resource *src_res;
+DBG("Disabling nine_subbuffer for a buffer having"
+"used a nine_subbuffer buffer\n");
+/* Copy buffer content to the buffer resource, which
+ * we will now use.
+ * Note: The behaviour may be different from what is expected
+ * with double lock. However applications can't really make 
expectations
+ * about double locks, and don't really use them, so that's ok. */
+src_res = nine_upload_buffer_resource_and_offset(This->buf, &offset);
+u_box_1d(offset, This->size, &src_box);
+
+pipe = NineDevice9_GetPipe(device);
+pipe->resource_copy_region(pipe, This->base.resource, 0, 0, 0, 0,
+   src_res, 0, &src_box);
+/* Release previous resource */
+if (This->nmaps >= 1)
+This->maps[This->nmaps-1].should_destroy_buf = true;
+else
+nine_upload_release_buffer(device->buffer_upload, This->buf);
+This->buf = NULL;
+/* Rebind buffer */
+Ni

[Mesa-dev] [PATCH 84/84 v6] st/nine: Implement new buffer upload path

2016-12-18 Thread Axel Davy
This new buffer upload path enables to lock
faster than the normal path when using
DISCARD/NOOVERWRITE.

v2: Diverse cleanups and fixes.
v3: Fix allocation size for 'lone' buffers and
add more debug info.
v4: Rewrite of the path to handle when DISCARD/NOOVERWRITE
is not used anymore. The resource content is copied to the
new resource used.
v5: flush for safety after unmap (not sure it is really required
here, but safer to flush).
v6: Do not use the path if persistent coherent mapping is unavailable.
Fix buffer creation flags.

Signed-off-by: Axel Davy 
---
 src/gallium/state_trackers/nine/Makefile.sources   |   2 +
 src/gallium/state_trackers/nine/buffer9.c  |  97 ++-
 src/gallium/state_trackers/nine/buffer9.h  |   9 +-
 src/gallium/state_trackers/nine/device9.c  |   6 +
 src/gallium/state_trackers/nine/device9.h  |   3 +
 src/gallium/state_trackers/nine/indexbuffer9.c |  10 +-
 src/gallium/state_trackers/nine/indexbuffer9.h |   2 -
 .../state_trackers/nine/nine_buffer_upload.c   | 300 +
 .../state_trackers/nine/nine_buffer_upload.h   |  59 
 src/gallium/state_trackers/nine/vertexbuffer9.c|   3 +-
 10 files changed, 467 insertions(+), 24 deletions(-)
 create mode 100644 src/gallium/state_trackers/nine/nine_buffer_upload.c
 create mode 100644 src/gallium/state_trackers/nine/nine_buffer_upload.h

diff --git a/src/gallium/state_trackers/nine/Makefile.sources 
b/src/gallium/state_trackers/nine/Makefile.sources
index 2bb08a2612..56698a19f1 100644
--- a/src/gallium/state_trackers/nine/Makefile.sources
+++ b/src/gallium/state_trackers/nine/Makefile.sources
@@ -23,6 +23,8 @@ C_SOURCES := \
indexbuffer9.h \
iunknown.c \
iunknown.h \
+   nine_buffer_upload.c \
+   nine_buffer_upload.h \
nine_csmt_helper.h \
nine_debug.c \
nine_debug.h \
diff --git a/src/gallium/state_trackers/nine/buffer9.c 
b/src/gallium/state_trackers/nine/buffer9.c
index c745d77c2c..b22713b351 100644
--- a/src/gallium/state_trackers/nine/buffer9.c
+++ b/src/gallium/state_trackers/nine/buffer9.c
@@ -23,6 +23,7 @@
 
 #include "buffer9.h"
 #include "device9.h"
+#include "nine_buffer_upload.h"
 #include "nine_helpers.h"
 #include "nine_pipe.h"
 
@@ -100,6 +101,10 @@ NineBuffer9_ctor( struct NineBuffer9 *This,
 else
 info->usage = PIPE_USAGE_DYNAMIC;
 
+/* When Writeonly is not set, we don't want to enable the
+ * optimizations */
+This->discard_nooverwrite_only = !!(Usage & D3DUSAGE_WRITEONLY) &&
+ pParams->device->buffer_upload;
 /* if (pDesc->Usage & D3DUSAGE_DONOTCLIP) { } */
 /* if (pDesc->Usage & D3DUSAGE_NONSECURE) { } */
 /* if (pDesc->Usage & D3DUSAGE_NPATCHES) { } */
@@ -161,12 +166,18 @@ NineBuffer9_dtor( struct NineBuffer9 *This )
 list_del(&This->managed.list2);
 }
 
+if (This->buf)
+nine_upload_release_buffer(This->base.base.device->buffer_upload, 
This->buf);
+
 NineResource9_dtor(&This->base);
 }
 
 struct pipe_resource *
-NineBuffer9_GetResource( struct NineBuffer9 *This )
+NineBuffer9_GetResource( struct NineBuffer9 *This, unsigned *offset )
 {
+if (This->buf)
+return nine_upload_buffer_resource_and_offset(This->buf, offset);
+*offset = 0;
 return NineResource9_GetResource(&This->base);
 }
 
@@ -264,6 +275,8 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 if (Flags & D3DLOCK_DONOTWAIT && !(This->base.usage & D3DUSAGE_DYNAMIC))
 usage |= PIPE_TRANSFER_DONTBLOCK;
 
+This->discard_nooverwrite_only &= !!(Flags & (D3DLOCK_DISCARD | 
D3DLOCK_NOOVERWRITE));
+
 if (This->nmaps == This->maxmaps) {
 struct NineTransfer *newmaps =
 REALLOC(This->maps, sizeof(struct NineTransfer)*This->maxmaps,
@@ -275,8 +288,67 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
 This->maps = newmaps;
 }
 
-This->maps[This->nmaps].is_pipe_secondary = FALSE;
+if (This->buf && !This->discard_nooverwrite_only) {
+struct pipe_box src_box;
+unsigned offset;
+struct pipe_resource *src_res;
+DBG("Disabling nine_subbuffer for a buffer having"
+"used a nine_subbuffer buffer\n");
+/* Copy buffer content to the buffer resource, which
+ * we will now use.
+ * Note: The behaviour may be different from what is expected
+ * with double lock. However applications can't really make 
expectations
+ * about double locks, and don't really use them, so that's ok. */
+src_res = nine_upload_buffer_resource_and_offset(This->buf, &offset);
+u_box_1d(offset, This->size, &src_box);
+
+pipe = NineDevice9_GetPipe(device);
+pipe->resource_copy_region(pipe, This->base.resource, 0, 0, 0, 0,
+   src_res, 0, &src_box);
+/* Release previous resource */
+if (This->nmaps >= 1)
+This->maps[This->nmaps-1].should_destr

[Mesa-dev] [PATCH 2/3] radeon: enable PIPE_CAP_THREAD_SAFE

2016-12-18 Thread Axel Davy
r600 and radeonsi are thread safe.

Signed-off-by: Axel Davy 
---
 src/gallium/drivers/r600/r600_pipe.c   | 2 +-
 src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index 6036604d79..5156e16946 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -286,6 +286,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_THREAD_SAFE:
return 1;
 
case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@@ -376,7 +377,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
-   case PIPE_CAP_THREAD_SAFE:
return 0;
 
case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 0d7d21fc39..e06bdd0a35 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -410,6 +410,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_CULL_DISTANCE:
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+   case PIPE_CAP_THREAD_SAFE:
return 1;
 
case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -460,7 +461,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_VOTE:
case PIPE_CAP_MAX_WINDOW_RECTANGLES:
case PIPE_CAP_NATIVE_FENCE_FD:
-   case PIPE_CAP_THREAD_SAFE:
return 0;
 
case PIPE_CAP_QUERY_BUFFER_OBJECT:
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] gallium: add flag for transfers in a different context than draw calls

2016-12-18 Thread Axel Davy
Add cap PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT.

See commit for detailed description of the flag.

Signed-off-by: Axel Davy 
---
 src/gallium/docs/source/screen.rst   | 9 +
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c   | 1 +
 src/gallium/drivers/ilo/ilo_screen.c | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c   | 1 +
 src/gallium/drivers/r600/r600_pipe.c | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c   | 1 +
 src/gallium/drivers/softpipe/sp_screen.c | 1 +
 src/gallium/drivers/svga/svga_screen.c   | 1 +
 src/gallium/drivers/swr/swr_screen.cpp   | 1 +
 src/gallium/drivers/vc4/vc4_screen.c | 1 +
 src/gallium/drivers/virgl/virgl_screen.c | 1 +
 src/gallium/include/pipe/p_defines.h | 2 ++
 17 files changed, 26 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst 
b/src/gallium/docs/source/screen.rst
index ea76dffe46..b179788c20 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -369,6 +369,15 @@ The integer capabilities:
 * ``PIPE_CAP_THREAD_SAFE``: Whether the driver is thread-safe. It is allowed to
   use at the same time several pipe_contexts in different threads, or a
   pipe_screen in a different thread than one of its pipe_context.
+* ``PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT``: Whether
+  PIPE_RESOURCE_FLAG_TRANSFER_EXTERNAL_CONTEXT is supported for buffers.
+  When the flag is set, the resource may be mapped/unmapped in a different
+  context than the context using it. Like for other resources, resources with
+  this flag must be unmapped before any draw call, except with combination
+  of persistent/coherent where the resource may be mapped on one context,
+  and used for a draw call in another. No flush is required after unmapping
+  the resource. If a memory_barrier call is needed, it may be done from either
+  context.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c 
b/src/gallium/drivers/freedreno/freedreno_screen.c
index e7155b25f8..17be678247 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -291,6 +291,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
return 0;
 
case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c 
b/src/gallium/drivers/i915/i915_screen.c
index cd08ef0876..b80c291ff3 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -297,6 +297,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap 
cap)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
   return 0;
 
case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c 
b/src/gallium/drivers/ilo/ilo_screen.c
index f69d8d08ab..20a14b99ce 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -520,6 +520,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index bbd25c9998..2edddfe9c0 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -341,6 +341,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
   return 0;
}
/* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c 
b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 429cc79b28..cd6181ed6b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -206,6 +206,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_scre

[Mesa-dev] [PATCH 1/3] gallium: add PIPE_CAP_THREAD_SAFE

2016-12-18 Thread Axel Davy
Add flag indicating if the driver is thread safe or not.

See commit for documentation on the meaning of the flag.

Signed-off-by: Axel Davy 
---
 src/gallium/docs/source/screen.rst   | 3 +++
 src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
 src/gallium/drivers/i915/i915_screen.c   | 1 +
 src/gallium/drivers/ilo/ilo_screen.c | 1 +
 src/gallium/drivers/llvmpipe/lp_screen.c | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   | 1 +
 src/gallium/drivers/r300/r300_screen.c   | 1 +
 src/gallium/drivers/r600/r600_pipe.c | 1 +
 src/gallium/drivers/radeonsi/si_pipe.c   | 1 +
 src/gallium/drivers/softpipe/sp_screen.c | 1 +
 src/gallium/drivers/svga/svga_screen.c   | 1 +
 src/gallium/drivers/swr/swr_screen.cpp   | 1 +
 src/gallium/drivers/vc4/vc4_screen.c | 1 +
 src/gallium/drivers/virgl/virgl_screen.c | 1 +
 src/gallium/include/pipe/p_defines.h | 1 +
 17 files changed, 19 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst 
b/src/gallium/docs/source/screen.rst
index 7ac39ffc44..ea76dffe46 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -366,6 +366,9 @@ The integer capabilities:
   ARB_transform_feedback3.
 * ``PIPE_CAP_TGSI_CAN_READ_OUTPUTS``: Whether every TGSI shader stage can read
   from the output file.
+* ``PIPE_CAP_THREAD_SAFE``: Whether the driver is thread-safe. It is allowed to
+  use at the same time several pipe_contexts in different threads, or a
+  pipe_screen in a different thread than one of its pipe_context.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c 
b/src/gallium/drivers/freedreno/freedreno_screen.c
index 8ab0e37fa0..e7155b25f8 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -290,6 +290,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
+   case PIPE_CAP_THREAD_SAFE:
return 0;
 
case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c 
b/src/gallium/drivers/i915/i915_screen.c
index 14f42717b1..cd08ef0876 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -296,6 +296,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap 
cap)
case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
+   case PIPE_CAP_THREAD_SAFE:
   return 0;
 
case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c 
b/src/gallium/drivers/ilo/ilo_screen.c
index c3fad7314e..f69d8d08ab 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -519,6 +519,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
+   case PIPE_CAP_THREAD_SAFE:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index 83045cd5e3..bbd25c9998 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -340,6 +340,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
+   case PIPE_CAP_THREAD_SAFE:
   return 0;
}
/* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c 
b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index beffeac7a1..429cc79b28 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -205,6 +205,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
+   case PIPE_CAP_THREAD_SAFE:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c 
b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 5b388b5aa1..4b3b299426 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -257,6 +257,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
+   case PIPE_CAP_THREAD_SAFE:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/galliu

[Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Axel Davy
Currently there is no real specification on what is allowed for
using different contexts in several threads, or when you
map/unmap a resource in a thread, but uses it in another for
draw calls.

For the gallium nine CSMT patchset, I've figured out it would be better
to add flags to describe what is allowed.

Please comment.

Yours,

Axel Davy

Axel Davy (3):
  gallium: add PIPE_CAP_THREAD_SAFE
  radeon: enable PIPE_CAP_THREAD_SAFE
  gallium: add flag for transfers in a different context than draw call

 src/gallium/docs/source/screen.rst   | 12 
 src/gallium/drivers/freedreno/freedreno_screen.c |  2 ++
 src/gallium/drivers/i915/i915_screen.c   |  2 ++
 src/gallium/drivers/ilo/ilo_screen.c |  2 ++
 src/gallium/drivers/llvmpipe/lp_screen.c |  2 ++
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   |  2 ++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   |  2 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   |  2 ++
 src/gallium/drivers/r300/r300_screen.c   |  2 ++
 src/gallium/drivers/r600/r600_pipe.c |  2 ++
 src/gallium/drivers/radeonsi/si_pipe.c   |  2 ++
 src/gallium/drivers/softpipe/sp_screen.c |  2 ++
 src/gallium/drivers/svga/svga_screen.c   |  2 ++
 src/gallium/drivers/swr/swr_screen.cpp   |  2 ++
 src/gallium/drivers/vc4/vc4_screen.c |  2 ++
 src/gallium/drivers/virgl/virgl_screen.c |  2 ++
 src/gallium/include/pipe/p_defines.h |  3 +++
 17 files changed, 45 insertions(+)

-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3 v2] gallium: add flag for transfers in a different context than draw calls

2016-12-18 Thread Axel Davy
Add cap PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT.

See commit for detailed description of the flag.

v2: Explicit flush behaviour.

Signed-off-by: Axel Davy 
---
 src/gallium/docs/source/screen.rst   | 11 +++
 src/gallium/drivers/freedreno/freedreno_screen.c |  1 +
 src/gallium/drivers/i915/i915_screen.c   |  1 +
 src/gallium/drivers/ilo/ilo_screen.c |  1 +
 src/gallium/drivers/llvmpipe/lp_screen.c |  1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   |  1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   |  1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   |  1 +
 src/gallium/drivers/r300/r300_screen.c   |  1 +
 src/gallium/drivers/r600/r600_pipe.c |  1 +
 src/gallium/drivers/radeonsi/si_pipe.c   |  1 +
 src/gallium/drivers/softpipe/sp_screen.c |  1 +
 src/gallium/drivers/svga/svga_screen.c   |  1 +
 src/gallium/drivers/swr/swr_screen.cpp   |  1 +
 src/gallium/drivers/vc4/vc4_screen.c |  1 +
 src/gallium/drivers/virgl/virgl_screen.c |  1 +
 src/gallium/include/pipe/p_defines.h |  2 ++
 17 files changed, 28 insertions(+)

diff --git a/src/gallium/docs/source/screen.rst 
b/src/gallium/docs/source/screen.rst
index ea76dffe46..1aed86fd04 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -369,6 +369,17 @@ The integer capabilities:
 * ``PIPE_CAP_THREAD_SAFE``: Whether the driver is thread-safe. It is allowed to
   use at the same time several pipe_contexts in different threads, or a
   pipe_screen in a different thread than one of its pipe_context.
+* ``PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT``: Whether
+  PIPE_RESOURCE_FLAG_TRANSFER_EXTERNAL_CONTEXT is supported for buffers.
+  When the flag is set, the resource may be mapped/unmapped in a different
+  context than the context using it. Like for other resources, resources with
+  this flag must be unmapped before any draw call, except with combination
+  of persistent/coherent where the resource may be mapped on one context,
+  and used for a draw call in another. No flush is required after unmapping
+  the resource to make all contexts see the change. If a memory_barrier call
+  is needed, it may be done from either context. Previous rendering using the
+  resource must be flushed if a transfer_map with a different context is
+  expected to wait for them.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c 
b/src/gallium/drivers/freedreno/freedreno_screen.c
index e7155b25f8..17be678247 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -291,6 +291,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
return 0;
 
case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c 
b/src/gallium/drivers/i915/i915_screen.c
index cd08ef0876..b80c291ff3 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -297,6 +297,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap 
cap)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
   return 0;
 
case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c 
b/src/gallium/drivers/ilo/ilo_screen.c
index f69d8d08ab..20a14b99ce 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -520,6 +520,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index bbd25c9998..2edddfe9c0 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -341,6 +341,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
case PIPE_CAP_NATIVE_FENCE_FD:
case PIPE_CAP_THREAD_SAFE:
+   case PIPE_CAP_BUFFER_TRANSFER_EXTERNAL_CONTEXT:
   return 0;
}
/* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c 
b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 429cc79b28..cd6181ed6b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -206,6 +206,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
  

Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Nicolai Hähnle

On 18.12.2016 13:38, Axel Davy wrote:

Currently there is no real specification on what is allowed for
using different contexts in several threads, or when you
map/unmap a resource in a thread, but uses it in another for
draw calls.

For the gallium nine CSMT patchset, I've figured out it would be better
to add flags to describe what is allowed.

Please comment.


I don't see the point of this. All existing drivers must already support 
the multi-threading scenarios described in the commits, because they can 
happen as part of correct use of OpenGL. If drivers don't support them, 
then they're already broken.


I'm happy to be convinced otherwise if I missed something, but using 
multiple contexts from different threads, or using Map/UnmapBuffer from 
one context but sourcing the buffer from draw calls in another context 
are all perfectly supported OpenGL use cases.


Nicolai



Yours,

Axel Davy

Axel Davy (3):
  gallium: add PIPE_CAP_THREAD_SAFE
  radeon: enable PIPE_CAP_THREAD_SAFE
  gallium: add flag for transfers in a different context than draw call

 src/gallium/docs/source/screen.rst   | 12 
 src/gallium/drivers/freedreno/freedreno_screen.c |  2 ++
 src/gallium/drivers/i915/i915_screen.c   |  2 ++
 src/gallium/drivers/ilo/ilo_screen.c |  2 ++
 src/gallium/drivers/llvmpipe/lp_screen.c |  2 ++
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   |  2 ++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   |  2 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   |  2 ++
 src/gallium/drivers/r300/r300_screen.c   |  2 ++
 src/gallium/drivers/r600/r600_pipe.c |  2 ++
 src/gallium/drivers/radeonsi/si_pipe.c   |  2 ++
 src/gallium/drivers/softpipe/sp_screen.c |  2 ++
 src/gallium/drivers/svga/svga_screen.c   |  2 ++
 src/gallium/drivers/swr/swr_screen.cpp   |  2 ++
 src/gallium/drivers/vc4/vc4_screen.c |  2 ++
 src/gallium/drivers/virgl/virgl_screen.c |  2 ++
 src/gallium/include/pipe/p_defines.h |  3 +++
 17 files changed, 45 insertions(+)


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Axel Davy

On 18/12/2016 16:57, Nicolai Hähnle wrote:

On 18.12.2016 13:38, Axel Davy wrote:

Currently there is no real specification on what is allowed for
using different contexts in several threads, or when you
map/unmap a resource in a thread, but uses it in another for
draw calls.

For the gallium nine CSMT patchset, I've figured out it would be better
to add flags to describe what is allowed.

Please comment.


I don't see the point of this. All existing drivers must already 
support the multi-threading scenarios described in the commits, 
because they can happen as part of correct use of OpenGL. If drivers 
don't support them, then they're already broken.


Some drivers aren't thread safe. For example nouveau isn't. I guess 
other drivers may be.
I'm happy to be convinced otherwise if I missed something, but using 
multiple contexts from different threads, or using Map/UnmapBuffer 
from one context but sourcing the buffer from draw calls in another 
context are all perfectly supported OpenGL use cases.
For some drivers and combination flags, map gives a staging buffer and 
unmap does trigger a copy of it to the real buffer. This is not 
neccessarily flushed. I'd like this flush to be implicit, because if we 
can avoid it (most of the cases) it's better.


Axel


Nicolai



Yours,

Axel Davy

Axel Davy (3):
  gallium: add PIPE_CAP_THREAD_SAFE
  radeon: enable PIPE_CAP_THREAD_SAFE
  gallium: add flag for transfers in a different context than draw call

 src/gallium/docs/source/screen.rst   | 12 
 src/gallium/drivers/freedreno/freedreno_screen.c |  2 ++
 src/gallium/drivers/i915/i915_screen.c   |  2 ++
 src/gallium/drivers/ilo/ilo_screen.c |  2 ++
 src/gallium/drivers/llvmpipe/lp_screen.c |  2 ++
 src/gallium/drivers/nouveau/nv30/nv30_screen.c   |  2 ++
 src/gallium/drivers/nouveau/nv50/nv50_screen.c   |  2 ++
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c   |  2 ++
 src/gallium/drivers/r300/r300_screen.c   |  2 ++
 src/gallium/drivers/r600/r600_pipe.c |  2 ++
 src/gallium/drivers/radeonsi/si_pipe.c   |  2 ++
 src/gallium/drivers/softpipe/sp_screen.c |  2 ++
 src/gallium/drivers/svga/svga_screen.c   |  2 ++
 src/gallium/drivers/swr/swr_screen.cpp   |  2 ++
 src/gallium/drivers/vc4/vc4_screen.c |  2 ++
 src/gallium/drivers/virgl/virgl_screen.c |  2 ++
 src/gallium/include/pipe/p_defines.h |  3 +++
 17 files changed, 45 insertions(+)





___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Axel Davy

On 18/12/2016 16:57, Nicolai Hähnle wrote:



I'm happy to be convinced otherwise if I missed something, but using 
multiple contexts from different threads, or using Map/UnmapBuffer 
from one context but sourcing the buffer from draw calls in another 
context are all perfectly supported OpenGL use cases.
There is also the case of having persistent coherent buffer mapped with 
one context, and used with another one.
If implementation of coherent requires driver flushes the range at draw 
calls, I guess it may not work as is with the multi context scenerio 
(looking at nouveau, it seems to check if the buffer is mapped in 
current context for example, not for all possible contexts)


Axel


Nicolai







___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Nicolai Hähnle

On 18.12.2016 17:37, Axel Davy wrote:

On 18/12/2016 16:57, Nicolai Hähnle wrote:

On 18.12.2016 13:38, Axel Davy wrote:

Currently there is no real specification on what is allowed for
using different contexts in several threads, or when you
map/unmap a resource in a thread, but uses it in another for
draw calls.

For the gallium nine CSMT patchset, I've figured out it would be better
to add flags to describe what is allowed.

Please comment.


I don't see the point of this. All existing drivers must already
support the multi-threading scenarios described in the commits,
because they can happen as part of correct use of OpenGL. If drivers
don't support them, then they're already broken.


Some drivers aren't thread safe. For example nouveau isn't. I guess
other drivers may be.


Then this needs to be fixed regardless because it could have happened in 
OpenGL applications all along.




I'm happy to be convinced otherwise if I missed something, but using
multiple contexts from different threads, or using Map/UnmapBuffer
from one context but sourcing the buffer from draw calls in another
context are all perfectly supported OpenGL use cases.

For some drivers and combination flags, map gives a staging buffer and
unmap does trigger a copy of it to the real buffer. This is not
neccessarily flushed. I'd like this flush to be implicit, because if we
can avoid it (most of the cases) it's better.


Is this for some internal transfers done by nine?

I _really_ don't like the "EXTERNAL_CONTEXT" naming. "External contexts" 
may be a motivation for using the feature, but they're not what the 
feature is really about. It's about the interaction of unmap and flush.


Remember: Even today, all resources can be mapped from context A and 
drawn from using context B. You don't need a flag for that.


We really shouldn't add more places for drivers to do implicit flushes. 
Reduce the magic, please. If anything, we could consider adding a 
feedback mechanism where the driver tells you at transfer_map time 
whether the unmap requires a flush to be effective. That has the 
advantage of working with multiple such unmaps in a row.


Then again, why not just call flush unconditionally? If the flush was 
unnecessary, it should be a no-op, and the driver should already have a 
fast path for that anyway.


Nicolai
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Nicolai Hähnle

On 18.12.2016 17:40, Axel Davy wrote:

On 18/12/2016 16:57, Nicolai Hähnle wrote:



I'm happy to be convinced otherwise if I missed something, but using
multiple contexts from different threads, or using Map/UnmapBuffer
from one context but sourcing the buffer from draw calls in another
context are all perfectly supported OpenGL use cases.

There is also the case of having persistent coherent buffer mapped with
one context, and used with another one.
If implementation of coherent requires driver flushes the range at draw
calls, I guess it may not work as is with the multi context scenerio
(looking at nouveau, it seems to check if the buffer is mapped in
current context for example, not for all possible contexts)


That sounds like a bug in nouveau, if true. The mapping state of a 
buffer object is per-buffer object, not per-context, at least in OpenGL.


Come to think of it, OpenGL probably allows unmapping a buffer from a 
different context than it was mapped from, and it seems like we may have 
a mismatch there with the Gallium API.


Nicolai
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Axel Davy

On 18/12/2016 18:34, Nicolai Hähnle wrote:

On 18.12.2016 17:37, Axel Davy wrote:

On 18/12/2016 16:57, Nicolai Hähnle wrote:

On 18.12.2016 13:38, Axel Davy wrote:

Currently there is no real specification on what is allowed for
using different contexts in several threads, or when you
map/unmap a resource in a thread, but uses it in another for
draw calls.

For the gallium nine CSMT patchset, I've figured out it would be 
better

to add flags to describe what is allowed.

Please comment.


I don't see the point of this. All existing drivers must already
support the multi-threading scenarios described in the commits,
because they can happen as part of correct use of OpenGL. If drivers
don't support them, then they're already broken.


Some drivers aren't thread safe. For example nouveau isn't. I guess
other drivers may be.


Then this needs to be fixed regardless because it could have happened 
in OpenGL applications all along.




I'm happy to be convinced otherwise if I missed something, but using
multiple contexts from different threads, or using Map/UnmapBuffer
from one context but sourcing the buffer from draw calls in another
context are all perfectly supported OpenGL use cases.

For some drivers and combination flags, map gives a staging buffer and
unmap does trigger a copy of it to the real buffer. This is not
neccessarily flushed. I'd like this flush to be implicit, because if we
can avoid it (most of the cases) it's better.


Is this for some internal transfers done by nine?

I _really_ don't like the "EXTERNAL_CONTEXT" naming. "External 
contexts" may be a motivation for using the feature, but they're not 
what the feature is really about. It's about the interaction of unmap 
and flush.


Remember: Even today, all resources can be mapped from context A and 
drawn from using context B. You don't need a flag for that.


We really shouldn't add more places for drivers to do implicit 
flushes. Reduce the magic, please. If anything, we could consider 
adding a feedback mechanism where the driver tells you at transfer_map 
time whether the unmap requires a flush to be effective. That has the 
advantage of working with multiple such unmaps in a row.


Then again, why not just call flush unconditionally? If the flush was 
unnecessary, it should be a no-op, and the driver should already have 
a fast path for that anyway.

If drivers flush noop very fast, I'm okay doing that (but do they ?).
My main thought relative to that was: What about persistent, coherent 
and barriers ? Does it makes sense for them to flush after unmap ? It is 
supposed to work using them in another context without unmap or flush ?
Is it expected for current drivers that you can map such buffers with 
one context, and draw with the others ?
In case barriers are needed (nine doesn't use persistent without 
coherent so doesn't need this, but it would need to be clarified), which 
context should wait for the barrier ?


Axel


Nicolai



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Ilia Mirkin
On Sun, Dec 18, 2016 at 12:43 PM, Nicolai Hähnle  wrote:
> On 18.12.2016 17:40, Axel Davy wrote:
>>
>> On 18/12/2016 16:57, Nicolai Hähnle wrote:
>>>
>>>
>>>
>>> I'm happy to be convinced otherwise if I missed something, but using
>>> multiple contexts from different threads, or using Map/UnmapBuffer
>>> from one context but sourcing the buffer from draw calls in another
>>> context are all perfectly supported OpenGL use cases.
>>
>> There is also the case of having persistent coherent buffer mapped with
>> one context, and used with another one.
>> If implementation of coherent requires driver flushes the range at draw
>> calls, I guess it may not work as is with the multi context scenerio
>> (looking at nouveau, it seems to check if the buffer is mapped in
>> current context for example, not for all possible contexts)
>
>
> That sounds like a bug in nouveau, if true. The mapping state of a buffer
> object is per-buffer object, not per-context, at least in OpenGL.

I'm not aware of anything offhand that would cause this. If a buffer
is mapped coherently, we always flush everything if there's any hint
of it being bound in the context when there's a draw. If it's not
mapped coherently (but persistently), we rely on the update
notification thing to mark which buffers are dirty and thus cause
things to get flushed out. I only roughly remember how this works, but
I could imagine a situation where updating a
persistent-but-not-coherent buffer in one context (and never drawing)
and then using it in another could miss some necessary cache flushes.
However it shouldn't be difficult to fix if that does indeed happen,
esp if a piglit came along exercising that.

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [RFC] New gallium flags for using different contexts in several threads

2016-12-18 Thread Axel Davy

On 18/12/2016 18:34, Nicolai Hähnle wrote:



Then again, why not just call flush unconditionally? If the flush was 
unnecessary, it should be a no-op, and the driver should already have 
a fast path for that anyway.


I just looked at radeon source with amdgpu, and it looks like to me a flush
will wait:
1) current commands in the context are submitted to the thread 
submitting the cs to the context

2) Wait the submitting thread has finished all its work.

While 1) would be noop, 2) wouldn't be in my understanding, because it 
would wait on work done in other contexts.


Axel
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] st/nine: Upload on secondary context for Draw*Up

2016-12-18 Thread Axel Davy
Avoid synchronization by using the secondary context
for uploading the vertex data for Draw*Up.

Signed-off-by: Axel Davy 
---
 src/gallium/state_trackers/nine/device9.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/gallium/state_trackers/nine/device9.c 
b/src/gallium/state_trackers/nine/device9.c
index 9f2575309a..9af9b5b578 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -473,12 +473,12 @@ NineDevice9_ctor( struct NineDevice9 *This,
 This->driver_caps.user_sw_cbufs = 
This->screen_sw->get_param(This->screen_sw, PIPE_CAP_USER_CONSTANT_BUFFERS);
 
 if (!This->driver_caps.user_vbufs)
-This->vertex_uploader = u_upload_create(This->context.pipe, 65536,
+This->vertex_uploader = u_upload_create(This->pipe_secondary, 65536,
 PIPE_BIND_VERTEX_BUFFER, 
PIPE_USAGE_STREAM);
 This->vertex_sw_uploader = u_upload_create(This->pipe_sw, 65536,
 PIPE_BIND_VERTEX_BUFFER, 
PIPE_USAGE_STREAM);
 if (!This->driver_caps.user_ibufs)
-This->index_uploader = u_upload_create(This->context.pipe, 128 * 1024,
+This->index_uploader = u_upload_create(This->pipe_secondary, 128 * 
1024,
PIPE_BIND_INDEX_BUFFER, 
PIPE_USAGE_STREAM);
 if (!This->driver_caps.user_cbufs) {
 This->constbuf_alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
@@ -2835,8 +2835,6 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
 vtxbuf.user_buffer = pVertexStreamZeroData;
 
 if (!This->driver_caps.user_vbufs) {
-/* Implicit use of context pipe */
-(void)NineDevice9_GetPipe(This);
 u_upload_data(This->vertex_uploader,
   0,
   (prim_count_to_vertex_count(PrimitiveType, 
PrimitiveCount)) * VertexStreamZeroStride, /* XXX */
@@ -2846,6 +2844,9 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
   &vtxbuf.buffer);
 u_upload_unmap(This->vertex_uploader);
 vtxbuf.user_buffer = NULL;
+/* Flush to make sure u_upload_unmap, which is on pipe_secondary,
+ * is visible to context pipe. */
+This->pipe_secondary->flush(This->pipe_secondary, NULL, 0);
 }
 
 NineBeforeDraw(This);
@@ -2900,8 +2901,6 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 
*This,
 
 if (!This->driver_caps.user_vbufs) {
 const unsigned base = MinVertexIndex * VertexStreamZeroStride;
-/* Implicit use of context pipe */
-(void)NineDevice9_GetPipe(This);
 u_upload_data(This->vertex_uploader,
   base,
   NumVertices * VertexStreamZeroStride, /* XXX */
@@ -2915,8 +2914,6 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 
*This,
 vbuf.user_buffer = NULL;
 }
 if (!This->driver_caps.user_ibufs) {
-/* Implicit use of context pipe */
-(void)NineDevice9_GetPipe(This);
 u_upload_data(This->index_uploader,
   0,
   (prim_count_to_vertex_count(PrimitiveType, 
PrimitiveCount)) * ibuf.index_size,
@@ -2928,6 +2925,11 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 
*This,
 ibuf.user_buffer = NULL;
 }
 
+if (!This->driver_caps.user_vbufs || !This->driver_caps.user_ibufs)
+/* Flush to make sure u_upload_unmap, which is on pipe_secondary,
+ * is visible to context pipe. */
+This->pipe_secondary->flush(This->pipe_secondary, NULL, 0);
+
 NineBeforeDraw(This);
 nine_context_draw_indexed_primitive_from_vtxbuf_idxbuf(This, PrimitiveType,
MinVertexIndex,
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 06/23] radv: add a compute shader implementation for buffer to image

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

This implements the reverse of the current buffer->image path
and can be used when we need to do image transfer on compute queues

This just adds the code turned off as we don't support separate
computes queues yet, and we don't want to use this path on the GFX
queues for DCC reasons.

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_meta.h  |   6 +
 src/amd/vulkan/radv_meta_bufimage.c | 302 +++-
 src/amd/vulkan/radv_meta_copy.c |  23 ++-
 3 files changed, 325 insertions(+), 6 deletions(-)

diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
index 97d020cea1..aa0d30c338 100644
--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -166,6 +166,12 @@ void radv_meta_image_to_buffer(struct radv_cmd_buffer 
*cmd_buffer,
   unsigned num_rects,
   struct radv_meta_blit2d_rect *rects);
 
+void radv_meta_buffer_to_image_cs(struct radv_cmd_buffer *cmd_buffer,
+ struct radv_meta_blit2d_buffer *src,
+ struct radv_meta_blit2d_surf *dst,
+ unsigned num_rects,
+ struct radv_meta_blit2d_rect *rects);
+
 void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 struct radv_image *image,
 VkImageSubresourceRange 
*subresourceRange);
diff --git a/src/amd/vulkan/radv_meta_bufimage.c 
b/src/amd/vulkan/radv_meta_bufimage.c
index 310771829b..efe02cc854 100644
--- a/src/amd/vulkan/radv_meta_bufimage.c
+++ b/src/amd/vulkan/radv_meta_bufimage.c
@@ -225,10 +225,206 @@ radv_device_finish_meta_itob_state(struct radv_device 
*device)
}
 }
 
+static nir_shader *
+build_nir_btoi_compute_shader(struct radv_device *dev)
+{
+   nir_builder b;
+   const struct glsl_type *buf_type = 
glsl_sampler_type(GLSL_SAMPLER_DIM_BUF,
+false,
+false,
+GLSL_TYPE_FLOAT);
+   const struct glsl_type *img_type = 
glsl_sampler_type(GLSL_SAMPLER_DIM_2D,
+false,
+false,
+GLSL_TYPE_FLOAT);
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
+   b.shader->info->name = ralloc_strdup(b.shader, "meta_btoi_cs");
+   b.shader->info->cs.local_size[0] = 16;
+   b.shader->info->cs.local_size[1] = 16;
+   b.shader->info->cs.local_size[2] = 1;
+   nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform,
+ buf_type, "s_tex");
+   input_img->data.descriptor_set = 0;
+   input_img->data.binding = 0;
+
+   nir_variable *output_img = nir_variable_create(b.shader, 
nir_var_uniform,
+  img_type, "out_img");
+   output_img->data.descriptor_set = 0;
+   output_img->data.binding = 1;
+
+   nir_ssa_def *invoc_id = nir_load_system_value(&b, 
nir_intrinsic_load_local_invocation_id, 0);
+   nir_ssa_def *wg_id = nir_load_system_value(&b, 
nir_intrinsic_load_work_group_id, 0);
+   nir_ssa_def *block_size = nir_imm_ivec4(&b,
+   
b.shader->info->cs.local_size[0],
+   
b.shader->info->cs.local_size[1],
+   
b.shader->info->cs.local_size[2], 0);
+
+   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), 
invoc_id);
+
+   nir_intrinsic_instr *offset = nir_intrinsic_instr_create(b.shader, 
nir_intrinsic_load_push_constant);
+   offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   offset->num_components = 2;
+   nir_ssa_dest_init(&offset->instr, &offset->dest, 2, 32, "offset");
+   nir_builder_instr_insert(&b, &offset->instr);
+
+   nir_intrinsic_instr *stride = nir_intrinsic_instr_create(b.shader, 
nir_intrinsic_load_push_constant);
+   stride->src[0] = nir_src_for_ssa(nir_imm_int(&b, 8));
+   stride->num_components = 1;
+   nir_ssa_dest_init(&stride->instr, &stride->dest, 1, 32, "stride");
+   nir_builder_instr_insert(&b, &stride->instr);
+
+   nir_ssa_def *pos_x = nir_channel(&b, global_id, 0);
+   nir_ssa_def *pos_y = nir_channel(&b, global_id, 1);
+
+   nir_ssa_def *tmp = nir_imul(&b, pos_y, &stride->dest.ssa);
+   tmp = nir_iadd(&b, tmp, pos_x);
+
+   nir_ssa_def *buf_coord = nir_vec4(&b, tmp, tmp, tmp, tmp);
+
+   nir_ssa_def *img_coord = nir_iadd(&b, global_id, &offset->dest.ssa);
+
+   nir_tex_instr *tex = nir_tex_inst

[Mesa-dev] [PATCH 01/23] radv/winsys: Expose number of compute/dma rings.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_radeon_winsys.h   |  3 ++-
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c | 14 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_radeon_winsys.h 
b/src/amd/vulkan/radv_radeon_winsys.h
index 76363a3552..db7650c9db 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -89,8 +89,9 @@ struct radeon_info {
boolhas_dedicated_vram;
bool has_virtual_memory;
boolgfx_ib_pad_with_type2;
-   bool has_sdma;
bool has_uvd;
+   uint32_tsdma_rings;
+   uint32_tcompute_rings;
uint32_tvce_fw_version;
uint32_tvce_harvest_config;
uint32_tclock_crystal_freq;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c 
b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
index b2e171a082..1ae78ac8d1 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
@@ -118,6 +118,7 @@ do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
struct amdgpu_buffer_size_alignments alignment_info = {};
struct amdgpu_heap_info vram, visible_vram, gtt;
struct drm_amdgpu_info_hw_ip dma = {};
+   struct drm_amdgpu_info_hw_ip compute = {};
drmDevicePtr devinfo;
int r;
int i, j;
@@ -170,6 +171,12 @@ do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) 
failed.\n");
goto fail;
}
+
+   r = amdgpu_query_hw_ip_info(ws->dev, AMDGPU_HW_IP_COMPUTE, 0, &compute);
+   if (r) {
+   fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) 
failed.\n");
+   goto fail;
+   }
ws->info.pci_id = ws->amdinfo.asic_id; /* TODO: is this correct? */
ws->info.vce_harvest_config = ws->amdinfo.vce_harvest_config;
 
@@ -273,6 +280,10 @@ do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
goto fail;
}
+
+   assert(util_is_power_of_two(dma.available_rings + 1));
+   assert(util_is_power_of_two(compute.available_rings + 1));
+
/* Set hardware information. */
ws->info.name = get_chip_name(ws->info.family);
ws->info.gart_size = gtt.heap_size;
@@ -290,7 +301,8 @@ do_winsys_init(struct radv_amdgpu_winsys *ws, int fd)
ws->info.num_tile_pipes = radv_cik_get_num_tile_pipes(&ws->amdinfo);
ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) 
& 0x7);
ws->info.has_virtual_memory = TRUE;
-   ws->info.has_sdma = dma.available_rings != 0;
+   ws->info.sdma_rings = util_bitcount(dma.available_rings);
+   ws->info.compute_rings = util_bitcount(compute.available_rings);
 
/* Get the number of good compute units. */
ws->info.num_good_compute_units = 0;
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 00/23] Compute queues for radv

2016-12-18 Thread Bas Nieuwenhuizen
Hi Dave,

I finally got the compute queues working for me, tested with CTS, Talos and
dota2.

The most significant functional changes, besides just implementing different
command formats for the MEC are:
 - Fixing a pitch bug that caused VM faults when storing a compressed texture as
   RGBA32. Needed for the compute copy functions.
 - The image layout implementation is queue dependent, as we can't elminate the
   cmask on the compute queue. DCC does not suffer from this yet, as it is
   already disabled for textures with image_store or copy usages.
 - Rewrote the whole wait idle code, as the shortcut you implemented won't work,
   and was causing the *.semaphore.* tests to be failing.
 - Removed some of the DMA queue debugging stuff.

Note that the compute queues got an overhaul in CIK with the MEC and I've added
a helper, so we mostly use the non-MEC variant for all differently formatted
packets and stuff I'd guess to be CIK+, but it is completely non-tested on SI.
Furthermore, given the amount of reverse engineering I needed for stuff that was
not really in the open before, I'm not going to implement SI support, especially
not without a SI card.

The series is also available at
https://github.com/BNieuwenhuizen/mesa/commits/radv-wip-cs-queue

Yours sincerely,
Bas Nieuwenhuizen

Bas Nieuwenhuizen (10):
  radv/winsys: Expose number of compute/dma rings.
  radv: Use correct pitch for views with different block size.
  radv/winsys: Make WaitIdle queue aware.
  radv: Implement cache flushing for the MEC.
  radv: update vkCmdUpdateBuffer for the MEC.
  radv: Implement indirect dispatch for the MEC.
  radv: Use RELEASE_MEM packet for MEC timestamp query.
  radv: Don't enable CMASK on compute queues.
  radv: Create an empty CS per ring type.
  radv: Only emit PFP ME syncs for DMA on the GFX queue.

Dave Airlie (13):
  radv/winsys: start adding support for DMA/compute queue
  radv: start fixing up queue allocate for multiple queues
  radv: Store queue family in command buffers.
  radv: add a compute shader implementation for buffer to image
  radv: implement image->image copies using compute shader
  radv/meta: split clear image out into a separate layer clear function
  radv: clear image implementation for compute queue
  radv: hook compute clears into clear image api.
  radv/meta: update header info
  radv: init compute queue and avoid initing transfer queues
  radv: pass queue index into winsys submission
  radv: add semaphore support
  radv: expose the compute queue

 src/amd/common/sid.h  |   1 +
 src/amd/vulkan/radv_cmd_buffer.c  | 137 +++-
 src/amd/vulkan/radv_device.c  | 172 -
 src/amd/vulkan/radv_image.c   |  18 +-
 src/amd/vulkan/radv_meta.h|  23 +-
 src/amd/vulkan/radv_meta_buffer.c |   4 +-
 src/amd/vulkan/radv_meta_bufimage.c   | 871 +-
 src/amd/vulkan/radv_meta_clear.c  | 280 ---
 src/amd/vulkan/radv_meta_copy.c   |  42 +-
 src/amd/vulkan/radv_private.h |  47 +-
 src/amd/vulkan/radv_query.c   |  29 +-
 src/amd/vulkan/radv_radeon_winsys.h   |  16 +-
 src/amd/vulkan/si_cmd_buffer.c|  47 +-
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 109 ++-
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h |   6 +-
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c |  16 +-
 16 files changed, 1559 insertions(+), 259 deletions(-)

-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/23] radv/meta: split clear image out into a separate layer clear function

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

This will make it easier to add support for clears on compute queues.

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_meta_clear.c | 245 ---
 1 file changed, 128 insertions(+), 117 deletions(-)

diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index 77f7dc42aa..dba9ead121 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -976,6 +976,132 @@ radv_cmd_buffer_clear_subpass(struct radv_cmd_buffer 
*cmd_buffer)
 }
 
 static void
+radv_clear_image_layer(struct radv_cmd_buffer *cmd_buffer,
+  struct radv_image *image,
+  VkImageLayout image_layout,
+  const VkImageSubresourceRange *range,
+  VkFormat format, int level, int layer,
+  const VkClearValue *clear_val)
+{
+   VkDevice device_h = radv_device_to_handle(cmd_buffer->device);
+   struct radv_image_view iview;
+   radv_image_view_init(&iview, cmd_buffer->device,
+&(VkImageViewCreateInfo) {
+.sType = 
VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+.image = 
radv_image_to_handle(image),
+.viewType = 
radv_meta_get_view_type(image),
+.format = format,
+.subresourceRange = {
+.aspectMask = range->aspectMask,
+.baseMipLevel = 
range->baseMipLevel + level,
+.levelCount = 1,
+.baseArrayLayer = 
range->baseArrayLayer + layer,
+.layerCount = 1
+},
+},
+cmd_buffer, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT);
+
+   VkFramebuffer fb;
+   radv_CreateFramebuffer(device_h,
+  &(VkFramebufferCreateInfo) {
+  .sType = 
VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+  .attachmentCount = 1,
+  .pAttachments = (VkImageView[]) {
+  
radv_image_view_to_handle(&iview),
+  },
+  .width = iview.extent.width,
+   .height = 
iview.extent.height,
+   .layers = 1
+  },
+  &cmd_buffer->pool->alloc,
+  &fb);
+
+   VkAttachmentDescription att_desc = {
+   .format = iview.vk_format,
+   .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+   .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+   .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
+   .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE,
+   .initialLayout = image_layout,
+   .finalLayout = image_layout,
+   };
+
+   VkSubpassDescription subpass_desc = {
+   .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+   .inputAttachmentCount = 0,
+   .colorAttachmentCount = 0,
+   .pColorAttachments = NULL,
+   .pResolveAttachments = NULL,
+   .pDepthStencilAttachment = NULL,
+   .preserveAttachmentCount = 0,
+   .pPreserveAttachments = NULL,
+   };
+
+   const VkAttachmentReference att_ref = {
+   .attachment = 0,
+   .layout = image_layout,
+   };
+
+   if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+   subpass_desc.colorAttachmentCount = 1;
+   subpass_desc.pColorAttachments = &att_ref;
+   } else {
+   subpass_desc.pDepthStencilAttachment = &att_ref;
+   }
+
+   VkRenderPass pass;
+   radv_CreateRenderPass(device_h,
+ &(VkRenderPassCreateInfo) {
+ .sType = 
VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &att_desc,
+ .subpassCount = 1,
+ .pSubpasses = &subpass_desc,
+ },
+ &cmd_buffer->pool->alloc,
+ &pass);
+
+   radv_CmdBeginRenderPass(radv_cmd_buffer_to_handle(cmd_buffer),
+   &(VkRenderPassBeginInfo) {
+

[Mesa-dev] [PATCH 05/23] radv: Use correct pitch for views with different block size.

2016-12-18 Thread Bas Nieuwenhuizen
Needed when accessing a comrpessed texture as R32G32B32A32 from a shader. This
was not encountered previously, as we used the CB for the reinterpretation, 
which
does not use this pitch.

Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_image.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index c6318f2882..fee98ba94c 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -742,6 +742,7 @@ radv_image_view_init(struct radv_image_view *iview,
 {
RADV_FROM_HANDLE(radv_image, image, pCreateInfo->image);
const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
+   uint32_t blk_w;
bool is_stencil = false;
switch (image->type) {
case VK_IMAGE_TYPE_1D:
@@ -779,6 +780,8 @@ radv_image_view_init(struct radv_image_view *iview,
iview->extent.height = round_up_u32(iview->extent.height * 
vk_format_get_blockheight(iview->vk_format),

vk_format_get_blockheight(image->vk_format));
 
+   assert(image->surface.blk_w % 
vk_format_get_blockwidth(image->vk_format) == 0);
+   blk_w = image->surface.blk_w / 
vk_format_get_blockwidth(image->vk_format) * 
vk_format_get_blockwidth(iview->vk_format);
iview->base_layer = range->baseArrayLayer;
iview->layer_count = radv_get_layerCount(image, range);
iview->base_mip = range->baseMipLevel;
@@ -798,7 +801,7 @@ radv_image_view_init(struct radv_image_view *iview,
si_set_mutable_tex_desc_fields(device, image,
   is_stencil ? 
&image->surface.stencil_level[range->baseMipLevel] : 
&image->surface.level[range->baseMipLevel], range->baseMipLevel,
   range->baseMipLevel,
-  image->surface.blk_w, is_stencil, 
iview->descriptor);
+  blk_w, is_stencil, iview->descriptor);
 }
 
 void radv_image_set_optimal_micro_tile_mode(struct radv_device *device,
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 10/23] radv: hook compute clears into clear image api.

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

These aren't used yet but we will want to use them when we
implement a separate compute queue.

Signed-off-by: Dave Airlie 
Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_meta_clear.c | 41 
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index dba9ead121..d6af29187f 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -1107,7 +1107,8 @@ radv_cmd_clear_image(struct radv_cmd_buffer *cmd_buffer,
 VkImageLayout image_layout,
 const VkClearValue *clear_value,
 uint32_t range_count,
-const VkImageSubresourceRange *ranges)
+const VkImageSubresourceRange *ranges,
+bool cs)
 {
VkFormat format = image->vk_format;
VkClearValue internal_clear_value = *clear_value;
@@ -1126,13 +1127,30 @@ radv_cmd_clear_image(struct radv_cmd_buffer *cmd_buffer,
radv_minify(image->extent.depth, 
range->baseMipLevel + l) :
radv_get_layerCount(image, range);
for (uint32_t s = 0; s < layer_count; ++s) {
-   radv_clear_image_layer(cmd_buffer, image, 
image_layout,
-  range, format, l, s, 
&internal_clear_value);
+
+   if (cs) {
+   struct radv_meta_blit2d_surf surf;
+   surf.format = format;
+   surf.image = image;
+   surf.level = range->baseMipLevel + l;
+   surf.layer = range->baseArrayLayer + s;
+   surf.aspect_mask = range->aspectMask;
+   radv_meta_clear_image_cs(cmd_buffer, 
&surf,
+
&internal_clear_value.color);
+   } else {
+   radv_clear_image_layer(cmd_buffer, 
image, image_layout,
+  range, format, 
l, s, &internal_clear_value);
+   }
}
}
}
 }
 
+union meta_saved_state {
+   struct radv_meta_saved_state gfx;
+   struct radv_meta_saved_compute_state compute;
+};
+
 void radv_CmdClearColorImage(
VkCommandBuffer commandBuffer,
VkImage image_h,
@@ -1143,15 +1161,22 @@ void radv_CmdClearColorImage(
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
RADV_FROM_HANDLE(radv_image, image, image_h);
-   struct radv_meta_saved_state saved_state;
+   union meta_saved_state saved_state;
+   bool cs = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
 
-   radv_meta_save_graphics_reset_vport_scissor(&saved_state, cmd_buffer);
+   if (cs)
+   radv_meta_begin_cleari(cmd_buffer, &saved_state.compute);
+   else
+   radv_meta_save_graphics_reset_vport_scissor(&saved_state.gfx, 
cmd_buffer);
 
radv_cmd_clear_image(cmd_buffer, image, imageLayout,
 (const VkClearValue *) pColor,
-rangeCount, pRanges);
+rangeCount, pRanges, cs);
 
-   radv_meta_restore(&saved_state, cmd_buffer);
+   if (cs)
+   radv_meta_end_cleari(cmd_buffer, &saved_state.compute);
+   else
+   radv_meta_restore(&saved_state.gfx, cmd_buffer);
 }
 
 void radv_CmdClearDepthStencilImage(
@@ -1170,7 +1195,7 @@ void radv_CmdClearDepthStencilImage(
 
radv_cmd_clear_image(cmd_buffer, image, imageLayout,
 (const VkClearValue *) pDepthStencil,
-rangeCount, pRanges);
+rangeCount, pRanges, false);
 
radv_meta_restore(&saved_state, cmd_buffer);
 }
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 04/23] radv: Store queue family in command buffers.

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

v2: Added helper (Bas)

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_cmd_buffer.c | 30 --
 src/amd/vulkan/radv_private.h|  7 +++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 8e99fc0be9..798dd6aa17 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -110,6 +110,25 @@ radv_dynamic_state_copy(struct radv_dynamic_state *dest,
dest->stencil_reference = src->stencil_reference;
 }
 
+bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
+{
+   return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
+  cmd_buffer->device->instance->physicalDevice.rad_info.chip_class 
>= CIK;
+}
+
+enum ring_type radv_queue_family_to_ring(int f) {
+   switch (f) {
+   case RADV_QUEUE_GENERAL:
+   return RING_GFX;
+   case RADV_QUEUE_COMPUTE:
+   return RING_COMPUTE;
+   case RADV_QUEUE_TRANSFER:
+   return RING_DMA;
+   default:
+   unreachable("Unknown queue family");
+   }
+}
+
 static VkResult radv_create_cmd_buffer(
struct radv_device * device,
struct radv_cmd_pool *   pool,
@@ -118,7 +137,7 @@ static VkResult radv_create_cmd_buffer(
 {
struct radv_cmd_buffer *cmd_buffer;
VkResult result;
-
+   unsigned ring;
cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (cmd_buffer == NULL)
@@ -132,14 +151,19 @@ static VkResult radv_create_cmd_buffer(
 
if (pool) {
list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
+   cmd_buffer->queue_family_index = pool->queue_family_index;
+
} else {
/* Init the pool_link so we can safefly call list_del when we 
destroy
 * the command buffer
 */
list_inithead(&cmd_buffer->pool_link);
+   cmd_buffer->queue_family_index = RADV_QUEUE_GENERAL;
}
 
-   cmd_buffer->cs = device->ws->cs_create(device->ws, RING_GFX);
+   ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
+
+   cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
if (!cmd_buffer->cs) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto fail;
@@ -1775,6 +1799,8 @@ VkResult radv_CreateCommandPool(
 
list_inithead(&pool->cmd_buffers);
 
+   pool->queue_family_index = pCreateInfo->queueFamilyIndex;
+
*pCmdPool = radv_cmd_pool_to_handle(pool);
 
return VK_SUCCESS;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index aa5b477c19..e58053b1be 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -441,6 +441,8 @@ struct radv_meta_state {
 
 #define RADV_MAX_QUEUE_FAMILIES 3
 
+enum ring_type radv_queue_family_to_ring(int f);
+
 struct radv_queue {
VK_LOADER_DATA  _loader_data;
struct radv_device * device;
@@ -666,9 +668,11 @@ struct radv_cmd_state {
floatoffset_scale;
uint32_t  descriptors_dirty;
 };
+
 struct radv_cmd_pool {
VkAllocationCallbacksalloc;
struct list_head cmd_buffers;
+   uint32_t queue_family_index;
 };
 
 struct radv_cmd_buffer_upload {
@@ -691,6 +695,7 @@ struct radv_cmd_buffer {
VkCommandBufferLevel level;
struct radeon_winsys_cs *cs;
struct radv_cmd_state state;
+   uint32_t queue_family_index;
 
uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
uint32_t dynamic_buffers[16 * MAX_DYNAMIC_BUFFERS];
@@ -703,6 +708,8 @@ struct radv_cmd_buffer {
 
 struct radv_image;
 
+bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer);
+
 void si_init_config(struct radv_physical_device *physical_device,
struct radv_cmd_buffer *cmd_buffer);
 void si_write_viewport(struct radeon_winsys_cs *cs, int first_vp,
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 07/23] radv: implement image->image copies using compute shader

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

This is required for having a separate compute queue, we
probably can't use this on GFX queue due to DCC.

v2: Set coord_components = 2 for itoi texture fetch. (Bas)

Signed-off-by: Dave Airlie 
Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_meta.h  |  10 +-
 src/amd/vulkan/radv_meta_bufimage.c | 315 
 src/amd/vulkan/radv_meta_copy.c |  19 ++-
 src/amd/vulkan/radv_private.h   |   5 +
 4 files changed, 343 insertions(+), 6 deletions(-)

diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
index aa0d30c338..5994a7ce8f 100644
--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -159,7 +159,10 @@ void radv_meta_begin_bufimage(struct radv_cmd_buffer 
*cmd_buffer,
  struct radv_meta_saved_compute_state *save);
 void radv_meta_end_bufimage(struct radv_cmd_buffer *cmd_buffer,
struct radv_meta_saved_compute_state *save);
-
+void radv_meta_begin_itoi(struct radv_cmd_buffer *cmd_buffer,
+ struct radv_meta_saved_compute_state *save);
+void radv_meta_end_itoi(struct radv_cmd_buffer *cmd_buffer,
+   struct radv_meta_saved_compute_state *save);
 void radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer,
   struct radv_meta_blit2d_surf *src,
   struct radv_meta_blit2d_buffer *dst,
@@ -171,6 +174,11 @@ void radv_meta_buffer_to_image_cs(struct radv_cmd_buffer 
*cmd_buffer,
  struct radv_meta_blit2d_surf *dst,
  unsigned num_rects,
  struct radv_meta_blit2d_rect *rects);
+void radv_meta_image_to_image_cs(struct radv_cmd_buffer *cmd_buffer,
+struct radv_meta_blit2d_surf *src,
+struct radv_meta_blit2d_surf *dst,
+unsigned num_rects,
+struct radv_meta_blit2d_rect *rects);
 
 void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 struct radv_image *image,
diff --git a/src/amd/vulkan/radv_meta_bufimage.c 
b/src/amd/vulkan/radv_meta_bufimage.c
index efe02cc854..638208f939 100644
--- a/src/amd/vulkan/radv_meta_bufimage.c
+++ b/src/amd/vulkan/radv_meta_bufimage.c
@@ -420,11 +420,201 @@ radv_device_finish_meta_btoi_state(struct radv_device 
*device)
}
 }
 
+static nir_shader *
+build_nir_itoi_compute_shader(struct radv_device *dev)
+{
+   nir_builder b;
+   const struct glsl_type *buf_type = 
glsl_sampler_type(GLSL_SAMPLER_DIM_2D,
+false,
+false,
+GLSL_TYPE_FLOAT);
+   const struct glsl_type *img_type = 
glsl_sampler_type(GLSL_SAMPLER_DIM_2D,
+false,
+false,
+GLSL_TYPE_FLOAT);
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
+   b.shader->info->name = ralloc_strdup(b.shader, "meta_itoi_cs");
+   b.shader->info->cs.local_size[0] = 16;
+   b.shader->info->cs.local_size[1] = 16;
+   b.shader->info->cs.local_size[2] = 1;
+   nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform,
+ buf_type, "s_tex");
+   input_img->data.descriptor_set = 0;
+   input_img->data.binding = 0;
+
+   nir_variable *output_img = nir_variable_create(b.shader, 
nir_var_uniform,
+  img_type, "out_img");
+   output_img->data.descriptor_set = 0;
+   output_img->data.binding = 1;
+
+   nir_ssa_def *invoc_id = nir_load_system_value(&b, 
nir_intrinsic_load_local_invocation_id, 0);
+   nir_ssa_def *wg_id = nir_load_system_value(&b, 
nir_intrinsic_load_work_group_id, 0);
+   nir_ssa_def *block_size = nir_imm_ivec4(&b,
+   
b.shader->info->cs.local_size[0],
+   
b.shader->info->cs.local_size[1],
+   
b.shader->info->cs.local_size[2], 0);
+
+   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), 
invoc_id);
+
+   nir_intrinsic_instr *src_offset = nir_intrinsic_instr_create(b.shader, 
nir_intrinsic_load_push_constant);
+   src_offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   src_offset->num_components = 2;
+   nir_ssa_dest_init(&src_offset->instr, &src_offset->dest, 2, 32, 
"src_offset");
+   nir_builder_instr_insert(&b, &src_offset->instr);
+
+   nir_intrinsic_inst

[Mesa-dev] [PATCH 22/23] radv: Only emit PFP ME syncs for DMA on the GFX queue.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/si_cmd_buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 4b2624cb8e..e3f883f50b 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -776,7 +776,7 @@ static void si_emit_cp_dma_copy_buffer(struct 
radv_cmd_buffer *cmd_buffer,
 * indices. If we wanted to execute CP DMA in PFP, this packet
 * should precede it.
 */
-   if (sync_flag) {
+   if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
@@ -816,7 +816,7 @@ static void si_emit_cp_dma_clear_buffer(struct 
radv_cmd_buffer *cmd_buffer,
}
 
/* See "copy_buffer" for explanation. */
-   if (sync_flag) {
+   if (sync_flag && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 20/23] radv: Don't enable CMASK on compute queues.

2016-12-18 Thread Bas Nieuwenhuizen
We can't fast clear on compute queues.

Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_cmd_buffer.c | 40 
 src/amd/vulkan/radv_image.c  | 18 --
 src/amd/vulkan/radv_meta_clear.c |  2 +-
 src/amd/vulkan/radv_private.h| 10 +-
 4 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 3c5fe25ce6..0572cb85e5 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -36,6 +36,8 @@ static void radv_handle_image_transition(struct 
radv_cmd_buffer *cmd_buffer,
 struct radv_image *image,
 VkImageLayout src_layout,
 VkImageLayout dst_layout,
+int src_family,
+int dst_family,
 VkImageSubresourceRange range,
 VkImageAspectFlags pending_clears);
 
@@ -1207,7 +1209,7 @@ static void radv_handle_subpass_image_transition(struct 
radv_cmd_buffer *cmd_buf
radv_handle_image_transition(cmd_buffer,
 view->image,
 
cmd_buffer->state.attachments[idx].current_layout,
-att.layout, range,
+att.layout, 0, 0, range,
 
cmd_buffer->state.attachments[idx].pending_clear_aspects);
 
cmd_buffer->state.attachments[idx].current_layout = att.layout;
@@ -2386,6 +2388,8 @@ static void radv_handle_cmask_image_transition(struct 
radv_cmd_buffer *cmd_buffe
   struct radv_image *image,
   VkImageLayout src_layout,
   VkImageLayout dst_layout,
+  unsigned src_queue_mask,
+  unsigned dst_queue_mask,
   VkImageSubresourceRange range,
   VkImageAspectFlags 
pending_clears)
 {
@@ -2394,8 +2398,8 @@ static void radv_handle_cmask_image_transition(struct 
radv_cmd_buffer *cmd_buffe
radv_initialise_cmask(cmd_buffer, image, 0xu);
else
radv_initialise_cmask(cmd_buffer, image, 0xu);
-   } else if (radv_layout_has_cmask(image, src_layout) &&
-  !radv_layout_has_cmask(image, dst_layout)) {
+   } else if (radv_layout_has_cmask(image, src_layout, src_queue_mask) &&
+  !radv_layout_has_cmask(image, dst_layout, dst_queue_mask)) {
radv_fast_clear_flush_image_inplace(cmd_buffer, image);
}
 }
@@ -2436,16 +2440,40 @@ static void radv_handle_image_transition(struct 
radv_cmd_buffer *cmd_buffer,
 struct radv_image *image,
 VkImageLayout src_layout,
 VkImageLayout dst_layout,
+int src_family,
+int dst_family,
 VkImageSubresourceRange range,
 VkImageAspectFlags pending_clears)
 {
+   if (image->exclusive && src_family != dst_family) {
+   /* This is an acquire or a release operation and there will be
+* a corresponding release/acquire. Do the transition in the
+* most flexible queue. */
+
+   assert(src_family == cmd_buffer->queue_family_index ||
+  dst_family == cmd_buffer->queue_family_index);
+
+   if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
+   return;
+
+   if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
+   (src_family == RADV_QUEUE_GENERAL ||
+dst_family == RADV_QUEUE_GENERAL))
+   return;
+   }
+
+   unsigned src_queue_mask = radv_image_queue_family_mask(image, 
src_family);
+   unsigned dst_queue_mask = radv_image_queue_family_mask(image, 
dst_family);
+
if (image->htile.size)
radv_handle_depth_image_transition(cmd_buffer, image, 
src_layout,
   dst_layout, range, 
pending_clears);
 
if (image->cmask.size)
radv_handle_cmask_image_transition(cmd_buffer, image, 
src_layout,
-  dst_layout, range, 
pending_clears);
+  dst_layout, src_queue_mask,
+  

[Mesa-dev] [PATCH 09/23] radv: clear image implementation for compute queue

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_meta.h  |   7 +
 src/amd/vulkan/radv_meta_bufimage.c | 269 ++--
 src/amd/vulkan/radv_private.h   |   5 +
 3 files changed, 272 insertions(+), 9 deletions(-)

diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
index 5994a7ce8f..55f3a4b132 100644
--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -163,6 +163,10 @@ void radv_meta_begin_itoi(struct radv_cmd_buffer 
*cmd_buffer,
  struct radv_meta_saved_compute_state *save);
 void radv_meta_end_itoi(struct radv_cmd_buffer *cmd_buffer,
struct radv_meta_saved_compute_state *save);
+void radv_meta_begin_cleari(struct radv_cmd_buffer *cmd_buffer,
+   struct radv_meta_saved_compute_state *save);
+void radv_meta_end_cleari(struct radv_cmd_buffer *cmd_buffer,
+ struct radv_meta_saved_compute_state *save);
 void radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer,
   struct radv_meta_blit2d_surf *src,
   struct radv_meta_blit2d_buffer *dst,
@@ -179,6 +183,9 @@ void radv_meta_image_to_image_cs(struct radv_cmd_buffer 
*cmd_buffer,
 struct radv_meta_blit2d_surf *dst,
 unsigned num_rects,
 struct radv_meta_blit2d_rect *rects);
+void radv_meta_clear_image_cs(struct radv_cmd_buffer *cmd_buffer,
+ struct radv_meta_blit2d_surf *dst,
+ const VkClearColorValue *clear_color);
 
 void radv_decompress_depth_image_inplace(struct radv_cmd_buffer *cmd_buffer,
 struct radv_image *image,
diff --git a/src/amd/vulkan/radv_meta_bufimage.c 
b/src/amd/vulkan/radv_meta_bufimage.c
index 638208f939..984b3472e8 100644
--- a/src/amd/vulkan/radv_meta_bufimage.c
+++ b/src/amd/vulkan/radv_meta_bufimage.c
@@ -609,12 +609,159 @@ radv_device_finish_meta_itoi_state(struct radv_device 
*device)
}
 }
 
+static nir_shader *
+build_nir_cleari_compute_shader(struct radv_device *dev)
+{
+   nir_builder b;
+   const struct glsl_type *img_type = 
glsl_sampler_type(GLSL_SAMPLER_DIM_2D,
+false,
+false,
+GLSL_TYPE_FLOAT);
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
+   b.shader->info->name = ralloc_strdup(b.shader, "meta_cleari_cs");
+   b.shader->info->cs.local_size[0] = 16;
+   b.shader->info->cs.local_size[1] = 16;
+   b.shader->info->cs.local_size[2] = 1;
+
+   nir_variable *output_img = nir_variable_create(b.shader, 
nir_var_uniform,
+  img_type, "out_img");
+   output_img->data.descriptor_set = 0;
+   output_img->data.binding = 0;
+
+   nir_ssa_def *invoc_id = nir_load_system_value(&b, 
nir_intrinsic_load_local_invocation_id, 0);
+   nir_ssa_def *wg_id = nir_load_system_value(&b, 
nir_intrinsic_load_work_group_id, 0);
+   nir_ssa_def *block_size = nir_imm_ivec4(&b,
+   
b.shader->info->cs.local_size[0],
+   
b.shader->info->cs.local_size[1],
+   
b.shader->info->cs.local_size[2], 0);
+
+   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), 
invoc_id);
+
+   nir_intrinsic_instr *clear_val = nir_intrinsic_instr_create(b.shader, 
nir_intrinsic_load_push_constant);
+   clear_val->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   clear_val->num_components = 4;
+   nir_ssa_dest_init(&clear_val->instr, &clear_val->dest, 4, 32, 
"clear_value");
+   nir_builder_instr_insert(&b, &clear_val->instr);
+
+   nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, 
nir_intrinsic_image_store);
+   store->src[0] = nir_src_for_ssa(global_id);
+   store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
+   store->src[2] = nir_src_for_ssa(&clear_val->dest.ssa);
+   store->variables[0] = nir_deref_var_create(store, output_img);
+
+   nir_builder_instr_insert(&b, &store->instr);
+   return b.shader;
+}
+
+static VkResult
+radv_device_init_meta_cleari_state(struct radv_device *device)
+{
+   VkResult result;
+   struct radv_shader_module cs = { .nir = NULL };
+
+   zero(device->meta_state.cleari);
+
+   cs.nir = build_nir_cleari_compute_shader(device);
+
+   /*
+* two descriptors one for the image being sampled
+* one for the buffer being written.
+*/
+   VkDescriptorSetLayoutCreateInfo ds_create_info = {
+   .sType = VK_STRUCTURE_TYPE_DESCRIPT

[Mesa-dev] [PATCH 03/23] radv: start fixing up queue allocate for multiple queues

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

v2: Fix error handling  and zero init the device (Bas)

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_device.c  | 52 ++-
 src/amd/vulkan/radv_private.h | 16 +
 2 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index c1438feb85..0cac5bc989 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -619,10 +619,13 @@ void radv_GetPhysicalDeviceMemoryProperties(
 }
 
 static void
-radv_queue_init(struct radv_device *device, struct radv_queue *queue)
+radv_queue_init(struct radv_device *device, struct radv_queue *queue,
+   int queue_family_index, int idx)
 {
queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
queue->device = device;
+   queue->queue_family_index = queue_family_index;
+   queue->queue_idx = idx;
 }
 
 static void
@@ -659,6 +662,8 @@ VkResult radv_CreateDevice(
if (!device)
return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   memset(device, 0, sizeof(*device));
+
device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
device->instance = physical_device->instance;
device->shader_stats_dump = false;
@@ -669,18 +674,33 @@ VkResult radv_CreateDevice(
else
device->alloc = physical_device->instance->alloc;
 
+   for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+   const VkDeviceQueueCreateInfo *queue_create = 
&pCreateInfo->pQueueCreateInfos[i];
+   uint32_t qfi = queue_create->queueFamilyIndex;
+
+   device->queues[qfi] = vk_alloc(&device->alloc,
+  queue_create->queueCount * 
sizeof(struct radv_queue), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!device->queues[qfi]) {
+   result = VK_ERROR_OUT_OF_HOST_MEMORY;
+   goto fail;
+   }
+
+   device->queue_count[qfi] = queue_create->queueCount;
+
+   for (unsigned q = 0; q < queue_create->queueCount; q++)
+   radv_queue_init(device, &device->queues[qfi][q], qfi, 
q);
+   }
+
device->hw_ctx = device->ws->ctx_create(device->ws);
if (!device->hw_ctx) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
-   goto fail_free;
+   goto fail;
}
 
-   radv_queue_init(device, &device->queue);
-
result = radv_device_init_meta(device);
if (result != VK_SUCCESS) {
device->ws->ctx_destroy(device->hw_ctx);
-   goto fail_free;
+   goto fail;
}
device->allow_fast_clears = env_var_as_boolean("RADV_FAST_CLEARS", 
false);
device->allow_dcc = !env_var_as_boolean("RADV_DCC_DISABLE", false);
@@ -697,7 +717,14 @@ VkResult radv_CreateDevice(
device->ws->cs_finalize(device->empty_cs);
*pDevice = radv_device_to_handle(device);
return VK_SUCCESS;
-fail_free:
+
+fail:
+   for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
+   for (unsigned q = 0; q < device->queue_count[i]; q++)
+   radv_queue_finish(&device->queues[i][q]);
+   if (device->queue_count[i])
+   vk_free(&device->alloc, device->queues[i]);
+   }
vk_free(&device->alloc, device);
return result;
 }
@@ -709,7 +736,12 @@ void radv_DestroyDevice(
RADV_FROM_HANDLE(radv_device, device, _device);
 
device->ws->ctx_destroy(device->hw_ctx);
-   radv_queue_finish(&device->queue);
+   for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
+   for (unsigned q = 0; q < device->queue_count[i]; q++)
+   radv_queue_finish(&device->queues[i][q]);
+   if (device->queue_count[i])
+   vk_free(&device->alloc, device->queues[i]);
+   }
radv_device_finish_meta(device);
 
vk_free(&device->alloc, device);
@@ -783,15 +815,13 @@ VkResult radv_EnumerateDeviceLayerProperties(
 
 void radv_GetDeviceQueue(
VkDevice_device,
-   uint32_tqueueNodeIndex,
+   uint32_tqueueFamilyIndex,
uint32_tqueueIndex,
VkQueue*pQueue)
 {
RADV_FROM_HANDLE(radv_device, device, _device);
 
-   assert(queueIndex == 0);
-
-   *pQueue = radv_queue_to_handle(&device->queue);
+   *pQueue = 
radv_queue_to_handle(&device->queues[queueFamilyIndex][queueIndex]);
 }
 
 VkResult radv_QueueSubmit(
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 3d4b111d25..aa5b477c19 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -434,12 +434,18 @@ struct radv

[Mesa-dev] [PATCH 19/23] radv: Use RELEASE_MEM packet for MEC timestamp query.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/common/sid.h|  1 +
 src/amd/vulkan/radv_query.c | 29 -
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index 3b3983fe27..0a2c616e64 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -156,6 +156,7 @@
  * DST_SEL=MC. Only CIK chips are affected.
  */
 /*#define PKT3_EVENT_WRITE_EOS   0x48*/ /* fix CP DMA before 
uncommenting */
+#define PKT3_RELEASE_MEM   0x49
 #define PKT3_ONE_REG_WRITE 0x57 /* not on CIK */
 #define PKT3_ACQUIRE_MEM   0x58 /* new for CIK */
 #define PKT3_SET_CONFIG_REG0x68
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index 185968689a..06762dee08 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -387,6 +387,7 @@ void radv_CmdWriteTimestamp(
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
+   bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
struct radeon_winsys_cs *cs = cmd_buffer->cs;
uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo);
uint64_t avail_va = va + pool->availability_offset + 4 * query;
@@ -394,17 +395,27 @@ void radv_CmdWriteTimestamp(
 
cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 5);
 
-   MAYBE_UNUSED unsigned cdw_max = 
radeon_check_space(cmd_buffer->device->ws, cs, 11);
-
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | 
EVENT_INDEX(5));
-   radeon_emit(cs, query_va);
-   radeon_emit(cs, (3 << 29) | ((query_va >> 32) & 0x));
-   radeon_emit(cs, 0);
-   radeon_emit(cs, 0);
+   MAYBE_UNUSED unsigned cdw_max = 
radeon_check_space(cmd_buffer->device->ws, cs, 12);
+
+   if (mec) {
+   radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | 
EVENT_INDEX(5));
+   radeon_emit(cs, 3 << 29);
+   radeon_emit(cs, query_va);
+   radeon_emit(cs, query_va >> 32);
+   radeon_emit(cs, 0);
+   radeon_emit(cs, 0);
+   } else {
+   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+   radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | 
EVENT_INDEX(5));
+   radeon_emit(cs, query_va);
+   radeon_emit(cs, (3 << 29) | ((query_va >> 32) & 0x));
+   radeon_emit(cs, 0);
+   radeon_emit(cs, 0);
+   }
 
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
-   radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
+   radeon_emit(cs, S_370_DST_SEL(mec ? V_370_MEM_ASYNC : 
V_370_MEMORY_SYNC) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(V_370_ME));
radeon_emit(cs, avail_va);
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 13/23] radv: init compute queue and avoid initing transfer queues

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_cmd_buffer.c | 41 
 src/amd/vulkan/radv_private.h|  2 ++
 src/amd/vulkan/si_cmd_buffer.c   |  7 ---
 3 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 798dd6aa17..7d7f55a145 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1393,17 +1393,33 @@ VkResult radv_BeginCommandBuffer(
 
/* setup initial configuration into command buffer */
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
-   /* Flush read caches at the beginning of CS not flushed by the 
kernel. */
-   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_ICACHE |
-   RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-   RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
-   RADV_CMD_FLAG_INV_VMEM_L1 |
-   RADV_CMD_FLAG_INV_SMEM_L1 |
-   RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER |
-   RADV_CMD_FLAG_INV_GLOBAL_L2;
-   si_init_config(&cmd_buffer->device->instance->physicalDevice, 
cmd_buffer);
-   radv_set_db_count_control(cmd_buffer);
-   si_emit_cache_flush(cmd_buffer);
+   switch (cmd_buffer->queue_family_index) {
+   case RADV_QUEUE_GENERAL:
+   /* Flush read caches at the beginning of CS not flushed 
by the kernel. */
+   cmd_buffer->state.flush_bits |= 
RADV_CMD_FLAG_INV_ICACHE |
+   RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+   RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+   RADV_CMD_FLAG_INV_VMEM_L1 |
+   RADV_CMD_FLAG_INV_SMEM_L1 |
+   RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER |
+   RADV_CMD_FLAG_INV_GLOBAL_L2;
+   
si_init_config(&cmd_buffer->device->instance->physicalDevice, cmd_buffer);
+   radv_set_db_count_control(cmd_buffer);
+   si_emit_cache_flush(cmd_buffer);
+   break;
+   case RADV_QUEUE_COMPUTE:
+   cmd_buffer->state.flush_bits = RADV_CMD_FLAG_INV_ICACHE 
|
+   RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
+   RADV_CMD_FLAG_INV_VMEM_L1 |
+   RADV_CMD_FLAG_INV_SMEM_L1 |
+   RADV_CMD_FLAG_INV_GLOBAL_L2;
+   
si_init_compute(&cmd_buffer->device->instance->physicalDevice, cmd_buffer);
+   si_emit_cache_flush(cmd_buffer);
+   break;
+   case RADV_QUEUE_TRANSFER:
+   default:
+   break;
+   }
}
 
if (pBeginInfo->flags & 
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
@@ -1539,7 +1555,8 @@ VkResult radv_EndCommandBuffer(
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   si_emit_cache_flush(cmd_buffer);
+   if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
+   si_emit_cache_flush(cmd_buffer);
if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
cmd_buffer->record_fail)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 849c54006d..e6f6c29c91 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -720,6 +720,8 @@ struct radv_image;
 
 bool radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer);
 
+void si_init_compute(struct radv_physical_device *physical_device,
+struct radv_cmd_buffer *cmd_buffer);
 void si_init_config(struct radv_physical_device *physical_device,
struct radv_cmd_buffer *cmd_buffer);
 void si_write_viewport(struct radeon_winsys_cs *cs, int first_vp,
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index a61a950de6..5ac2a14809 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -170,10 +170,11 @@ si_write_harvested_raster_configs(struct 
radv_physical_device *physical_device,
   S_030800_INSTANCE_BROADCAST_WRITES(1));
 }
 
-static void
+void
 si_init_compute(struct radv_physical_device *physical_device,
-struct radeon_winsys_cs *cs)
+struct radv_cmd_buffer *cmd_buffer)
 {
+   struct radeon_winsys_cs *cs = cmd_buffer->cs;
radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
radeon_emit(cs, 0);
radeon_emit(cs, 0);
@@ -419,7 +420,7 @@ void si_init_config(struct radv_physical_device 
*physical_device,
if (physical_device->rad_info.family == CHIP_STONEY)
radeo

[Mesa-dev] [PATCH 02/23] radv/winsys: start adding support for DMA/compute queue

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c 
b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index 56bfbb2a47..325458f41b 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -54,6 +54,7 @@ struct radv_amdgpu_cs {
boolis_chained;
 
int buffer_hash_table[1024];
+   unsignedhw_ip;
 };
 
 static inline struct radv_amdgpu_cs *
@@ -62,6 +63,19 @@ radv_amdgpu_cs(struct radeon_winsys_cs *base)
return (struct radv_amdgpu_cs*)base;
 }
 
+static int ring_to_hw_ip(enum ring_type ring)
+{
+   switch (ring) {
+   case RING_GFX:
+   return AMDGPU_HW_IP_GFX;
+   case RING_DMA:
+   return AMDGPU_HW_IP_DMA;
+   case RING_COMPUTE:
+   return AMDGPU_HW_IP_COMPUTE;
+   default:
+   unreachable("unsupported ring");
+   }
+}
 
 static void radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx,
 struct amdgpu_cs_fence *fence,
@@ -137,6 +151,7 @@ static boolean radv_amdgpu_init_cs(struct radv_amdgpu_cs 
*cs,
for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
cs->buffer_hash_table[i] = -1;
 
+   cs->hw_ip = ring_to_hw_ip(ring_type);
return true;
 }
 
@@ -151,7 +166,7 @@ radv_amdgpu_cs_create(struct radeon_winsys *ws,
return NULL;
 
cs->ws = radv_amdgpu_winsys(ws);
-   radv_amdgpu_init_cs(cs, RING_GFX);
+   radv_amdgpu_init_cs(cs, ring_type);
 
if (cs->ws->use_ib_bos) {
cs->ib_buffer = ws->buffer_create(ws, ib_size, 0,
@@ -526,7 +541,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct 
radeon_winsys_ctx *_ctx,
return r;
}
 
-   request.ip_type = AMDGPU_HW_IP_GFX;
+   request.ip_type = cs0->hw_ip;
request.number_of_ibs = 1;
request.ibs = &cs0->ib;
request.resources = bo_list;
@@ -576,7 +591,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct 
radeon_winsys_ctx *_ctx,
return r;
}
 
-   request.ip_type = AMDGPU_HW_IP_GFX;
+   request.ip_type = cs0->hw_ip;
request.resources = bo_list;
request.number_of_ibs = cnt;
request.ibs = ibs;
@@ -676,7 +691,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct 
radeon_winsys_ctx *_ctx,
ib.size = size;
ib.ib_mc_address = ws->buffer_get_va(bo);
 
-   request.ip_type = AMDGPU_HW_IP_GFX;
+   request.ip_type = cs0->hw_ip;
request.resources = bo_list;
request.number_of_ibs = 1;
request.ibs = &ib;
@@ -759,7 +774,7 @@ static bool radv_amdgpu_ctx_wait_idle(struct 
radeon_winsys_ctx *rwctx)
struct amdgpu_cs_fence fence;
 
fence.context = ctx->ctx;
-   fence.ip_type = RING_GFX;
+   fence.ip_type = AMDGPU_HW_IP_GFX;
fence.ip_instance = 0;
fence.ring = 0;
fence.fence = ctx->last_seq_no;
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 21/23] radv: Create an empty CS per ring type.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_device.c  | 27 ---
 src/amd/vulkan/radv_private.h |  2 +-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 3aac247f8b..2e802d2b7e 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -703,18 +703,30 @@ VkResult radv_CreateDevice(
goto fail;
}
device->allow_fast_clears = env_var_as_boolean("RADV_FAST_CLEARS", 
false);
-   device->allow_dcc = !env_var_as_boolean("RADV_DCC_DISABLE", false);
+   device->allow_dcc = !env_var_as_boolean("RADV_DCC_DISABLE", true);
device->shader_stats_dump = env_var_as_boolean("RADV_SHADER_STATS", 
false);
 
if (device->allow_fast_clears && device->allow_dcc)
radv_finishme("DCC fast clears have not been tested\n");
 
radv_device_init_msaa(device);
-   device->empty_cs = device->ws->cs_create(device->ws, RING_GFX);
-   radeon_emit(device->empty_cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
-   radeon_emit(device->empty_cs, CONTEXT_CONTROL_LOAD_ENABLE(1));
-   radeon_emit(device->empty_cs, CONTEXT_CONTROL_SHADOW_ENABLE(1));
-   device->ws->cs_finalize(device->empty_cs);
+
+   for (int family = 0; family < RADV_MAX_QUEUE_FAMILIES; ++family) {
+   device->empty_cs[family] = device->ws->cs_create(device->ws, 
family);
+   switch (family) {
+   case RADV_QUEUE_GENERAL:
+   radeon_emit(device->empty_cs[family], 
PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+   radeon_emit(device->empty_cs[family], 
CONTEXT_CONTROL_LOAD_ENABLE(1));
+   radeon_emit(device->empty_cs[family], 
CONTEXT_CONTROL_SHADOW_ENABLE(1));
+   break;
+   case RADV_QUEUE_COMPUTE:
+   radeon_emit(device->empty_cs[family], PKT3(PKT3_NOP, 0, 
0));
+   radeon_emit(device->empty_cs[family], 0);
+   break;
+   }
+   device->ws->cs_finalize(device->empty_cs[family]);
+   }
+
*pDevice = radv_device_to_handle(device);
return VK_SUCCESS;
 
@@ -869,7 +881,8 @@ VkResult radv_QueueSubmit(
 
if (fence) {
if (!submitCount)
-   ret = queue->device->ws->cs_submit(ctx, 
queue->queue_idx, &queue->device->empty_cs,
+   ret = queue->device->ws->cs_submit(ctx, 
queue->queue_idx,
+  
&queue->device->empty_cs[queue->queue_family_index],
   1, NULL, 0, NULL, 0, 
false, base_fence);
 
fence->submitted = true;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 03d295986f..e15556ea57 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -473,7 +473,7 @@ struct radv_device {
 
struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
int queue_count[RADV_MAX_QUEUE_FAMILIES];
-   struct radeon_winsys_cs *empty_cs;
+   struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES];
 
bool allow_fast_clears;
bool allow_dcc;
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 14/23] radv: pass queue index into winsys submission

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

This is so we can submit on separate queues if needed

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_device.c  |  4 ++--
 src/amd/vulkan/radv_radeon_winsys.h   |  1 +
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 13 ++---
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index cc89387ff7..fd0ef720d8 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -855,7 +855,7 @@ VkResult radv_QueueSubmit(
if ((cmd_buffer->usage_flags & 
VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
can_patch = false;
}
-   ret = queue->device->ws->cs_submit(ctx, cs_array,
+   ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, 
cs_array,
   
pSubmits[i].commandBufferCount,
   can_patch, base_fence);
if (ret)
@@ -865,7 +865,7 @@ VkResult radv_QueueSubmit(
 
if (fence) {
if (!submitCount)
-   ret = queue->device->ws->cs_submit(ctx, 
&queue->device->empty_cs,
+   ret = queue->device->ws->cs_submit(ctx, 
queue->queue_idx, &queue->device->empty_cs,
   1, false, 
base_fence);
 
fence->submitted = true;
diff --git a/src/amd/vulkan/radv_radeon_winsys.h 
b/src/amd/vulkan/radv_radeon_winsys.h
index f29071be94..38cb4408ff 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -301,6 +301,7 @@ struct radeon_winsys {
void (*cs_grow)(struct radeon_winsys_cs * cs, size_t min_size);
 
int (*cs_submit)(struct radeon_winsys_ctx *ctx,
+int queue_index,
 struct radeon_winsys_cs **cs_array,
 unsigned cs_count,
 bool can_patch,
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c 
b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index fc02d49263..7337918680 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -510,6 +510,7 @@ static void radv_assign_last_submit(struct radv_amdgpu_ctx 
*ctx,
 }
 
 static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
+   int queue_idx,
struct radeon_winsys_cs 
**cs_array,
unsigned cs_count,
struct radeon_winsys_fence 
*_fence)
@@ -550,6 +551,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct 
radeon_winsys_ctx *_ctx,
}
 
request.ip_type = cs0->hw_ip;
+   request.ring = queue_idx;
request.number_of_ibs = 1;
request.ibs = &cs0->ib;
request.resources = bo_list;
@@ -574,6 +576,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct 
radeon_winsys_ctx *_ctx,
 }
 
 static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx 
*_ctx,
+int queue_idx,
 struct radeon_winsys_cs 
**cs_array,
 unsigned cs_count,
 struct radeon_winsys_fence 
*_fence)
@@ -600,6 +603,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct 
radeon_winsys_ctx *_ctx,
}
 
request.ip_type = cs0->hw_ip;
+   request.ring = queue_idx;
request.resources = bo_list;
request.number_of_ibs = cnt;
request.ibs = ibs;
@@ -639,6 +643,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct 
radeon_winsys_ctx *_ctx,
 }
 
 static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
+  int queue_idx,
   struct radeon_winsys_cs 
**cs_array,
   unsigned cs_count,
   struct radeon_winsys_fence 
*_fence)
@@ -700,6 +705,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct 
radeon_winsys_ctx *_ctx,
ib.ib_mc_address = ws->buffer_get_va(bo);
 
request.ip_type = cs0->hw_ip;
+   request.ring = queue_idx;
request.resources = bo_list;
request.number_of_ibs = 1;
request.ibs = &ib;
@@ -730,6 +736,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct 
radeon_winsys_ctx *_ctx,
 }
 
 static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
+   int queue_idx,
   

[Mesa-dev] [PATCH 18/23] radv: Implement indirect dispatch for the MEC.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_cmd_buffer.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 7d7f55a145..3c5fe25ce6 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2217,16 +2217,24 @@ void radv_CmdDispatchIndirect(
}
}
 
-   radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) |
-   PKT3_SHADER_TYPE_S(1));
-   radeon_emit(cmd_buffer->cs, 1);
-   radeon_emit(cmd_buffer->cs, va);
-   radeon_emit(cmd_buffer->cs, va >> 32);
+   if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+   radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
+   PKT3_SHADER_TYPE_S(1));
+   radeon_emit(cmd_buffer->cs, va);
+   radeon_emit(cmd_buffer->cs, va >> 32);
+   radeon_emit(cmd_buffer->cs, 1);
+   } else {
+   radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_BASE, 2, 0) |
+   PKT3_SHADER_TYPE_S(1));
+   radeon_emit(cmd_buffer->cs, 1);
+   radeon_emit(cmd_buffer->cs, va);
+   radeon_emit(cmd_buffer->cs, va >> 32);
 
-   radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
-   PKT3_SHADER_TYPE_S(1));
-   radeon_emit(cmd_buffer->cs, 0);
-   radeon_emit(cmd_buffer->cs, 1);
+   radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
+   PKT3_SHADER_TYPE_S(1));
+   radeon_emit(cmd_buffer->cs, 0);
+   radeon_emit(cmd_buffer->cs, 1);
+   }
 
assert(cmd_buffer->cs->cdw <= cdw_max);
 }
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 23/23] radv: expose the compute queue

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

v2: Don't expose the SDMA queue and use the CIK check also in the
second if. (Bas)

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_device.c | 52 ++--
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 2e802d2b7e..7ae5c1bb86 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -553,20 +553,50 @@ void radv_GetPhysicalDeviceQueueFamilyProperties(
uint32_t*   pCount,
VkQueueFamilyProperties*pQueueFamilyProperties)
 {
+   RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+   int num_queue_families = 1;
+   bool all_queues = env_var_as_boolean("RADV_SHOW_QUEUES", true);
+   int idx;
+   if (all_queues && pdevice->rad_info.chip_class >= CIK) {
+   if (pdevice->rad_info.compute_rings > 0)
+   num_queue_families++;
+   }
+
if (pQueueFamilyProperties == NULL) {
-   *pCount = 1;
+   *pCount = num_queue_families;
return;
}
-   assert(*pCount >= 1);
-
-   *pQueueFamilyProperties = (VkQueueFamilyProperties) {
-   .queueFlags = VK_QUEUE_GRAPHICS_BIT |
-   VK_QUEUE_COMPUTE_BIT |
-   VK_QUEUE_TRANSFER_BIT,
-   .queueCount = 1,
-   .timestampValidBits = 64,
-   .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
-   };
+
+   if (!*pCount)
+   return;
+
+   idx = 0;
+   if (*pCount >= 1) {
+   pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) {
+   .queueFlags = VK_QUEUE_GRAPHICS_BIT |
+   VK_QUEUE_COMPUTE_BIT |
+   VK_QUEUE_TRANSFER_BIT,
+   .queueCount = 1,
+   .timestampValidBits = 64,
+   .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
+   };
+   idx++;
+   }
+
+   if (!all_queues)
+   return;
+
+   if (pdevice->rad_info.compute_rings > 0 && pdevice->rad_info.chip_class 
>= CIK) {
+   if (*pCount > idx) {
+   pQueueFamilyProperties[idx] = (VkQueueFamilyProperties) 
{
+   .queueFlags = VK_QUEUE_COMPUTE_BIT | 
VK_QUEUE_TRANSFER_BIT,
+   .queueCount = pdevice->rad_info.compute_rings,
+   .timestampValidBits = 64,
+   .minImageTransferGranularity = (VkExtent3D) { 
1, 1, 1 },
+   };
+   idx++;
+   }
+   }
 }
 
 void radv_GetPhysicalDeviceMemoryProperties(
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 11/23] radv/meta: update header info

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_meta_bufimage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/amd/vulkan/radv_meta_bufimage.c 
b/src/amd/vulkan/radv_meta_bufimage.c
index 984b3472e8..24e10b2518 100644
--- a/src/amd/vulkan/radv_meta_bufimage.c
+++ b/src/amd/vulkan/radv_meta_bufimage.c
@@ -25,7 +25,8 @@
 #include "nir/nir_builder.h"
 
 /*
- * Compute shader implementation of image->buffer copy.
+ * GFX queue: Compute shader implementation of image->buffer copy
+ * Compute queue: implementation also of buffer->image, image->image, and 
image clear.
  */
 
 static nir_shader *
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 15/23] radv: add semaphore support

2016-12-18 Thread Bas Nieuwenhuizen
From: Dave Airlie 

Reviewed-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_device.c  | 29 -
 src/amd/vulkan/radv_radeon_winsys.h   |  9 ++
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 45 +--
 3 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index fd0ef720d8..3aac247f8b 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -857,6 +857,10 @@ VkResult radv_QueueSubmit(
}
ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, 
cs_array,
   
pSubmits[i].commandBufferCount,
+  (struct radeon_winsys_sem 
**)pSubmits[i].pWaitSemaphores,
+  
pSubmits[i].waitSemaphoreCount,
+  (struct radeon_winsys_sem 
**)pSubmits[i].pSignalSemaphores,
+  
pSubmits[i].signalSemaphoreCount,
   can_patch, base_fence);
if (ret)
radv_loge("failed to submit CS %d\n", i);
@@ -866,7 +870,7 @@ VkResult radv_QueueSubmit(
if (fence) {
if (!submitCount)
ret = queue->device->ws->cs_submit(ctx, 
queue->queue_idx, &queue->device->empty_cs,
-  1, false, 
base_fence);
+  1, NULL, 0, NULL, 0, 
false, base_fence);
 
fence->submitted = true;
}
@@ -1270,25 +1274,34 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence 
_fence)
 // Queue semaphore functions
 
 VkResult radv_CreateSemaphore(
-   VkDevicedevice,
+   VkDevice_device,
const VkSemaphoreCreateInfo*pCreateInfo,
const VkAllocationCallbacks*pAllocator,
VkSemaphore*pSemaphore)
 {
-   /* The DRM execbuffer ioctl always execute in-oder, even between 
different
-* rings. As such, there's nothing to do for the user space semaphore.
-*/
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   struct radeon_winsys_sem *sem;
 
-   *pSemaphore = (VkSemaphore)1;
+   sem = device->ws->create_sem(device->ws);
+   if (!sem)
+   return VK_ERROR_OUT_OF_HOST_MEMORY;
 
+   *pSemaphore = (VkSemaphore)sem;
return VK_SUCCESS;
 }
 
 void radv_DestroySemaphore(
-   VkDevicedevice,
-   VkSemaphore semaphore,
+   VkDevice_device,
+   VkSemaphore _semaphore,
const VkAllocationCallbacks*pAllocator)
 {
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   struct radeon_winsys_sem *sem;
+   if (!_semaphore)
+   return;
+
+   sem = (struct radeon_winsys_sem *)_semaphore;
+   device->ws->destroy_sem(sem);
 }
 
 VkResult radv_CreateEvent(
diff --git a/src/amd/vulkan/radv_radeon_winsys.h 
b/src/amd/vulkan/radv_radeon_winsys.h
index 38cb4408ff..4b738b8cf4 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -253,6 +253,7 @@ struct radeon_bo_metadata {
 
 struct radeon_winsys_bo;
 struct radeon_winsys_fence;
+struct radeon_winsys_sem;
 
 struct radeon_winsys {
void (*destroy)(struct radeon_winsys *ws);
@@ -304,6 +305,10 @@ struct radeon_winsys {
 int queue_index,
 struct radeon_winsys_cs **cs_array,
 unsigned cs_count,
+struct radeon_winsys_sem **wait_sem,
+unsigned wait_sem_count,
+struct radeon_winsys_sem **signal_sem,
+unsigned signal_sem_count,
 bool can_patch,
 struct radeon_winsys_fence *fence);
 
@@ -326,6 +331,10 @@ struct radeon_winsys {
   struct radeon_winsys_fence *fence,
   bool absolute,
   uint64_t timeout);
+
+   struct radeon_winsys_sem *(*create_sem)(struct radeon_winsys *ws);
+   void (*destroy_sem)(struct radeon_winsys_sem *sem);
+
 };
 
 static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c 
b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index 7337918680..b24aa99749 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -739,20 +739,40 @@ static int radv_amdgpu_winsys_cs_submit(struct 

[Mesa-dev] [PATCH 12/23] radv/winsys: Make WaitIdle queue aware.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_device.c  | 10 +--
 src/amd/vulkan/radv_radeon_winsys.h   |  3 +-
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 34 +--
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h |  6 +++-
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c |  6 ++--
 5 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 0cac5bc989..cc89387ff7 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -879,7 +879,9 @@ VkResult radv_QueueWaitIdle(
 {
RADV_FROM_HANDLE(radv_queue, queue, _queue);
 
-   queue->device->ws->ctx_wait_idle(queue->device->hw_ctx);
+   queue->device->ws->ctx_wait_idle(queue->device->hw_ctx,
+
radv_queue_family_to_ring(queue->queue_family_index),
+queue->queue_idx);
return VK_SUCCESS;
 }
 
@@ -888,7 +890,11 @@ VkResult radv_DeviceWaitIdle(
 {
RADV_FROM_HANDLE(radv_device, device, _device);
 
-   device->ws->ctx_wait_idle(device->hw_ctx);
+   for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
+   for (unsigned q = 0; q < device->queue_count[i]; q++) {
+   
radv_QueueWaitIdle(radv_queue_to_handle(&device->queues[i][q]));
+   }
+   }
return VK_SUCCESS;
 }
 
diff --git a/src/amd/vulkan/radv_radeon_winsys.h 
b/src/amd/vulkan/radv_radeon_winsys.h
index db7650c9db..f29071be94 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -286,7 +286,8 @@ struct radeon_winsys {
struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws);
void (*ctx_destroy)(struct radeon_winsys_ctx *ctx);
 
-   bool (*ctx_wait_idle)(struct radeon_winsys_ctx *ctx);
+   bool (*ctx_wait_idle)(struct radeon_winsys_ctx *ctx,
+ enum ring_type ring_type, int ring_index);
 
struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws,
  enum ring_type ring_type);
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c 
b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index 325458f41b..fc02d49263 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -501,6 +501,14 @@ static int radv_amdgpu_create_bo_list(struct 
radv_amdgpu_winsys *ws,
return r;
 }
 
+static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
+   struct amdgpu_cs_request *request)
+{
+   radv_amdgpu_request_to_fence(ctx,
+
&ctx->last_submission[request->ip_type][request->ring],
+request);
+}
+
 static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
struct radeon_winsys_cs 
**cs_array,
unsigned cs_count,
@@ -560,7 +568,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct 
radeon_winsys_ctx *_ctx,
if (fence)
radv_amdgpu_request_to_fence(ctx, fence, &request);
 
-   ctx->last_seq_no = request.seq_no;
+   radv_assign_last_submit(ctx, &request);
 
return r;
 }
@@ -625,7 +633,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct 
radeon_winsys_ctx *_ctx,
if (fence)
radv_amdgpu_request_to_fence(ctx, fence, &request);
 
-   ctx->last_seq_no = request.seq_no;
+   radv_assign_last_submit(ctx, &request);
 
return 0;
 }
@@ -715,7 +723,9 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct 
radeon_winsys_ctx *_ctx,
}
if (fence)
radv_amdgpu_request_to_fence(ctx, fence, &request);
-   ctx->last_seq_no = request.seq_no;
+
+   radv_assign_last_submit(ctx, &request);
+
return 0;
 }
 
@@ -765,22 +775,16 @@ static void radv_amdgpu_ctx_destroy(struct 
radeon_winsys_ctx *rwctx)
FREE(ctx);
 }
 
-static bool radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx)
+static bool radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx,
+  enum ring_type ring_type, int ring_index)
 {
struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
+   int ip_type = ring_to_hw_ip(ring_type);
 
-   if (ctx->last_seq_no) {
+   if (ctx->last_submission[ip_type][ring_index].fence) {
uint32_t expired;
-   struct amdgpu_cs_fence fence;
-
-   fence.context = ctx->ctx;
-   fence.ip_type = AMDGPU_HW_IP_GFX;
-   fence.ip_instance = 0;
-   fence.ring = 0;
-   fence.fence = ctx->last_seq_no;
-
-   int ret = amdgpu_cs_query_fence_status(&fence, 10ull, 0,
-   

[Mesa-dev] [PATCH 17/23] radv: update vkCmdUpdateBuffer for the MEC.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_meta_buffer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/amd/vulkan/radv_meta_buffer.c 
b/src/amd/vulkan/radv_meta_buffer.c
index 42297b9ce9..cd2973fa4a 100644
--- a/src/amd/vulkan/radv_meta_buffer.c
+++ b/src/amd/vulkan/radv_meta_buffer.c
@@ -515,6 +515,7 @@ void radv_CmdUpdateBuffer(
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
+   bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
uint64_t words = dataSize / 4;
uint64_t va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo);
va += dstOffset + dst_buffer->offset;
@@ -528,7 +529,8 @@ void radv_CmdUpdateBuffer(
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 
words + 4);
 
radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + words, 
0));
-   radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
+   radeon_emit(cmd_buffer->cs, S_370_DST_SEL(mec ?
+   V_370_MEM_ASYNC : 
V_370_MEMORY_SYNC) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(V_370_ME));
radeon_emit(cmd_buffer->cs, va);
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 16/23] radv: Implement cache flushing for the MEC.

2016-12-18 Thread Bas Nieuwenhuizen
Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/si_cmd_buffer.c | 36 +---
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 5ac2a14809..4b2624cb8e 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -601,6 +601,16 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 {
enum chip_class chip_class = 
cmd_buffer->device->instance->physicalDevice.rad_info.chip_class;
unsigned cp_coher_cntl = 0;
+   bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
+
+   if (is_compute)
+   cmd_buffer->state.flush_bits &= 
~(RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+ 
RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
+ 
RADV_CMD_FLAG_FLUSH_AND_INV_DB |
+ 
RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
+ 
RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+ 
RADV_CMD_FLAG_VS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_VGT_FLUSH);
 
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);
 
@@ -679,7 +689,8 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
/* Make sure ME is idle (it executes most packets) before continuing.
 * This prevents read-after-write hazards between PFP and ME.
 */
-   if (cp_coher_cntl || (cmd_buffer->state.flush_bits & 
RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
+   if ((cp_coher_cntl || (cmd_buffer->state.flush_bits & 
RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) &&
+   !radv_cmd_buffer_uses_mec(cmd_buffer)) {
radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cmd_buffer->cs, 0);
}
@@ -688,12 +699,23 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
 * Therefore, it should be last. Done in PFP.
 */
if (cp_coher_cntl) {
-   /* ACQUIRE_MEM is only required on a compute ring. */
-   radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
-   radeon_emit(cmd_buffer->cs, cp_coher_cntl);   /* CP_COHER_CNTL 
*/
-   radeon_emit(cmd_buffer->cs, 0x);  /* CP_COHER_SIZE 
*/
-   radeon_emit(cmd_buffer->cs, 0);   /* CP_COHER_BASE 
*/
-   radeon_emit(cmd_buffer->cs, 0x000A);  /* POLL_INTERVAL 
*/
+   if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
+   radeon_emit(cmd_buffer->cs, PKT3(PKT3_ACQUIRE_MEM, 5, 
0) |
+   PKT3_SHADER_TYPE_S(1));
+   radeon_emit(cmd_buffer->cs, cp_coher_cntl);   /* 
CP_COHER_CNTL */
+   radeon_emit(cmd_buffer->cs, 0x);  /* 
CP_COHER_SIZE */
+   radeon_emit(cmd_buffer->cs, 0xff);/* 
CP_COHER_SIZE_HI */
+   radeon_emit(cmd_buffer->cs, 0);   /* 
CP_COHER_BASE */
+   radeon_emit(cmd_buffer->cs, 0);   /* 
CP_COHER_BASE_HI */
+   radeon_emit(cmd_buffer->cs, 0x000A);  /* 
POLL_INTERVAL */
+   } else {
+   /* ACQUIRE_MEM is only required on a compute ring. */
+   radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 
0));
+   radeon_emit(cmd_buffer->cs, cp_coher_cntl);   /* 
CP_COHER_CNTL */
+   radeon_emit(cmd_buffer->cs, 0x);  /* 
CP_COHER_SIZE */
+   radeon_emit(cmd_buffer->cs, 0);   /* 
CP_COHER_BASE */
+   radeon_emit(cmd_buffer->cs, 0x000A);  /* 
POLL_INTERVAL */
+   }
}
 
cmd_buffer->state.flush_bits = 0;
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] radv: Use correct workgroup size limits.

2016-12-18 Thread Bas Nieuwenhuizen
Not sure where the 16k comes from, but pretty sure 2k is the max.

Signed-off-by: Bas Nieuwenhuizen 
---
 src/amd/vulkan/radv_device.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 6362e9ead1..7a64cc4473 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -475,11 +475,11 @@ void radv_GetPhysicalDeviceProperties(
.maxFragmentCombinedOutputResources   = 8,
.maxComputeSharedMemorySize   = 32768,
.maxComputeWorkGroupCount = { 65535, 65535, 
65535 },
-   .maxComputeWorkGroupInvocations   = 16 * 1024,
+   .maxComputeWorkGroupInvocations   = 2048,
.maxComputeWorkGroupSize = {
-   16 * 1024/*devinfo->max_cs_threads*/,
-   16 * 1024,
-   16 * 1024
+   2048,
+   2048,
+   2048
},
.subPixelPrecisionBits= 4 /* FIXME */,
.subTexelPrecisionBits= 4 /* FIXME */,
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] radv: fix dual source blending

2016-12-18 Thread Bas Nieuwenhuizen
Reviewed-by: Bas Nieuwenhuizen 

Does dual source blending work now with this patch? And do you need me
to commit it?

- Bas


On Fri, Dec 16, 2016 at 2:25 AM, Fredrik Höglund  wrote:
> Add the index to the location when assigning driver locations for
> output variables.
>
> Otherwise two fragment shader outputs declared as:
>
>layout (location = 0, index = 0) out vec4 output1;
>layout (location = 0, index = 1) out vec4 output2;
>
> will end up aliasing one another.
>
> Note that this patch will make the second output variable in the above
> example alias a possible third output variable with location = 1 and
> index = 0. But this shouldn't be a problem in practice since only one
> color attachment is supported when dual-source blending is used.
> ---
>  src/amd/common/ac_nir_to_llvm.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index d66fefb..90ee917 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -4125,7 +4125,7 @@ static void
>  handle_shader_output_decl(struct nir_to_llvm_context *ctx,
>   struct nir_variable *variable)
>  {
> -   int idx = variable->data.location;
> +   int idx = variable->data.location + variable->data.index;
> unsigned attrib_count = glsl_count_attribute_slots(variable->type, 
> false);
>
> variable->data.driver_location = idx * 4;
> @@ -4155,7 +4155,7 @@ handle_shader_output_decl(struct nir_to_llvm_context 
> *ctx,
>si_build_alloca_undef(ctx, ctx->f32, 
> "");
> }
> }
> -   ctx->output_mask |= ((1ull << attrib_count) - 1) << 
> variable->data.location;
> +   ctx->output_mask |= ((1ull << attrib_count) - 1) << idx;
>  }
>
>  static void
> --
> 2.1.4
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] Mesa: Return GL error INVALID_OPERATION in case invalid format/type

2016-12-18 Thread Xu, Randy
Thanks, Ian

Yes, it's my local gitconfig issue, and it's gone after I add quotes around the 
name and re-commit the patch.

Thanks,
Randy

-Original Message-
From: Ian Romanick [mailto:i...@freedesktop.org] 
Sent: Saturday, December 17, 2016 6:07 AM
To: Xu, Randy 
Cc: mesa-dev@lists.freedesktop.org
Subject: Re: [Mesa-dev] [PATCH] Mesa: Return GL error INVALID_OPERATION in case 
invalid format/type

On 12/15/2016 05:25 PM, Xu, Randy wrote:
> Thanks, Matt
> 
> I have run "git config --global user.name "Randy Xu"" and see it in 
> ~/.gitconfig [user]
> email = randy...@intel.com
> name = Randy Xu

Perhaps you need quotes around the name?  My ~/.gitconfig has:

[user]
name = "Ian Romanick"
email = ian.d.roman...@intel.com

> While I don’t know why the "git send-email --smtp-server=smtp.intel.com 
> --to=mesa-dev@lists.freedesktop.org 0001-.patch" 
> command always cc the email to x...@freedesktop.org , which is not valid. Do 
> you know why?

It's possible something in the local .git/config is causing problems.
If you do 'git show --format=fuller' on the commit, what does it show for 
Author: and Commit: lines?

If you're using git-format-patch to prepare the patches, you can look at the 
patch 00??-*.patch files for problems too.

> Thanks,
> Randy
> 
> -Original Message-
> From: Matt Turner [mailto:matts...@gmail.com]
> Sent: Friday, December 16, 2016 9:20 AM
> To: Xu, Randy 
> Cc: mesa-dev@lists.freedesktop.org; x...@freedesktop.org
> Subject: Re: [Mesa-dev] [PATCH] Mesa: Return GL error 
> INVALID_OPERATION in case invalid format/type
> 
> On Wed, Dec 14, 2016 at 5:10 PM, Randy Xu  wrote:
>> From: "Xu,Randy" 
> 
> Reminder to fix your configured name.
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] Mesa: Fix error code for glTexImage3D in GLES

2016-12-18 Thread Xu, Randy
Hi, Chad & Ian

Thanks for your suggestion, and I understand and agree your point, while the 
texsubimage_error_check (in teximage.c) calls _mesa_error_check_format_and_type 
first, and if error happens, it will return immediately (in 2175) and not call 
texture_format_error_check_gles (in 2184). So I did the patch this way.

Follow your suggestion, we'd better move texture_format_error_check_gles ahead 
of _mesa_error_check_format_and_type, i.e. handle the GLES API ahead. Do you 
agree with that?

Thanks,
Randy 

2131 static GLboolean
2132 texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
2133 struct gl_texture_object *texObj,
2134 GLenum target, GLint level,
2135 GLint xoffset, GLint yoffset, GLint zoffset,
2136 GLint width, GLint height, GLint depth,
2137 GLenum format, GLenum type, const GLvoid *pixels,
2138 bool dsa, const char *callerName)
2139 {

2169err = _mesa_error_check_format_and_type(ctx, format, type);
2170if (err != GL_NO_ERROR) {
2171   _mesa_error(ctx, err,
2172   "%s(incompatible format = %s, type = %s)",
2173   callerName, _mesa_enum_to_string(format),
2174   _mesa_enum_to_string(type));
2175   return GL_TRUE;
2176}

2183if (_mesa_is_gles(ctx) &&
2184texture_format_error_check_gles(ctx, format, type,
2185texImage->InternalFormat,
2186dimensions, callerName)) {
2187   return GL_TRUE;
2188}



-Original Message-
From: Ian Romanick [mailto:i...@freedesktop.org] 
Sent: Saturday, December 17, 2016 6:02 AM
To: Chad Versace ; Xu, Randy ; 
mesa-dev@lists.freedesktop.org; mesa-sta...@lists.freedesktop.org; 
x...@freedesktop.org
Subject: Re: [Mesa-dev] [PATCH] Mesa: Fix error code for glTexImage3D in GLES

On 12/16/2016 12:49 PM, Chad Versace wrote:
> On Fri 16 Dec 2016, Chad Versace wrote:
>> On Fri 16 Dec 2016, Randy Xu wrote:
>>> From: "Xu,Randy" 
>>>
>>> The ES specification says that TexImage3D should return 
>>> GL_INVALID_OPERATION if the internal format is DEPTH_COMPONENT, 
>>> DEPTH_-STENCIL or STENCIL_INDEX.
>>> The current code returns INVALID_ENUM as 
>>> _mesa_error_check_format_and_type is used by glReadPixels also and 
>>> the GL specification defines "INVALID_ENUM is generated if format is 
>>> DEPTH_STENCIL and type is not UNSIGNED_INT_24_8 or
>>> FLOAT_32_UNSIGNED_INT_24_8_- REV".
>>>
>>> This patch only impacts GLES, which can generate 
>>> GL_INVALID_OPERATION because glReadPixels cannot be used to read depth or 
>>> stencil buffer.
>>> Fixes dEQP-GLES3.functional.negative_api.texture.teximage3d.
>>>
>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99076
>>>
>>> Signed-off-by: Xu,Randy 
>>> ---
>>>  src/mesa/main/glformats.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>
>> Thanks for fixing the dEQP failure. But I think your patch applies 
>> the fix to wrong portion of code.
>>
>> The commit message mentions the internalFormat, but the patch updates 
>> a function to which validates the *format* (not internalFormat).
>> I believe the change should instead be placed in 
>> teximage.c:texture_format_error_check_gles(), which is better for
>> 2 reasons:
>> - That function specifically checks GLES-specific requirements like
>>   this.
>> - It checks the *internalFormat* in addition to the *format*.
>>
>> Also, in the future, please remove the empty lines between the tags 
>> (Bugzilla:, Signed-of-by:) in the commit message. The empty lines can 
>> confuse scripts that parse those tags.
> 
> One more thing: Please insert any relevant quotes from the GLES spec 
> into the code itself as comments. It's ok to put those quotes in the

I was going to suggest the same thing.

> commit message, but they should also go into the code. If it's in the 
> code, developers will easily find the quote without needing to use 
> git-blame.
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 02/19] glsl: Fix wonkey indentation left from previous commit

2016-12-18 Thread Timothy Arceri
Patches 1-2:

Reviewed-by: Timothy Arceri 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] Mesa: Fix error code for glTexImage3D in GLES

2016-12-18 Thread Xu, Randy
Hi, Chad & Ian

I check the code again, we cannot easily move texture_format_error_check_gles 
ahead of _mesa_error_check_format_and_type,  as the 
texture_format_error_check_gles is based on _mesa_error_check_format_and_type 
to handles some additional restrictions from GLES API.  

I am afraid it's not a minor effort and we should be very careful. Please let 
me know your idea or suggestion. 

Thanks,
Randy

-Original Message-
From: Xu, Randy 
Sent: Monday, December 19, 2016 10:47 AM
To: 'Ian Romanick' ; Chad Versace 
; mesa-dev@lists.freedesktop.org; 
mesa-sta...@lists.freedesktop.org
Subject: RE: [Mesa-dev] [PATCH] Mesa: Fix error code for glTexImage3D in GLES

Hi, Chad & Ian

Thanks for your suggestion, and I understand and agree your point, while the 
texsubimage_error_check (in teximage.c) calls _mesa_error_check_format_and_type 
first, and if error happens, it will return immediately (in 2175) and not call 
texture_format_error_check_gles (in 2184). So I did the patch this way.

Follow your suggestion, we'd better move texture_format_error_check_gles ahead 
of _mesa_error_check_format_and_type, i.e. handle the GLES API ahead. Do you 
agree with that?

Thanks,
Randy 

2131 static GLboolean
2132 texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
2133 struct gl_texture_object *texObj,
2134 GLenum target, GLint level,
2135 GLint xoffset, GLint yoffset, GLint zoffset,
2136 GLint width, GLint height, GLint depth,
2137 GLenum format, GLenum type, const GLvoid *pixels,
2138 bool dsa, const char *callerName)
2139 {

2169err = _mesa_error_check_format_and_type(ctx, format, type);
2170if (err != GL_NO_ERROR) {
2171   _mesa_error(ctx, err,
2172   "%s(incompatible format = %s, type = %s)",
2173   callerName, _mesa_enum_to_string(format),
2174   _mesa_enum_to_string(type));
2175   return GL_TRUE;
2176}

2183if (_mesa_is_gles(ctx) &&
2184texture_format_error_check_gles(ctx, format, type,
2185texImage->InternalFormat,
2186dimensions, callerName)) {
2187   return GL_TRUE;
2188}



-Original Message-
From: Ian Romanick [mailto:i...@freedesktop.org]
Sent: Saturday, December 17, 2016 6:02 AM
To: Chad Versace ; Xu, Randy ; 
mesa-dev@lists.freedesktop.org; mesa-sta...@lists.freedesktop.org; 
x...@freedesktop.org
Subject: Re: [Mesa-dev] [PATCH] Mesa: Fix error code for glTexImage3D in GLES

On 12/16/2016 12:49 PM, Chad Versace wrote:
> On Fri 16 Dec 2016, Chad Versace wrote:
>> On Fri 16 Dec 2016, Randy Xu wrote:
>>> From: "Xu,Randy" 
>>>
>>> The ES specification says that TexImage3D should return 
>>> GL_INVALID_OPERATION if the internal format is DEPTH_COMPONENT, 
>>> DEPTH_-STENCIL or STENCIL_INDEX.
>>> The current code returns INVALID_ENUM as 
>>> _mesa_error_check_format_and_type is used by glReadPixels also and 
>>> the GL specification defines "INVALID_ENUM is generated if format is 
>>> DEPTH_STENCIL and type is not UNSIGNED_INT_24_8 or
>>> FLOAT_32_UNSIGNED_INT_24_8_- REV".
>>>
>>> This patch only impacts GLES, which can generate 
>>> GL_INVALID_OPERATION because glReadPixels cannot be used to read depth or 
>>> stencil buffer.
>>> Fixes dEQP-GLES3.functional.negative_api.texture.teximage3d.
>>>
>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99076
>>>
>>> Signed-off-by: Xu,Randy 
>>> ---
>>>  src/mesa/main/glformats.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>
>> Thanks for fixing the dEQP failure. But I think your patch applies 
>> the fix to wrong portion of code.
>>
>> The commit message mentions the internalFormat, but the patch updates 
>> a function to which validates the *format* (not internalFormat).
>> I believe the change should instead be placed in 
>> teximage.c:texture_format_error_check_gles(), which is better for
>> 2 reasons:
>> - That function specifically checks GLES-specific requirements like
>>   this.
>> - It checks the *internalFormat* in addition to the *format*.
>>
>> Also, in the future, please remove the empty lines between the tags 
>> (Bugzilla:, Signed-of-by:) in the commit message. The empty lines can 
>> confuse scripts that parse those tags.
> 
> One more thing: Please insert any relevant quotes from the GLES spec 
> into the code itself as comments. It's ok to put those quotes in the

I was going to suggest the same thing.

> commit message, but they should also go into the code. If it's in the 
> code, developers will easily find the quote without needing to use 
> git-blame.
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-d

[Mesa-dev] V7 Loop unrolling in NIR

2016-12-18 Thread Timothy Arceri
V7:
 - paritally out of ssa in unrolling pass to avoid phis
 - lots of simplification/tidy ups in the analysis pass
 - if_uses bug fix (missing functionality) in lcssa fixed
 - better support for non trivial loop terminators
 - fixed all loop HURT except 1 that is too big to unroll.

total instructions in shared programs: 12584624 -> 12584621 (-0.00%)
instructions in affected programs: 68507 -> 68504 (-0.00%)
helped: 70
HURT: 170

total cycles in shared programs: 24146 -> 241476226 (-0.01%)
cycles in affected programs: 4060722 -> 4036952 (-0.59%)
helped: 1241
HURT: 1278

total loops in shared programs: 4245 -> 2948 (-30.55%)
loops in affected programs: 1535 -> 238 (-84.50%)
helped: 1142
HURT: 1

LOST:   26
GAINED: 16
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/12] i965: allow sampler indirects on all gens

2016-12-18 Thread Timothy Arceri
Without this we will regress the max-samplers piglit test on Gen6
and lower when loop unrolling is done in NIR. There is a check
in the GLSL IR linker that errors when it finds indirects and
EmitNoIndirectSampler is set.

As far as I can tell there is no reason for not enabling this for
all gens regardless of whether they fully support ARB_gpu_shader5
or not.
---
 src/mesa/drivers/dri/i965/brw_compiler.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c 
b/src/mesa/drivers/dri/i965/brw_compiler.c
index 1aa72bc..6a73719 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -133,10 +133,6 @@ brw_compiler_create(void *mem_ctx, const struct 
gen_device_info *devinfo)
   compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
   compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
 
-  /* !ARB_gpu_shader5 */
-  if (devinfo->gen < 7)
- compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
-
   if (is_scalar) {
  compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options;
   } else {
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 03/12] nir: Add a loop analysis pass

2016-12-18 Thread Timothy Arceri
From: Thomas Helland 

This pass detects induction variables and calculates the
trip count of loops to be used for loop unrolling.

I've removed support for float induction values for now, for the
simple reason that they don't appear in my shader-db collection,
and so I don't see it as common enough that we want to pollute the
pass with this in the initial version.

V2: Rebase, adapt to removal of function overloads

V3: (Timothy Arceri)
 - don't try to find trip count if loop terminator conditional is a phi
 - fix trip count for do-while loops
 - replace conditional type != alu assert with return
 - disable unrolling of loops with continues
 - multiple fixes to memory allocation, stop leaking and don't destroy
   structs we want to use for unrolling.
 - fix iteration count bugs when induction var not on RHS of condition
 - add FIXME for && conditions
 - calculate trip count for unsigned induction/limit vars

V4: (Timothy Arceri)
- count instructions in a loop
- set the limiting_terminator even if we can't find the trip count for
 all terminators. This is needed for complex unrolling where we handle
 2 terminators and the trip count is unknown for one of them.
- restruct structs so we don't keep information not required after
 analysis and remove dead fields.
- force unrolling in some cases as per the rules in the GLSL IR pass

V5: (Timothy Arceri)
- fix metadata mask value 0x10 vs 0x16

V6: (Timothy Arceri)
- merge loop_variable and nir_loop_variable structs and lists suggested by Jason
- remove induction var hash table and store pointer to induction information in
  the loop_variable suggested by Jason.
- use lowercase list_addtail() suggested by Jason.
- tidy up init_loop_block() as per Jasons suggestions.
- replace switch with nir_op_infos[alu->op].num_inputs == 2 in
  is_var_basic_induction_var() as suggested by Jason.
- use nir_block_last_instr() in and rename foreach_cf_node_ex_loop() as 
suggested
  by Jason.
- fix else check for is_trivial_loop_terminator() as per Connors suggetions.
- simplify offset for induction valiables incremented before the exit 
conditions is
  checked.
- replace nir_op_isub check with assert() as it should have been lowered away.

V7: (Timothy Arceri)
- use rzalloc() on nir_loop struct creation. Worked previously because ralloc()
  was broken and always zeroed the struct.
- fix cf_node_find_loop_jumps() to find jumps when loops contain
  nested if statements. Code is tidier as a result.

V8: (Timothy Arceri)
- move is_trivial_loop_terminator() to nir.h so we can use it to assert is
  the loop unroll pass
- fix analysis to not bail when looking for terminator when the break is in the 
else
  rather then the if
- added new loop terminator fields: break_block, continue_from_block and
  continue_from_then so we don't have to gather these when doing unrolling.
- get correct array length when forcing unrolling of variables
  indexed arrays that are the same size as the iteration count
- add support for induction variables of type float
- update trival loop terminator check to allow an if containing
  instructions as long as both branches contain only a single
  block.

V9:
 - bunch of tidy ups and simplifications suggested by Jason.
 - rewrote trivial terminator detection, now the only restriction is there
   must be no nested jumps, anything else goes.
 - rewrote the iteration test to use nir_eval_const_opcode().
 - count instruction properly even when forcing an unroll.
 - bunch of other tidy ups and simplifications.
---
 src/compiler/Makefile.sources   |   2 +
 src/compiler/nir/nir.c  |   2 +-
 src/compiler/nir/nir.h  |  41 +-
 src/compiler/nir/nir_loop_analyze.c | 852 
 src/compiler/nir/nir_loop_analyze.h |  92 
 src/compiler/nir/nir_metadata.c |   8 +-
 6 files changed, 994 insertions(+), 3 deletions(-)
 create mode 100644 src/compiler/nir/nir_loop_analyze.c
 create mode 100644 src/compiler/nir/nir_loop_analyze.h

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 17b15de..ca8a056 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -193,6 +193,8 @@ NIR_FILES = \
nir/nir_intrinsics.c \
nir/nir_intrinsics.h \
nir/nir_liveness.c \
+   nir/nir_loop_analyze.c \
+   nir/nir_loop_analyze.h \
nir/nir_lower_alu_to_scalar.c \
nir/nir_lower_atomics.c \
nir/nir_lower_bitmap.c \
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 2d882f7..2c3531c 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -393,7 +393,7 @@ nir_if_create(nir_shader *shader)
 nir_loop *
 nir_loop_create(nir_shader *shader)
 {
-   nir_loop *loop = ralloc(shader, nir_loop);
+   nir_loop *loop = rzalloc(shader, nir_loop);
 
cf_init(&loop->cf_node, nir_cf_node_loop);
 
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index ba88a24..28010aa 100644
--- a/src/compiler/nir/nir.h
+++ b/src/c

[Mesa-dev] [PATCH 04/12] nir: Add a LCSAA-pass

2016-12-18 Thread Timothy Arceri
From: Thomas Helland 

V2: Do a "depth first search" to convert to LCSSA

V3: Small comment fixup

V4: Rebase, adapt to removal of function overloads

V5: Rebase, adapt to relocation of nir to compiler/nir
Still need to adapt to potential if-uses
Work around nir_validate issue

V6 (Timothy):
 - tidy lcssa and stop leaking memory
 - dont rewrite the src for the lcssa phi node
 - validate lcssa phi srcs to avoid postvalidate assert
 - don't add new phi if one already exists
 - more lcssa phi validation fixes
 - Rather than marking ssa defs inside a loop just mark blocks inside
   a loop. This is simpler and fixes lcssa for intrinsics which do
   not have a destination.
 - don't create LCSSA phis for loops we won't unroll
 - require loop metadata for lcssa pass
 - handle case were the ssa defs use outside the loop is already a phi

V7: (Timothy)
- pass indirect mask to metadata call

v8: (Timothy)
- make convert to lcssa a helper function rather than a nir pass
- replace inside loop bitset with on the fly block index logic.
- remove lcssa phi validation special cases
- inline code from useless helpers, suggested by Jason.
- always do lcssa on loops, suggested by Jason.
- stop making lcssa phis special. Add as many source as the block
  has predecessors, suggested by Jason.

V9: (Timothy)
- fix regression with the is_lcssa_phi field not being initialised
  to false now that ralloc() doesn't zero out memory.

V10: (Timothy)
- remove extra braces in SSA example, pointed out by Topi

V11: (Timothy)
- add missing support for LCSSA phis in if conditions.
---
 src/compiler/Makefile.sources   |   1 +
 src/compiler/nir/nir.c  |   1 +
 src/compiler/nir/nir.h  |   4 +
 src/compiler/nir/nir_to_lcssa.c | 215 
 4 files changed, 221 insertions(+)
 create mode 100644 src/compiler/nir/nir_to_lcssa.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index ca8a056..e8f7b02 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -254,6 +254,7 @@ NIR_FILES = \
nir/nir_split_var_copies.c \
nir/nir_sweep.c \
nir/nir_to_ssa.c \
+   nir/nir_to_lcssa.c \
nir/nir_validate.c \
nir/nir_vla.h \
nir/nir_worklist.c \
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 2c3531c..e522a67 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -561,6 +561,7 @@ nir_phi_instr_create(nir_shader *shader)
 {
nir_phi_instr *instr = ralloc(shader, nir_phi_instr);
instr_init(&instr->instr, nir_instr_type_phi);
+   instr->is_lcssa_phi = false;
 
dest_init(&instr->dest);
exec_list_make_empty(&instr->srcs);
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 28010aa..75a91ea 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1360,6 +1360,8 @@ typedef struct {
struct exec_list srcs; /** < list of nir_phi_src */
 
nir_dest dest;
+
+   bool is_lcssa_phi;
 } nir_phi_instr;
 
 typedef struct {
@@ -2526,6 +2528,8 @@ void nir_convert_to_ssa(nir_shader *shader);
 bool nir_repair_ssa_impl(nir_function_impl *impl);
 bool nir_repair_ssa(nir_shader *shader);
 
+void nir_convert_loop_to_lcssa(nir_loop *loop);
+
 /* If phi_webs_only is true, only convert SSA values involved in phi nodes to
  * registers.  If false, convert all values (even those not involved in a phi
  * node) to registers.
diff --git a/src/compiler/nir/nir_to_lcssa.c b/src/compiler/nir/nir_to_lcssa.c
new file mode 100644
index 000..8afdc54
--- /dev/null
+++ b/src/compiler/nir/nir_to_lcssa.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright © 2015 Thomas Helland
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * This pass converts the ssa-graph into "Loop Closed SSA form". This is
+ * done by placing phi nodes at the exits of the loop for all values
+ * that are used outside the loop. The resu

[Mesa-dev] [PATCH 05/12] nir: don't count removal of lcssa_phi as progress

2016-12-18 Thread Timothy Arceri
V2:
 - make the is_lcssa_phi bool const, suggested by Topi.
---
 src/compiler/nir/nir_opt_remove_phis.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_opt_remove_phis.c 
b/src/compiler/nir/nir_opt_remove_phis.c
index acaa6e1..99d5d35 100644
--- a/src/compiler/nir/nir_opt_remove_phis.c
+++ b/src/compiler/nir/nir_opt_remove_phis.c
@@ -73,6 +73,7 @@ remove_phis_block(nir_block *block, nir_builder *b)
  break;
 
   nir_phi_instr *phi = nir_instr_as_phi(instr);
+  const bool is_lcssa_phi = phi->is_lcssa_phi;
 
   nir_ssa_def *def = NULL;
   nir_alu_instr *mov = NULL;
@@ -133,7 +134,8 @@ remove_phis_block(nir_block *block, nir_builder *b)
   nir_ssa_def_rewrite_uses(&phi->dest.ssa, nir_src_for_ssa(def));
   nir_instr_remove(instr);
 
-  progress = true;
+  if (!is_lcssa_phi)
+ progress = true;
}
 
return progress;
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 07/12] nir: update fixup_phi_srcs() to handle registers

2016-12-18 Thread Timothy Arceri
We need to do this because we partially get out of SSA when unrolling
and cloning loops.
---
 src/compiler/nir/nir_clone.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index fb1558c..91ffe62 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -603,12 +603,17 @@ fixup_phi_srcs(clone_state *state)
 {
list_for_each_entry_safe(nir_phi_src, src, &state->phi_srcs, src.use_link) {
   src->pred = remap_local(state, src->pred);
-  assert(src->src.is_ssa);
-  src->src.ssa = remap_local(state, src->src.ssa);
 
-  /* Remove from this list and place in the uses of the SSA def */
+  /* Remove from this list */
   list_del(&src->src.use_link);
-  list_addtail(&src->src.use_link, &src->src.ssa->uses);
+
+  if (src->src.is_ssa) {
+ src->src.ssa = remap_local(state, src->src.ssa);
+ list_addtail(&src->src.use_link, &src->src.ssa->uses);
+  } else {
+ src->src.reg.reg = remap_reg(state, src->src.reg.reg);
+ list_addtail(&src->src.use_link, &src->src.reg.reg->uses);
+  }
}
assert(list_empty(&state->phi_srcs));
 }
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 10/12] nir: pass compiler rather than devinfo to functions that call nir_optimize

2016-12-18 Thread Timothy Arceri
Later we will pass compiler to nir_optimise to be used by the loop unroll
pass.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp  | 10 --
 src/mesa/drivers/dri/i965/brw_nir.c   |  7 ---
 src/mesa/drivers/dri/i965/brw_nir.h   |  4 ++--
 src/mesa/drivers/dri/i965/brw_shader.cpp  |  4 ++--
 src/mesa/drivers/dri/i965/brw_vec4.cpp|  5 ++---
 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp |  5 ++---
 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp|  4 ++--
 7 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 671b44b..c8a0693 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -6396,14 +6396,13 @@ brw_compile_fs(const struct brw_compiler *compiler, 
void *log_data,
char **error_str)
 {
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-  true);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
brw_nir_lower_fs_inputs(shader, vue_map, prog, compiler->devinfo, key);
brw_nir_lower_fs_outputs(shader);
if (!key->multisample_fbo)
   NIR_PASS_V(shader, demote_sample_qualifiers);
NIR_PASS_V(shader, move_interpolation_to_top);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
 
/* key->alpha_test_func means simulating alpha testing via discards,
 * so the shader definitely kills pixels.
@@ -6628,8 +6627,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
char **error_str)
 {
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler->devinfo, &key->tex,
-  true);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
brw_nir_lower_cs_shared(shader);
prog_data->base.total_shared += shader->num_shared;
 
@@ -6642,7 +6640,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
(unsigned)4 * (prog_data->thread_local_id_index + 1));
 
brw_nir_lower_intrinsics(shader, &prog_data->base);
-   shader = brw_postprocess_nir(shader, compiler->devinfo, true);
+   shader = brw_postprocess_nir(shader, compiler, true);
 
prog_data->local_size[0] = shader->info->cs.local_size[0];
prog_data->local_size[1] = shader->info->cs.local_size[1];
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c 
b/src/mesa/drivers/dri/i965/brw_nir.c
index 7624126..b44cbe8 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -514,10 +514,10 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
  * will not work.
  */
 nir_shader *
-brw_postprocess_nir(nir_shader *nir,
-const struct gen_device_info *devinfo,
+brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
 bool is_scalar)
 {
+   const struct gen_device_info *devinfo = compiler->devinfo;
bool debug_enabled =
   (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->stage));
 
@@ -579,10 +579,11 @@ brw_postprocess_nir(nir_shader *nir,
 
 nir_shader *
 brw_nir_apply_sampler_key(nir_shader *nir,
-  const struct gen_device_info *devinfo,
+  const struct brw_compiler *compiler,
   const struct brw_sampler_prog_key_data *key_tex,
   bool is_scalar)
 {
+   const struct gen_device_info *devinfo = compiler->devinfo;
nir_lower_tex_options tex_options = { 0 };
 
/* Iron Lake and prior require lowering of all rectangle textures */
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h 
b/src/mesa/drivers/dri/i965/brw_nir.h
index 3c774d0..8cfb6c1 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -114,7 +114,7 @@ void brw_nir_lower_fs_outputs(nir_shader *nir);
 void brw_nir_lower_cs_shared(nir_shader *nir);
 
 nir_shader *brw_postprocess_nir(nir_shader *nir,
-const struct gen_device_info *devinfo,
+const struct brw_compiler *compiler,
 bool is_scalar);
 
 bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
@@ -126,7 +126,7 @@ bool brw_nir_apply_trig_workarounds(nir_shader *nir);
 void brw_nir_apply_tcs_quads_workaround(nir_shader *nir);
 
 nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
-  const struct gen_device_info *devinfo,
+  const struct brw_compiler *compiler,
   const struct brw_sampler_prog_key_data 
*key,
   bool is_scalar);
 
d

[Mesa-dev] [PATCH 08/12] nir: add helper for cloning nir_cf_list

2016-12-18 Thread Timothy Arceri
V2:
- updated to create a generic list clone helper nir_cf_list_clone()
- continue to assert on clone when fallback flag not set as suggested
  by Jason.

Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir_clone.c| 62 +++--
 src/compiler/nir/nir_control_flow.h |  3 ++
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 91ffe62..a0ba8f7 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -22,7 +22,7 @@
  */
 
 #include "nir.h"
-#include "nir_control_flow_private.h"
+#include "nir_control_flow.h"
 
 /* Secret Decoder Ring:
  *   clone_foo():
@@ -35,6 +35,13 @@ typedef struct {
/* True if we are cloning an entire shader. */
bool global_clone;
 
+   /* If true allows the clone operation to fall back to the original pointer
+* if no clone pointer is found in the remap table.  This allows us to
+* clone a loop body without having to add srcs from outside the loop to
+* the remap table. This is useful for loop unrolling.
+*/
+   bool allow_remap_fallback;
+
/* maps orig ptr -> cloned ptr: */
struct hash_table *remap_table;
 
@@ -46,11 +53,19 @@ typedef struct {
 } clone_state;
 
 static void
-init_clone_state(clone_state *state, bool global)
+init_clone_state(clone_state *state, struct hash_table *remap_table,
+ bool global, bool allow_remap_fallback)
 {
state->global_clone = global;
-   state->remap_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-_mesa_key_pointer_equal);
+   state->allow_remap_fallback = allow_remap_fallback;
+
+   if (remap_table) {
+  state->remap_table = remap_table;
+   } else {
+  state->remap_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+   _mesa_key_pointer_equal);
+   }
+
list_inithead(&state->phi_srcs);
 }
 
@@ -72,9 +87,10 @@ _lookup_ptr(clone_state *state, const void *ptr, bool global)
   return (void *)ptr;
 
entry = _mesa_hash_table_search(state->remap_table, ptr);
-   assert(entry && "Failed to find pointer!");
-   if (!entry)
-  return NULL;
+   if (!entry) {
+  assert(state->allow_remap_fallback);
+  return (void *)ptr;
+   }
 
return entry->data;
 }
@@ -618,6 +634,34 @@ fixup_phi_srcs(clone_state *state)
assert(list_empty(&state->phi_srcs));
 }
 
+void
+nir_cf_list_clone(nir_cf_list *dst, nir_cf_list *src, nir_cf_node *parent,
+  struct hash_table *remap_table)
+{
+   exec_list_make_empty(&dst->list);
+   dst->impl = src->impl;
+
+   if (exec_list_is_empty(&src->list))
+  return;
+
+   clone_state state;
+   init_clone_state(&state, remap_table, false, true);
+
+   /* We use the same shader */
+   state.ns = src->impl->function->shader;
+
+   /* The control-flow code assumes that the list of cf_nodes always starts
+* and ends with a block.  We start by adding an empty block.
+*/
+   nir_block *nblk = nir_block_create(state.ns);
+   nblk->cf_node.parent = parent;
+   exec_list_push_tail(&dst->list, &nblk->cf_node.node);
+
+   clone_cf_list(&state, &dst->list, &src->list);
+
+   fixup_phi_srcs(&state);
+}
+
 static nir_function_impl *
 clone_function_impl(clone_state *state, const nir_function_impl *fi)
 {
@@ -651,7 +695,7 @@ nir_function_impl *
 nir_function_impl_clone(const nir_function_impl *fi)
 {
clone_state state;
-   init_clone_state(&state, false);
+   init_clone_state(&state, NULL, false, false);
 
/* We use the same shader */
state.ns = fi->function->shader;
@@ -691,7 +735,7 @@ nir_shader *
 nir_shader_clone(void *mem_ctx, const nir_shader *s)
 {
clone_state state;
-   init_clone_state(&state, true);
+   init_clone_state(&state, NULL, true, false);
 
nir_shader *ns = nir_shader_create(mem_ctx, s->stage, s->options, NULL);
state.ns = ns;
diff --git a/src/compiler/nir/nir_control_flow.h 
b/src/compiler/nir/nir_control_flow.h
index b71382f..b496aec 100644
--- a/src/compiler/nir/nir_control_flow.h
+++ b/src/compiler/nir/nir_control_flow.h
@@ -141,6 +141,9 @@ void nir_cf_reinsert(nir_cf_list *cf_list, nir_cursor 
cursor);
 
 void nir_cf_delete(nir_cf_list *cf_list);
 
+void nir_cf_list_clone(nir_cf_list *dst, nir_cf_list *src, nir_cf_node *parent,
+   struct hash_table *remap_table);
+
 static inline void
 nir_cf_list_extract(nir_cf_list *extracted, struct exec_list *cf_list)
 {
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 11/12] i965: use nir loop unrolling pass

2016-12-18 Thread Timothy Arceri
shader-db results for BDW:

total instructions in shared programs: 12589614 -> 12590119 (0.00%)
instructions in affected programs: 50525 -> 51030 (1.00%)
helped: 7
HURT: 145

total cycles in shared programs: 241524604 -> 241490502 (-0.01%)
cycles in affected programs: 1941404 -> 1907302 (-1.76%)
helped: 302
HURT: 449

total loops in shared programs: 4245 -> 2947 (-30.58%)
loops in affected programs: 1535 -> 237 (-84.56%)
helped: 1142
HURT: 0

total spills in shared programs: 14453 -> 14453 (0.00%)
spills in affected programs: 0 -> 0
helped: 0
HURT: 0

total fills in shared programs: 18984 -> 18984 (0.00%)
fills in affected programs: 0 -> 0
helped: 0
HURT: 0

LOST:   26
GAINED: 15
---
 src/mesa/drivers/dri/i965/brw_compiler.c |  3 +++
 src/mesa/drivers/dri/i965/brw_nir.c  | 22 +-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c 
b/src/mesa/drivers/dri/i965/brw_compiler.c
index 6a73719..d7900a7 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -55,6 +55,7 @@ static const struct nir_shader_compiler_options 
scalar_nir_options = {
.lower_unpack_snorm_4x8 = true,
.lower_unpack_unorm_2x16 = true,
.lower_unpack_unorm_4x8 = true,
+   .max_unroll_iterations = 32,
 };
 
 static const struct nir_shader_compiler_options vector_nir_options = {
@@ -75,6 +76,7 @@ static const struct nir_shader_compiler_options 
vector_nir_options = {
.lower_unpack_unorm_2x16 = true,
.lower_extract_byte = true,
.lower_extract_word = true,
+   .max_unroll_iterations = 32,
 };
 
 static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
@@ -92,6 +94,7 @@ static const struct nir_shader_compiler_options 
vector_nir_options_gen6 = {
.lower_unpack_unorm_2x16 = true,
.lower_extract_byte = true,
.lower_extract_word = true,
+   .max_unroll_iterations = 32,
 };
 
 struct brw_compiler *
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c 
b/src/mesa/drivers/dri/i965/brw_nir.c
index b44cbe8..0c1fb44 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -396,8 +396,17 @@ brw_nir_lower_cs_shared(nir_shader *nir)
 #define OPT_V(pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
 
 static nir_shader *
-nir_optimize(nir_shader *nir, bool is_scalar)
+nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
+ bool is_scalar)
 {
+   nir_variable_mode indirect_mask = 0;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput)
+  indirect_mask |= nir_var_shader_in;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput)
+  indirect_mask |= nir_var_shader_out;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp)
+  indirect_mask |= nir_var_local;
+
bool progress;
do {
   progress = false;
@@ -420,6 +429,9 @@ nir_optimize(nir_shader *nir, bool is_scalar)
   OPT(nir_opt_algebraic);
   OPT(nir_opt_constant_folding);
   OPT(nir_opt_dead_cf);
+  if (nir->options->max_unroll_iterations != 0) {
+ OPT(nir_opt_loop_unroll, indirect_mask);
+  }
   OPT(nir_opt_remove_phis);
   OPT(nir_opt_undef);
   OPT_V(nir_lower_doubles, nir_lower_drcp |
@@ -477,7 +489,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
 
OPT(nir_split_var_copies);
 
-   nir = nir_optimize(nir, is_scalar);
+   nir = nir_optimize(nir, compiler, is_scalar);
 
if (is_scalar) {
   OPT_V(nir_lower_load_const_to_scalar);
@@ -497,7 +509,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
nir_lower_indirect_derefs(nir, indirect_mask);
 
/* Get rid of split copies */
-   nir = nir_optimize(nir, is_scalar);
+   nir = nir_optimize(nir, compiler, is_scalar);
 
OPT_V(nir_lower_clip_cull_distance_arrays);
 
@@ -524,7 +536,7 @@ brw_postprocess_nir(nir_shader *nir, const struct 
brw_compiler *compiler,
bool progress; /* Written by OPT and OPT_V */
(void)progress;
 
-   nir = nir_optimize(nir, is_scalar);
+   nir = nir_optimize(nir, compiler, is_scalar);
 
if (devinfo->gen >= 6) {
   /* Try and fuse multiply-adds */
@@ -616,7 +628,7 @@ brw_nir_apply_sampler_key(nir_shader *nir,
 
if (nir_lower_tex(nir, &tex_options)) {
   nir_validate_shader(nir);
-  nir = nir_optimize(nir, is_scalar);
+  nir = nir_optimize(nir, compiler, is_scalar);
}
 
return nir;
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 02/12] i965: use nir_lower_indirect_derefs() for GLSL

2016-12-18 Thread Timothy Arceri
This moves the nir_lower_indirect_derefs() call into
brw_preprocess_nir() so thats is called by both OpenGL and Vulkan
and removes that call to the old GLSL IR pass
lower_variable_index_to_cond_assign()

We want to do this pass in nir to be able to move loop unrolling
to nir.

There is a increase of 1-3 instructions in a small number of shaders,
and 2 Kerbal Space program shaders that increase by 32 instructions.
The changes seem to be caused be the difference in the GLSL IR vs
NIR variable index lowering passes. The GLSL IR pass creates a
simple if ladder for arrays of size 4 or less, while the NIR pass
implements a binary search for all arrays regardless of size.

Shader-db results BDW:

total instructions in shared programs: 13021176 -> 13021819 (0.00%)
instructions in affected programs: 57693 -> 58336 (1.11%)
helped: 20
HURT: 190

total cycles in shared programs: 299805580 -> 299750826 (-0.02%)
cycles in affected programs: 2290024 -> 2235270 (-2.39%)
helped: 337
HURT: 442

total fills in shared programs: 19984 -> 19984 (0.00%)
fills in affected programs: 0 -> 0
helped: 0
HURT: 0

LOST:   4
GAINED: 0

V2: remove the do_copy_propagation() call from the i965 GLSL IR
linking code. This call was added in f7741c52111 but since we are
moving the variable index lowering to NIR we no longer need it and
can just rely on the nir copy propagation pass.

Reviewed-by: Kenneth Graunke 
---
 src/intel/vulkan/anv_pipeline.c| 10 --
 src/mesa/drivers/dri/i965/brw_link.cpp | 15 ---
 src/mesa/drivers/dri/i965/brw_nir.c| 10 ++
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 9104267..e2fbcab 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -190,16 +190,6 @@ anv_shader_compile_to_nir(struct anv_device *device,
 
nir_shader_gather_info(nir, entry_point->impl);
 
-   nir_variable_mode indirect_mask = 0;
-   if (compiler->glsl_compiler_options[stage].EmitNoIndirectInput)
-  indirect_mask |= nir_var_shader_in;
-   if (compiler->glsl_compiler_options[stage].EmitNoIndirectOutput)
-  indirect_mask |= nir_var_shader_out;
-   if (compiler->glsl_compiler_options[stage].EmitNoIndirectTemp)
-  indirect_mask |= nir_var_local;
-
-   nir_lower_indirect_derefs(nir, indirect_mask);
-
return nir;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp 
b/src/mesa/drivers/dri/i965/brw_link.cpp
index 6f37428..0d8a626 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -134,21 +134,6 @@ process_glsl_ir(struct brw_context *brw,
lower_noise(shader->ir);
lower_quadop_vector(shader->ir, false);
 
-   do_copy_propagation(shader->ir);
-
-   bool lowered_variable_indexing =
-  lower_variable_index_to_cond_assign(shader->Stage, shader->ir,
-  options->EmitNoIndirectInput,
-  options->EmitNoIndirectOutput,
-  options->EmitNoIndirectTemp,
-  options->EmitNoIndirectUniform);
-
-   if (unlikely(brw->perf_debug && lowered_variable_indexing)) {
-  perf_debug("Unsupported form of variable indexing in %s; falling "
- "back to very inefficient code generation\n",
- _mesa_shader_stage_to_abbrev(shader->Stage));
-   }
-
bool progress;
do {
   progress = false;
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c 
b/src/mesa/drivers/dri/i965/brw_nir.c
index 55b16cf..7624126 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -486,6 +486,16 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
/* Lower a bunch of stuff */
OPT_V(nir_lower_var_copies);
 
+   nir_variable_mode indirect_mask = 0;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput)
+  indirect_mask |= nir_var_shader_in;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput)
+  indirect_mask |= nir_var_shader_out;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp)
+  indirect_mask |= nir_var_local;
+
+   nir_lower_indirect_derefs(nir, indirect_mask);
+
/* Get rid of split copies */
nir = nir_optimize(nir, is_scalar);
 
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 12/12] i965: disable loop unrolling in GLSL IR

2016-12-18 Thread Timothy Arceri
There is a single regression in loop unrolling which is:

loops HURT:   shaders/orbital_explorer.shader_test GS SIMD8:0 -> 1

However the loop is huge so it seems reasonable not to unroll it. It's
surprising that GLSL IR does unroll it.

shader-db results BDW:

total instructions in shared programs: 13037455 -> 13036947 (-0.00%)
instructions in affected programs: 17982 -> 17474 (-2.83%)
helped: 63
HURT: 25

total cycles in shared programs: 262217870 -> 262227990 (0.00%)
cycles in affected programs: 2287046 -> 2297166 (0.44%)
helped: 969
HURT: 844

total loops in shared programs: 2951 -> 2952 (0.03%)
loops in affected programs: 0 -> 1
helped: 0
HURT: 1

LOST:   0
GAINED: 1
---
 src/mesa/drivers/dri/i965/brw_compiler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c 
b/src/mesa/drivers/dri/i965/brw_compiler.c
index d7900a7..fa8a772 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -122,7 +122,7 @@ brw_compiler_create(void *mem_ctx, const struct 
gen_device_info *devinfo)
 
/* We want the GLSL compiler to emit code that uses condition codes */
for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-  compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
+  compiler->glsl_compiler_options[i].MaxUnrollIterations = 0;
   compiler->glsl_compiler_options[i].MaxIfDepth =
  devinfo->gen < 6 ? 16 : UINT_MAX;
 
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 06/12] nir: create helper for fixing phi srcs when cloning

2016-12-18 Thread Timothy Arceri
This will be useful for fixing phi srcs when cloning a loop body
during loop unrolling.

Reviewed-by: Jason Ekstrand 
---
 src/compiler/nir/nir_clone.c | 36 +---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index be89426..fb1558c 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -593,6 +593,26 @@ clone_cf_list(clone_state *state, struct exec_list *dst,
}
 }
 
+/* After we've cloned almost everything, we have to walk the list of phi
+ * sources and fix them up.  Thanks to loops, the block and SSA value for a
+ * phi source may not be defined when we first encounter it.  Instead, we
+ * add it to the phi_srcs list and we fix it up here.
+ */
+static void
+fixup_phi_srcs(clone_state *state)
+{
+   list_for_each_entry_safe(nir_phi_src, src, &state->phi_srcs, src.use_link) {
+  src->pred = remap_local(state, src->pred);
+  assert(src->src.is_ssa);
+  src->src.ssa = remap_local(state, src->src.ssa);
+
+  /* Remove from this list and place in the uses of the SSA def */
+  list_del(&src->src.use_link);
+  list_addtail(&src->src.use_link, &src->src.ssa->uses);
+   }
+   assert(list_empty(&state->phi_srcs));
+}
+
 static nir_function_impl *
 clone_function_impl(clone_state *state, const nir_function_impl *fi)
 {
@@ -614,21 +634,7 @@ clone_function_impl(clone_state *state, const 
nir_function_impl *fi)
 
clone_cf_list(state, &nfi->body, &fi->body);
 
-   /* After we've cloned almost everything, we have to walk the list of phi
-* sources and fix them up.  Thanks to loops, the block and SSA value for a
-* phi source may not be defined when we first encounter it.  Instead, we
-* add it to the phi_srcs list and we fix it up here.
-*/
-   list_for_each_entry_safe(nir_phi_src, src, &state->phi_srcs, src.use_link) {
-  src->pred = remap_local(state, src->pred);
-  assert(src->src.is_ssa);
-  src->src.ssa = remap_local(state, src->src.ssa);
-
-  /* Remove from this list and place in the uses of the SSA def */
-  list_del(&src->src.use_link);
-  list_addtail(&src->src.use_link, &src->src.ssa->uses);
-   }
-   assert(list_empty(&state->phi_srcs));
+   fixup_phi_srcs(state);
 
/* All metadata is invalidated in the cloning process */
nfi->valid_metadata = 0;
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 09/12] nir: add a loop unrolling pass

2016-12-18 Thread Timothy Arceri
V2:
- tidy ups suggested by Connor.
- tidy up cloning logic and handle copy propagation
 based of suggestion by Connor.
- use nir_ssa_def_rewrite_uses to fix up lcssa phis
  suggested by Connor.
- add support for complex loop unrolling (two terminators)
- handle case were the ssa defs use outside the loop is already a phi
- support unrolling loops with multiple terminators when trip count
  is know for each terminator

V3:
- set correct num_components when creating phi in complex unroll
- rewrite update remap table based on Jasons suggestions.
- remove unrequired extract_loop_body() helper as suggested by Jason.
- simplify the lcssa phi fix up code for simple loops as per Jasons suggestions.
- use mem context to keep track of hash table memory as suggested by Jason.
- move is_{complex,simple}_loop helpers to the unroll code
- require nir_metadata_block_index
- partially rewrote complex unroll to be simpler and easier to follow.

V4:
- use rzalloc() when creating nir_phi_src but not setting pred right away
 fixes regression cause by ralloc() no longer zeroing memory.

V5:
- simplify calling of complex_unroll()
- use new loop terminator fields to get the break/continue from blocks
  and simplify loop unrolling code
- handle slightly less trivial loop terminators. if branches can
  now have instructions but can only contain a single block.
- use nir print type IR snippets in unroll function descriptions
- add better explanation and variable for why we need to clone
  additional times when the second terminator it the limiting
  terminator.
- partially convert out of ssa before unrolling loops (suggested by Jason)
---
 src/compiler/Makefile.sources  |   1 +
 src/compiler/nir/nir.h |   2 +
 src/compiler/nir/nir_opt_loop_unroll.c | 559 +
 3 files changed, 562 insertions(+)
 create mode 100644 src/compiler/nir/nir_opt_loop_unroll.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index e8f7b02..ae3e5f0 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -239,6 +239,7 @@ NIR_FILES = \
nir/nir_opt_dead_cf.c \
nir/nir_opt_gcm.c \
nir/nir_opt_global_to_local.c \
+   nir/nir_opt_loop_unroll.c \
nir/nir_opt_peephole_select.c \
nir/nir_opt_remove_phis.c \
nir/nir_opt_undef.c \
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 75a91ea..51bc6b2 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2552,6 +2552,8 @@ bool nir_opt_dead_cf(nir_shader *shader);
 
 bool nir_opt_gcm(nir_shader *shader, bool value_number);
 
+bool nir_opt_loop_unroll(nir_shader *shader, nir_variable_mode indirect_mask);
+
 bool nir_opt_peephole_select(nir_shader *shader, unsigned limit);
 
 bool nir_opt_remove_phis(nir_shader *shader);
diff --git a/src/compiler/nir/nir_opt_loop_unroll.c 
b/src/compiler/nir/nir_opt_loop_unroll.c
new file mode 100644
index 000..7eb44cb
--- /dev/null
+++ b/src/compiler/nir/nir_opt_loop_unroll.c
@@ -0,0 +1,559 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_control_flow.h"
+#include "nir_loop_analyze.h"
+
+/* Convert all phis in the give block to regs, here we insert a mov in the
+ * pred block of the phi source to copy the src to the reg, then we rewrite
+ * all uses of the phi to the new reg.
+ */
+static void
+convert_phis_to_regs(nir_builder *b, nir_block *block)
+{
+   nir_foreach_instr_safe(instr, block) {
+  if (instr->type != nir_instr_type_phi)
+ break;
+
+  nir_phi_instr *phi = nir_instr_as_phi(instr);
+
+  nir_register *reg = nir_local_reg_create(b->impl);
+  reg->num_components = phi->dest.ssa.num_components;
+  reg->bit_size = phi->dest.ssa.bit_size;
+
+  nir_foreach_phi_src(src, phi) {
+ nir_alu_i

Re: [Mesa-dev] [PATCH 03/19] glsl: Track the linearized array index for each UBO instance array element

2016-12-18 Thread Timothy Arceri
On Thu, 2016-12-15 at 20:10 -0800, Ian Romanick wrote:
> From: Ian Romanick 
> 
> Signed-off-by: Ian Romanick 
> Cc: mesa-sta...@lists.freedesktop.org
> ---
>  src/compiler/glsl/link_uniform_blocks.cpp | 17 ++---
>  src/mesa/main/mtypes.h| 15 +++
>  2 files changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/src/compiler/glsl/link_uniform_blocks.cpp
> b/src/compiler/glsl/link_uniform_blocks.cpp
> index 41b26e7..9adfbd5 100644
> --- a/src/compiler/glsl/link_uniform_blocks.cpp
> +++ b/src/compiler/glsl/link_uniform_blocks.cpp
> @@ -209,13 +209,19 @@ static void process_block_array_leaf(char
> **name, gl_uniform_block *blocks,
>   struct gl_context *ctx,
>   struct gl_shader_program
> *prog);
>  
> +/**
> + *
> + * \param first_index Value of \c block_index for the first element
> of the
> + *array.
> + */
>  static void
>  process_block_array(struct uniform_block_array_elements *ub_array,
> char **name,
>  size_t name_length, gl_uniform_block *blocks,
>  ubo_visitor *parcel, gl_uniform_buffer_variable
> *variables,
>  const struct link_uniform_block_active *const b,
>  unsigned *block_index, unsigned *binding_offset,
> -struct gl_context *ctx, struct gl_shader_program
> *prog)
> +struct gl_context *ctx, struct gl_shader_program
> *prog,
> +unsigned first_index)
>  {
> for (unsigned j = 0; j < ub_array->num_array_elements; j++) {
>    size_t new_length = name_length;
> @@ -227,11 +233,15 @@ process_block_array(struct
> uniform_block_array_elements *ub_array, char **name,
>    if (ub_array->array) {
>   process_block_array(ub_array->array, name, new_length,
> blocks,
>   parcel, variables, b, block_index,
> - binding_offset, ctx, prog);
> + binding_offset, ctx, prog,
> first_index);
>    } else {
> + const unsigned i = *block_index;
> +
>   process_block_array_leaf(name, blocks,
>    parcel, variables, b, block_index,
>    binding_offset, ctx, prog);
> +
> + blocks[i].linearized_array_index = i - first_index;

Shouldn't this go in the new process_block_array_leaf() too?

Otherwise this patch is:

Reviewed-by: Timothy Arceri 


>    }
> }
>  }
> @@ -359,7 +369,8 @@ create_buffer_blocks(void *mem_ctx, struct
> gl_context *ctx,
>  
>  assert(b->has_instance_name);
>  process_block_array(b->array, &name, name_length,
> blocks, &parcel,
> -variables, b, &i, &binding_offset,
> ctx, prog);
> +variables, b, &i, &binding_offset,
> ctx, prog,
> +i);
>  ralloc_free(name);
>   } else {
>  blocks[i].Name = ralloc_strdup(blocks, block_type-
> >name);
> diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
> index 36d48e2..ac4cac0 100644
> --- a/src/mesa/main/mtypes.h
> +++ b/src/mesa/main/mtypes.h
> @@ -2493,6 +2493,21 @@ struct gl_uniform_block
> uint8_t stageref;
>  
> /**
> +* Linearized array index for uniform block instance arrays
> +*
> +* Given a uniform block instance array declared with size
> +* blk[s_0][s_1]..[s_m], the block referenced by
> blk[i_0][i_1]..[i_m] will
> +* have the linearized array index
> +*
> +*   m-1   m
> +* i_m + ∑   i_j * ∏ s_k
> +*   j=0   k=j+1
> +*
> +* For a uniform block instance that is not an array, this is
> always 0.
> +*/
> +   uint8_t linearized_array_index;
> +
> +   /**
>  * Layout specified in the shader
>  *
>  * This isn't accessible through the API, but it is used while
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] V7 Loop unrolling in NIR

2016-12-18 Thread Jason Ekstrand
On Dec 18, 2016 9:48 PM, "Timothy Arceri" 
wrote:

V7:
 - paritally out of ssa in unrolling pass to avoid phis
 - lots of simplification/tidy ups in the analysis pass
 - if_uses bug fix (missing functionality) in lcssa fixed
 - better support for non trivial loop terminators
 - fixed all loop HURT except 1 that is too big to unroll.

total instructions in shared programs: 12584624 -> 12584621 (-0.00%)
instructions in affected programs: 68507 -> 68504 (-0.00%)
helped: 70
HURT: 170

total cycles in shared programs: 24146 -> 241476226 (-0.01%)
cycles in affected programs: 4060722 -> 4036952 (-0.59%)
helped: 1241
HURT: 1278

total loops in shared programs: 4245 -> 2948 (-30.55%)
loops in affected programs: 1535 -> 238 (-84.50%)
helped: 1142
HURT: 1


That is a *lot* of loops that we were leaving intact for no good reason.
Any idea how many of those were because the glsl pass didn't know what to
do weight them vs. the heuristic subtly changing?


LOST:   26
GAINED: 16
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 04/19] glsl: Use simpler visitor to determine which UBO and SSBO blocks are used

2016-12-18 Thread Timothy Arceri
On Thu, 2016-12-15 at 20:10 -0800, Ian Romanick wrote:
> From: Ian Romanick 
> 
> Very soon this visitor will get more complicated.  The users of the
> existing ir_variable_refcount visitor won't need the coming
> functionality, and this use doesn't need much of the functionality of
> ir_variable_refcount.
> 
> Signed-off-by: Ian Romanick 
> Cc: mesa-sta...@lists.freedesktop.org
> ---
>  src/compiler/Makefile.sources   |   2 +
>  src/compiler/glsl/ir_array_refcount.cpp | 100
> 
>  src/compiler/glsl/ir_array_refcount.h   |  65 +
>  src/compiler/glsl/link_uniforms.cpp |  10 ++--
>  4 files changed, 172 insertions(+), 5 deletions(-)
>  create mode 100644 src/compiler/glsl/ir_array_refcount.cpp
>  create mode 100644 src/compiler/glsl/ir_array_refcount.h
> 
> diff --git a/src/compiler/Makefile.sources
> b/src/compiler/Makefile.sources
> index 17b15de..15f410f 100644
> --- a/src/compiler/Makefile.sources
> +++ b/src/compiler/Makefile.sources
> @@ -29,6 +29,8 @@ LIBGLSL_FILES = \
>   glsl/glsl_to_nir.cpp \
>   glsl/glsl_to_nir.h \
>   glsl/hir_field_selection.cpp \
> + glsl/ir_array_refcount.cpp \
> + glsl/ir_array_refcount.h \
>   glsl/ir_basic_block.cpp \
>   glsl/ir_basic_block.h \
>   glsl/ir_builder.cpp \
> diff --git a/src/compiler/glsl/ir_array_refcount.cpp
> b/src/compiler/glsl/ir_array_refcount.cpp
> new file mode 100644
> index 000..41a0914
> --- /dev/null
> +++ b/src/compiler/glsl/ir_array_refcount.cpp
> @@ -0,0 +1,100 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person
> obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> the
> + * Software is furnished to do so, subject to the following
> conditions:
> + *
> + * The above copyright notice and this permission notice (including
> the next
> + * paragraph) shall be included in all copies or substantial
> portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
> OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file ir_array_refcount.cpp
> + *
> + * Provides a visitor which produces a list of variables referenced.
> + */
> +
> +#include "ir.h"
> +#include "ir_visitor.h"
> +#include "ir_array_refcount.h"
> +#include "compiler/glsl_types.h"
> +#include "util/hash_table.h"
> +
> +ir_array_refcount_visitor::ir_array_refcount_visitor()
> +{
> +   this->mem_ctx = ralloc_context(NULL);
> +   this->ht = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
> +  _mesa_key_pointer_equal);
> +}
> +
> +static void
> +free_entry(struct hash_entry *entry)
> +{
> +   ir_array_refcount_entry *ivre = (ir_array_refcount_entry *)
> entry->data;
> +   delete ivre;
> +}
> +
> +ir_array_refcount_visitor::~ir_array_refcount_visitor()
> +{
> +   ralloc_free(this->mem_ctx);
> +   _mesa_hash_table_destroy(this->ht, free_entry);
> +}
> +
> +ir_array_refcount_entry::ir_array_refcount_entry(ir_variable *var)
> +   : var(var), is_referenced(false)
> +{
> +   /* empty */
> +}
> +
> +
> +ir_array_refcount_entry *
> +ir_array_refcount_visitor::get_variable_entry(ir_variable *var)
> +{
> +   assert(var);
> +
> +   struct hash_entry *e = _mesa_hash_table_search(this->ht, var);
> +   if (e)
> +  return (ir_array_refcount_entry *)e->data;
> +
> +   ir_array_refcount_entry *entry = new
> ir_array_refcount_entry(var);
> +   _mesa_hash_table_insert(this->ht, var, entry);
> +
> +   return entry;
> +}
> +
> +
> +ir_visitor_status
> +ir_array_refcount_visitor::visit(ir_dereference_variable *ir)
> +{
> +   ir_variable *const var = ir->variable_referenced();
> +   ir_array_refcount_entry *entry = this->get_variable_entry(var);
> +
> +   if (entry)

I don't think this can ever be not true. maybe just assert(entry). If
new fails it will throw an exception rather than return null as far as
I understand.

Otherwise seems fine:

Reviewed-by: Timothy Arceri 

> +  entry->is_referenced = true;
> +
> +   return visit_continue;
> +}
> +
> +
> +ir_visitor_status
> +ir_array_refcount_visitor::visit_enter(ir_function_signature *ir)
> +{
> +   /* We don't want to descend into the function parameters and
> +* dead-code eliminate them, so just accept the body here.
>

[Mesa-dev] [AppVeyor] mesa fdo_master #2952 failed

2016-12-18 Thread AppVeyor



Build mesa 2952 failed


Commit e2610bf165 by Liu Zhiquan on 12/9/2016 11:29 AM:

EGL/android: Enhance pbuffer implementation\n\nSome dri drivers will pass multiple bits in buffer_mask parameter\nto droid_image_get_buffer(), more than the actual supported buffer\ntype combination. For such case, will go through all the bits, and\nwill not return error when unsupported buffer is requested, only\nreturn error when the allocation for supported buffer failed.\n\nv2: coding style and log changes\nv3: coding style changes and update patch format\n\nSigned-off-by: Liu Zhiquan \nSigned-off-by: Long, Zhifang \nReviewed-by: Tomasz Figa 


Configure your notification preferences

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [AppVeyor] mesa master #2953 completed

2016-12-18 Thread AppVeyor


Build mesa 2953 completed



Commit e2610bf165 by Liu Zhiquan on 12/9/2016 11:29 AM:

EGL/android: Enhance pbuffer implementation\n\nSome dri drivers will pass multiple bits in buffer_mask parameter\nto droid_image_get_buffer(), more than the actual supported buffer\ntype combination. For such case, will go through all the bits, and\nwill not return error when unsupported buffer is requested, only\nreturn error when the allocation for supported buffer failed.\n\nv2: coding style and log changes\nv3: coding style changes and update patch format\n\nSigned-off-by: Liu Zhiquan \nSigned-off-by: Long, Zhifang \nReviewed-by: Tomasz Figa 


Configure your notification preferences

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] V7 Loop unrolling in NIR

2016-12-18 Thread Timothy Arceri
On Sun, 2016-12-18 at 21:58 -0800, Jason Ekstrand wrote:
> On Dec 18, 2016 9:48 PM, "Timothy Arceri"  om> wrote:
> V7:
>  - paritally out of ssa in unrolling pass to avoid phis
>  - lots of simplification/tidy ups in the analysis pass
>  - if_uses bug fix (missing functionality) in lcssa fixed
>  - better support for non trivial loop terminators
>  - fixed all loop HURT except 1 that is too big to unroll.
> 
> total instructions in shared programs: 12584624 -> 12584621 (-0.00%)
> instructions in affected programs: 68507 -> 68504 (-0.00%)
> helped: 70
> HURT: 170
> 
> total cycles in shared programs: 24146 -> 241476226 (-0.01%)
> cycles in affected programs: 4060722 -> 4036952 (-0.59%)
> helped: 1241
> HURT: 1278
> 
> total loops in shared programs: 4245 -> 2948 (-30.55%)
> loops in affected programs: 1535 -> 238 (-84.50%)
> helped: 1142
> HURT: 1
> 
> That is a *lot* of loops that we were leaving intact for no good
> reason.  Any idea how many of those were because the glsl pass didn't
> know what to do weight them vs. the heuristic subtly changing?

Are you sending html emails lately? Your comments have been
indistinguishable from the original email in a number of emails lately.

I disable instruction/iteration unrolling restrictions and it helped on
ly 13 loops.

For the record V7 unrolls around 500 more loops than V6. I think
handling phis as a conditional might give us another good jump. I would
need to dig a bit deeper to be sure why so many loops remain.

I'm happy to work on improving things further once we land the initial
version.

> 
> 
> LOST:   26
> GAINED: 16
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] egl: Check config's surface types in eglCreate*Surface()

2016-12-18 Thread Tapani Pälli

Reviewed-by: Tapani Pälli 

On 12/17/2016 03:59 AM, Chad Versace wrote:

If the provided EGLConfig does not support the requested surface type,
then emit EGL_BAD_MATCH.

Fixes dEQP-EGL.functional.negative_api.create_pbuffer_surface
on GBM.

Cc: "13.0" 
---
 src/egl/main/eglapi.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index a349992c46..cab05c2301 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -875,6 +875,9 @@ _eglCreateWindowSurfaceCommon(_EGLDisplay *disp, EGLConfig 
config,

_EGL_CHECK_CONFIG(disp, conf, EGL_NO_SURFACE, drv);

+   if ((conf->SurfaceType & EGL_WINDOW_BIT) == 0)
+  RETURN_EGL_ERROR(disp, EGL_BAD_MATCH, EGL_NO_SURFACE);
+
surf = drv->API.CreateWindowSurface(drv, disp, conf, native_window,
attrib_list);
ret = (surf) ? _eglLinkSurface(surf) : EGL_NO_SURFACE;
@@ -993,6 +996,10 @@ _eglCreatePixmapSurfaceCommon(_EGLDisplay *disp, EGLConfig 
config,
 #endif

_EGL_CHECK_CONFIG(disp, conf, EGL_NO_SURFACE, drv);
+
+   if ((conf->SurfaceType & EGL_PIXMAP_BIT) == 0)
+  RETURN_EGL_ERROR(disp, EGL_BAD_MATCH, EGL_NO_SURFACE);
+
surf = drv->API.CreatePixmapSurface(drv, disp, conf, native_pixmap,
attrib_list);
ret = (surf) ? _eglLinkSurface(surf) : EGL_NO_SURFACE;
@@ -1063,6 +1070,9 @@ eglCreatePbufferSurface(EGLDisplay dpy, EGLConfig config,
_EGL_FUNC_START(disp, EGL_OBJECT_DISPLAY_KHR, NULL, EGL_NO_SURFACE);
_EGL_CHECK_CONFIG(disp, conf, EGL_NO_SURFACE, drv);

+   if ((conf->SurfaceType & EGL_PBUFFER_BIT) == 0)
+  RETURN_EGL_ERROR(disp, EGL_BAD_MATCH, EGL_NO_SURFACE);
+
surf = drv->API.CreatePbufferSurface(drv, disp, conf, attrib_list);
ret = (surf) ? _eglLinkSurface(surf) : EGL_NO_SURFACE;



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] New GBM backend for dEQP

2016-12-18 Thread Tapani Pälli



On 12/17/2016 03:58 AM, Chad Versace wrote:

Happy Christmas to everyone who's busy squashing dEQP bugs.

I wrote a new GBM backend for dEQP. I even submitted it to dEQP's
upstream Gerrit.  Pyry, dEQP's maintainer, told me over beer earlier
this year that he would accept it if I submitted it, and if it wasn't
too crazy. So, maybe it'll be upstream soon.

If you want to try it out, you can either fetch the patch from Gerrit:
$ git fetch https://android.googlesource.com/platform/external/deqp 
refs/changes/43/315743/1

View it on Gerrit:
https://android-review.googlesource.com/#/c/315743/

Fetch from personal my work-in-progress branch:
$ git fetch git://git.kiwitree.net/~chadv/deqp refs/heads/wip/gbm

View it on my cgit:
http://git.kiwitree.net/cgit/~chadv/deqp/log/?h=wip/gbm

GBM today does not support pixmaps nor pbuffers (eglCreatePixmapSurface
and eglCreatePbufferSurface), so the dEQP test coverage with GBM does
not have parity with X11. But, on the other hand, you get to run dEQP
without the headache of X11.

There's probably bugs. No surprises there.


Branch did not work 'out of the box' for me:

"No rule to make target 'framework/qphelper/.git/index', needed by 
'framework/qphelper/qpReleaseInfo.inl'.  Stop."


(attached patch makes it work for me)


What prompted my painful misadventures into dEQP's circus of abstract
factory registries?

- Some dEQP tests intermittently fail on X11, most likely due to
  dEQP's misuse of multi-threaded Xlib. See
  .
  On the new GBM backend, the test results are consistent. They
  consistently fail ;)

- Some tests run much faster without the overhead of X11.
  The dEQP-EGL.functional.negative_api.* tests run almost instantly.

- Intel's Jenkins prefers headless testing over X11.



One issue is that real users will use X11, Wayland or Android. Would be 
cool to have a 'switch' to toggle CI to use a particular backend so that 
most of the time we would run against gbm but then sometimes check that 
X11 still works etc.


// Tapani
>From 22e7a45eb9f3707ef902d17802de79ba9924cdf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= 
Date: Mon, 19 Dec 2016 09:38:23 +0200
Subject: [PATCH] append ${CMAKE_SOURCE_DIR} to ${DE_GIT_DIR}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes issues for me as now we use absolute path for setting
the dependency and dir is found.

Signed-off-by: Tapani Pälli 
---
 framework/qphelper/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/framework/qphelper/CMakeLists.txt b/framework/qphelper/CMakeLists.txt
index 8f38ee9..eeac439 100644
--- a/framework/qphelper/CMakeLists.txt
+++ b/framework/qphelper/CMakeLists.txt
@@ -54,8 +54,8 @@ elseif (EXISTS "${CMAKE_SOURCE_DIR}/.git")
 		OUTPUT_VARIABLE DE_GIT_DIR
 		OUTPUT_STRIP_TRAILING_WHITESPACE)
 		add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/qpReleaseInfo.inl
-			COMMAND ${PYTHON_EXECUTABLE} -B ${CMAKE_CURRENT_SOURCE_DIR}/gen_release_info.py --git --git-dir="${DE_GIT_DIR}" --out=${CMAKE_CURRENT_BINARY_DIR}/qpReleaseInfo.inl
-		   DEPENDS gen_release_info.py ${DE_GIT_DIR}/HEAD ${DE_GIT_DIR}/index) # \note HEAD updated only when changing branches
+			COMMAND ${PYTHON_EXECUTABLE} -B ${CMAKE_CURRENT_SOURCE_DIR}/gen_release_info.py --git --git-dir="${CMAKE_SOURCE_DIR}/${DE_GIT_DIR}" --out=${CMAKE_CURRENT_BINARY_DIR}/qpReleaseInfo.inl
+		   DEPENDS gen_release_info.py ${CMAKE_SOURCE_DIR}/${DE_GIT_DIR}/HEAD ${CMAKE_SOURCE_DIR}/${DE_GIT_DIR}/index) # \note HEAD updated only when changing branches
 		add_custom_target(git-rel-info DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/qpReleaseInfo.inl)
 		add_dependencies(qphelper git-rel-info)
 		include_directories(${CMAKE_CURRENT_BINARY_DIR})
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev