date:20180609

[Mesa-dev] [PATCH v3 1/3] gallium/util: Fix build error due to cast to different size

2018-06-09 Thread Robert Foss

Signed-off-by: Robert Foss 
---
 src/gallium/auxiliary/util/u_debug_stack_android.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_debug_stack_android.cpp 
b/src/gallium/auxiliary/util/u_debug_stack_android.cpp
index b3d56aebe6..395a1fe911 100644
--- a/src/gallium/auxiliary/util/u_debug_stack_android.cpp
+++ b/src/gallium/auxiliary/util/u_debug_stack_android.cpp
@@ -49,10 +49,10 @@ debug_backtrace_capture(debug_stack_frame *mesa_backtrace,
   backtrace_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
 _mesa_key_pointer_equal);
 
-   backtrace_entry = _mesa_hash_table_search(backtrace_table, (void*) tid);
+   backtrace_entry = _mesa_hash_table_search(backtrace_table, (void*) 
(uintptr_t)tid);
if (!backtrace_entry) {
   backtrace = Backtrace::Create(getpid(), tid);
-  _mesa_hash_table_insert(backtrace_table, (void*) tid, backtrace);
+  _mesa_hash_table_insert(backtrace_table, (void*) (uintptr_t)tid, 
backtrace);
} else {
   backtrace = (Backtrace *) backtrace_entry->data;
}
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3 2/3] egl/android: #ifdef out flink name support

2018-06-09 Thread Robert Foss

From: Rob Herring 

Maintaining both flink names and prime fd support which are provided by
2 different gralloc implementations is problematic because we have a
dependency on a specific gralloc implementation header.

This mostly disables the dependency on the gralloc implementation and
headers. The dependency on GRALLOC_MODULE_PERFORM_GET_DRM_FD remains for
now, but the definition is added locally to remove the header
dependency.

drm_gralloc support can be enabled by setting
BOARD_USES_DRM_GRALLOC=true in BoardConfig.mk.

Signed-off-by: Rob Herring 
Signed-off-by: Robert Foss 
---

Changes since RFC:
 - Instead of removing code, #ifdef it out.

 src/egl/Android.mk  |  6 ++-
 src/egl/drivers/dri2/egl_dri2.h |  2 -
 src/egl/drivers/dri2/platform_android.c | 56 +++--
 3 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/src/egl/Android.mk b/src/egl/Android.mk
index 11818694f4..8412aeb798 100644
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -57,9 +57,13 @@ LOCAL_SHARED_LIBRARIES := \
libhardware \
liblog \
libcutils \
-   libgralloc_drm \
libsync
 
+ifeq ($(BOARD_USES_DRM_GRALLOC),true)
+   LOCAL_CFLAGS += -DHAVE_DRM_GRALLOC
+   LOCAL_SHARED_LIBRARIES += libgralloc_drm
+endif
+
 ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),)
 LOCAL_SHARED_LIBRARIES += libnativewindow
 endif
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index adabc527f8..5d8fbfa235 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -67,8 +67,6 @@ struct zwp_linux_dmabuf_v1;
 
 #include 
 #include 
-#include 
-
 #endif /* HAVE_ANDROID_PLATFORM */
 
 #include "eglconfig.h"
diff --git a/src/egl/drivers/dri2/platform_android.c 
b/src/egl/drivers/dri2/platform_android.c
index 1d6ed92bd6..4ba96aad90 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -37,7 +37,11 @@
 #include "loader.h"
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
+
+#ifdef HAVE_DRM_GRALLOC
+#include 
 #include "gralloc_drm.h"
+#endif /* HAVE_DRM_GRALLOC */
 
 #define ALIGN(val, align)  (((val) + (align) - 1) & ~((align) - 1))
 
@@ -164,11 +168,13 @@ get_native_buffer_fd(struct ANativeWindowBuffer *buf)
return (handle && handle->numFds) ? handle->data[0] : -1;
 }
 
+#ifdef HAVE_DRM_GRALLOC
 static int
 get_native_buffer_name(struct ANativeWindowBuffer *buf)
 {
return gralloc_drm_get_gem_handle(buf->handle);
 }
+#endif /* HAVE_DRM_GRALLOC */
 
 static EGLBoolean
 droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf)
@@ -838,6 +844,7 @@ droid_create_image_from_prime_fd(_EGLDisplay *disp, 
_EGLContext *ctx,
return dri2_create_image_dma_buf(disp, ctx, NULL, attr_list);
 }
 
+#ifdef HAVE_DRM_GRALLOC
 static _EGLImage *
 droid_create_image_from_name(_EGLDisplay *disp, _EGLContext *ctx,
  struct ANativeWindowBuffer *buf)
@@ -881,6 +888,7 @@ droid_create_image_from_name(_EGLDisplay *disp, _EGLContext 
*ctx,
 
return _img->base;
 }
+#endif /* HAVE_DRM_GRALLOC */
 
 static EGLBoolean
 droid_query_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf,
@@ -937,7 +945,11 @@ dri2_create_image_android_native_buffer(_EGLDisplay *disp,
if (fd >= 0)
   return droid_create_image_from_prime_fd(disp, ctx, buf, fd);
 
+#ifdef HAVE_DRM_GRALLOC
return droid_create_image_from_name(disp, ctx, buf);
+#else
+   return NULL;
+#endif
 }
 
 static _EGLImage *
@@ -959,6 +971,7 @@ droid_flush_front_buffer(__DRIdrawable * driDrawable, void 
*loaderPrivate)
 {
 }
 
+#ifdef HAVE_DRM_GRALLOC
 static int
 droid_get_buffers_parse_attachments(struct dri2_egl_surface *dri2_surf,
 unsigned int *attachments, int count)
@@ -1034,6 +1047,7 @@ droid_get_buffers_with_format(__DRIdrawable * driDrawable,
 
return dri2_surf->buffers;
 }
+#endif /* HAVE_DRM_GRALLOC */
 
 static unsigned
 droid_get_capability(void *loaderPrivate, enum dri_loader_cap cap)
@@ -1116,6 +1130,14 @@ droid_add_configs_for_visuals(_EGLDriver *drv, 
_EGLDisplay *dpy)
return (config_count != 0);
 }
 
+enum {
+/* perform(const struct gralloc_module_t *mod,
+ * int op,
+ * int *fd);
+ */
+GRALLOC_MODULE_PERFORM_GET_DRM_FD = 0x4002,
+};
+
 static int
 droid_open_device(struct dri2_egl_display *dri2_dpy)
 {
@@ -1158,6 +1180,7 @@ static const struct dri2_egl_display_vtbl 
droid_display_vtbl = {
.get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
+#ifdef HAVE_DRM_GRALLOC
 static const __DRIdri2LoaderExtension droid_dri2_loader_extension = {
.base = { __DRI_DRI2_LOADER, 4 },
 
@@ -1166,6 +1189,7 @@ static const __DRIdri2LoaderExtension 
droid_dri2_loader_extension = {
.getBuffersWithFormat = droid_get_buffers_with_format,
.getCapability= droid_get_capability,
 };
+#endif /* HAVE_DRM_GRALLOC */
 
 static const

[Mesa-dev] [PATCH v3 3/3] egl/android: Add DRM node probing and filtering

2018-06-09 Thread Robert Foss

This patch both adds support for probing & filtering DRM nodes
and switches away from using the GRALLOC_MODULE_PERFORM_GET_DRM_FD
gralloc call.

Currently the filtering is based just on the driver name,
and the desired name is supplied using the "drm.gpu.vendor_name"
Android property.

Signed-off-by: Robert Foss 
---

Changes since v2:
 - Switch from drmGetDevices2 to manual renderD node iteration
 - Add probe_res enum to communicate probing results better
 - Avoid using _eglError() in internal static functions
 - Avoid actually loading the driver while probing, just verify
   that it exists.
 - Replace strlen call with the assumed length PROPERTY_VALUE_MAX

Changes since v1:
 - Do not rely on libdrm for probing
 - Distinguish between errors and when no drm devices are found

Changes since RFC:
 - Rebased on newer libdrm drmHandleMatch patch
 - Added support for driver probing


 src/egl/drivers/dri2/platform_android.c | 222 ++--
 1 file changed, 169 insertions(+), 53 deletions(-)

diff --git a/src/egl/drivers/dri2/platform_android.c 
b/src/egl/drivers/dri2/platform_android.c
index 4ba96aad90..a2cbe92d93 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -27,12 +27,16 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
+#include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
+#include 
 
 #include "loader.h"
 #include "egl_dri2.h"
@@ -1130,31 +1134,6 @@ droid_add_configs_for_visuals(_EGLDriver *drv, 
_EGLDisplay *dpy)
return (config_count != 0);
 }
 
-enum {
-/* perform(const struct gralloc_module_t *mod,
- * int op,
- * int *fd);
- */
-GRALLOC_MODULE_PERFORM_GET_DRM_FD = 0x4002,
-};
-
-static int
-droid_open_device(struct dri2_egl_display *dri2_dpy)
-{
-   int fd = -1, err = -EINVAL;
-
-   if (dri2_dpy->gralloc->perform)
- err = dri2_dpy->gralloc->perform(dri2_dpy->gralloc,
-  GRALLOC_MODULE_PERFORM_GET_DRM_FD,
-  );
-   if (err || fd < 0) {
-  _eglLog(_EGL_WARNING, "fail to get drm fd");
-  fd = -1;
-   }
-
-   return (fd >= 0) ? fcntl(fd, F_DUPFD_CLOEXEC, 3) : -1;
-}
-
 static const struct dri2_egl_display_vtbl droid_display_vtbl = {
.authenticate = NULL,
.create_window_surface = droid_create_window_surface,
@@ -1215,6 +1194,168 @@ static const __DRIextension 
*droid_image_loader_extensions[] = {
NULL,
 };
 
+EGLBoolean
+droid_load_driver(_EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy = disp->DriverData;
+   const char *err;
+
+   dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
+   if (dri2_dpy->driver_name == NULL)
+  return false;
+
+   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == 
DRM_NODE_RENDER;
+
+   if (!dri2_dpy->is_render_node) {
+   #ifdef HAVE_DRM_GRALLOC
+   /* Handle control nodes using __DRI_DRI2_LOADER extension and GEM names
+* for backwards compatibility with drm_gralloc. (Do not use on new
+* systems.) */
+   dri2_dpy->loader_extensions = droid_dri2_loader_extensions;
+   if (!dri2_load_driver(disp)) {
+  err = "DRI2: failed to load driver";
+  goto error;
+   }
+   #else
+   err = "DRI2: handle is not for a render node";
+   goto error;
+   #endif
+   } else {
+   dri2_dpy->loader_extensions = droid_image_loader_extensions;
+   if (!dri2_load_driver_dri3(disp)) {
+  err = "DRI3: failed to load driver";
+  goto error;
+   }
+}
+
+   return true;
+
+error:
+   free(dri2_dpy->driver_name);
+   dri2_dpy->driver_name = NULL;
+   return false;
+}
+
+static bool
+droid_probe_driver(int fd)
+{
+   char *driver_name;
+
+   driver_name = loader_get_driver_for_fd(fd);
+   if (driver_name == NULL)
+  return false;
+
+   free(driver_name);
+   return true;
+}
+
+typedef enum {
+   probe_error = -1,
+   probe_success = 0,
+   probe_filtered_out = 1,
+   probe_no_driver = 2
+} probe_ret_t;
+
+static probe_ret_t
+droid_probe_device(_EGLDisplay *disp, int fd, char *vendor)
+{
+   int ret;
+
+   drmVersionPtr ver = drmGetVersion(fd);
+   if (!ver)
+  return probe_error;
+
+   if (vendor != NULL && ver->name != NULL &&
+   strncmp(vendor, ver->name, PROPERTY_VALUE_MAX) != 0) {
+  ret = probe_filtered_out;
+  goto cleanup;
+   }
+
+
+   if (!droid_probe_driver(fd)) {
+  ret = probe_no_driver;
+  goto cleanup;
+   }
+
+   ret = probe_success;
+
+cleanup:
+   drmFreeVersion(ver);
+   return ret;
+}
+
+static int
+droid_open_device(_EGLDisplay *disp)
+{
+   const int MAX_DRM_DEVICES = 32;
+   int prop_set, num_devices;
+   int fd = -1, fallback_fd = -1;
+
+   char *vendor_name = NULL;
+   char vendor_buf[PROPERTY_VALUE_MAX];
+   if (property_get("drm.gpu.vendor_name", vendor_buf, NULL) > 0);
+  vendor_name = vendor_buf;
+
+   const char *drm_dir_name =

[Mesa-dev] [PATCH v3 0/3] egl/android: Remove dependencies on specific grallocs

2018-06-09 Thread Robert Foss

This series replaces the dependency on
GRALLOC_MODULE_PERFORM_GET_DRM_FD with DRM node
probing and disables the support for drm_gralloc.

The series has been tested on Qemu+AOSP, where a
virtio gpu was successfully probed for and
opened.

Changes since v2:
 - Fixed whitespace issue
 - Diversified return codes from probing functions
 - Switched away from using drmGetDevices2, to iterating over
   /dev/dir/renderD nodes manually

Changes since v1:
 - Added fix for build issue
 - Do not rely on libdrm for probing
 - Distinguish between errors and when no drm devices are found

Changes since RFC:
 - Rebased work on the libdrm patch [2].
 - Included patch from Rob Herring disabling drm_gralloc/flink
   support by default.
 - Added device handler driver probing.


Rob Herring (1):
  egl/android: #ifdef out flink name support

Robert Foss (2):
  gallium/util: Fix build error due to cast to different size
  egl/android: Add DRM node probing and filtering

 src/egl/Android.mk|   6 +-
 src/egl/drivers/dri2/egl_dri2.h   |   2 -
 src/egl/drivers/dri2/platform_android.c   | 226 +++---
 .../auxiliary/util/u_debug_stack_android.cpp  |   4 +-
 4 files changed, 194 insertions(+), 44 deletions(-)

-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 0/3] egl/android: Remove dependencies on specific grallocs

2018-06-09 Thread Robert Foss


Hey,

On 2018-05-25 17:38, Rob Herring wrote:

On Fri, May 25, 2018 at 9:25 AM, Tomasz Figa  wrote:

On Fri, May 25, 2018 at 10:59 PM Rob Herring  wrote:


On Fri, May 25, 2018 at 4:15 AM, Robert Foss 

wrote:



On 2018-05-25 10:38, Tomasz Figa wrote:


On Fri, May 25, 2018 at 5:33 PM Robert Foss 
wrote:


Hey,




On 2018-05-25 02:17, Rob Herring wrote:


On Thu, May 24, 2018 at 6:23 AM, Robert Foss <

robert.f...@collabora.com>


wrote:


Hey,

I don't think I've received any feedback on this version yet.
If anyone has some time to spare, it would be nice to get it merged.

Just to be clear about the libdrm branch linked in the cover letter,
it is not required. Only for virgl platforms which happens to be

what

I tested on.



virgl will still fallback to using the first render node without

those

libdrm changes, right? If not, I don't think we should apply until
we're not breaking a platform...




No it will not fall back. I agree that holding off makes more sense.



What's the reason of this problems? Is it because of drmGetDevices()?
Since
we don't really use it for anything other than getting the list of

render

nodes in the system, maybe we could just iterate over any /dev/renderD*
nodes explicitly and avoid introducing new problems?



That's exactly the problem, and yes we could 100% solve by iterating

over

/dev/renderD* nodes. I originally assumed we wouldn't want to do that,

but

rather use the libdrm interfaces.

But for the next spin I could avoid using libdrm, should I?



I don't have an opinion on libdrm really, but I do think we should
fallback to the 1st (only) render node rather than just fail.


We do, even with libdrm.

AFAICT, the problem with virgl seems to be that drmGetDevices() doesn't
include devices on virtio bus in the results, which means that there likely
wouldn't be any render node returned.


Okay. I still don't get why we search by bus in the first place. Who
cares what bus the gpu sits on.

Now I have an opinion. We should just iterate over render nodes
matching by name or use the first node if we don't have a set name.


For v3 I implemented a version with drmGetDevices2 replaced with iteration over 
the /dev/dri/renderD* nodes.


That still means doing all of the other filtering, however.


Rob.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 3/3] egl/android: Add DRM node probing and filtering

2018-06-09 Thread Robert Foss


Hey Tomasz,

On 2018-05-25 09:27, Tomasz Figa wrote:
> Hi Rob,
>
> Finally got to review this. Please see my comments inline.
>
> On Fri, May 11, 2018 at 10:48 PM Robert Foss 
> wrote:
> [snip]
>> +EGLBoolean
>> +droid_load_driver(_EGLDisplay *disp)
>
> Since this is not EGL-facing, I'd personally use bool.
>
>> +{
>> +   struct dri2_egl_display *dri2_dpy = disp->DriverData;
>> +   const char *err;
>> +
>> +   dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
>> +   if (dri2_dpy->driver_name == NULL) {
>> +  err = "DRI2: failed to get driver name";
>> +  goto error;
>
> It shouldn't be an error if there is no driver for given render node. We
> should just skip it and try next one, which I believe would be achieved by
> just returning false here.
>
>> +   }
>> +
>> +   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) ==
> DRM_NODE_RENDER;
>> +
>> +   if (!dri2_dpy->is_render_node) {
>> +   #ifdef HAVE_DRM_GRALLOC
>> +   /* Handle control nodes using __DRI_DRI2_LOADER extension and GEM
> names
>> +* for backwards compatibility with drm_gralloc. (Do not use on
> new
>> +* systems.) */
>> +   dri2_dpy->loader_extensions = droid_dri2_loader_extensions;
>> +   if (!dri2_load_driver(disp)) {
>> +  err = "DRI2: failed to load driver";
>> +  goto error;
>> +   }
>> +   #else
>> +   err = "DRI2: handle is not for a render node";
>> +   goto error;
>> +   #endif
>> +   } else {
>> +   dri2_dpy->loader_extensions = droid_image_loader_extensions;
>> +   if (!dri2_load_driver_dri3(disp)) {
>> +  err = "DRI3: failed to load driver";
>> +  goto error;
>> +   }
>> +}
>> +
>> +   return EGL_TRUE;
>> +
>> +error:
>> +   free(dri2_dpy->driver_name);
>> +   dri2_dpy->driver_name = NULL;
>> +   return _eglError(EGL_NOT_INITIALIZED, err);
>
> Hmm, if we signal EGL error here, we should break the probing loop and just
> bail out. This would suggest that a boolean is not the right type for this
> function to return. Perhaps something like negative error, 0 for skip and 1
> for success would make sense?
>
> Also, how does it play with the _eglError() called from the error path of
> dri2_initialize_android()?

I can't say I put any though into that aspect, but dri2_initialize_android() 
would overwrite it. So maybe completely avoiding _eglError() in 
droid_load_driver() is the way to go.


>
>> +}
>> +
>> +static int
>> +droid_probe_driver(_EGLDisplay *disp, int fd)
>> +{
>> +   struct dri2_egl_display *dri2_dpy = disp->DriverData;
>> +   dri2_dpy->fd = fd;
>> +
>> +   if (!droid_load_driver(disp))
>> +  return false;
>
> Given my other suggestion about distinguishing failure, render node skip
> and success, I think it should be more like this:
>
>  int ret = droid_load_driver(disp);
>  if (ret <= 0)
> return ret;
>
> Or actually, maybe we don't really need to go as far as loading the driver.
> I'd say it should be enough to just check if we have a driver for the
> device by looking at what loader_get_driver_for_fd() returns. (In that
> case, we can ignore my comment about returning error on
> loader_get_driver_for_fd() failure in droid_load_driver(), since the
> skipping would be handling only here.)

If we don't need to actually load the driver, I think all of the above
comments can be fixed by just removing chunks out of dri related setup.

>
>> +
>> +   /* Since this probe can succeed, but another filter may fail,
>
> What another filter could fail? I can see the vendor name being checked
> before calling this function.
>
> The free() below is actually needed, just the comment is off. We need to
> free the name to be able to probe remaining nodes, without leaking the name.

Ack, fixed by the above change.

>
>> +  this string needs to be deallocated either way.
>> +  Once an FD has been found, this string will be set a second time.
> */
>> +   free(dri2_dpy->driver_name);
>
> Don't we also need to unload the driver?

Ack, fixed by the above change.

>
>> +   dri2_dpy->driver_name = NULL;
>> +   return true;
>
> To match the change above:
>
>  return 1;
>

Ack, fixed by the above change.

>> +}
>> +
>> +static int
>> +droid_probe_device(_EGLDisplay *disp, int fd, drmDevicePtr dev, char
> *vendor)
>> +{
>> +   drmVersionPtr ver = drmGetVersion(fd);
>> +   if (!ver)
>> +   goto fail;
>
> Something wrong with indentation here.

Ack.

>
>> +
>> +   size_t vendor_len = strlen(vendor);
>> +   if (vendor_len != 0 && strncmp(vendor, ver->name, vendor_len))
>> +  goto fail;
>
> Maybe it's just me, but I don't see any point in using strncmp() if the
> length argument is obtained by calling strlen() first. Especially if the
> strlen() call is on a string that comes from some external code
> (property_get()).
>
> Perhaps we could just call strncmp() with PROPERTY_VALUE_MAX? This would
> actually play nice with my other comment about using NULL for vendor
> string, if

[Mesa-dev] [PATCH v2] nv50/ir: improve performance of signed division by powers of two

2018-06-09 Thread Rhys Perry

Changes in v2:
- Stylistic changes
- Use OP_SLCT instead of OP_SELP which only worked by luck
- Fix issues in edge cases

Signed-off-by: Rhys Perry 
---
 .../drivers/nouveau/codegen/nv50_ir_peephole.cpp   | 30 +++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 39177bd044..d636eb130a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1095,10 +1095,36 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue 
, int s)
  i->op = OP_MOV;
  i->setSrc(1, NULL);
   } else
+  if (imm0.reg.data.s32 == -1) {
+ i->op = OP_NEG;
+ i->setSrc(1, NULL);
+  } else
   if (i->dType == TYPE_U32 && imm0.isPow2()) {
  i->op = OP_SHR;
  i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
   } else
+  if (i->dType == TYPE_S32 && 
util_is_power_of_two_or_zero(llabs(imm0.reg.data.s32))) {
+ Value *a = i->getSrc(0);
+ int64_t absb = llabs(imm0.reg.data.s32);
+
+ Value *sign = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), a, 
bld.mkImm(31));
+ Value *adjusted = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), a,
+  bld.loadImm(bld.getSSA(), 
(uint32_t)(absb - 1)));
+
+ Value *selected = bld.getSSA();
+ bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, selected, TYPE_U32, adjusted, a, 
sign);
+
+ if (imm0.reg.data.s32 < 0) {
+i->op = OP_NEG;
+i->setSrc(0, bld.mkOp2v(
+   OP_SHR, TYPE_S32, bld.getSSA(), selected, 
bld.mkImm(util_logbase2(absb;
+i->setSrc(1, NULL);
+ } else {
+i->op = OP_SHR;
+i->setSrc(0, selected);
+i->setSrc(1, bld.mkImm(util_logbase2(absb)));
+ }
+  } else
   if (i->dType == TYPE_U32) {
  Instruction *mul;
  Value *tA, *tB;
@@ -1129,10 +1155,6 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue 
, int s)
 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
 
  delete_Instruction(prog, i);
-  } else
-  if (imm0.reg.data.s32 == -1) {
- i->op = OP_NEG;
- i->setSrc(1, NULL);
   } else {
  LValue *tA, *tB;
  LValue *tD;
-- 
2.14.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [Bug 105613] Compute shader locks up within nested "for" loop

2018-06-09 Thread bugzilla-daemon

https://bugs.freedesktop.org/show_bug.cgi?id=105613

Samuel Pitoiset  changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |FIXED

--- Comment #9 from Samuel Pitoiset  ---
Fixed with
https://cgit.freedesktop.org/mesa/mesa/commit/?id=135e4d434f622fa1d7275bdb72f859e1c1b1976e

-- 
You are receiving this mail because:
You are the assignee for the bug.
You are the QA Contact for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 13/14] intel/compiler: use new shuffle_32bit_write for all 64-bit storage writes

2018-06-09 Thread Jose Maria Casanova Crespo

---
 src/intel/compiler/brw_fs_nir.cpp | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 2521f3c001b..833fad4247a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2839,8 +2839,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
 * for that.
 */
unsigned channel = iter * 2 + i;
-   fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
-  offset(value, bld, channel), 1);
+   fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
 
srcs[header_regs + (i + first_component) * 2] = dest;
srcs[header_regs + (i + first_component) * 2 + 1] =
@@ -3694,8 +3693,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   unsigned type_size = 4;
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
- val_reg = shuffle_64bit_data_for_32bit_write(bld,
-val_reg, instr->num_components);
+ val_reg = shuffle_for_32bit_write(bld, val_reg, 0,
+   instr->num_components);
   }
 
   unsigned type_slots = type_size / 4;
@@ -4236,8 +4235,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  * iteration handle the rest.
  */
 num_components = MIN2(2, num_components);
-write_src = shuffle_64bit_data_for_32bit_write(bld, write_src,
-   num_components);
+write_src = shuffle_for_32bit_write(bld, write_src, 0,
+num_components);
  } else if (type_size < 4) {
 assert(type_size == 2);
 /* For 16-bit types we pack two consecutive values into a 32-bit
@@ -4333,7 +4332,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unsigned num_components = instr->num_components;
   unsigned first_component = nir_intrinsic_component(instr);
   if (nir_src_bit_size(instr->src[0]) == 64) {
- src = shuffle_64bit_data_for_32bit_write(bld, src, num_components);
+ src = shuffle_for_32bit_write(bld, src, 0, num_components);
  num_components *= 2;
   }
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 14/14] intel/compiler: shuffle_64bit_data_for_32bit_write is not used anymore

2018-06-09 Thread Jose Maria Casanova Crespo

---
 src/intel/compiler/brw_fs.h   |  4 
 src/intel/compiler/brw_fs_nir.cpp | 32 ---
 2 files changed, 36 deletions(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 1f86f17ccbb..17b1368d522 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -499,10 +499,6 @@ private:
void *mem_ctx;
 };
 
-fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder ,
-  const fs_reg ,
-  uint32_t components);
-
 void shuffle_from_32bit_read(const brw::fs_builder ,
  const fs_reg ,
  const fs_reg ,
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 833fad4247a..f68fe5f1d1a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5187,38 +5187,6 @@ fs_visitor::nir_emit_jump(const fs_builder , 
nir_jump_instr *instr)
}
 }
 
-/**
- * This helper does the inverse operation of
- * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
- *
- * We need to do this when we are going to use untyped write messsages that
- * operate with 32-bit components in order to arrange our 64-bit data to be
- * in the expected layout.
- *
- * Notice that callers of this function, unlike in the case of the inverse
- * operation, would typically need to call this with dst and src being
- * different registers, since they would otherwise corrupt the original
- * 64-bit data they are about to write. Because of this the function checks
- * that the src and dst regions involved in the operation do not overlap.
- */
-fs_reg
-shuffle_64bit_data_for_32bit_write(const fs_builder ,
-   const fs_reg ,
-   uint32_t components)
-{
-   assert(type_sz(src.type) == 8);
-
-   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 2 * components);
-
-   for (unsigned i = 0; i < components; i++) {
-  const fs_reg component_i = offset(src, bld, i);
-  bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
-  bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 
1));
-   }
-
-   return dst;
-}
-
 /*
  * This helper takes a source register and un/shuffles it into the destination
  * register.
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 09/14] intel/compiler: Use shuffle_from_32bit_read at VS load_input

2018-06-09 Thread Jose Maria Casanova Crespo

shuffle_from_32bit_read manages 32-bit reads to 32-bit destination
in the same way that the previous loop so now we just call the new
function for all bitsizes, simplifying also the 64-bit load_input.
---
 src/intel/compiler/brw_fs_nir.cpp | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 6abc7c0174d..fedf3bf5a83 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2483,16 +2483,8 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder ,
   if (type_sz(dest.type) == 8)
  first_component /= 2;
 
-  for (unsigned j = 0; j < num_components; j++) {
- bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
-  }
-
-  if (type_sz(dest.type) == 8) {
- shuffle_32bit_load_result_to_64bit_data(bld,
- dest,
- retype(dest, 
BRW_REGISTER_TYPE_F),
- instr->num_components);
-  }
+  shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
+  first_component, num_components);
   break;
}
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 10/14] intel/compiler: shuffle_from_32bit_read at load_per_vertex_input at TCS/TES

2018-06-09 Thread Jose Maria Casanova Crespo

Previously, the shuffle function had a source/destination overlap that
needs to be avoided to use shuffle_from_32bit_read. As we can use for
the shuffle destination the destination of removed MOVs.

This change also avoids the internal MOVs done by the previous shuffle
to deal with possible overlaps.
---
 src/intel/compiler/brw_fs_nir.cpp | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index fedf3bf5a83..11b707e57a8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2662,13 +2662,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder 
,
   * or SSBOs.
   */
  if (type_sz(dst.type) == 8) {
-shuffle_32bit_load_result_to_64bit_data(
-   bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
-
-for (unsigned c = 0; c < num_components; c++) {
-   bld.MOV(offset(orig_dst, bld, iter * 2 + c),
-   offset(dst, bld, c));
-}
+shuffle_from_32bit_read(bld,
+offset(orig_dst, bld, iter * 2),
+retype(dst, BRW_REGISTER_TYPE_D),
+0, num_components);
  }
 
  /* Copy the temporary to the destination to deal with writemasking.
@@ -3011,13 +3008,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder 
,
  * or SSBOs.
  */
 if (type_sz(dest.type) == 8) {
-   shuffle_32bit_load_result_to_64bit_data(
-  bld, dest, retype(dest, BRW_REGISTER_TYPE_F), 
num_components);
-
-   for (unsigned c = 0; c < num_components; c++) {
-  bld.MOV(offset(orig_dest, bld, iter * 2 + c),
-  offset(dest, bld, c));
-   }
+   shuffle_from_32bit_read(bld,
+   offset(orig_dest, bld, iter * 2),
+   retype(dest, BRW_REGISTER_TYPE_D),
+   0, num_components);
 }
 
 /* If we are loading double data and we need a second read message
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 12/14] intel/compiler: shuffle_32bit_load_result_to_64bit_data is not used anymore

2018-06-09 Thread Jose Maria Casanova Crespo

---
 src/intel/compiler/brw_fs.h   |  5 ---
 src/intel/compiler/brw_fs_nir.cpp | 53 ---
 2 files changed, 58 deletions(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index d72164ae0b6..1f86f17ccbb 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -499,11 +499,6 @@ private:
void *mem_ctx;
 };
 
-void shuffle_32bit_load_result_to_64bit_data(const brw::fs_builder ,
- const fs_reg ,
- const fs_reg ,
- uint32_t components);
-
 fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder ,
   const fs_reg ,
   uint32_t components);
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 7e0ef2f34a9..2521f3c001b 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5188,59 +5188,6 @@ fs_visitor::nir_emit_jump(const fs_builder , 
nir_jump_instr *instr)
}
 }
 
-/**
- * This helper takes the result of a load operation that reads 32-bit elements
- * in this format:
- *
- * x x x x x x x x
- * y y y y y y y y
- * z z z z z z z z
- * w w w w w w w w
- *
- * and shuffles the data to get this:
- *
- * x y x y x y x y
- * x y x y x y x y
- * z w z w z w z w
- * z w z w z w z w
- *
- * Which is exactly what we want if the load is reading 64-bit components
- * like doubles, where x represents the low 32-bit of the x double component
- * and y represents the high 32-bit of the x double component (likewise with
- * z and w for double component y). The parameter @components represents
- * the number of 64-bit components present in @src. This would typically be
- * 2 at most, since we can only fit 2 double elements in the result of a
- * vec4 load.
- *
- * Notice that @dst and @src can be the same register.
- */
-void
-shuffle_32bit_load_result_to_64bit_data(const fs_builder ,
-const fs_reg ,
-const fs_reg ,
-uint32_t components)
-{
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 8);
-
-   /* A temporary that we will use to shuffle the 32-bit data of each
-* component in the vector into valid 64-bit data. We can't write directly
-* to dst because dst can be (and would usually be) the same as src
-* and in that case the first MOV in the loop below would overwrite the
-* data read in the second MOV.
-*/
-   fs_reg tmp = bld.vgrf(dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-  const fs_reg component_i = offset(src, bld, 2 * i);
-
-  bld.MOV(subscript(tmp, src.type, 0), component_i);
-  bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
-
-  bld.MOV(offset(dst, bld, i), tmp);
-   }
-}
-
 /**
  * This helper does the inverse operation of
  * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 11/14] intel/compiler: use shuffle_from_32bit_read for 64-bit FS load_input

2018-06-09 Thread Jose Maria Casanova Crespo

As the previous use of shuffle_32bit_load_result_to_64bit_data
had a source/destination overlap for 64-bit. Now a temporal destination
is used for 64-bit cases to use shuffle_from_32bit_read that doesn't
handle src/dst overlaps.
---
 src/intel/compiler/brw_fs_nir.cpp | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 11b707e57a8..7e0ef2f34a9 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3350,6 +3350,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder ,
   unsigned base = nir_intrinsic_base(instr);
   unsigned comp = nir_intrinsic_component(instr);
   unsigned num_components = instr->num_components;
+  fs_reg orig_dest = dest;
   enum brw_reg_type type = dest.type;
 
   /* Special case fields in the VUE header */
@@ -3365,6 +3366,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder ,
   */
  type = BRW_REGISTER_TYPE_F;
  num_components *= 2;
+ dest = bld.vgrf(type, num_components);
   }
 
   for (unsigned int i = 0; i < num_components; i++) {
@@ -3373,10 +3375,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder ,
   }
 
   if (nir_dest_bit_size(instr->dest) == 64) {
- shuffle_32bit_load_result_to_64bit_data(bld,
- dest,
- retype(dest, type),
- instr->num_components);
+ shuffle_from_32bit_read(bld, orig_dest, dest, 0,
+ instr->num_components);
   }
   break;
}
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 04/14] intel/compiler: Use shuffle_from_32bit_read to read 16-bit SSBO

2018-06-09 Thread Jose Maria Casanova Crespo

Using shuffle_from_32bit_read instead of 16-bit shuffle functions
avoids the need of retype. At the same time new function are
ready for 8-bit type SSBO reads.
---
 src/intel/compiler/brw_fs_nir.cpp | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 1f684149fd5..ef7895262b8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2372,10 +2372,8 @@ do_untyped_vector_read(const fs_builder ,
   1 /* dims */,
   num_components_32bit,
   BRW_PREDICATE_NONE);
- shuffle_32bit_load_result_to_16bit_data(bld,
-   retype(dest, BRW_REGISTER_TYPE_W),
-   retype(read_result, BRW_REGISTER_TYPE_D),
-   first_component, num_components);
+ shuffle_from_32bit_read(bld, dest, read_result, first_component,
+ num_components);
   } else {
  fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
  for (unsigned i = 0; i < num_components; i++) {
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 06/14] intel/compiler: remove old 16-bit shuffle/unshuffle functions

2018-06-09 Thread Jose Maria Casanova Crespo

---
 src/intel/compiler/brw_fs.h   | 11 --
 src/intel/compiler/brw_fs_nir.cpp | 62 ---
 2 files changed, 73 deletions(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 779170ecc95..d72164ae0b6 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -508,17 +508,6 @@ fs_reg shuffle_64bit_data_for_32bit_write(const 
brw::fs_builder ,
   const fs_reg ,
   uint32_t components);
 
-void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder ,
- const fs_reg ,
- const fs_reg ,
- uint32_t first_component,
- uint32_t components);
-
-void shuffle_16bit_data_for_32bit_write(const brw::fs_builder ,
-const fs_reg ,
-const fs_reg ,
-uint32_t components);
-
 void shuffle_from_32bit_read(const brw::fs_builder ,
  const fs_reg ,
  const fs_reg ,
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index a54935f7049..7e738ade82e 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5263,40 +5263,6 @@ shuffle_32bit_load_result_to_64bit_data(const fs_builder 
,
}
 }
 
-void
-shuffle_32bit_load_result_to_16bit_data(const fs_builder ,
-const fs_reg ,
-const fs_reg ,
-uint32_t first_component,
-uint32_t components)
-{
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 2);
-
-   /* A temporary is used to un-shuffle the 32-bit data of each component in
-* into a valid 16-bit vector. We can't write directly to dst because it
-* can be the same register as src and in that case the first MOV in the
-* loop below would overwrite the data read in the second MOV.
-*/
-   fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-  const fs_reg component_i =
- subscript(offset(src, bld, (first_component + i) / 2), dst.type,
-   (first_component + i) % 2);
-
-  bld.MOV(offset(tmp, bld, i % 2), component_i);
-
-  if (i % 2) {
- bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
- bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
-  }
-   }
-   if (components % 2) {
-  bld.MOV(offset(dst, bld, components - 1), tmp);
-   }
-}
-
 /**
  * This helper does the inverse operation of
  * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
@@ -5329,34 +5295,6 @@ shuffle_64bit_data_for_32bit_write(const fs_builder ,
return dst;
 }
 
-void
-shuffle_16bit_data_for_32bit_write(const fs_builder ,
-   const fs_reg ,
-   const fs_reg ,
-   uint32_t components)
-{
-   assert(type_sz(src.type) == 2);
-   assert(type_sz(dst.type) == 4);
-
-   /* A temporary is used to shuffle the 16-bit data of each component in the
-* 32-bit data vector. We can't write directly to dst because it can be the
-* same register as src and in that case the first MOV in the loop below
-* would overwrite the data read in the second MOV.
-*/
-   fs_reg tmp = bld.vgrf(dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-  const fs_reg component_i = offset(src, bld, i);
-  bld.MOV(subscript(tmp, src.type, i % 2), component_i);
-  if (i % 2) {
- bld.MOV(offset(dst, bld, i / 2), tmp);
-  }
-   }
-   if (components % 2) {
-  bld.MOV(offset(dst, bld, components / 2), tmp);
-   }
-}
-
 /*
  * This helper takes a source register and un/shuffles it into the destination
  * register.
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 01/14] intel/compiler: general 8/16/32/64-bit shuffle_src_to_dst function

2018-06-09 Thread Jose Maria Casanova Crespo

This new function takes care of shuffle/unshuffle components of a
particular bit-size in components with a different bit-size.

If source type size is smaller than destination type size the operation
needed is a component shuffle. The opposite case would be an unshuffle.

The operation allows to skip first_component number of components from
the source.

Shuffle MOVs are retyped using integer types avoiding problems with denorms
and float types. This allows to simplify uses of shuffle functions that are
dealing with these retypes individually.

Now there is a new restriction so source and destination can not overlap
anymore when calling this suffle function. Following patches that migrate
to use this new function will take care individually of avoiding source
and destination overlaps.
---
 src/intel/compiler/brw_fs_nir.cpp | 92 +++
 1 file changed, 92 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 166da0aa6d7..1a9d3c41d1d 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5362,6 +5362,98 @@ shuffle_16bit_data_for_32bit_write(const fs_builder ,
}
 }
 
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
+ *
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
+ *
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
+ *
+ *|x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ *|z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
+ *
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
+ *
+ *|x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ *|z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ *| x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
+ *| x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
+ *| y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
+ *| y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ *| x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ *| x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ *| y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ *| y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - first_component parameter allows skipping source components.
+ */
+void
+shuffle_src_to_dst(const fs_builder ,
+   const fs_reg ,
+   const fs_reg ,
+   uint32_t first_component,
+   uint32_t components)
+{
+   if (type_sz(src.type) <= type_sz(dst.type)) {
+  /* Source is shuffled into destination */
+  unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+#ifndef NDEBUG
+  boolean src_dst_overlap = regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() * components,
+ offset(src, bld, first_component * size_ratio),
+ type_sz(src.type) * bld.dispatch_width() * components * size_ratio);
+#endif
+  assert(!src_dst_overlap);
+
+  brw_reg_type shuffle_type =
+ brw_reg_type_from_bit_size(8 * type_sz(src.type),
+BRW_REGISTER_TYPE_D);
+  for (unsigned i = 0; i < components; i++) {
+ fs_reg shuffle_component_i =
+subscript(offset(dst, bld, i / size_ratio),
+  shuffle_type, i % size_ratio);
+ bld.MOV(shuffle_component_i,
+ retype(offset(src, bld, i + first_component), shuffle_type));
+  }
+   } else {
+  /* Source is unshuffled into destination */
+  unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+#ifndef NDEBUG
+  boolean src_dst_overlap = regions_overlap(dst,
+ type_sz(dst.type) * bld.dispatch_width() * components,
+ offset(src, bld, first_component / size_ratio),
+ type_sz(src.type) * bld.dispatch_width() *
+ DIV_ROUND_UP(components + first_component % size_ratio, size_ratio));
+#endif
+  assert(!src_dst_overlap);
+  brw_reg_type shuffle_type =
+ brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+BRW_REGISTER_TYPE_D);
+  for (unsigned i = 0; i < components; i++) {
+ fs_reg shuffle_component_i =
+subscript(offset(src, bld, (first_component + i) / size_ratio),
+  shuffle_type, (first_component + i) % size_ratio);
+ bld.MOV(retype(offset(dst, bld, i),

[Mesa-dev] [PATCH 03/14] intel/compiler: use shuffle_from_32bit_read at VARYING_PULL_CONSTANT_LOAD

2018-06-09 Thread Jose Maria Casanova Crespo

shuffle_from_32bit_read can manage the shuffle/unshuffle needed
for different 8/16/32/64 bit-sizes at VARYING PULL CONSTANT LOAD.
To get the specific component the first_component parameter is used.

In the case of the previous 16-bit shuffle, the shuffle operation was
generating not needed MOVs where its results where never used. This
behaviour passed unnoticed on SIMD16 because dead_code_eliminate
pass removed the generated instructions but for SIMD8 they cound't be
removed because of being partial writes.
---
 src/intel/compiler/brw_fs.cpp | 17 ++---
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index d67c0a41922..410768ef927 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -191,21 +191,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
,
 vec4_result, surf_index, vec4_offset);
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
 
-   fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
-   switch (type_sz(dst.type)) {
-   case 2:
-  shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 0, 1);
-  bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
-  break;
-   case 4:
-  bld.MOV(dst, retype(dw, dst.type));
-  break;
-   case 8:
-  shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
-  break;
-   default:
-  unreachable("Unsupported bit_size");
-   }
+   shuffle_from_32bit_read(bld, dst, vec4_result,
+   (const_offset & 0xf) / type_sz(dst.type), 1);
 }
 
 /**
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 05/14] intel/compiler: Use shuffle_from_32bit_write for 16-bits store_ssbo

2018-06-09 Thread Jose Maria Casanova Crespo

---
 src/intel/compiler/brw_fs_nir.cpp | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index ef7895262b8..a54935f7049 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4297,11 +4297,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  * aligned. Shuffling only one component would be the same as
  * striding it.
  */
-fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D,
-  DIV_ROUND_UP(num_components, 2));
-shuffle_16bit_data_for_32bit_write(bld, tmp, write_src,
-   num_components);
-write_src = tmp;
+write_src = shuffle_for_32bit_write(bld, write_src, 0,
+num_components);
  }
 
  fs_reg offset_reg;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 07/14] intel/compiler: shuffle_from_32bit_read for 64-bit do_untyped_vector_read

2018-06-09 Thread Jose Maria Casanova Crespo

do_untyped_vector_read is used at load_ssbo and load_shared.

The previous MOVs are removed because shuffle_from_32bit_read
can handle storing the shuffle results in the expected destination
just using the proper offset.
---
 src/intel/compiler/brw_fs_nir.cpp | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 7e738ade82e..780a9e228de 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2434,16 +2434,8 @@ do_untyped_vector_read(const fs_builder ,
 BRW_PREDICATE_NONE);
 
  /* Shuffle the 32-bit load result into valid 64-bit data */
- const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
- shuffle_32bit_load_result_to_64bit_data(
-bld, packed_result, read_result, iter_components);
-
- /* Move each component to its destination */
- read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
- for (int c = 0; c < iter_components; c++) {
-bld.MOV(offset(dest, bld, it * 2 + c),
-offset(packed_result, bld, c));
- }
+ shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+ read_result, 0, iter_components);
 
  bld.ADD(read_offset, read_offset, brw_imm_ud(16));
   }
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 08/14] intel/compiler: enable shuffle_from_32bit_read at 64-bit gs_input_load

2018-06-09 Thread Jose Maria Casanova Crespo

This implementation avoids two unneeded MOVs for each 64-bit
component. One was done in the old shuffle, to avoid cases of
src/dst overlap but this is not the case. And the removed MOV
was already being being done in the shuffle.

Copy propagation wasn't able to remove them because shuffle
destination values are defined with partial writes because they
have stride == 2.
---
 src/intel/compiler/brw_fs_nir.cpp | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 780a9e228de..6abc7c0174d 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2305,11 +2305,11 @@ fs_visitor::emit_gs_input_load(const fs_reg ,
   }
 
   if (type_sz(dst.type) == 8) {
- shuffle_32bit_load_result_to_64bit_data(
-bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), 
num_components);
-
- for (unsigned c = 0; c < num_components; c++)
-bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+ shuffle_from_32bit_read(bld,
+ offset(dst, bld, iter * 2),
+ retype(tmp_dst, BRW_REGISTER_TYPE_D),
+ 0,
+ num_components);
   }
 
   if (num_iterations > 1) {
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 00/14] intel/compiler: unshuffle/shuffle functions refactoring

2018-06-09 Thread Jose Maria Casanova Crespo

64-bit types support required un/shuffle operations to deal with load/store
operations that use 32-bit components. The introduction of VK_KHR_16bit_storage
support also needed new dedicated shuffle/unshuffle functions.

This series generalizes the different shuffle/unshuffle operations in a new
helper shuffle_src_to_dst for 64/32/16/8-bit cases an creates two specific
functions shuffle_from_32bit_read and shuffle_for_32bit_write to be used
after and before load/store operations.

I revisited all shuffle/unshuffle uses and avoids the emission of
unneeded instructions caused by previously allowing source/destination
overlaps in shuffles. The series refactors that cases to avoid them.

As most of the changes affects 64-bit uses, I used the piglit test shaders to
check the impact of the series with shader-db that is mainly possitive.

Skylake
total instructions in shared programs: 1688394 -> 1661420 (-1.60%)
instructions in affected programs: 370480 -> 343506 (-7.28%)
helped: 1048
HURT: 0

total cycles in shared programs: 52697457 -> 52527275 (-0.32%)
cycles in affected programs: 1783581 -> 1613399 (-9.54%)
helped: 1004
HURT: 41

Cc: Jason Ekstrand 
Cc: Samuel Iglesias 
Cc: Iago Toral 

Jose Maria Casanova Crespo (14):
  intel/compiler: general 8/16/32/64-bit shuffle_src_to_dst function
  intel/compiler: new shuffle_for_32bit_write and shuffle_from_32bit_read
  intel/compiler: use shuffle_from_32bit_read at VARYING_PULL_CONSTANT_LOAD
  intel/compiler: Use shuffle_from_32bit_read to read 16-bit SSBO
  intel/compiler: Use shuffle_from_32bit_write for 16-bits store_ssbo
  intel/compiler: remove old 16-bit shuffle/unshuffle functions
  intel/compiler: shuffle_from_32bit_read for 64-bit do_untyped_vector_read
  intel/compiler: enable shuffle_from_32bit_read at 64-bit gs_input_load
  intel/compiler: Use shuffle_from_32bit_read at VS load_input
  intel/compiler: shuffle_from_32bit_read at load_per_vertex_input at TCS/TES
  intel/compiler: use shuffle_from_32bit_read for 64-bit FS load_input
  intel/compiler: shuffle_32bit_load_result_to_64bit_data is not used anymore
  intel/compiler: use new shuffle_32bit_write for all 64-bit storage writes
  intel/compiler: shuffle_64bit_data_for_32bit_write is not used anymore

 src/intel/compiler/brw_fs.cpp |  17 +-
 src/intel/compiler/brw_fs.h   |  29 +--
 src/intel/compiler/brw_fs_nir.cpp | 315 +-
 3 files changed, 147 insertions(+), 214 deletions(-)

-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 02/14] intel/compiler: new shuffle_for_32bit_write and shuffle_from_32bit_read

2018-06-09 Thread Jose Maria Casanova Crespo

These new shuffle functions deal with the shuffle/unshuffle operations
needed for read/write operations using 32-bit components when the
read/written components have a different bit-size (8, 16, 64-bits).
Shuffle from 32-bit to 32-bit becomes a simple MOV.

As the new function shuffle_src_to_dst takes of doing a shuffle or an
unshuffle based on the different type_sz of source an destination this
generic functions work with any source/destination assuming that writes
use a 32-bit destination or reads use a 32-bit source.

To enable this new functions it is needed than there is no
source/destination overlap in the case of shuffle_from_32bit_read.
That never happens on shuffle_for_32bit_write as it allocates a new
destination register as it was at shuffle_64bit_data_for_32bit_write.
---
 src/intel/compiler/brw_fs.h   | 11 +
 src/intel/compiler/brw_fs_nir.cpp | 38 +++
 2 files changed, 49 insertions(+)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index faf51568637..779170ecc95 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -519,6 +519,17 @@ void shuffle_16bit_data_for_32bit_write(const 
brw::fs_builder ,
 const fs_reg ,
 uint32_t components);
 
+void shuffle_from_32bit_read(const brw::fs_builder ,
+ const fs_reg ,
+ const fs_reg ,
+ uint32_t first_component,
+ uint32_t components);
+
+fs_reg shuffle_for_32bit_write(const brw::fs_builder ,
+   const fs_reg ,
+   uint32_t first_component,
+   uint32_t components);
+
 fs_reg setup_imm_df(const brw::fs_builder ,
 double v);
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 1a9d3c41d1d..1f684149fd5 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -5454,6 +5454,44 @@ shuffle_src_to_dst(const fs_builder ,
}
 }
 
+void
+shuffle_from_32bit_read(const fs_builder ,
+const fs_reg ,
+const fs_reg ,
+uint32_t first_component,
+uint32_t components)
+{
+   assert(type_sz(src.type) == 4);
+
+   if (type_sz(dst.type) > 4) {
+  assert(type_sz(dst.type) == 8);
+  first_component *= 2;
+  components *= 2;
+   }
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+}
+
+fs_reg
+shuffle_for_32bit_write(const fs_builder ,
+const fs_reg ,
+uint32_t first_component,
+uint32_t components)
+{
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
+ DIV_ROUND_UP (components * type_sz(src.type), 4));
+
+   if (type_sz(src.type) > 4) {
+  assert(type_sz(src.type) == 8);
+  first_component *= 2;
+  components *= 2;
+   }
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+
+   return dst;
+}
+
 fs_reg
 setup_imm_df(const fs_builder , double v)
 {
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3 1/3] gallium/util: Fix build error due to cast to different size

[Mesa-dev] [PATCH v3 2/3] egl/android: #ifdef out flink name support

[Mesa-dev] [PATCH v3 3/3] egl/android: Add DRM node probing and filtering

[Mesa-dev] [PATCH v3 0/3] egl/android: Remove dependencies on specific grallocs

Re: [Mesa-dev] [PATCH 0/3] egl/android: Remove dependencies on specific grallocs

Re: [Mesa-dev] [PATCH 3/3] egl/android: Add DRM node probing and filtering

[Mesa-dev] [PATCH v2] nv50/ir: improve performance of signed division by powers of two

[Mesa-dev] [Bug 105613] Compute shader locks up within nested "for" loop

[Mesa-dev] [PATCH 13/14] intel/compiler: use new shuffle_32bit_write for all 64-bit storage writes

[Mesa-dev] [PATCH 14/14] intel/compiler: shuffle_64bit_data_for_32bit_write is not used anymore

[Mesa-dev] [PATCH 09/14] intel/compiler: Use shuffle_from_32bit_read at VS load_input

[Mesa-dev] [PATCH 10/14] intel/compiler: shuffle_from_32bit_read at load_per_vertex_input at TCS/TES

[Mesa-dev] [PATCH 12/14] intel/compiler: shuffle_32bit_load_result_to_64bit_data is not used anymore

[Mesa-dev] [PATCH 11/14] intel/compiler: use shuffle_from_32bit_read for 64-bit FS load_input

[Mesa-dev] [PATCH 04/14] intel/compiler: Use shuffle_from_32bit_read to read 16-bit SSBO

[Mesa-dev] [PATCH 06/14] intel/compiler: remove old 16-bit shuffle/unshuffle functions

[Mesa-dev] [PATCH 01/14] intel/compiler: general 8/16/32/64-bit shuffle_src_to_dst function

[Mesa-dev] [PATCH 03/14] intel/compiler: use shuffle_from_32bit_read at VARYING_PULL_CONSTANT_LOAD

[Mesa-dev] [PATCH 05/14] intel/compiler: Use shuffle_from_32bit_write for 16-bits store_ssbo

[Mesa-dev] [PATCH 07/14] intel/compiler: shuffle_from_32bit_read for 64-bit do_untyped_vector_read

[Mesa-dev] [PATCH 08/14] intel/compiler: enable shuffle_from_32bit_read at 64-bit gs_input_load

[Mesa-dev] [PATCH 00/14] intel/compiler: unshuffle/shuffle functions refactoring

[Mesa-dev] [PATCH 02/14] intel/compiler: new shuffle_for_32bit_write and shuffle_from_32bit_read

23 matches

Site Navigation

Mail list logo

Footer information