[Mesa-dev] [PATCH v3 1/3] gallium/util: Fix build error due to cast to different size
Signed-off-by: Robert Foss --- src/gallium/auxiliary/util/u_debug_stack_android.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/auxiliary/util/u_debug_stack_android.cpp b/src/gallium/auxiliary/util/u_debug_stack_android.cpp index b3d56aebe6..395a1fe911 100644 --- a/src/gallium/auxiliary/util/u_debug_stack_android.cpp +++ b/src/gallium/auxiliary/util/u_debug_stack_android.cpp @@ -49,10 +49,10 @@ debug_backtrace_capture(debug_stack_frame *mesa_backtrace, backtrace_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - backtrace_entry = _mesa_hash_table_search(backtrace_table, (void*) tid); + backtrace_entry = _mesa_hash_table_search(backtrace_table, (void*) (uintptr_t)tid); if (!backtrace_entry) { backtrace = Backtrace::Create(getpid(), tid); - _mesa_hash_table_insert(backtrace_table, (void*) tid, backtrace); + _mesa_hash_table_insert(backtrace_table, (void*) (uintptr_t)tid, backtrace); } else { backtrace = (Backtrace *) backtrace_entry->data; } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH v3 2/3] egl/android: #ifdef out flink name support
From: Rob Herring Maintaining both flink names and prime fd support which are provided by 2 different gralloc implementations is problematic because we have a dependency on a specific gralloc implementation header. This mostly disables the dependency on the gralloc implementation and headers. The dependency on GRALLOC_MODULE_PERFORM_GET_DRM_FD remains for now, but the definition is added locally to remove the header dependency. drm_gralloc support can be enabled by setting BOARD_USES_DRM_GRALLOC=true in BoardConfig.mk. Signed-off-by: Rob Herring Signed-off-by: Robert Foss --- Changes since RFC: - Instead of removing code, #ifdef it out. src/egl/Android.mk | 6 ++- src/egl/drivers/dri2/egl_dri2.h | 2 - src/egl/drivers/dri2/platform_android.c | 56 +++-- 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/src/egl/Android.mk b/src/egl/Android.mk index 11818694f4..8412aeb798 100644 --- a/src/egl/Android.mk +++ b/src/egl/Android.mk @@ -57,9 +57,13 @@ LOCAL_SHARED_LIBRARIES := \ libhardware \ liblog \ libcutils \ - libgralloc_drm \ libsync +ifeq ($(BOARD_USES_DRM_GRALLOC),true) + LOCAL_CFLAGS += -DHAVE_DRM_GRALLOC + LOCAL_SHARED_LIBRARIES += libgralloc_drm +endif + ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),) LOCAL_SHARED_LIBRARIES += libnativewindow endif diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h index adabc527f8..5d8fbfa235 100644 --- a/src/egl/drivers/dri2/egl_dri2.h +++ b/src/egl/drivers/dri2/egl_dri2.h @@ -67,8 +67,6 @@ struct zwp_linux_dmabuf_v1; #include #include -#include - #endif /* HAVE_ANDROID_PLATFORM */ #include "eglconfig.h" diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c index 1d6ed92bd6..4ba96aad90 100644 --- a/src/egl/drivers/dri2/platform_android.c +++ b/src/egl/drivers/dri2/platform_android.c @@ -37,7 +37,11 @@ #include "loader.h" #include "egl_dri2.h" #include "egl_dri2_fallbacks.h" + +#ifdef HAVE_DRM_GRALLOC +#include #include "gralloc_drm.h" +#endif /* HAVE_DRM_GRALLOC */ #define ALIGN(val, align) (((val) + (align) - 1) & ~((align) - 1)) @@ -164,11 +168,13 @@ get_native_buffer_fd(struct ANativeWindowBuffer *buf) return (handle && handle->numFds) ? handle->data[0] : -1; } +#ifdef HAVE_DRM_GRALLOC static int get_native_buffer_name(struct ANativeWindowBuffer *buf) { return gralloc_drm_get_gem_handle(buf->handle); } +#endif /* HAVE_DRM_GRALLOC */ static EGLBoolean droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf) @@ -838,6 +844,7 @@ droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx, return dri2_create_image_dma_buf(disp, ctx, NULL, attr_list); } +#ifdef HAVE_DRM_GRALLOC static _EGLImage * droid_create_image_from_name(_EGLDisplay *disp, _EGLContext *ctx, struct ANativeWindowBuffer *buf) @@ -881,6 +888,7 @@ droid_create_image_from_name(_EGLDisplay *disp, _EGLContext *ctx, return _img->base; } +#endif /* HAVE_DRM_GRALLOC */ static EGLBoolean droid_query_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf, @@ -937,7 +945,11 @@ dri2_create_image_android_native_buffer(_EGLDisplay *disp, if (fd >= 0) return droid_create_image_from_prime_fd(disp, ctx, buf, fd); +#ifdef HAVE_DRM_GRALLOC return droid_create_image_from_name(disp, ctx, buf); +#else + return NULL; +#endif } static _EGLImage * @@ -959,6 +971,7 @@ droid_flush_front_buffer(__DRIdrawable * driDrawable, void *loaderPrivate) { } +#ifdef HAVE_DRM_GRALLOC static int droid_get_buffers_parse_attachments(struct dri2_egl_surface *dri2_surf, unsigned int *attachments, int count) @@ -1034,6 +1047,7 @@ droid_get_buffers_with_format(__DRIdrawable * driDrawable, return dri2_surf->buffers; } +#endif /* HAVE_DRM_GRALLOC */ static unsigned droid_get_capability(void *loaderPrivate, enum dri_loader_cap cap) @@ -1116,6 +1130,14 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy) return (config_count != 0); } +enum { +/* perform(const struct gralloc_module_t *mod, + * int op, + * int *fd); + */ +GRALLOC_MODULE_PERFORM_GET_DRM_FD = 0x4002, +}; + static int droid_open_device(struct dri2_egl_display *dri2_dpy) { @@ -1158,6 +1180,7 @@ static const struct dri2_egl_display_vtbl droid_display_vtbl = { .get_dri_drawable = dri2_surface_get_dri_drawable, }; +#ifdef HAVE_DRM_GRALLOC static const __DRIdri2LoaderExtension droid_dri2_loader_extension = { .base = { __DRI_DRI2_LOADER, 4 }, @@ -1166,6 +1189,7 @@ static const __DRIdri2LoaderExtension droid_dri2_loader_extension = { .getBuffersWithFormat = droid_get_buffers_with_format, .getCapability= droid_get_capability, }; +#endif /* HAVE_DRM_GRALLOC */ static const
[Mesa-dev] [PATCH v3 3/3] egl/android: Add DRM node probing and filtering
This patch both adds support for probing & filtering DRM nodes and switches away from using the GRALLOC_MODULE_PERFORM_GET_DRM_FD gralloc call. Currently the filtering is based just on the driver name, and the desired name is supplied using the "drm.gpu.vendor_name" Android property. Signed-off-by: Robert Foss --- Changes since v2: - Switch from drmGetDevices2 to manual renderD node iteration - Add probe_res enum to communicate probing results better - Avoid using _eglError() in internal static functions - Avoid actually loading the driver while probing, just verify that it exists. - Replace strlen call with the assumed length PROPERTY_VALUE_MAX Changes since v1: - Do not rely on libdrm for probing - Distinguish between errors and when no drm devices are found Changes since RFC: - Rebased on newer libdrm drmHandleMatch patch - Added support for driver probing src/egl/drivers/dri2/platform_android.c | 222 ++-- 1 file changed, 169 insertions(+), 53 deletions(-) diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c index 4ba96aad90..a2cbe92d93 100644 --- a/src/egl/drivers/dri2/platform_android.c +++ b/src/egl/drivers/dri2/platform_android.c @@ -27,12 +27,16 @@ * DEALINGS IN THE SOFTWARE. */ +#include #include +#include #include #include #include #include +#include #include +#include #include "loader.h" #include "egl_dri2.h" @@ -1130,31 +1134,6 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy) return (config_count != 0); } -enum { -/* perform(const struct gralloc_module_t *mod, - * int op, - * int *fd); - */ -GRALLOC_MODULE_PERFORM_GET_DRM_FD = 0x4002, -}; - -static int -droid_open_device(struct dri2_egl_display *dri2_dpy) -{ - int fd = -1, err = -EINVAL; - - if (dri2_dpy->gralloc->perform) - err = dri2_dpy->gralloc->perform(dri2_dpy->gralloc, - GRALLOC_MODULE_PERFORM_GET_DRM_FD, - ); - if (err || fd < 0) { - _eglLog(_EGL_WARNING, "fail to get drm fd"); - fd = -1; - } - - return (fd >= 0) ? fcntl(fd, F_DUPFD_CLOEXEC, 3) : -1; -} - static const struct dri2_egl_display_vtbl droid_display_vtbl = { .authenticate = NULL, .create_window_surface = droid_create_window_surface, @@ -1215,6 +1194,168 @@ static const __DRIextension *droid_image_loader_extensions[] = { NULL, }; +EGLBoolean +droid_load_driver(_EGLDisplay *disp) +{ + struct dri2_egl_display *dri2_dpy = disp->DriverData; + const char *err; + + dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd); + if (dri2_dpy->driver_name == NULL) + return false; + + dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER; + + if (!dri2_dpy->is_render_node) { + #ifdef HAVE_DRM_GRALLOC + /* Handle control nodes using __DRI_DRI2_LOADER extension and GEM names +* for backwards compatibility with drm_gralloc. (Do not use on new +* systems.) */ + dri2_dpy->loader_extensions = droid_dri2_loader_extensions; + if (!dri2_load_driver(disp)) { + err = "DRI2: failed to load driver"; + goto error; + } + #else + err = "DRI2: handle is not for a render node"; + goto error; + #endif + } else { + dri2_dpy->loader_extensions = droid_image_loader_extensions; + if (!dri2_load_driver_dri3(disp)) { + err = "DRI3: failed to load driver"; + goto error; + } +} + + return true; + +error: + free(dri2_dpy->driver_name); + dri2_dpy->driver_name = NULL; + return false; +} + +static bool +droid_probe_driver(int fd) +{ + char *driver_name; + + driver_name = loader_get_driver_for_fd(fd); + if (driver_name == NULL) + return false; + + free(driver_name); + return true; +} + +typedef enum { + probe_error = -1, + probe_success = 0, + probe_filtered_out = 1, + probe_no_driver = 2 +} probe_ret_t; + +static probe_ret_t +droid_probe_device(_EGLDisplay *disp, int fd, char *vendor) +{ + int ret; + + drmVersionPtr ver = drmGetVersion(fd); + if (!ver) + return probe_error; + + if (vendor != NULL && ver->name != NULL && + strncmp(vendor, ver->name, PROPERTY_VALUE_MAX) != 0) { + ret = probe_filtered_out; + goto cleanup; + } + + + if (!droid_probe_driver(fd)) { + ret = probe_no_driver; + goto cleanup; + } + + ret = probe_success; + +cleanup: + drmFreeVersion(ver); + return ret; +} + +static int +droid_open_device(_EGLDisplay *disp) +{ + const int MAX_DRM_DEVICES = 32; + int prop_set, num_devices; + int fd = -1, fallback_fd = -1; + + char *vendor_name = NULL; + char vendor_buf[PROPERTY_VALUE_MAX]; + if (property_get("drm.gpu.vendor_name", vendor_buf, NULL) > 0); + vendor_name = vendor_buf; + + const char *drm_dir_name =
[Mesa-dev] [PATCH v3 0/3] egl/android: Remove dependencies on specific grallocs
This series replaces the dependency on GRALLOC_MODULE_PERFORM_GET_DRM_FD with DRM node probing and disables the support for drm_gralloc. The series has been tested on Qemu+AOSP, where a virtio gpu was successfully probed for and opened. Changes since v2: - Fixed whitespace issue - Diversified return codes from probing functions - Switched away from using drmGetDevices2, to iterating over /dev/dir/renderD nodes manually Changes since v1: - Added fix for build issue - Do not rely on libdrm for probing - Distinguish between errors and when no drm devices are found Changes since RFC: - Rebased work on the libdrm patch [2]. - Included patch from Rob Herring disabling drm_gralloc/flink support by default. - Added device handler driver probing. Rob Herring (1): egl/android: #ifdef out flink name support Robert Foss (2): gallium/util: Fix build error due to cast to different size egl/android: Add DRM node probing and filtering src/egl/Android.mk| 6 +- src/egl/drivers/dri2/egl_dri2.h | 2 - src/egl/drivers/dri2/platform_android.c | 226 +++--- .../auxiliary/util/u_debug_stack_android.cpp | 4 +- 4 files changed, 194 insertions(+), 44 deletions(-) -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 0/3] egl/android: Remove dependencies on specific grallocs
Hey, On 2018-05-25 17:38, Rob Herring wrote: On Fri, May 25, 2018 at 9:25 AM, Tomasz Figa wrote: On Fri, May 25, 2018 at 10:59 PM Rob Herring wrote: On Fri, May 25, 2018 at 4:15 AM, Robert Foss wrote: On 2018-05-25 10:38, Tomasz Figa wrote: On Fri, May 25, 2018 at 5:33 PM Robert Foss wrote: Hey, On 2018-05-25 02:17, Rob Herring wrote: On Thu, May 24, 2018 at 6:23 AM, Robert Foss < robert.f...@collabora.com> wrote: Hey, I don't think I've received any feedback on this version yet. If anyone has some time to spare, it would be nice to get it merged. Just to be clear about the libdrm branch linked in the cover letter, it is not required. Only for virgl platforms which happens to be what I tested on. virgl will still fallback to using the first render node without those libdrm changes, right? If not, I don't think we should apply until we're not breaking a platform... No it will not fall back. I agree that holding off makes more sense. What's the reason of this problems? Is it because of drmGetDevices()? Since we don't really use it for anything other than getting the list of render nodes in the system, maybe we could just iterate over any /dev/renderD* nodes explicitly and avoid introducing new problems? That's exactly the problem, and yes we could 100% solve by iterating over /dev/renderD* nodes. I originally assumed we wouldn't want to do that, but rather use the libdrm interfaces. But for the next spin I could avoid using libdrm, should I? I don't have an opinion on libdrm really, but I do think we should fallback to the 1st (only) render node rather than just fail. We do, even with libdrm. AFAICT, the problem with virgl seems to be that drmGetDevices() doesn't include devices on virtio bus in the results, which means that there likely wouldn't be any render node returned. Okay. I still don't get why we search by bus in the first place. Who cares what bus the gpu sits on. Now I have an opinion. We should just iterate over render nodes matching by name or use the first node if we don't have a set name. For v3 I implemented a version with drmGetDevices2 replaced with iteration over the /dev/dri/renderD* nodes. That still means doing all of the other filtering, however. Rob. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 3/3] egl/android: Add DRM node probing and filtering
Hey Tomasz, On 2018-05-25 09:27, Tomasz Figa wrote: > Hi Rob, > > Finally got to review this. Please see my comments inline. > > On Fri, May 11, 2018 at 10:48 PM Robert Foss > wrote: > [snip] >> +EGLBoolean >> +droid_load_driver(_EGLDisplay *disp) > > Since this is not EGL-facing, I'd personally use bool. > >> +{ >> + struct dri2_egl_display *dri2_dpy = disp->DriverData; >> + const char *err; >> + >> + dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd); >> + if (dri2_dpy->driver_name == NULL) { >> + err = "DRI2: failed to get driver name"; >> + goto error; > > It shouldn't be an error if there is no driver for given render node. We > should just skip it and try next one, which I believe would be achieved by > just returning false here. > >> + } >> + >> + dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == > DRM_NODE_RENDER; >> + >> + if (!dri2_dpy->is_render_node) { >> + #ifdef HAVE_DRM_GRALLOC >> + /* Handle control nodes using __DRI_DRI2_LOADER extension and GEM > names >> +* for backwards compatibility with drm_gralloc. (Do not use on > new >> +* systems.) */ >> + dri2_dpy->loader_extensions = droid_dri2_loader_extensions; >> + if (!dri2_load_driver(disp)) { >> + err = "DRI2: failed to load driver"; >> + goto error; >> + } >> + #else >> + err = "DRI2: handle is not for a render node"; >> + goto error; >> + #endif >> + } else { >> + dri2_dpy->loader_extensions = droid_image_loader_extensions; >> + if (!dri2_load_driver_dri3(disp)) { >> + err = "DRI3: failed to load driver"; >> + goto error; >> + } >> +} >> + >> + return EGL_TRUE; >> + >> +error: >> + free(dri2_dpy->driver_name); >> + dri2_dpy->driver_name = NULL; >> + return _eglError(EGL_NOT_INITIALIZED, err); > > Hmm, if we signal EGL error here, we should break the probing loop and just > bail out. This would suggest that a boolean is not the right type for this > function to return. Perhaps something like negative error, 0 for skip and 1 > for success would make sense? > > Also, how does it play with the _eglError() called from the error path of > dri2_initialize_android()? I can't say I put any though into that aspect, but dri2_initialize_android() would overwrite it. So maybe completely avoiding _eglError() in droid_load_driver() is the way to go. > >> +} >> + >> +static int >> +droid_probe_driver(_EGLDisplay *disp, int fd) >> +{ >> + struct dri2_egl_display *dri2_dpy = disp->DriverData; >> + dri2_dpy->fd = fd; >> + >> + if (!droid_load_driver(disp)) >> + return false; > > Given my other suggestion about distinguishing failure, render node skip > and success, I think it should be more like this: > > int ret = droid_load_driver(disp); > if (ret <= 0) > return ret; > > Or actually, maybe we don't really need to go as far as loading the driver. > I'd say it should be enough to just check if we have a driver for the > device by looking at what loader_get_driver_for_fd() returns. (In that > case, we can ignore my comment about returning error on > loader_get_driver_for_fd() failure in droid_load_driver(), since the > skipping would be handling only here.) If we don't need to actually load the driver, I think all of the above comments can be fixed by just removing chunks out of dri related setup. > >> + >> + /* Since this probe can succeed, but another filter may fail, > > What another filter could fail? I can see the vendor name being checked > before calling this function. > > The free() below is actually needed, just the comment is off. We need to > free the name to be able to probe remaining nodes, without leaking the name. Ack, fixed by the above change. > >> + this string needs to be deallocated either way. >> + Once an FD has been found, this string will be set a second time. > */ >> + free(dri2_dpy->driver_name); > > Don't we also need to unload the driver? Ack, fixed by the above change. > >> + dri2_dpy->driver_name = NULL; >> + return true; > > To match the change above: > > return 1; > Ack, fixed by the above change. >> +} >> + >> +static int >> +droid_probe_device(_EGLDisplay *disp, int fd, drmDevicePtr dev, char > *vendor) >> +{ >> + drmVersionPtr ver = drmGetVersion(fd); >> + if (!ver) >> + goto fail; > > Something wrong with indentation here. Ack. > >> + >> + size_t vendor_len = strlen(vendor); >> + if (vendor_len != 0 && strncmp(vendor, ver->name, vendor_len)) >> + goto fail; > > Maybe it's just me, but I don't see any point in using strncmp() if the > length argument is obtained by calling strlen() first. Especially if the > strlen() call is on a string that comes from some external code > (property_get()). > > Perhaps we could just call strncmp() with PROPERTY_VALUE_MAX? This would > actually play nice with my other comment about using NULL for vendor > string, if
[Mesa-dev] [PATCH v2] nv50/ir: improve performance of signed division by powers of two
Changes in v2: - Stylistic changes - Use OP_SLCT instead of OP_SELP which only worked by luck - Fix issues in edge cases Signed-off-by: Rhys Perry --- .../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 30 +++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 39177bd044..d636eb130a 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1095,10 +1095,36 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue , int s) i->op = OP_MOV; i->setSrc(1, NULL); } else + if (imm0.reg.data.s32 == -1) { + i->op = OP_NEG; + i->setSrc(1, NULL); + } else if (i->dType == TYPE_U32 && imm0.isPow2()) { i->op = OP_SHR; i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32))); } else + if (i->dType == TYPE_S32 && util_is_power_of_two_or_zero(llabs(imm0.reg.data.s32))) { + Value *a = i->getSrc(0); + int64_t absb = llabs(imm0.reg.data.s32); + + Value *sign = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), a, bld.mkImm(31)); + Value *adjusted = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), a, + bld.loadImm(bld.getSSA(), (uint32_t)(absb - 1))); + + Value *selected = bld.getSSA(); + bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, selected, TYPE_U32, adjusted, a, sign); + + if (imm0.reg.data.s32 < 0) { +i->op = OP_NEG; +i->setSrc(0, bld.mkOp2v( + OP_SHR, TYPE_S32, bld.getSSA(), selected, bld.mkImm(util_logbase2(absb; +i->setSrc(1, NULL); + } else { +i->op = OP_SHR; +i->setSrc(0, selected); +i->setSrc(1, bld.mkImm(util_logbase2(absb))); + } + } else if (i->dType == TYPE_U32) { Instruction *mul; Value *tA, *tB; @@ -1129,10 +1155,6 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue , int s) bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s)); delete_Instruction(prog, i); - } else - if (imm0.reg.data.s32 == -1) { - i->op = OP_NEG; - i->setSrc(1, NULL); } else { LValue *tA, *tB; LValue *tD; -- 2.14.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [Bug 105613] Compute shader locks up within nested "for" loop
https://bugs.freedesktop.org/show_bug.cgi?id=105613 Samuel Pitoiset changed: What|Removed |Added Status|NEW |RESOLVED Resolution|--- |FIXED --- Comment #9 from Samuel Pitoiset --- Fixed with https://cgit.freedesktop.org/mesa/mesa/commit/?id=135e4d434f622fa1d7275bdb72f859e1c1b1976e -- You are receiving this mail because: You are the assignee for the bug. You are the QA Contact for the bug.___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 13/14] intel/compiler: use new shuffle_32bit_write for all 64-bit storage writes
--- src/intel/compiler/brw_fs_nir.cpp | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 2521f3c001b..833fad4247a 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2839,8 +2839,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder , * for that. */ unsigned channel = iter * 2 + i; - fs_reg dest = shuffle_64bit_data_for_32bit_write(bld, - offset(value, bld, channel), 1); + fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1); srcs[header_regs + (i + first_component) * 2] = dest; srcs[header_regs + (i + first_component) * 2 + 1] = @@ -3694,8 +3693,8 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder , unsigned type_size = 4; if (nir_src_bit_size(instr->src[0]) == 64) { type_size = 8; - val_reg = shuffle_64bit_data_for_32bit_write(bld, -val_reg, instr->num_components); + val_reg = shuffle_for_32bit_write(bld, val_reg, 0, + instr->num_components); } unsigned type_slots = type_size / 4; @@ -4236,8 +4235,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , nir_intrinsic_instr *instr * iteration handle the rest. */ num_components = MIN2(2, num_components); -write_src = shuffle_64bit_data_for_32bit_write(bld, write_src, - num_components); +write_src = shuffle_for_32bit_write(bld, write_src, 0, +num_components); } else if (type_size < 4) { assert(type_size == 2); /* For 16-bit types we pack two consecutive values into a 32-bit @@ -4333,7 +4332,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , nir_intrinsic_instr *instr unsigned num_components = instr->num_components; unsigned first_component = nir_intrinsic_component(instr); if (nir_src_bit_size(instr->src[0]) == 64) { - src = shuffle_64bit_data_for_32bit_write(bld, src, num_components); + src = shuffle_for_32bit_write(bld, src, 0, num_components); num_components *= 2; } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 14/14] intel/compiler: shuffle_64bit_data_for_32bit_write is not used anymore
--- src/intel/compiler/brw_fs.h | 4 src/intel/compiler/brw_fs_nir.cpp | 32 --- 2 files changed, 36 deletions(-) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 1f86f17ccbb..17b1368d522 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -499,10 +499,6 @@ private: void *mem_ctx; }; -fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder , - const fs_reg , - uint32_t components); - void shuffle_from_32bit_read(const brw::fs_builder , const fs_reg , const fs_reg , diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 833fad4247a..f68fe5f1d1a 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5187,38 +5187,6 @@ fs_visitor::nir_emit_jump(const fs_builder , nir_jump_instr *instr) } } -/** - * This helper does the inverse operation of - * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA. - * - * We need to do this when we are going to use untyped write messsages that - * operate with 32-bit components in order to arrange our 64-bit data to be - * in the expected layout. - * - * Notice that callers of this function, unlike in the case of the inverse - * operation, would typically need to call this with dst and src being - * different registers, since they would otherwise corrupt the original - * 64-bit data they are about to write. Because of this the function checks - * that the src and dst regions involved in the operation do not overlap. - */ -fs_reg -shuffle_64bit_data_for_32bit_write(const fs_builder , - const fs_reg , - uint32_t components) -{ - assert(type_sz(src.type) == 8); - - fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 2 * components); - - for (unsigned i = 0; i < components; i++) { - const fs_reg component_i = offset(src, bld, i); - bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0)); - bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1)); - } - - return dst; -} - /* * This helper takes a source register and un/shuffles it into the destination * register. -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 09/14] intel/compiler: Use shuffle_from_32bit_read at VS load_input
shuffle_from_32bit_read manages 32-bit reads to 32-bit destination in the same way that the previous loop so now we just call the new function for all bitsizes, simplifying also the 64-bit load_input. --- src/intel/compiler/brw_fs_nir.cpp | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 6abc7c0174d..fedf3bf5a83 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2483,16 +2483,8 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder , if (type_sz(dest.type) == 8) first_component /= 2; - for (unsigned j = 0; j < num_components; j++) { - bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component)); - } - - if (type_sz(dest.type) == 8) { - shuffle_32bit_load_result_to_64bit_data(bld, - dest, - retype(dest, BRW_REGISTER_TYPE_F), - instr->num_components); - } + shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D), + first_component, num_components); break; } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 10/14] intel/compiler: shuffle_from_32bit_read at load_per_vertex_input at TCS/TES
Previously, the shuffle function had a source/destination overlap that needs to be avoided to use shuffle_from_32bit_read. As we can use for the shuffle destination the destination of removed MOVs. This change also avoids the internal MOVs done by the previous shuffle to deal with possible overlaps. --- src/intel/compiler/brw_fs_nir.cpp | 22 -- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index fedf3bf5a83..11b707e57a8 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2662,13 +2662,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder , * or SSBOs. */ if (type_sz(dst.type) == 8) { -shuffle_32bit_load_result_to_64bit_data( - bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components); - -for (unsigned c = 0; c < num_components; c++) { - bld.MOV(offset(orig_dst, bld, iter * 2 + c), - offset(dst, bld, c)); -} +shuffle_from_32bit_read(bld, +offset(orig_dst, bld, iter * 2), +retype(dst, BRW_REGISTER_TYPE_D), +0, num_components); } /* Copy the temporary to the destination to deal with writemasking. @@ -3011,13 +3008,10 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder , * or SSBOs. */ if (type_sz(dest.type) == 8) { - shuffle_32bit_load_result_to_64bit_data( - bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components); - - for (unsigned c = 0; c < num_components; c++) { - bld.MOV(offset(orig_dest, bld, iter * 2 + c), - offset(dest, bld, c)); - } + shuffle_from_32bit_read(bld, + offset(orig_dest, bld, iter * 2), + retype(dest, BRW_REGISTER_TYPE_D), + 0, num_components); } /* If we are loading double data and we need a second read message -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 12/14] intel/compiler: shuffle_32bit_load_result_to_64bit_data is not used anymore
--- src/intel/compiler/brw_fs.h | 5 --- src/intel/compiler/brw_fs_nir.cpp | 53 --- 2 files changed, 58 deletions(-) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index d72164ae0b6..1f86f17ccbb 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -499,11 +499,6 @@ private: void *mem_ctx; }; -void shuffle_32bit_load_result_to_64bit_data(const brw::fs_builder , - const fs_reg , - const fs_reg , - uint32_t components); - fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder , const fs_reg , uint32_t components); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 7e0ef2f34a9..2521f3c001b 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5188,59 +5188,6 @@ fs_visitor::nir_emit_jump(const fs_builder , nir_jump_instr *instr) } } -/** - * This helper takes the result of a load operation that reads 32-bit elements - * in this format: - * - * x x x x x x x x - * y y y y y y y y - * z z z z z z z z - * w w w w w w w w - * - * and shuffles the data to get this: - * - * x y x y x y x y - * x y x y x y x y - * z w z w z w z w - * z w z w z w z w - * - * Which is exactly what we want if the load is reading 64-bit components - * like doubles, where x represents the low 32-bit of the x double component - * and y represents the high 32-bit of the x double component (likewise with - * z and w for double component y). The parameter @components represents - * the number of 64-bit components present in @src. This would typically be - * 2 at most, since we can only fit 2 double elements in the result of a - * vec4 load. - * - * Notice that @dst and @src can be the same register. - */ -void -shuffle_32bit_load_result_to_64bit_data(const fs_builder , -const fs_reg , -const fs_reg , -uint32_t components) -{ - assert(type_sz(src.type) == 4); - assert(type_sz(dst.type) == 8); - - /* A temporary that we will use to shuffle the 32-bit data of each -* component in the vector into valid 64-bit data. We can't write directly -* to dst because dst can be (and would usually be) the same as src -* and in that case the first MOV in the loop below would overwrite the -* data read in the second MOV. -*/ - fs_reg tmp = bld.vgrf(dst.type); - - for (unsigned i = 0; i < components; i++) { - const fs_reg component_i = offset(src, bld, 2 * i); - - bld.MOV(subscript(tmp, src.type, 0), component_i); - bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1)); - - bld.MOV(offset(dst, bld, i), tmp); - } -} - /** * This helper does the inverse operation of * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA. -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 11/14] intel/compiler: use shuffle_from_32bit_read for 64-bit FS load_input
As the previous use of shuffle_32bit_load_result_to_64bit_data had a source/destination overlap for 64-bit. Now a temporal destination is used for 64-bit cases to use shuffle_from_32bit_read that doesn't handle src/dst overlaps. --- src/intel/compiler/brw_fs_nir.cpp | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 11b707e57a8..7e0ef2f34a9 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3350,6 +3350,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder , unsigned base = nir_intrinsic_base(instr); unsigned comp = nir_intrinsic_component(instr); unsigned num_components = instr->num_components; + fs_reg orig_dest = dest; enum brw_reg_type type = dest.type; /* Special case fields in the VUE header */ @@ -3365,6 +3366,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder , */ type = BRW_REGISTER_TYPE_F; num_components *= 2; + dest = bld.vgrf(type, num_components); } for (unsigned int i = 0; i < num_components; i++) { @@ -3373,10 +3375,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder , } if (nir_dest_bit_size(instr->dest) == 64) { - shuffle_32bit_load_result_to_64bit_data(bld, - dest, - retype(dest, type), - instr->num_components); + shuffle_from_32bit_read(bld, orig_dest, dest, 0, + instr->num_components); } break; } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 04/14] intel/compiler: Use shuffle_from_32bit_read to read 16-bit SSBO
Using shuffle_from_32bit_read instead of 16-bit shuffle functions avoids the need of retype. At the same time new function are ready for 8-bit type SSBO reads. --- src/intel/compiler/brw_fs_nir.cpp | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 1f684149fd5..ef7895262b8 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2372,10 +2372,8 @@ do_untyped_vector_read(const fs_builder , 1 /* dims */, num_components_32bit, BRW_PREDICATE_NONE); - shuffle_32bit_load_result_to_16bit_data(bld, - retype(dest, BRW_REGISTER_TYPE_W), - retype(read_result, BRW_REGISTER_TYPE_D), - first_component, num_components); + shuffle_from_32bit_read(bld, dest, read_result, first_component, + num_components); } else { fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); for (unsigned i = 0; i < num_components; i++) { -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 06/14] intel/compiler: remove old 16-bit shuffle/unshuffle functions
--- src/intel/compiler/brw_fs.h | 11 -- src/intel/compiler/brw_fs_nir.cpp | 62 --- 2 files changed, 73 deletions(-) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 779170ecc95..d72164ae0b6 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -508,17 +508,6 @@ fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder , const fs_reg , uint32_t components); -void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder , - const fs_reg , - const fs_reg , - uint32_t first_component, - uint32_t components); - -void shuffle_16bit_data_for_32bit_write(const brw::fs_builder , -const fs_reg , -const fs_reg , -uint32_t components); - void shuffle_from_32bit_read(const brw::fs_builder , const fs_reg , const fs_reg , diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index a54935f7049..7e738ade82e 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5263,40 +5263,6 @@ shuffle_32bit_load_result_to_64bit_data(const fs_builder , } } -void -shuffle_32bit_load_result_to_16bit_data(const fs_builder , -const fs_reg , -const fs_reg , -uint32_t first_component, -uint32_t components) -{ - assert(type_sz(src.type) == 4); - assert(type_sz(dst.type) == 2); - - /* A temporary is used to un-shuffle the 32-bit data of each component in -* into a valid 16-bit vector. We can't write directly to dst because it -* can be the same register as src and in that case the first MOV in the -* loop below would overwrite the data read in the second MOV. -*/ - fs_reg tmp = retype(bld.vgrf(src.type), dst.type); - - for (unsigned i = 0; i < components; i++) { - const fs_reg component_i = - subscript(offset(src, bld, (first_component + i) / 2), dst.type, - (first_component + i) % 2); - - bld.MOV(offset(tmp, bld, i % 2), component_i); - - if (i % 2) { - bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0)); - bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1)); - } - } - if (components % 2) { - bld.MOV(offset(dst, bld, components - 1), tmp); - } -} - /** * This helper does the inverse operation of * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA. @@ -5329,34 +5295,6 @@ shuffle_64bit_data_for_32bit_write(const fs_builder , return dst; } -void -shuffle_16bit_data_for_32bit_write(const fs_builder , - const fs_reg , - const fs_reg , - uint32_t components) -{ - assert(type_sz(src.type) == 2); - assert(type_sz(dst.type) == 4); - - /* A temporary is used to shuffle the 16-bit data of each component in the -* 32-bit data vector. We can't write directly to dst because it can be the -* same register as src and in that case the first MOV in the loop below -* would overwrite the data read in the second MOV. -*/ - fs_reg tmp = bld.vgrf(dst.type); - - for (unsigned i = 0; i < components; i++) { - const fs_reg component_i = offset(src, bld, i); - bld.MOV(subscript(tmp, src.type, i % 2), component_i); - if (i % 2) { - bld.MOV(offset(dst, bld, i / 2), tmp); - } - } - if (components % 2) { - bld.MOV(offset(dst, bld, components / 2), tmp); - } -} - /* * This helper takes a source register and un/shuffles it into the destination * register. -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 01/14] intel/compiler: general 8/16/32/64-bit shuffle_src_to_dst function
This new function takes care of shuffle/unshuffle components of a particular bit-size in components with a different bit-size. If source type size is smaller than destination type size the operation needed is a component shuffle. The opposite case would be an unshuffle. The operation allows to skip first_component number of components from the source. Shuffle MOVs are retyped using integer types avoiding problems with denorms and float types. This allows to simplify uses of shuffle functions that are dealing with these retypes individually. Now there is a new restriction so source and destination can not overlap anymore when calling this suffle function. Following patches that migrate to use this new function will take care individually of avoiding source and destination overlaps. --- src/intel/compiler/brw_fs_nir.cpp | 92 +++ 1 file changed, 92 insertions(+) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 166da0aa6d7..1a9d3c41d1d 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5362,6 +5362,98 @@ shuffle_16bit_data_for_32bit_write(const fs_builder , } } +/* + * This helper takes a source register and un/shuffles it into the destination + * register. + * + * If source type size is smaller than destination type size the operation + * needed is a component shuffle. The opposite case would be an unshuffle. If + * source/destination type size is equal a shuffle is done that would be + * equivalent to a simple MOV. + * + * For example, if source is a 16-bit type and destination is 32-bit. A 3 + * components .xyz 16-bit vector on SIMD8 would be. + * + *|x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| + *|z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | + * + * This helper will return the following 2 32-bit components with the 16-bit + * values shuffled: + * + *|x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| + *|z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | + * + * For unshuffle, the example would be the opposite, a 64-bit type source + * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 + * would be: + * + *| x1l x1h | x2l x2h | x3l x3h | x4l x4h | + *| x5l x5h | x6l x6h | x7l x7h | x8l x8h | + *| y1l y1h | y2l y2h | y3l y3h | y4l y4h | + *| y5l y5h | y6l y6h | y7l y7h | y8l y8h | + * + * The returned result would be the following 4 32-bit components unshuffled: + * + *| x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | + *| x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | + *| y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | + *| y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | + * + * - Source and destination register must not be overlapped. + * - first_component parameter allows skipping source components. + */ +void +shuffle_src_to_dst(const fs_builder , + const fs_reg , + const fs_reg , + uint32_t first_component, + uint32_t components) +{ + if (type_sz(src.type) <= type_sz(dst.type)) { + /* Source is shuffled into destination */ + unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); +#ifndef NDEBUG + boolean src_dst_overlap = regions_overlap(dst, + type_sz(dst.type) * bld.dispatch_width() * components, + offset(src, bld, first_component * size_ratio), + type_sz(src.type) * bld.dispatch_width() * components * size_ratio); +#endif + assert(!src_dst_overlap); + + brw_reg_type shuffle_type = + brw_reg_type_from_bit_size(8 * type_sz(src.type), +BRW_REGISTER_TYPE_D); + for (unsigned i = 0; i < components; i++) { + fs_reg shuffle_component_i = +subscript(offset(dst, bld, i / size_ratio), + shuffle_type, i % size_ratio); + bld.MOV(shuffle_component_i, + retype(offset(src, bld, i + first_component), shuffle_type)); + } + } else { + /* Source is unshuffled into destination */ + unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); +#ifndef NDEBUG + boolean src_dst_overlap = regions_overlap(dst, + type_sz(dst.type) * bld.dispatch_width() * components, + offset(src, bld, first_component / size_ratio), + type_sz(src.type) * bld.dispatch_width() * + DIV_ROUND_UP(components + first_component % size_ratio, size_ratio)); +#endif + assert(!src_dst_overlap); + brw_reg_type shuffle_type = + brw_reg_type_from_bit_size(8 * type_sz(dst.type), +BRW_REGISTER_TYPE_D); + for (unsigned i = 0; i < components; i++) { + fs_reg shuffle_component_i = +subscript(offset(src, bld, (first_component + i) / size_ratio), + shuffle_type, (first_component + i) % size_ratio); + bld.MOV(retype(offset(dst, bld, i),
[Mesa-dev] [PATCH 03/14] intel/compiler: use shuffle_from_32bit_read at VARYING_PULL_CONSTANT_LOAD
shuffle_from_32bit_read can manage the shuffle/unshuffle needed for different 8/16/32/64 bit-sizes at VARYING PULL CONSTANT LOAD. To get the specific component the first_component parameter is used. In the case of the previous 16-bit shuffle, the shuffle operation was generating not needed MOVs where its results where never used. This behaviour passed unnoticed on SIMD16 because dead_code_eliminate pass removed the generated instructions but for SIMD8 they cound't be removed because of being partial writes. --- src/intel/compiler/brw_fs.cpp | 17 ++--- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d67c0a41922..410768ef927 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -191,21 +191,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder , vec4_result, surf_index, vec4_offset); inst->size_written = 4 * vec4_result.component_size(inst->exec_size); - fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4); - switch (type_sz(dst.type)) { - case 2: - shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 0, 1); - bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1)); - break; - case 4: - bld.MOV(dst, retype(dw, dst.type)); - break; - case 8: - shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1); - break; - default: - unreachable("Unsupported bit_size"); - } + shuffle_from_32bit_read(bld, dst, vec4_result, + (const_offset & 0xf) / type_sz(dst.type), 1); } /** -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 05/14] intel/compiler: Use shuffle_from_32bit_write for 16-bits store_ssbo
--- src/intel/compiler/brw_fs_nir.cpp | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index ef7895262b8..a54935f7049 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4297,11 +4297,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , nir_intrinsic_instr *instr * aligned. Shuffling only one component would be the same as * striding it. */ -fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D, - DIV_ROUND_UP(num_components, 2)); -shuffle_16bit_data_for_32bit_write(bld, tmp, write_src, - num_components); -write_src = tmp; +write_src = shuffle_for_32bit_write(bld, write_src, 0, +num_components); } fs_reg offset_reg; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 07/14] intel/compiler: shuffle_from_32bit_read for 64-bit do_untyped_vector_read
do_untyped_vector_read is used at load_ssbo and load_shared. The previous MOVs are removed because shuffle_from_32bit_read can handle storing the shuffle results in the expected destination just using the proper offset. --- src/intel/compiler/brw_fs_nir.cpp | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 7e738ade82e..780a9e228de 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2434,16 +2434,8 @@ do_untyped_vector_read(const fs_builder , BRW_PREDICATE_NONE); /* Shuffle the 32-bit load result into valid 64-bit data */ - const fs_reg packed_result = bld.vgrf(dest.type, iter_components); - shuffle_32bit_load_result_to_64bit_data( -bld, packed_result, read_result, iter_components); - - /* Move each component to its destination */ - read_result = retype(read_result, BRW_REGISTER_TYPE_DF); - for (int c = 0; c < iter_components; c++) { -bld.MOV(offset(dest, bld, it * 2 + c), -offset(packed_result, bld, c)); - } + shuffle_from_32bit_read(bld, offset(dest, bld, it * 2), + read_result, 0, iter_components); bld.ADD(read_offset, read_offset, brw_imm_ud(16)); } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 08/14] intel/compiler: enable shuffle_from_32bit_read at 64-bit gs_input_load
This implementation avoids two unneeded MOVs for each 64-bit component. One was done in the old shuffle, to avoid cases of src/dst overlap but this is not the case. And the removed MOV was already being being done in the shuffle. Copy propagation wasn't able to remove them because shuffle destination values are defined with partial writes because they have stride == 2. --- src/intel/compiler/brw_fs_nir.cpp | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 780a9e228de..6abc7c0174d 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2305,11 +2305,11 @@ fs_visitor::emit_gs_input_load(const fs_reg , } if (type_sz(dst.type) == 8) { - shuffle_32bit_load_result_to_64bit_data( -bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components); - - for (unsigned c = 0; c < num_components; c++) -bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c)); + shuffle_from_32bit_read(bld, + offset(dst, bld, iter * 2), + retype(tmp_dst, BRW_REGISTER_TYPE_D), + 0, + num_components); } if (num_iterations > 1) { -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 00/14] intel/compiler: unshuffle/shuffle functions refactoring
64-bit types support required un/shuffle operations to deal with load/store operations that use 32-bit components. The introduction of VK_KHR_16bit_storage support also needed new dedicated shuffle/unshuffle functions. This series generalizes the different shuffle/unshuffle operations in a new helper shuffle_src_to_dst for 64/32/16/8-bit cases an creates two specific functions shuffle_from_32bit_read and shuffle_for_32bit_write to be used after and before load/store operations. I revisited all shuffle/unshuffle uses and avoids the emission of unneeded instructions caused by previously allowing source/destination overlaps in shuffles. The series refactors that cases to avoid them. As most of the changes affects 64-bit uses, I used the piglit test shaders to check the impact of the series with shader-db that is mainly possitive. Skylake total instructions in shared programs: 1688394 -> 1661420 (-1.60%) instructions in affected programs: 370480 -> 343506 (-7.28%) helped: 1048 HURT: 0 total cycles in shared programs: 52697457 -> 52527275 (-0.32%) cycles in affected programs: 1783581 -> 1613399 (-9.54%) helped: 1004 HURT: 41 Cc: Jason Ekstrand Cc: Samuel Iglesias Cc: Iago Toral Jose Maria Casanova Crespo (14): intel/compiler: general 8/16/32/64-bit shuffle_src_to_dst function intel/compiler: new shuffle_for_32bit_write and shuffle_from_32bit_read intel/compiler: use shuffle_from_32bit_read at VARYING_PULL_CONSTANT_LOAD intel/compiler: Use shuffle_from_32bit_read to read 16-bit SSBO intel/compiler: Use shuffle_from_32bit_write for 16-bits store_ssbo intel/compiler: remove old 16-bit shuffle/unshuffle functions intel/compiler: shuffle_from_32bit_read for 64-bit do_untyped_vector_read intel/compiler: enable shuffle_from_32bit_read at 64-bit gs_input_load intel/compiler: Use shuffle_from_32bit_read at VS load_input intel/compiler: shuffle_from_32bit_read at load_per_vertex_input at TCS/TES intel/compiler: use shuffle_from_32bit_read for 64-bit FS load_input intel/compiler: shuffle_32bit_load_result_to_64bit_data is not used anymore intel/compiler: use new shuffle_32bit_write for all 64-bit storage writes intel/compiler: shuffle_64bit_data_for_32bit_write is not used anymore src/intel/compiler/brw_fs.cpp | 17 +- src/intel/compiler/brw_fs.h | 29 +-- src/intel/compiler/brw_fs_nir.cpp | 315 +- 3 files changed, 147 insertions(+), 214 deletions(-) -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 02/14] intel/compiler: new shuffle_for_32bit_write and shuffle_from_32bit_read
These new shuffle functions deal with the shuffle/unshuffle operations needed for read/write operations using 32-bit components when the read/written components have a different bit-size (8, 16, 64-bits). Shuffle from 32-bit to 32-bit becomes a simple MOV. As the new function shuffle_src_to_dst takes of doing a shuffle or an unshuffle based on the different type_sz of source an destination this generic functions work with any source/destination assuming that writes use a 32-bit destination or reads use a 32-bit source. To enable this new functions it is needed than there is no source/destination overlap in the case of shuffle_from_32bit_read. That never happens on shuffle_for_32bit_write as it allocates a new destination register as it was at shuffle_64bit_data_for_32bit_write. --- src/intel/compiler/brw_fs.h | 11 + src/intel/compiler/brw_fs_nir.cpp | 38 +++ 2 files changed, 49 insertions(+) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index faf51568637..779170ecc95 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -519,6 +519,17 @@ void shuffle_16bit_data_for_32bit_write(const brw::fs_builder , const fs_reg , uint32_t components); +void shuffle_from_32bit_read(const brw::fs_builder , + const fs_reg , + const fs_reg , + uint32_t first_component, + uint32_t components); + +fs_reg shuffle_for_32bit_write(const brw::fs_builder , + const fs_reg , + uint32_t first_component, + uint32_t components); + fs_reg setup_imm_df(const brw::fs_builder , double v); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 1a9d3c41d1d..1f684149fd5 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -5454,6 +5454,44 @@ shuffle_src_to_dst(const fs_builder , } } +void +shuffle_from_32bit_read(const fs_builder , +const fs_reg , +const fs_reg , +uint32_t first_component, +uint32_t components) +{ + assert(type_sz(src.type) == 4); + + if (type_sz(dst.type) > 4) { + assert(type_sz(dst.type) == 8); + first_component *= 2; + components *= 2; + } + + shuffle_src_to_dst(bld, dst, src, first_component, components); +} + +fs_reg +shuffle_for_32bit_write(const fs_builder , +const fs_reg , +uint32_t first_component, +uint32_t components) +{ + fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, + DIV_ROUND_UP (components * type_sz(src.type), 4)); + + if (type_sz(src.type) > 4) { + assert(type_sz(src.type) == 8); + first_component *= 2; + components *= 2; + } + + shuffle_src_to_dst(bld, dst, src, first_component, components); + + return dst; +} + fs_reg setup_imm_df(const fs_builder , double v) { -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev