mesa: Changes to 'debian-experimental'

Timo Aaltonen Tue, 19 Apr 2016 01:24:56 -0700

 VERSION                                                   |    2 
 configure.ac                                              |    4 
 debian/changelog                                          |    7 
 debian/control                                            |    4 
 debian/rules                                              |    2 
 docs/relnotes/11.2.0.html                                 |    3 
 docs/relnotes/11.2.1.html                                 |  118 ++++++++
 include/D3D9/d3d9types.h                                  |   16 -
 src/compiler/glsl/glsl_lexer.ll                           |    2 
 src/compiler/glsl/lower_variable_index_to_cond_assign.cpp |   20 +
 src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp  |    2 
 src/gallium/drivers/radeonsi/si_state.c                   |    2 
 src/gallium/state_trackers/va/context.c                   |    4 
 src/gallium/state_trackers/va/image.c                     |    4 
 src/mesa/drivers/dri/common/drirc                         |    8 
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp              |   10 
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp            |   14 
 src/mesa/drivers/dri/i965/gen7_blorp.cpp                  |    5 
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c            |  198 ++++++--------
 src/mesa/state_tracker/st_cb_drawpixels.c                 |   23 +
 src/mesa/state_tracker/st_cb_fbo.c                        |    2 
 21 files changed, 319 insertions(+), 131 deletions(-)


New commits:
commit abf028947440577413c49feffbb15192701d991e
Author: Timo Aaltonen <tjaal...@debian.org>
Date:   Mon Apr 18 11:25:26 2016 +0300

    release to experimental

diff --git a/debian/changelog b/debian/changelog
index 0328b2c..9c11bba 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,9 +1,9 @@
-mesa (11.2.1-1) UNRELEASED; urgency=medium
+mesa (11.2.1-1) experimental; urgency=medium
 
   * New upstream release.
   * control, rules: Use llvm/clang 3.8.
 
- -- Timo Aaltonen <tjaal...@debian.org>  Mon, 18 Apr 2016 11:20:48 +0300
+ -- Timo Aaltonen <tjaal...@debian.org>  Mon, 18 Apr 2016 11:25:00 +0300
 
 mesa (11.2.0-1) experimental; urgency=medium
 

commit 436b3472adde14b22e9ce204820dab417cfe00c6
Author: Timo Aaltonen <tjaal...@debian.org>
Date:   Mon Apr 18 11:24:58 2016 +0300

    control, rules: Use llvm/clang 3.8.

diff --git a/debian/changelog b/debian/changelog
index cf98bea..0328b2c 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,6 +1,7 @@
 mesa (11.2.1-1) UNRELEASED; urgency=medium
 
   * New upstream release.
+  * control, rules: Use llvm/clang 3.8.
 
  -- Timo Aaltonen <tjaal...@debian.org>  Mon, 18 Apr 2016 11:20:48 +0300
 
diff --git a/debian/control b/debian/control
index e495555..19c802f 100644
--- a/debian/control
+++ b/debian/control
@@ -41,10 +41,10 @@ Build-Depends:
  libudev-dev [linux-any],
  flex,
  bison,
- llvm-3.7-dev (>= 1:3.7~+rc2) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 armhf 
ppc64el],
+ llvm-3.8-dev (>= 1:3.8) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 armhf 
ppc64el],
  libelf-dev [amd64 i386 kfreebsd-amd64 kfreebsd-i386 armhf ppc64el],
  libwayland-dev (>= 1.2.0) [linux-any],
- libclang-3.7-dev (>= 1:3.7~+rc2) [amd64 i386 armhf],
+ libclang-3.8-dev (>= 1:3.8) [amd64 i386 armhf],
  libclc-dev (>= 0.2.0+git20150813) [amd64 i386 armhf],
 Vcs-Git: https://anonscm.debian.org/git/pkg-xorg/lib/mesa.git
 Vcs-Browser: https://anonscm.debian.org/cgit/pkg-xorg/lib/mesa.git
diff --git a/debian/rules b/debian/rules
index cf40811..ff216dd 100755
--- a/debian/rules
+++ b/debian/rules
@@ -89,7 +89,7 @@ else
   ifneq (,$(filter $(DEB_HOST_ARCH),amd64 i386 kfreebsd-amd64 kfreebsd-i386 
armhf ppc64el))
        GALLIUM_DRIVERS += radeonsi
        confflags_GALLIUM += --enable-gallium-llvm
-       confflags_GALLIUM += ac_cv_path_LLVM_CONFIG=llvm-config-3.7
+       confflags_GALLIUM += ac_cv_path_LLVM_CONFIG=llvm-config-3.8
        GALLIUM_DRIVERS += swrast
   else
        DRI_DRIVERS += swrast

commit 6faab1eb3ae95da1b71883be526ba520d2ad36f9
Author: Timo Aaltonen <tjaal...@debian.org>
Date:   Mon Apr 18 11:21:58 2016 +0300

    bump changelog

diff --git a/debian/changelog b/debian/changelog
index 76bb0a3..cf98bea 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+mesa (11.2.1-1) UNRELEASED; urgency=medium
+
+  * New upstream release.
+
+ -- Timo Aaltonen <tjaal...@debian.org>  Mon, 18 Apr 2016 11:20:48 +0300
+
 mesa (11.2.0-1) experimental; urgency=medium
 
   [ Andreas Boll ]

commit 21e6440e82808364a6c2cc38ea92651c99b69aad
Author: Emil Velikov <emil.veli...@collabora.com>
Date:   Sun Apr 17 16:03:34 2016 +0100

    docs: add release notes for 11.2.1
    
    Signed-off-by: Emil Velikov <emil.veli...@collabora.com>

diff --git a/docs/relnotes/11.2.1.html b/docs/relnotes/11.2.1.html
new file mode 100644
index 0000000..3100ebb
--- /dev/null
+++ b/docs/relnotes/11.2.1.html
@@ -0,0 +1,118 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
"http://www.w3.org/TR/html4/loose.dtd";>
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.2.1 Release Notes / April 17, 2016</h1>
+
+<p>
+Mesa 11.2.1 is a bug fix release which fixes bugs found since the 11.2.0 
release.
+</p>
+<p>
+Mesa 11.2.1 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93962";>Bug 93962</a> 
- [HSW, regression, bisected, CTS] ES2-CTS.gtf.GL2FixedTests.scissor.scissor - 
segfault/asserts</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Brian Paul (2):</p>
+<ul>
+  <li>st/mesa: fix glReadBuffer() assertion failure</li>
+  <li>st/mesa: fix memleak in glDrawPixels cache code</li>
+</ul>
+
+<p>Christian Schmidbauer (1):</p>
+<ul>
+  <li>st/nine: specify WINAPI only for i386 and amd64</li>
+</ul>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.2.0</li>
+  <li>configure.ac: update the path of the generated files</li>
+  <li>Update version to 11.2.1</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>glsl: allow usage of the keyword buffer before GLSL 430 / ESSL 310</li>
+</ul>
+
+<p>Iurie Salomov (1):</p>
+<ul>
+  <li>va: check null context in vlVaDestroyContext</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>i965/tiled_memcopy: Add aligned mem_copy parameters to the [de]tiling 
functions</li>
+  <li>i965/tiled_memcpy: Rework the RGBA -&gt; BGRA mem_copy functions</li>
+</ul>
+
+<p>Kenneth Graunke (3):</p>
+<ul>
+  <li>i965: Fix textureSize() depth value for 1 layer surfaces on Gen4-6.</li>
+  <li>i965: Use brw-&gt;urb.min_vs_urb_entries instead of 32 for BLORP.</li>
+  <li>glsl: Lower variable indexing of system value arrays 
unconditionally.</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>drirc: add a workaround for blackness in Warsow</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>radeonsi: fix bounds check in si_create_vertex_elements</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>nv50/ir: do not try to attach JOIN ops to ATOM</li>
+</ul>
+
+<p>Thomas Hindoe Paaboel Andersen (1):</p>
+<ul>
+  <li>st/va: avoid dereference after free in vlVaDestroyImage</li>
+</ul>
+
+
+</div>
+</body>
+</html>

commit 696614cbff849dc3644a35307d3f82ea1ec66266
Author: Emil Velikov <emil.veli...@collabora.com>
Date:   Sun Apr 17 14:51:55 2016 +0100

    Update version to 11.2.1
    
    Signed-off-by: Emil Velikov <emil.veli...@collabora.com>

diff --git a/VERSION b/VERSION
index b85c6c7..dc170a1 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-11.2.0
+11.2.1

commit 106c1facae8f8e2e27d670397d1276958f0143c4
Author: Iurie Salomov <iur...@gmail.com>
Date:   Tue Apr 12 23:24:30 2016 +0100

    va: check null context in vlVaDestroyContext
    
    Signed-off-by: Iurie Salomov <iur...@gmail.com>
    Reviewed-by: Julien Isorce <j.iso...@samsung.com>
    (cherry picked from commit 047e3264f67bc54365be7b0e163b6910a9e9de3a)
    Nominated-by: Emil Velikov <emil.veli...@collabora.com>

diff --git a/src/gallium/state_trackers/va/context.c 
b/src/gallium/state_trackers/va/context.c
index b25c381..25d587a 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -283,6 +283,10 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID 
context_id)
    drv = VL_VA_DRIVER(ctx);
    pipe_mutex_lock(drv->mutex);
    context = handle_table_get(drv->htab, context_id);
+   if (!context) {
+      pipe_mutex_unlock(drv->mutex);
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+   }
 
    if (context->decoder) {
       if (u_reduce_video_profile(context->decoder->profile) ==

commit fbdd845ed213871c221f1076ea83621ebe779f4e
Author: Nicolai Hähnle <nicolai.haeh...@amd.com>
Date:   Tue Apr 12 12:23:31 2016 -0500

    radeonsi: fix bounds check in si_create_vertex_elements
    
    This was triggered by
    dEQP-GLES3.functional.vertex_array_objects.all_attributes
    
    Cc: "11.1 11.2" <mesa-sta...@lists.freedesktop.org>
    Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
    Reviewed-by: Marek Olšák <marek.ol...@amd.com>
    (cherry picked from commit a191e6b719848a17963f185954f1696fa5a2bcb1)

diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index 663ebb3..58573ac 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3284,7 +3284,7 @@ static void *si_create_vertex_elements(struct 
pipe_context *ctx,
        struct si_vertex_element *v = CALLOC_STRUCT(si_vertex_element);
        int i;
 
-       assert(count < SI_MAX_ATTRIBS);
+       assert(count <= SI_MAX_ATTRIBS);
        if (!v)
                return NULL;
 

commit 389d5bbc98f04740bb64ec2e9c94dcab7adcbd6b
Author: Brian Paul <bri...@vmware.com>
Date:   Mon Apr 11 18:54:28 2016 -0600

    st/mesa: fix memleak in glDrawPixels cache code
    
    If the glDrawPixels size changed, we leaked the previously cached
    texture, if there was one.  This patch fixes the reference counting,
    adds a refcount assertion check, and better handles potential malloc()
    failures.
    
    Tested with a modified version of the drawpix Mesa demo which changed
    the image size for each glDrawPixels call.
    
    Cc: "11.2" <mesa-sta...@lists.freedesktop.org>
    Reviewed-by: José Fonseca <jfons...@vmware.com>
    Reviewed-by: Charmaine Lee <charmai...@vmware.com>
    (cherry picked from commit 6c014782138634d5d36e1484bf498cef2b2d888f)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c 
b/src/mesa/state_tracker/st_cb_drawpixels.c
index 51d4ae5..4f39ed7 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -374,7 +374,7 @@ make_texture(struct st_context *st,
    struct gl_context *ctx = st->ctx;
    struct pipe_context *pipe = st->pipe;
    mesa_format mformat;
-   struct pipe_resource *pt;
+   struct pipe_resource *pt = NULL;
    enum pipe_format pipeFormat;
    GLenum baseInternalFormat;
 
@@ -393,10 +393,18 @@ make_texture(struct st_context *st,
        unpack->SkipRows == 0 &&
        unpack->SwapBytes == GL_FALSE &&
        st->drawpix_cache.image) {
+      assert(st->drawpix_cache.texture);
+
       /* check if the pixel data is the same */
       if (memcmp(pixels, st->drawpix_cache.image, width * height * bpp) == 0) {
          /* OK, re-use the cached texture */
-         return st->drawpix_cache.texture;
+         pipe_resource_reference(&pt, st->drawpix_cache.texture);
+         /* refcount of returned texture should be at least two here.  One
+          * reference for the cache to hold on to, one for the caller (which
+          * it will release), and possibly more held by the driver.
+          */
+         assert(pt->reference.count >= 2);
+         return pt;
       }
    }
 
@@ -515,8 +523,14 @@ make_texture(struct st_context *st,
       st->drawpix_cache.image = malloc(width * height * bpp);
       if (st->drawpix_cache.image) {
          memcpy(st->drawpix_cache.image, pixels, width * height * bpp);
+         pipe_resource_reference(&st->drawpix_cache.texture, pt);
+      }
+      else {
+         /* out of memory, free/disable cached texture */
+         st->drawpix_cache.width = 0;
+         st->drawpix_cache.height = 0;
+         pipe_resource_reference(&st->drawpix_cache.texture, NULL);
       }
-      st->drawpix_cache.texture = pt;
    }
 #endif
 
@@ -1150,9 +1164,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
    if (num_sampler_view > 1)
       pipe_sampler_view_reference(&sv[1], NULL);
 
-#if !USE_DRAWPIXELS_CACHE
+   /* free the texture (but may persist in the cache) */
    pipe_resource_reference(&pt, NULL);
-#endif
 }
 
 

commit 3840e67bf5a5c89fc644816e721d4527557ab3ef
Author: Emil Velikov <emil.veli...@collabora.com>
Date:   Tue Apr 5 14:29:41 2016 +0100

    configure.ac: update the path of the generated files
    
    ... in order to determine if we need bison/flex. Failing to locate the
    files will lead to mandating bison/flex even when building from a
    release tarball.
    
    CC: "11.2" <mesa-sta...@lists.freedesktop.org>
    Signed-off-by: Emil Velikov <emil.veli...@collabora.com>
    Acked-by: Jason Ekstrand <ja...@jlekstrand.net>
    Reviewed-by: Matt Turner <matts...@gmail.com>
    (cherry picked from commit c481c8f7f135d4cf17e35bb5126bdcf6b5611940)

diff --git a/configure.ac b/configure.ac
index d4ce99a..8109ac4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -110,10 +110,10 @@ LT_INIT([disable-static])
 AC_CHECK_PROG(RM, rm, [rm -f])
 
 AX_PROG_BISON([],
-              AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.c"],
+              AS_IF([test ! -f 
"$srcdir/src/compiler/glsl/glcpp/glcpp-parse.c"],
                     [AC_MSG_ERROR([bison not found - unable to compile 
glcpp-parse.y])]))
 AX_PROG_FLEX([],
-             AS_IF([test ! -f "$srcdir/src/glsl/glcpp/glcpp-lex.c"],
+             AS_IF([test ! -f "$srcdir/src/compiler/glsl/glcpp/glcpp-lex.c"],
                    [AC_MSG_ERROR([flex not found - unable to compile 
glcpp-lex.l])]))
 
 AC_CHECK_PROG(INDENT, indent, indent, cat)

commit 5b644f3a06da76cdd6d404c98696e6d78c39c0cc
Author: Ilia Mirkin <imir...@alum.mit.edu>
Date:   Sat Apr 9 13:11:42 2016 -0400

    glsl: allow usage of the keyword buffer before GLSL 430 / ESSL 310
    
    The GLSL 4.20 and ESSL 3.00 specs don't list 'buffer' as a reserved
    keyword. Make the parser ignore it unless GLSL 4.30 / ESSL 3.10 are
    used, or ARB_shader_storage_buffer_objects is enabled.
    
    Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu>
    Reviewed-by: Timothy Arceri <timothy.arc...@collabora.com>
    Cc: mesa-sta...@lists.freedesktop.org
    (cherry picked from commit 9b5bd20eb2d09e1ec2319b55c83ad7f28b6fefee)

diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 9704fc7..50e862a 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -304,7 +304,7 @@ in          return IN_TOK;
 out            return OUT_TOK;
 inout          return INOUT_TOK;
 uniform                return UNIFORM;
-buffer         return BUFFER;
+buffer         KEYWORD_WITH_ALT(0, 0, 430, 310, 
yyextra->ARB_shader_storage_buffer_object_enable, BUFFER);
 varying                DEPRECATED_ES_KEYWORD(VARYING);
 centroid       KEYWORD(120, 300, 120, 300, CENTROID);
 invariant      KEYWORD(120, 100, 120, 100, INVARIANT);

commit bbbcad4f0d915c31f25b1beee19cea216ced4796
Author: Jason Ekstrand <jason.ekstr...@intel.com>
Date:   Thu Apr 7 11:21:19 2016 -0700

    i965/tiled_memcpy: Rework the RGBA -> BGRA mem_copy functions
    
    This splits the two copy functions into three: One for unaligned copies,
    one for aligned sources, and one for aligned destinations.  Thanks to the
    previous commit, we are now guaranteed that the aligned ones will *only*
    operate on aligned memory so they should be safe.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93962
    Cc: "11.1 11.2" <mesa-sta...@lists.freedesktop.org>
    Reviewed-by: Matt Turner <matts...@gmail.com>
    Reviewed-by: Roland Scheidegger <srol...@vmware.com>
    Reviewed-by: Chad Versace <chad.vers...@intel.com>
    (cherry picked from commit d2b32656e18607f5807b3f4d4dde02568370b9bf)

diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c 
b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 89de594..8082b9d 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -56,21 +56,32 @@ static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;
 
+/**
+ * Copy RGBA to BGRA - swap R and B.
+ */
+static inline void *
+rgba8_copy(void *dst, const void *src, size_t bytes)
+{
+   uint8_t *d = dst;
+   uint8_t const *s = src;
+
+   assert(bytes % 4 == 0);
+
+   while (bytes >= 4) {
+      d[0] = s[2];
+      d[1] = s[1];
+      d[2] = s[0];
+      d[3] = s[3];
+      d += 4;
+      s += 4;
+      bytes -= 4;
+   }
+   return dst;
+}
+
 #ifdef __SSSE3__
 static const uint8_t rgba8_permutation[16] =
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
-
-/* NOTE: dst must be 16-byte aligned. src may be unaligned. */
-#define rgba8_copy_16_aligned_dst(dst, src)                            \
-   _mm_store_si128((__m128i *)(dst),                                   \
-                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
-                                    *(__m128i *) rgba8_permutation))
-
-/* NOTE: src must be 16-byte aligned. dst may be unaligned. */
-#define rgba8_copy_16_aligned_src(dst, src)                            \
-   _mm_storeu_si128((__m128i *)(dst),                                  \
-                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
-                                     *(__m128i *) rgba8_permutation))
 #endif
 
 /**
@@ -82,32 +93,21 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t 
bytes)
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      return dst;
-   }
+   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
 
-   if (bytes == 64) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_dst(d+16, s+16);
-      rgba8_copy_16_aligned_dst(d+32, s+32);
-      rgba8_copy_16_aligned_dst(d+48, s+48);
-      return dst;
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_store_si128((__m128i *)d,
+                      _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)s),
+                                       *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
    }
 #endif
 
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(d, s, bytes);
+
    return dst;
 }
 
@@ -120,32 +120,21 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t 
bytes)
    uint8_t *d = dst;
    uint8_t const *s = src;
 
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      return dst;
-   }
+   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
 
-   if (bytes == 64) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_src(d+16, s+16);
-      rgba8_copy_16_aligned_src(d+32, s+32);
-      rgba8_copy_16_aligned_src(d+48, s+48);
-      return dst;
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_storeu_si128((__m128i *)d,
+                       _mm_shuffle_epi8(_mm_load_si128((__m128i *)s),
+                                        *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
    }
 #endif
 
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(d, s, bytes);
+
    return dst;
 }
 
@@ -404,10 +393,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
                                  dst, src, src_pitch, swizzle_bit, memcpy, 
memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
@@ -415,10 +404,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
                                  memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
@@ -447,20 +436,20 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
                                  dst, src, src_pitch, swizzle_bit, memcpy, 
memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit, memcpy, 
memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
@@ -489,20 +478,20 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }
@@ -531,20 +520,20 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
       if (mem_copy == memcpy)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }
@@ -773,8 +762,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum 
format,
       if (format == GL_BGRA) {
          *mem_copy = memcpy;
       } else if (format == GL_RGBA) {
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
       }
    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
@@ -783,8 +771,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum 
format,
          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
           * use the same function.
           */
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
       } else if (format == GL_RGBA) {
          *mem_copy = memcpy;
       }

commit b7bd6944c8ccfe2b6c0b1c68e16980fcac5cf7f0
Author: Jason Ekstrand <jason.ekstr...@intel.com>
Date:   Thu Apr 7 10:52:28 2016 -0700

    i965/tiled_memcopy: Add aligned mem_copy parameters to the [de]tiling 
functions
    
    Each of the [de]tiling functions has three mem_copy calls:
    
     1) Left edge to tile boundary
     2) Tile boundary to tile boundary in a loop
     3) Tile boundary to right edge
    
    Copies 2 and 3 start at a tile edge so the pointer to tiled memory is
    guaranteed to be at least 16-byte aligned.  Copy 1, on the other hand,
    starts at some arbitrary place in the tile so it doesn't have any such
    alignment guarantees.
    
    Cc: "11.1 11.2" <mesa-sta...@lists.freedesktop.org>
    Reviewed-by: Matt Turner <matts...@gmail.com>
    Reviewed-by: Roland Scheidegger <srol...@vmware.com>
    Reviewed-by: Chad Versace <chad.vers...@intel.com>
    (cherry picked from commit f6f54a29ca9bc8c1a1a994ff4e3ee09772de78e4)

diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c 
b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 2383401..89de594 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -172,6 +172,12 @@ typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
  * Copy texture data from linear to X tile layout.
  *
  * \copydoc tile_copy_fn
+ *
+ * The mem_copy parameters allow the user to specify an alternative mem_copy
+ * function that, for instance, may do RGBA -> BGRA swizzling.  The first
+ * function must handle any memory alignment while the second function must
+ * only handle 16-byte alignment in whichever side (source or destination) is
+ * tiled.
  */
 static inline void
 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
@@ -179,7 +185,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
                  char *dst, const char *src,
                  int32_t src_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* The copy destination offset for each range copied is the sum of
     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
@@ -200,10 +207,10 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
 
       for (xo = x1; xo < x2; xo += xtile_span) {
-         mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
+         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
       }
 
-      mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
 
       src += src_pitch;
    }
@@ -220,7 +227,8 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
                  char *dst, const char *src,
                  int32_t src_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* Y tiles consist of columns that are 'ytile_span' wide (and the same 
height
     * as the tile).  Thus the destination offset for (x,y) is the sum of:
@@ -259,12 +267,12 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
        * at each step so we don't need to calculate it explicitly.
        */
       for (x = x1; x < x2; x += ytile_span) {
-         mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
+         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
          xo += bytes_per_column;
          swizzle ^= swizzle_bit;
       }
 
-      mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
 
       src += src_pitch;
    }
@@ -281,7 +289,8 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
                  char *dst, const char *src,
                  int32_t dst_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* The copy destination offset for each range copied is the sum of
     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
@@ -302,10 +311,10 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
 
       for (xo = x1; xo < x2; xo += xtile_span) {
-         mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
+         mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
       }
 
-      mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
 
       dst += dst_pitch;
    }
@@ -322,7 +331,8 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
                  char *dst, const char *src,
                  int32_t dst_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* Y tiles consist of columns that are 'ytile_span' wide (and the same 
height
     * as the tile).  Thus the destination offset for (x,y) is the sum of:
@@ -361,12 +371,12 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, 
uint32_t x3,
        * at each step so we don't need to calculate it explicitly.
        */
       for (x = x1; x < x2; x += ytile_span) {
-         mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
+         mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
          xo += bytes_per_column;
          swizzle ^= swizzle_bit;
       }
 
-      mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
 
       dst += dst_pitch;
    }
@@ -393,26 +403,27 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
       if (mem_copy == memcpy)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
+                                 dst, src, src_pitch, swizzle_bit, memcpy, 
memcpy);
       else if (mem_copy == rgba8_copy_aligned_dst)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
+                                 dst, src, src_pitch, swizzle_bit,
+                                 memcpy, memcpy);
       else if (mem_copy == rgba8_copy_aligned_dst)
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
    linear_to_xtiled(x0, x1, x2, x3, y0, y1,
-                    dst, src, src_pitch, swizzle_bit, mem_copy);
+                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
 }
 
 /**
@@ -435,26 +446,26 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
       if (mem_copy == memcpy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
+                                 dst, src, src_pitch, swizzle_bit, memcpy, 
memcpy);
       else if (mem_copy == rgba8_copy_aligned_dst)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
+                                 dst, src, src_pitch, swizzle_bit, memcpy, 
memcpy);
       else if (mem_copy == rgba8_copy_aligned_dst)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy_aligned_dst, 
rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
    linear_to_ytiled(x0, x1, x2, x3, y0, y1,
-                    dst, src, src_pitch, swizzle_bit, mem_copy);
+                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
 }
 
 /**
@@ -477,26 +488,26 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
       if (mem_copy == memcpy)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
       else if (mem_copy == rgba8_copy_aligned_src)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, 
xtile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
       else if (mem_copy == rgba8_copy_aligned_src)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }
    xtiled_to_linear(x0, x1, x2, x3, y0, y1,
-                    dst, src, dst_pitch, swizzle_bit, mem_copy);
+                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
 }
 
 /**
@@ -519,26 +530,26 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, 
uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
       if (mem_copy == memcpy)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
       else if (mem_copy == rgba8_copy_aligned_src)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, 
ytile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, 
memcpy);
       else if (mem_copy == rgba8_copy_aligned_src)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy_aligned_src, 
rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }

mesa: Changes to 'debian-experimental'

Reply via email to