X11 and GL compositor performance on VC4 has been terrible because of our
SHARED-usage buffers all being forced to linear.  This swaps SHARED &&
!LINEAR buffers over to being tiled.

This is an expected win for all GL compositors during rendering (a full
copy of each shared texture per draw call), allows X11 to be used with
decent performance without a GL compositor, and improves X11 windowed
swapbuffers performance as well.  It also halves the memory usage of
shared buffers that get textured from.  The only cost should be idle
systems with a scanout-only buffer that isn't flagged as LINEAR, in which
case the memory bandwidth cost of scanout goes up ~25%.
---
 src/gallium/drivers/vc4/vc4_bufmgr.c    |   7 ++
 src/gallium/drivers/vc4/vc4_resource.c  | 111 ++++++++++++++++++++++----------
 src/gallium/drivers/vc4/vc4_simulator.c |   8 +++
 3 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c 
b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 12af7f8a9ef2..25e95ff3c50f 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -27,6 +27,7 @@
 #include <fcntl.h>
 #include <xf86drm.h>
 #include <xf86drmMode.h>
+#include <drm_fourcc.h>
 
 #include "util/u_hash_table.h"
 #include "util/u_memory.h"
@@ -282,6 +283,12 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, 
time_t time)
                 return;
         }
 
+        struct drm_vc4_set_tiling set_tiling = {
+                .handle = bo->handle,
+                .modifier = DRM_FORMAT_MOD_NONE,
+        };
+        (void)vc4_ioctl(screen->fd, DRM_IOCTL_VC4_SET_TILING, &set_tiling);
+
         if (cache->size_list_size <= page_index) {
                 struct list_head *new_list =
                         ralloc_array(screen, struct list_head, page_index + 1);
diff --git a/src/gallium/drivers/vc4/vc4_resource.c 
b/src/gallium/drivers/vc4/vc4_resource.c
index 5aaa31d6e67d..2ff7f8b08dbe 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -29,10 +29,12 @@
 #include "util/u_surface.h"
 #include "util/u_upload_mgr.h"
 
+#include "vc4_drm.h"
 #include "vc4_screen.h"
 #include "vc4_context.h"
 #include "vc4_resource.h"
 #include "vc4_tiling.h"
+#include "drm_fourcc.h"
 
 static bool miptree_debug = false;
 
@@ -575,27 +577,67 @@ vc4_resource_create(struct pipe_screen *pscreen,
         struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl);
         struct pipe_resource *prsc = &rsc->base;
 
-        /* We have to make shared be untiled, since we don't have any way to
-         * communicate metadata about tiling currently.
+        /* Use a tiled layout if we can, for better 3D performance. */
+        rsc->tiled = true;
+
+        /* VBOs/PBOs are untiled (and 1 height). */
+        if (tmpl->target == PIPE_BUFFER)
+                rsc->tiled = false;
+
+        /* MSAA buffers are linear. */
+        if (tmpl->nr_samples > 1)
+                rsc->tiled = false;
+
+        /* No tiling when we're sharing with another device (pl111). */
+        if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT))
+                rsc->tiled = false;
+
+        /* Cursors are always linear, and the user can request linear as
+         * well.
          */
-        if (tmpl->target == PIPE_BUFFER ||
-            tmpl->nr_samples > 1 ||
-            (tmpl->bind & (PIPE_BIND_SCANOUT |
-                           PIPE_BIND_LINEAR |
-                           PIPE_BIND_SHARED |
-                           PIPE_BIND_CURSOR))) {
+        if (tmpl->bind & (PIPE_BIND_LINEAR |
+                          PIPE_BIND_CURSOR)) {
                 rsc->tiled = false;
-        } else {
-                rsc->tiled = true;
         }
 
-        if (tmpl->target != PIPE_BUFFER)
-                rsc->vc4_format = get_resource_texture_format(prsc);
+        /* No shared objects with LT format -- the kernel only has T-format
+         * metadata.  LT objects are small enough it's not worth the trouble
+         * to give them metadata to tile.
+         */
+        if ((tmpl->bind & PIPE_BIND_SHARED) &&
+            vc4_size_is_lt(prsc->width0, prsc->height0, rsc->cpp)) {
+                rsc->tiled = false;
+        }
 
         vc4_setup_slices(rsc);
         if (!vc4_resource_bo_alloc(rsc))
                 goto fail;
 
+        if (tmpl->bind & PIPE_BIND_SHARED) {
+                assert(rsc->slices[0].tiling == VC4_TILING_FORMAT_T);
+
+                struct drm_vc4_set_tiling set_tiling = {
+                        .handle = rsc->bo->handle,
+                        .modifier = DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED,
+                };
+                int ret = vc4_ioctl(screen->fd,
+                                    DRM_IOCTL_VC4_SET_TILING,
+                                    &set_tiling);
+
+                /* If we hit this, we're probably on an old kernel.  Fall back
+                 * to linear.
+                 */
+                if (ret != 0) {
+                        rsc->tiled = false;
+                        vc4_setup_slices(rsc);
+                        if (!vc4_resource_bo_alloc(rsc))
+                                goto fail;
+                }
+        }
+
+        if (tmpl->target != PIPE_BUFFER)
+                rsc->vc4_format = get_resource_texture_format(prsc);
+
         if (screen->ro && tmpl->bind & PIPE_BIND_SCANOUT) {
                 rsc->scanout =
                         renderonly_scanout_for_resource(prsc, screen->ro);
@@ -619,29 +661,10 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
         struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl);
         struct pipe_resource *prsc = &rsc->base;
         struct vc4_resource_slice *slice = &rsc->slices[0];
-        uint32_t expected_stride =
-            align(prsc->width0, vc4_utile_width(rsc->cpp)) * rsc->cpp;
 
         if (!rsc)
                 return NULL;
 
-        if (whandle->stride != expected_stride) {
-                static bool warned = false;
-                if (!warned) {
-                        warned = true;
-                        fprintf(stderr,
-                                "Attempting to import %dx%d %s with "
-                                "unsupported stride %d instead of %d\n",
-                                prsc->width0, prsc->height0,
-                                util_format_short_name(prsc->format),
-                                whandle->stride,
-                                expected_stride);
-                }
-                goto fail;
-        }
-
-        rsc->tiled = false;
-
         if (whandle->offset != 0) {
                 fprintf(stderr,
                         "Attempt to import unsupported winsys offset %u\n",
@@ -667,10 +690,17 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
         if (!rsc->bo)
                 goto fail;
 
-        slice->stride = whandle->stride;
-        slice->tiling = VC4_TILING_FORMAT_LINEAR;
+        struct drm_vc4_get_tiling get_tiling = {
+                .handle = rsc->bo->handle,
+        };
+        int ret = vc4_ioctl(screen->fd, DRM_IOCTL_VC4_GET_TILING, &get_tiling);
+        if (ret == 0 &&
+            get_tiling.modifier == DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED) {
+                rsc->tiled = true;
+        }
 
         rsc->vc4_format = get_resource_texture_format(prsc);
+        vc4_setup_slices(rsc);
 
         if (screen->ro) {
                 /* Make sure that renderonly has a handle to our buffer in the
@@ -693,6 +723,21 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
                         slice->stride, slice->offset);
         }
 
+        if (whandle->stride != rsc->slices[0].stride) {
+                static bool warned = false;
+                if (!warned) {
+                        warned = true;
+                        fprintf(stderr,
+                                "Attempting to import %dx%d %s with "
+                                "unsupported stride %d instead of %d\n",
+                                prsc->width0, prsc->height0,
+                                util_format_short_name(prsc->format),
+                                whandle->stride,
+                                rsc->slices[0].stride);
+                }
+                goto fail;
+        }
+
         return prsc;
 
 fail:
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c 
b/src/gallium/drivers/vc4/vc4_simulator.c
index ab701ab56093..bd063a843267 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -658,9 +658,17 @@ vc4_simulator_ioctl(int fd, unsigned long request, void 
*args)
         case DRM_IOCTL_GEM_CLOSE:
                 return vc4_simulator_gem_close_ioctl(fd, args);
 
+        case DRM_IOCTL_VC4_GET_TILING:
+        case DRM_IOCTL_VC4_SET_TILING:
+                /* Disable these for now, since the sharing with i965 requires
+                 * linear buffers.
+                 */
+                return -1;
+
         case DRM_IOCTL_GEM_OPEN:
         case DRM_IOCTL_GEM_FLINK:
                 return drmIoctl(fd, request, args);
+
         default:
                 fprintf(stderr, "Unknown ioctl 0x%08x\n", (int)request);
                 abort();
-- 
2.11.0

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to