On 12.10.2016 15:54, Marek Olšák wrote:
From: Marek Olšák <marek.ol...@amd.com>

so that decompress blits aren't needed and depth texturing needs less
memory bandwidth.

Z16 and Z24 are promoted to Z32_FLOAT by the driver, because TC-compatible
HTILE only supports Z32_FLOAT. This doubles memory footprint for Z16.
The format promotion is not visible to state trackers.

This is part of TC-compatible renderbuffer compression, which has 3 parts:
DCC, HTILE, FMASK. Only TC-compatible FMASK compression is missing now.

I don't see a measurable increase in performance though.

(I tested Talos Principle and DiRT: Showdown, the latter is improved by
 0.5%, which is almost noise, and it originally used layered Z16,
 so at least we know that Z16 promoted to Z32F isn't slower now)

Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>

---
 src/gallium/drivers/radeon/r600_pipe_common.h  |  3 ++
 src/gallium/drivers/radeon/r600_texture.c      | 67 ++++++++++++++++++++++----
 src/gallium/drivers/radeon/radeon_winsys.h     |  4 ++
 src/gallium/drivers/radeonsi/si_blit.c         | 11 ++++-
 src/gallium/drivers/radeonsi/si_descriptors.c  |  7 ++-
 src/gallium/drivers/radeonsi/si_shader.c       | 18 ++++++-
 src/gallium/drivers/radeonsi/si_state.c        | 39 +++++++++++++--
 src/gallium/drivers/radeonsi/si_state_draw.c   |  3 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_surface.c | 57 ++++++++++++++++++++--
 9 files changed, 185 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
b/src/gallium/drivers/radeon/r600_pipe_common.h
index 290b228..5cfcad6 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -238,27 +238,29 @@ struct r600_cmask_info {
        unsigned yalign;
        unsigned slice_tile_max;
        unsigned base_address_reg;
 };

 struct r600_htile_info {
        unsigned pitch;
        unsigned height;
        unsigned xalign;
        unsigned yalign;
+       unsigned alignment;
 };

 struct r600_texture {
        struct r600_resource            resource;

        uint64_t                        size;
        unsigned                        num_level0_transfers;
+       enum pipe_format                db_render_format;
        bool                            is_depth;
        bool                            db_compatible;
        bool                            can_sample_z;
        bool                            can_sample_s;
        unsigned                        dirty_level_mask; /* each bit says if 
that mipmap is compressed */
        unsigned                        stencil_dirty_level_mask; /* each bit 
says if that mipmap is compressed */
        struct r600_texture             *flushed_depth_texture;
        struct radeon_surf              surface;

        /* Colorbuffer compression and fast clear. */
@@ -266,20 +268,21 @@ struct r600_texture {
        struct r600_cmask_info          cmask;
        struct r600_resource            *cmask_buffer;
        uint64_t                        dcc_offset; /* 0 = disabled */
        unsigned                        cb_color_info; /* fast clear enable bit 
*/
        unsigned                        color_clear_value[2];
        unsigned                        last_msaa_resolve_target_micro_mode;

        /* Depth buffer compression and fast clear. */
        struct r600_htile_info          htile;
        struct r600_resource            *htile_buffer;
+       bool                            tc_compatible_htile;
        bool                            depth_cleared; /* if it was cleared at 
least once */
        float                           depth_clear_value;
        bool                            stencil_cleared; /* if it was cleared 
at least once */
        uint8_t                         stencil_clear_value;

        bool                            non_disp_tiling; /* R600-Cayman only */

        /* Whether the texture is a displayable back buffer and needs DCC
         * decompression, which is expensive. Therefore, it's enabled only
         * if statistics suggest that it will pay off and it's allocated
diff --git a/src/gallium/drivers/radeon/r600_texture.c 
b/src/gallium/drivers/radeon/r600_texture.c
index 57cdbcf..625d091 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -185,21 +185,22 @@ static unsigned r600_texture_get_offset(struct 
r600_texture *rtex, unsigned leve
        return rtex->surface.level[level].offset +
               box->z * rtex->surface.level[level].slice_size +
               box->y / util_format_get_blockheight(format) * 
rtex->surface.level[level].pitch_bytes +
               box->x / util_format_get_blockwidth(format) * 
util_format_get_blocksize(format);
 }

 static int r600_init_surface(struct r600_common_screen *rscreen,
                             struct radeon_surf *surface,
                             const struct pipe_resource *ptex,
                             unsigned array_mode,
-                            bool is_flushed_depth)
+                            bool is_flushed_depth,
+                            bool tc_compatible_htile)
 {
        const struct util_format_description *desc =
                util_format_description(ptex->format);
        bool is_depth, is_stencil;

        is_depth = util_format_has_depth(desc);
        is_stencil = util_format_has_stencil(desc);

        surface->npix_x = ptex->width0;
        surface->npix_y = ptex->height0;
@@ -249,25 +250,36 @@ static int r600_init_surface(struct r600_common_screen 
*rscreen,
                surface->flags |= RADEON_SURF_SET(RADEON_SURF_TYPE_CUBEMAP, 
TYPE);
                break;
        case PIPE_BUFFER:
        default:
                return -EINVAL;
        }

        if (!is_flushed_depth && is_depth) {
                surface->flags |= RADEON_SURF_ZBUFFER;

+               if (tc_compatible_htile &&
+                   array_mode == RADEON_SURF_MODE_2D) {
+                       /* TC-compatible HTILE only supports Z32_FLOAT.
+                        * Promote Z16 to Z32. DB->CB copies will convert
+                        * the format for transfers.
+                        */
+                       surface->bpe = 4;
+                       surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+               }
+
                if (is_stencil) {
                        surface->flags |= RADEON_SURF_SBUFFER |
                                          RADEON_SURF_HAS_SBUFFER_MIPTREE;
                }
        }
+
        if (rscreen->chip_class >= SI) {
                surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
        }

        if (rscreen->chip_class >= VI &&
            (ptex->flags & R600_RESOURCE_FLAG_DISABLE_DCC ||
             ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT))
                surface->flags |= RADEON_SURF_DISABLE_DCC;

        if (ptex->bind & PIPE_BIND_SCANOUT) {
@@ -897,43 +909,57 @@ static unsigned r600_texture_get_htile_size(struct 
r600_common_screen *rscreen,
        slice_elements = (width * height) / (8 * 8);
        slice_bytes = slice_elements * 4;

        pipe_interleave_bytes = rscreen->info.pipe_interleave_bytes;
        base_align = num_pipes * pipe_interleave_bytes;

        rtex->htile.pitch = width;
        rtex->htile.height = height;
        rtex->htile.xalign = cl_width * 8;
        rtex->htile.yalign = cl_height * 8;
+       rtex->htile.alignment = base_align;

        return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
                align(slice_bytes, base_align);
 }

 static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
                                        struct r600_texture *rtex)
 {
-       unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
+       uint64_t htile_size, alignment;
+       uint32_t clear_value;
+
+       if (rtex->tc_compatible_htile) {
+               htile_size = rtex->surface.htile_size;
+               alignment = rtex->surface.htile_alignment;
+               clear_value = 0x0000030F;
+       } else {
+               htile_size = r600_texture_get_htile_size(rscreen, rtex);
+               alignment = rtex->htile.alignment;
+               clear_value = 0;
+       }

        if (!htile_size)
                return;

        rtex->htile_buffer = (struct r600_resource*)
-                            pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-                                               PIPE_USAGE_DEFAULT, htile_size);
+                            r600_aligned_buffer_create(&rscreen->b, 
PIPE_BIND_CUSTOM,
+                                                       PIPE_USAGE_DEFAULT,
+                                                       htile_size, alignment);
        if (rtex->htile_buffer == NULL) {
                /* this is not a fatal error as we can still keep rendering
                 * without htile buffer */
                R600_ERR("Failed to create buffer object for htile buffer.\n");
        } else {
-               r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0,
-                                        htile_size, 0, R600_COHERENCY_NONE);
+               r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
+                                        0, htile_size, clear_value,
+                                        R600_COHERENCY_NONE);
        }
 }

 void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 {
        int i;

        fprintf(f, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
                "blk_h=%u, blk_d=%u, array_size=%u, last_level=%u, "
                "bpe=%u, nsamples=%u, flags=0x%x, %s\n",
@@ -960,24 +986,25 @@ void r600_print_texture_info(struct r600_texture *rtex, 
FILE *f)

        if (rtex->cmask.size)
                fprintf(f, "  CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, 
pitch=%u, "
                        "height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n",
                        rtex->cmask.offset, rtex->cmask.size, 
rtex->cmask.alignment,
                        rtex->cmask.pitch, rtex->cmask.height, 
rtex->cmask.xalign,
                        rtex->cmask.yalign, rtex->cmask.slice_tile_max);

        if (rtex->htile_buffer)
                fprintf(f, "  HTile: size=%u, alignment=%u, pitch=%u, height=%u, 
"
-                       "xalign=%u, yalign=%u\n",
+                       "xalign=%u, yalign=%u, TC_compatible = %u\n",
                        rtex->htile_buffer->b.b.width0,
                        rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
-                       rtex->htile.height, rtex->htile.xalign, 
rtex->htile.yalign);
+                       rtex->htile.height, rtex->htile.xalign, 
rtex->htile.yalign,
+                       rtex->tc_compatible_htile);

        if (rtex->dcc_offset) {
                fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", 
alignment=%"PRIu64"\n",
                        rtex->dcc_offset, rtex->surface.dcc_size,
                        rtex->surface.dcc_alignment);
                for (i = 0; i <= rtex->surface.last_level; i++)
                        fprintf(f, "  DCCLevel[%i]: enabled=%u, offset=%"PRIu64", 
"
                                "fast_clear_size=%"PRIu64"\n",
                                i, rtex->surface.level[i].dcc_enabled,
                                rtex->surface.level[i].dcc_offset,
@@ -1047,20 +1074,30 @@ r600_texture_create_object(struct pipe_screen *screen,

        /* don't include stencil-only formats which we don't support for 
rendering */
        rtex->is_depth = 
util_format_has_depth(util_format_description(rtex->resource.b.b.format));

        rtex->surface = *surface;
        if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
                FREE(rtex);
                return NULL;
        }

+       rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
+       assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
+              rtex->tc_compatible_htile);
+
+       /* TC-compatible HTILE only supports Z32_FLOAT. */
+       if (rtex->tc_compatible_htile)
+               rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+       else
+               rtex->db_render_format = base->format;
+
        /* Tiled depth textures utilize the non-displayable tile order.
         * This must be done after r600_setup_surface.
         * Applies to R600-Cayman. */
        rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode 
>= RADEON_SURF_MODE_1D;
        /* Applies to GCN. */
        rtex->last_msaa_resolve_target_micro_mode = 
rtex->surface.micro_tile_mode;

        /* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
         * between frames, so the only thing that can enable separate DCC
         * with DRI2 is multiple slow clears within a frame.
@@ -1234,25 +1271,34 @@ static unsigned r600_choose_tiling(struct 
r600_common_screen *rscreen,

        /* The allocator will switch to 1D if needed. */
        return RADEON_SURF_MODE_2D;
 }

 struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
                                          const struct pipe_resource *templ)
 {
        struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
        struct radeon_surf surface = {0};
+       bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH;
+       bool tc_compatible_htile =
+               rscreen->chip_class >= VI &&
+               (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+               !(rscreen->debug_flags & DBG_NO_HYPERZ) &&
+               !is_flushed_depth &&
+               templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient 
with MSAA */
+               util_format_is_depth_or_stencil(templ->format);
+
        int r;

        r = r600_init_surface(rscreen, &surface, templ,
                              r600_choose_tiling(rscreen, templ),
-                             templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
+                             is_flushed_depth, tc_compatible_htile);
        if (r) {
                return NULL;
        }
        r = rscreen->ws->surface_best(rscreen->ws, &surface);
        if (r) {
                return NULL;
        }
        return (struct pipe_resource *)r600_texture_create_object(screen, 
templ, 0,
                                                                  0, NULL, 
&surface);
 }
@@ -1289,21 +1335,22 @@ static struct pipe_resource 
*r600_texture_from_handle(struct pipe_screen *screen
        surface.mtilea = metadata.mtilea;
        surface.num_banks = metadata.num_banks;

        if (metadata.macrotile == RADEON_LAYOUT_TILED)
                array_mode = RADEON_SURF_MODE_2D;
        else if (metadata.microtile == RADEON_LAYOUT_TILED)
                array_mode = RADEON_SURF_MODE_1D;
        else
                array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;

-       r = r600_init_surface(rscreen, &surface, templ, array_mode, false);
+       r = r600_init_surface(rscreen, &surface, templ, array_mode,
+                             false, false);
        if (r) {
                return NULL;
        }

        if (metadata.scanout)
                surface.flags |= RADEON_SURF_SCANOUT;

        rtex = r600_texture_create_object(screen, templ, stride,
                                          offset, buf, &surface);
        if (!rtex)
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 7146737..8946209 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -271,20 +271,21 @@ enum radeon_feature_id {
 #define     RADEON_SURF_MODE_1D                     2
 #define     RADEON_SURF_MODE_2D                     3
 #define RADEON_SURF_SCANOUT                     (1 << 16)
 #define RADEON_SURF_ZBUFFER                     (1 << 17)
 #define RADEON_SURF_SBUFFER                     (1 << 18)
 #define RADEON_SURF_Z_OR_SBUFFER                (RADEON_SURF_ZBUFFER | 
RADEON_SURF_SBUFFER)
 #define RADEON_SURF_HAS_SBUFFER_MIPTREE         (1 << 19)
 #define RADEON_SURF_HAS_TILE_MODE_INDEX         (1 << 20)
 #define RADEON_SURF_FMASK                       (1 << 21)
 #define RADEON_SURF_DISABLE_DCC                 (1 << 22)
+#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)

 #define RADEON_SURF_GET(v, field)   (((v) >> RADEON_SURF_ ## field ## _SHIFT) 
& RADEON_SURF_ ## field ## _MASK)
 #define RADEON_SURF_SET(v, field)   (((v) & RADEON_SURF_ ## field ## _MASK) << 
RADEON_SURF_ ## field ## _SHIFT)
 #define RADEON_SURF_CLR(v, field)   ((v) & ~(RADEON_SURF_ ## field ## _MASK << 
RADEON_SURF_ ## field ## _SHIFT))

 struct radeon_surf_level {
     uint64_t                    offset;
     uint64_t                    slice_size;
     uint32_t                    npix_x;
     uint32_t                    npix_y;
@@ -337,20 +338,23 @@ struct radeon_surf {
     /* Whether the depth miptree or stencil miptree as used by the DB are
      * adjusted from their TC compatible form to ensure depth/stencil
      * compatibility. If either is true, the corresponding plane cannot be
      * sampled from.
      */
     bool                        depth_adjusted;
     bool                        stencil_adjusted;

     uint64_t                    dcc_size;
     uint64_t                    dcc_alignment;
+    /* TC-compatible HTILE only. */
+    uint64_t                    htile_size;
+    uint64_t                    htile_alignment;
 };

 struct radeon_bo_list_item {
     uint64_t bo_size;
     uint64_t vm_address;
     uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */
 };

 struct radeon_winsys {
     /**
diff --git a/src/gallium/drivers/radeonsi/si_blit.c 
b/src/gallium/drivers/radeonsi/si_blit.c
index c143601..db41f56 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -325,20 +325,22 @@ si_flush_depth_texture(struct si_context *sctx,
                levels_s = level_mask & tex->stencil_dirty_level_mask;

                if (levels_s) {
                        if (r600_can_sample_zs(tex, true))
                                inplace_planes |= PIPE_MASK_S;
                        else
                                copy_planes |= PIPE_MASK_S;
                }
        }

+       assert(!tex->tc_compatible_htile || levels_z == 0);
+
        /* We may have to allocate the flushed texture here when called from
         * si_decompress_subresource.
         */
        if (copy_planes &&
            (tex->flushed_depth_texture ||
             r600_init_flushed_depth_texture(&sctx->b.b, &tex->resource.b.b, 
NULL))) {
                struct r600_texture *dst = tex->flushed_depth_texture;
                unsigned fully_copied_levels;
                unsigned levels = 0;

@@ -692,35 +694,40 @@ static void si_clear(struct pipe_context *ctx, unsigned 
buffers,
                        tex = (struct r600_texture *)fb->cbufs[i]->texture;
                        if (tex->fmask.size == 0)
                                tex->dirty_level_mask &= ~(1 << 
fb->cbufs[i]->u.tex.level);
                }
        }

        if (zstex && zstex->htile_buffer &&
            zsbuf->u.tex.level == 0 &&
            zsbuf->u.tex.first_layer == 0 &&
            zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) 
{
-               if (buffers & PIPE_CLEAR_DEPTH) {
+               /* TC-compatible HTILE only supports depth clears to 0 or 1. */
+               if (buffers & PIPE_CLEAR_DEPTH &&
+                   (!zstex->tc_compatible_htile ||
+                    depth == 0 || depth == 1)) {
                        /* Need to disable EXPCLEAR temporarily if clearing
                         * to a new value. */
                        if (!zstex->depth_cleared || zstex->depth_clear_value 
!= depth) {
                                sctx->db_depth_disable_expclear = true;
                        }

                        zstex->depth_clear_value = depth;
                        sctx->framebuffer.dirty_zsbuf = true;
                        si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* 
updates DB_DEPTH_CLEAR */
                        sctx->db_depth_clear = true;
                        si_mark_atom_dirty(sctx, &sctx->db_render_state);
                }

-               if (buffers & PIPE_CLEAR_STENCIL) {
+               /* TC-compatible HTILE only supports stencil clears to 0. */
+               if (buffers & PIPE_CLEAR_STENCIL &&
+                   (!zstex->tc_compatible_htile || stencil == 0)) {
                        stencil &= 0xff;

                        /* Need to disable EXPCLEAR temporarily if clearing
                         * to a new value. */
                        if (!zstex->stencil_cleared || 
zstex->stencil_clear_value != stencil) {
                                sctx->db_stencil_disable_expclear = true;
                        }

                        zstex->stencil_clear_value = stencil;
                        sctx->framebuffer.dirty_zsbuf = true;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 350242a..19cae65 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -392,20 +392,23 @@ void si_set_mutable_tex_desc_fields(struct r600_texture 
*tex,
        state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
        state[3] |= S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level,
                                                             is_stencil));
        state[4] |= S_008F20_PITCH(pitch - 1);

        if (tex->dcc_offset && tex->surface.level[first_level].dcc_enabled) {
                state[6] |= S_008F28_COMPRESSION_EN(1);
                state[7] = ((!tex->dcc_separate_buffer ? 
tex->resource.gpu_address : 0) +
                            tex->dcc_offset +
                            base_level_info->dcc_offset) >> 8;
+       } else if (tex->tc_compatible_htile) {
+               state[6] |= S_008F28_COMPRESSION_EN(1);
+               state[7] = tex->htile_buffer->gpu_address >> 8;
        }
 }

 static void si_set_sampler_view(struct si_context *sctx,
                                unsigned shader,
                                unsigned slot, struct pipe_sampler_view *view,
                                bool disallow_early_out)
 {
        struct si_sampler_views *views = &sctx->samplers[shader].views;
        struct si_sampler_view *rview = (struct si_sampler_view*)view;
@@ -501,22 +504,24 @@ static void si_set_sampler_views(struct pipe_context *ctx,
                        samplers->compressed_colortex_mask &= ~(1u << slot);
                        si_set_sampler_view(sctx, shader, slot, NULL, false);
                        continue;
                }

                si_set_sampler_view(sctx, shader, slot, views[i], false);

                if (views[i]->texture && views[i]->texture->target != 
PIPE_BUFFER) {
                        struct r600_texture *rtex =
                                (struct r600_texture*)views[i]->texture;
+                       struct si_sampler_view *rview = (struct si_sampler_view 
*)views[i];

-                       if (rtex->db_compatible) {
+                       if (rtex->db_compatible &&
+                           (!rtex->tc_compatible_htile || 
rview->is_stencil_sampler)) {
                                samplers->depth_texture_mask |= 1u << slot;
                        } else {
                                samplers->depth_texture_mask &= ~(1u << slot);
                        }
                        if (is_compressed_colortex(rtex)) {
                                samplers->compressed_colortex_mask |= 1u << 
slot;
                        } else {
                                samplers->compressed_colortex_mask &= ~(1u << 
slot);
                        }

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 49d4121..621d57f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4535,26 +4535,40 @@ static void tex_fetch_args(
        }

        /* Pack LOD bias value */
        if (opcode == TGSI_OPCODE_TXB)
                address[count++] = coords[3];
        if (opcode == TGSI_OPCODE_TXB2)
                address[count++] = lp_build_emit_fetch(bld_base, inst, 1, 
TGSI_CHAN_X);

        /* Pack depth comparison value */
        if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
+               LLVMValueRef z;
+
                if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-                       address[count++] = lp_build_emit_fetch(bld_base, inst, 
1, TGSI_CHAN_X);
+                       z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
                } else {
                        assert(ref_pos >= 0);
-                       address[count++] = coords[ref_pos];
+                       z = coords[ref_pos];
                }
+
+               /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
+                * so the depth comparison value isn't clamped for Z16 and
+                * Z24 anymore. Do it manually here.
+                *
+                * It's unnecessary if the original texture format was
+                * Z32_FLOAT, but we don't know that here.
+                */
+               if (ctx->screen->b.chip_class == VI)
+                       z = radeon_llvm_saturate(bld_base, z);
+
+               address[count++] = z;
        }

        /* Pack user derivatives */
        if (opcode == TGSI_OPCODE_TXD) {
                int param, num_src_deriv_channels;

                switch (target) {
                case TGSI_TEXTURE_3D:
                        num_src_deriv_channels = 3;
                        num_deriv_channels = 3;
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index ad65fc2..b23749c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -679,20 +679,23 @@ static void si_emit_clip_regs(struct si_context *sctx, 
struct r600_atom *atom)
 /*
  * inferred state between framebuffer and rasterizer
  */
 static void si_update_poly_offset_state(struct si_context *sctx)
 {
        struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;

        if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf)
                return;

+       /* Use the user format, not db_render_format, so that the polygon
+        * offset behaves as expected by applications.
+        */
        switch (sctx->framebuffer.state.zsbuf->texture->format) {
        case PIPE_FORMAT_Z16_UNORM:
                si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
                break;
        default: /* 24-bit */
                si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
                break;
        case PIPE_FORMAT_Z32_FLOAT:
        case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
                si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
@@ -2133,32 +2136,32 @@ static void si_init_depth_surface(struct si_context 
*sctx,
                                  struct r600_surface *surf)
 {
        struct r600_texture *rtex = (struct r600_texture*)surf->base.texture;
        unsigned level = surf->base.u.tex.level;
        struct radeon_surf_level *levelinfo = &rtex->surface.level[level];
        unsigned format;
        uint32_t z_info, s_info, db_depth_info;
        uint64_t z_offs, s_offs;
        uint32_t db_htile_data_base, db_htile_surface;

-       format = si_translate_dbformat(rtex->resource.b.b.format);
+       format = si_translate_dbformat(rtex->db_render_format);

        if (format == V_028040_Z_INVALID) {
                R600_ERR("Invalid DB format: %d, disabling DB.\n", 
rtex->resource.b.b.format);
        }
        assert(format != V_028040_Z_INVALID);

        s_offs = z_offs = rtex->resource.gpu_address;
        z_offs += rtex->surface.level[level].offset;
        s_offs += rtex->surface.stencil_level[level].offset;

-       db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
+       db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!rtex->tc_compatible_htile);

        z_info = S_028040_FORMAT(format);
        if (rtex->resource.b.b.nr_samples > 1) {
                z_info |= 
S_028040_NUM_SAMPLES(util_logbase2(rtex->resource.b.b.nr_samples));
        }

        if (rtex->surface.flags & RADEON_SURF_SBUFFER)
                s_info = S_028044_FORMAT(V_028044_STENCIL_8);
        else
                s_info = S_028044_FORMAT(V_028044_STENCIL_INVALID);
@@ -2201,27 +2204,51 @@ static void si_init_depth_surface(struct si_context 
*sctx,
                         * uses. Problem was reproduced on Verde, Bonaire,
                         * Tonga, and Carrizo.
                         *
                         * Disabling EXPCLEAR works around the problem.
                         *
                         * Check piglit's arb_texture_multisample-stencil-clear
                         * test if you want to try changing this.
                         */
                        if (rtex->resource.b.b.nr_samples <= 1)
                                s_info |= S_028044_ALLOW_EXPCLEAR(1);
-               } else
-                       /* Use all of the htile_buffer for depth if there's no 
stencil. */
+               } else if (!rtex->tc_compatible_htile) {
+                       /* Use all of the htile_buffer for depth if there's no 
stencil.
+                        * This must not be set when TC-compatible HTILE is 
enabled
+                        * due to a hw bug.
+                        */
                        s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+               }

                uint64_t va = rtex->htile_buffer->gpu_address;
                db_htile_data_base = va >> 8;
                db_htile_surface = S_028ABC_FULL_CACHE(1);
+
+               if (rtex->tc_compatible_htile) {
+                       db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+
+                       switch (rtex->resource.b.b.nr_samples) {
+                       case 0:
+                       case 1:
+                               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
+                               break;
+                       case 2:
+                       case 4:
+                               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
+                               break;
+                       case 8:
+                               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
+                               break;
+                       default:
+                               assert(0);
+                       }
+               }
        } else {
                db_htile_data_base = 0;
                db_htile_surface = 0;
        }

        assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);

        surf->db_depth_view = 
S_028008_SLICE_START(surf->base.u.tex.first_layer) |
                              S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
        surf->db_htile_data_base = db_htile_data_base;
@@ -2349,20 +2376,21 @@ static void si_set_framebuffer_state(struct 
pipe_context *ctx,

                if (rtex->dcc_gather_statistics) {
                        /* Dirty tracking must be enabled for DCC usage 
analysis. */
                        sctx->framebuffer.compressed_cb_mask |= 1 << i;
                        vi_separate_dcc_start_query(ctx, rtex);
                }
        }

        if (state->zsbuf) {
                surf = (struct r600_surface*)state->zsbuf;
+               rtex = (struct r600_texture*)surf->base.texture;

                if (!surf->depth_initialized) {
                        si_init_depth_surface(sctx, surf);
                }
                r600_context_add_resource_size(ctx, surf->base.texture);
        }

        si_update_poly_offset_state(sctx);
        si_mark_atom_dirty(sctx, &sctx->cb_render_state);
        si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
@@ -3014,20 +3042,23 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
                 */
                if (tmp->flushed_depth_texture->resource.b.b.format != 
tmp->resource.b.b.format)
                        pipe_format = 
tmp->flushed_depth_texture->resource.b.b.format;

                tmp = tmp->flushed_depth_texture;
        }

        surflevel = tmp->surface.level;

        if (tmp->db_compatible) {
+               if (!view->is_stencil_sampler)
+                       pipe_format = tmp->db_render_format;
+
                switch (pipe_format) {
                case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
                        pipe_format = PIPE_FORMAT_Z32_FLOAT;
                        break;
                case PIPE_FORMAT_X8Z24_UNORM:
                case PIPE_FORMAT_S8_UINT_Z24_UNORM:
                        /* Z24 is always stored like this for DB
                         * compatibility.
                         */
                        pipe_format = PIPE_FORMAT_Z24X8_UNORM;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index c14e852..d18137b 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1111,21 +1111,22 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
             sctx->b.family == CHIP_FIJI) &&
            r600_get_strmout_en(&sctx->b)) {
                sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
        }

        /* Set the depth buffer as dirty. */
        if (sctx->framebuffer.state.zsbuf) {
                struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
                struct r600_texture *rtex = (struct r600_texture 
*)surf->texture;

-               rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+               if (!rtex->tc_compatible_htile)
+                       rtex->dirty_level_mask |= 1 << surf->u.tex.level;

                if (rtex->surface.flags & RADEON_SURF_SBUFFER)
                        rtex->stencil_dirty_level_mask |= 1 << 
surf->u.tex.level;
        }
        if (sctx->framebuffer.compressed_cb_mask) {
                struct pipe_surface *surf;
                struct r600_texture *rtex;
                unsigned mask = sctx->framebuffer.compressed_cb_mask;

                do {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
index 8bfea45..1bf07a7 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -130,20 +130,21 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
       regValue.pMacroTileConfig = NULL;
       regValue.noOfMacroEntries = 0;
    } else {
       regValue.pMacroTileConfig = ws->amdinfo.gb_macro_tile_mode;
       regValue.noOfMacroEntries = ARRAY_SIZE(ws->amdinfo.gb_macro_tile_mode);
    }

    createFlags.value = 0;
    createFlags.useTileIndex = 1;
    createFlags.degradeBaseLevel = 1;
+   createFlags.useHtileSliceAlign = 1;

    addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
    addrCreateInput.chipFamily = ws->family;
    addrCreateInput.chipRevision = ws->rev_id;
    addrCreateInput.createFlags = createFlags;
    addrCreateInput.callbacks.allocSysMem = allocSysMem;
    addrCreateInput.callbacks.freeSysMem = freeSysMem;
    addrCreateInput.callbacks.debugPrint = 0;
    addrCreateInput.regValue = regValue;

@@ -153,21 +154,23 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)

    return addrCreateOutput.hLib;
 }

 static int compute_level(struct amdgpu_winsys *ws,
                          struct radeon_surf *surf, bool is_stencil,
                          unsigned level, unsigned type, bool compressed,
                          ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
                          ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
                          ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
-                         ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut)
+                         ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
+                         ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
+                         ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
 {
    struct radeon_surf_level *surf_level;
    ADDR_E_RETURNCODE ret;

    AddrSurfInfoIn->mipLevel = level;
    AddrSurfInfoIn->width = u_minify(surf->npix_x, level);
    AddrSurfInfoIn->height = u_minify(surf->npix_y, level);

    if (type == RADEON_SURF_TYPE_3D)
       AddrSurfInfoIn->numSlices = u_minify(surf->npix_z, level);
@@ -250,20 +253,46 @@ static int compute_level(struct amdgpu_winsys *ws,

       if (ret == ADDR_OK) {
          surf_level->dcc_offset = surf->dcc_size;
          surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
          surf_level->dcc_enabled = true;
          surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
          surf->dcc_alignment = MAX2(surf->dcc_alignment, 
AddrDccOut->dccRamBaseAlign);
       }
    }

+   /* TC-compatible HTILE. */
+   if (!is_stencil &&
+       AddrSurfInfoIn->flags.depth &&
+       AddrSurfInfoIn->flags.tcCompatible &&
+       surf_level->mode == RADEON_SURF_MODE_2D &&
+       level == 0) {
+      AddrHtileIn->flags.tcCompatible = 1;
+      AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
+      AddrHtileIn->height = AddrSurfInfoOut->height;
+      AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
+      AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
+      AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
+      AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+      ret = AddrComputeHtileInfo(ws->addrlib,
+                                 AddrHtileIn,
+                                 AddrHtileOut);
+
+      if (ret == ADDR_OK) {
+         surf->htile_size = AddrHtileOut->htileBytes;
+         surf->htile_alignment = AddrHtileOut->baseAlign;
+      }
+   }
+
    return 0;
 }

 #define   G_009910_MICRO_TILE_MODE(x)          (((x) >> 0) & 0x03)
 #define   G_009910_MICRO_TILE_MODE_NEW(x)      (((x) >> 22) & 0x07)

 static void set_micro_tile_mode(struct radeon_surf *surf,
                                 struct radeon_info *info)
 {
    uint32_t tile_mode = info->si_tile_mode_array[surf->tiling_index[0]];
@@ -277,32 +306,36 @@ static void set_micro_tile_mode(struct radeon_surf *surf,
 static int amdgpu_surface_init(struct radeon_winsys *rws,
                                struct radeon_surf *surf)
 {
    struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
    unsigned level, mode, type;
    bool compressed;
    ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
    ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
    ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
    ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
+   ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
+   ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
    ADDR_TILEINFO AddrTileInfoIn = {0};
    ADDR_TILEINFO AddrTileInfoOut = {0};
    int r;

    r = amdgpu_surface_sanity(surf);
    if (r)
       return r;

    AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
    AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
    AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
    AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
+   AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
+   AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
    AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;

    type = RADEON_SURF_GET(surf->flags, TYPE);
    mode = RADEON_SURF_GET(surf->flags, MODE);
    compressed = surf->blk_w == 4 && surf->blk_h == 4;

    /* MSAA and FMASK require 2D tiling. */
    if (surf->nsamples > 1 ||
        (surf->flags & RADEON_SURF_FMASK))
       mode = RADEON_SURF_MODE_2D;
@@ -354,21 +387,26 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
       AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
    else
       AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;

    AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
    AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
    AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
    AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
    AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
-   AddrSurfInfoIn.flags.degrade4Space = 1;
+   AddrSurfInfoIn.flags.tcCompatible = (surf->flags & 
RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
+
+   /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
+    * requested, because TC-compatible HTILE requires 2D tiling.
+    */
+   AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible;

    /* DCC notes:
     * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
     *   with samples >= 4.
     * - Mipmapped array textures have low performance (discovered by a closed
     *   driver team).
     */
    AddrSurfInfoIn.flags.dccCompatible = ws->info.chip_class >= VI &&
                                         !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) 
&&
                                         !(surf->flags & RADEON_SURF_DISABLE_DCC) 
&&
@@ -436,25 +474,28 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
          if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
             AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
          else
             AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
       }
    }

    surf->bo_size = 0;
    surf->dcc_size = 0;
    surf->dcc_alignment = 1;
+   surf->htile_size = 0;
+   surf->htile_alignment = 1;

    /* Calculate texture layout information. */
    for (level = 0; level <= surf->last_level; level++) {
       r = compute_level(ws, surf, false, level, type, compressed,
-                        &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, 
&AddrDccOut);
+                        &AddrSurfInfoIn, &AddrSurfInfoOut,
+                        &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
       if (r)
          return r;

       if (level == 0) {
          surf->bo_alignment = AddrSurfInfoOut.baseAlign;
          surf->pipe_config = AddrSurfInfoOut.pTileInfo->pipeConfig - 1;
          set_micro_tile_mode(surf, &ws->info);

          /* For 2D modes only. */
          if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
@@ -468,26 +509,28 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
             surf->macro_tile_index = 0;
          }
       }
    }

    /* Calculate texture layout information for stencil. */
    if (surf->flags & RADEON_SURF_SBUFFER) {
       AddrSurfInfoIn.bpp = 8;
       AddrSurfInfoIn.flags.depth = 0;
       AddrSurfInfoIn.flags.stencil = 1;
+      AddrSurfInfoIn.flags.tcCompatible = 0;
       /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
       AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;

       for (level = 0; level <= surf->last_level; level++) {
          r = compute_level(ws, surf, true, level, type, compressed,
-                           &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, 
&AddrDccOut);
+                           &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, 
&AddrDccOut,
+                           NULL, NULL);
          if (r)
             return r;

          /* DB uses the depth pitch for both stencil and depth. */
          if (surf->stencil_level[level].nblk_x != surf->level[level].nblk_x)
             surf->stencil_adjusted = true;

          if (level == 0) {
             /* For 2D modes only. */
             if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
@@ -501,20 +544,26 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    /* Recalculate the whole DCC miptree size including disabled levels.
     * This is what addrlib does, but calling addrlib would be a lot more
     * complicated.
     */
    if (surf->dcc_size && surf->last_level > 0) {
       surf->dcc_size = align64(surf->bo_size >> 8,
                                ws->info.pipe_interleave_bytes *
                                ws->info.num_tile_pipes);
    }

+   /* Make sure HTILE covers the whole miptree, because the shader reads
+    * TC-compatible HTILE even for levels where it's disabled by DB.
+    */
+   if (surf->htile_size && surf->last_level)
+          surf->htile_size *= 2;
+
    return 0;
 }

 static int amdgpu_surface_best(struct radeon_winsys *rws,
                                struct radeon_surf *surf)
 {
    return 0;
 }

 void amdgpu_surface_init_functions(struct amdgpu_winsys *ws)

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to