Module: Mesa Branch: main Commit: 2ac6816b70d7bb4658ed3236ede78a4dfa104e58 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=2ac6816b70d7bb4658ed3236ede78a4dfa104e58
Author: Marek Olšák <marek.ol...@amd.com> Date: Wed Oct 25 04:13:28 2023 -0400 radeonsi/gfx11: use SET_CONTEXT_REG_PAIRS_PACKED for other states It's used where registers are non-contiguous. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25941> --- src/gallium/drivers/radeonsi/si_build_pm4.h | 59 ++++ src/gallium/drivers/radeonsi/si_state.c | 332 +++++++++++++++++++--- src/gallium/drivers/radeonsi/si_state_msaa.c | 31 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 104 ++++++- src/gallium/drivers/radeonsi/si_state_viewport.c | 34 ++- 5 files changed, 492 insertions(+), 68 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index cfbd27a8d87..6ce53255cfe 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -278,6 +278,28 @@ } \ } while (0) +#define gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, category, buffer, reg_count) do { \ + unsigned __v1 = (v1); \ + unsigned __v2 = (v2); \ + unsigned __v3 = (v3); \ + unsigned __v4 = (v4); \ + if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0xf) != 0xf || \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4) { \ + gfx11_push_reg((reg), __v1, prefix_name, buffer, reg_count); \ + gfx11_push_reg((reg) + 4, __v2, prefix_name, buffer, reg_count); \ + gfx11_push_reg((reg) + 8, __v3, prefix_name, buffer, reg_count); \ + gfx11_push_reg((reg) + 12, __v4, prefix_name, buffer, reg_count); \ + sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE((reg_enum), 4); \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \ + } \ +} while (0) + /* GFX11 packet building helpers for buffered SH registers. */ #define gfx11_push_gfx_sh_reg(reg, value) \ gfx11_push_reg(reg, value, SI_SH, sctx->gfx11.buffered_gfx_sh_regs, \ @@ -295,6 +317,43 @@ gfx11_opt_push_reg(reg, reg_enum, value, SI_SH, other, sctx->gfx11.buffered_compute_sh_regs, \ sctx->num_buffered_compute_sh_regs) +/* GFX11 packet building helpers for SET_CONTEXT_REG_PAIRS_PACKED. + * Registers are buffered on the stack and then copied to the command buffer at the end. + */ +#define gfx11_begin_packed_context_regs() \ + struct gfx11_reg_pair __cs_context_regs[50]; \ + unsigned __cs_context_reg_count = 0; + +#define gfx11_set_context_reg(reg, value) \ + gfx11_push_reg(reg, value, SI_CONTEXT, __cs_context_regs, __cs_context_reg_count) + +#define gfx11_opt_set_context_reg(reg, reg_enum, value) \ + gfx11_opt_push_reg(reg, reg_enum, value, SI_CONTEXT, context, __cs_context_regs, \ + __cs_context_reg_count) + +#define gfx11_opt_set_context_reg4(reg, reg_enum, v1, v2, v3, v4) \ + gfx11_opt_push_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, context, __cs_context_regs, \ + __cs_context_reg_count) + +#define gfx11_end_packed_context_regs() do { \ + if (__cs_context_reg_count >= 2) { \ + /* Align the count to 2 by duplicating the first register. */ \ + if (__cs_context_reg_count % 2 == 1) { \ + gfx11_set_context_reg(__cs_context_regs[0].reg_offset[0] + SI_CONTEXT_REG_OFFSET, \ + __cs_context_regs[0].reg_value[0]); \ + } \ + assert(__cs_context_reg_count % 2 == 0); \ + unsigned __num_dw = (__cs_context_reg_count / 2) * 3; \ + radeon_emit(PKT3(PKT3_SET_CONTEXT_REG_PAIRS_PACKED, __num_dw, 0) | PKT3_RESET_FILTER_CAM_S(1)); \ + radeon_emit(__cs_context_reg_count); \ + radeon_emit_array(__cs_context_regs, __num_dw); \ + } else if (__cs_context_reg_count == 1) { \ + radeon_emit(PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); \ + radeon_emit(__cs_context_regs[0].reg_offset[0]); \ + radeon_emit(__cs_context_regs[0].reg_value[0]); \ + } \ +} while (0) + #define radeon_set_or_push_gfx_sh_reg(reg, value) do { \ if (GFX_VERSION >= GFX11 && HAS_SH_PAIRS_PACKED) { \ gfx11_push_gfx_sh_reg(reg, value); \ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 909500c82ea..3d5cac6e6da 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -249,18 +249,35 @@ static void si_emit_cb_render_state(struct si_context *sctx, unsigned index) sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; } - radeon_begin(cs); - radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, - cb_target_mask); - if (sctx->gfx_level >= GFX8) { - radeon_opt_set_context_reg(sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, - cb_dcc_control); - } - if (sctx->screen->info.rbplus_allowed) { - radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, - sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, + cb_target_mask); + gfx11_opt_set_context_reg(R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, + cb_dcc_control); + gfx11_opt_set_context_reg(R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, + sx_ps_downconvert); + gfx11_opt_set_context_reg(R_028758_SX_BLEND_OPT_EPSILON, SI_TRACKED_SX_BLEND_OPT_EPSILON, + sx_blend_opt_epsilon); + gfx11_opt_set_context_reg(R_02875C_SX_BLEND_OPT_CONTROL, SI_TRACKED_SX_BLEND_OPT_CONTROL, + sx_blend_opt_control); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(cs); + radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK, + cb_target_mask); + if (sctx->gfx_level >= GFX8) { + radeon_opt_set_context_reg(sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL, + cb_dcc_control); + } + if (sctx->screen->info.rbplus_allowed) { + radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT, + sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control); + } + radeon_end_update_context_roll(sctx); } - radeon_end_update_context_roll(sctx); } /* @@ -883,12 +900,27 @@ static void si_emit_clip_regs(struct si_context *sctx, unsigned index) S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->gfx_level >= GFX10_3) | clipdist_mask | (culldist_mask << 8); - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, - pa_cl_cntl | vs->pa_cl_vs_out_cntl); - radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, - rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space)); - radeon_end_update_context_roll(sctx); + unsigned pa_cl_clip_cntl = rs->pa_cl_clip_cntl | ucp_mask | + S_028810_CLIP_DISABLE(window_space); + unsigned pa_cl_vs_out_cntl = pa_cl_cntl | vs->pa_cl_vs_out_cntl; + + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, + pa_cl_clip_cntl); + gfx11_opt_set_context_reg(R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, + pa_cl_vs_out_cntl); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL, + pa_cl_clip_cntl); + radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL, + pa_cl_vs_out_cntl); + radeon_end_update_context_roll(sctx); + } } /* @@ -1660,22 +1692,39 @@ static void si_emit_db_render_state(struct si_context *sctx, unsigned index) S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) | S_028010_CENTROID_COMPUTATION_MODE(sctx->gfx_level >= GFX10_3 ? 1 : 0); - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, - db_render_control, db_count_control); - radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2, - SI_TRACKED_DB_RENDER_OVERRIDE2, db_render_override2); - radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, - db_shader_control); + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, + db_render_control); + gfx11_opt_set_context_reg(R_028004_DB_COUNT_CONTROL, SI_TRACKED_DB_COUNT_CONTROL, + db_count_control); + gfx11_opt_set_context_reg(R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2, + db_render_override2); + gfx11_opt_set_context_reg(R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, + db_shader_control); + gfx11_opt_set_context_reg(R_0283D0_PA_SC_VRS_OVERRIDE_CNTL, + SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL, + db_render_control, db_count_control); + radeon_opt_set_context_reg(sctx, R_028010_DB_RENDER_OVERRIDE2, + SI_TRACKED_DB_RENDER_OVERRIDE2, db_render_override2); + radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL, + db_shader_control); - if (sctx->gfx_level >= GFX11) { - radeon_opt_set_context_reg(sctx, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL, - SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); - } else if (sctx->gfx_level >= GFX10_3) { - radeon_opt_set_context_reg(sctx, R_028064_DB_VRS_OVERRIDE_CNTL, - SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); + if (sctx->gfx_level >= GFX11) { + radeon_opt_set_context_reg(sctx, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL, + SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); + } else if (sctx->gfx_level >= GFX10_3) { + radeon_opt_set_context_reg(sctx, R_028064_DB_VRS_OVERRIDE_CNTL, + SI_TRACKED_DB_PA_SC_VRS_OVERRIDE_CNTL, vrs_override_cntl); + } + radeon_end_update_context_roll(sctx); } - radeon_end_update_context_roll(sctx); } /* @@ -3583,6 +3632,189 @@ static void si_emit_framebuffer_state(struct si_context *sctx, unsigned index) sctx->framebuffer.dirty_zsbuf = false; } +static void gfx11_dgpu_emit_framebuffer_state(struct si_context *sctx, unsigned index) +{ + struct radeon_cmdbuf *cs = &sctx->gfx_cs; + struct pipe_framebuffer_state *state = &sctx->framebuffer.state; + unsigned i, nr_cbufs = state->nr_cbufs; + struct si_texture *tex = NULL; + struct si_surface *cb = NULL; + bool is_msaa_resolve = state->nr_cbufs == 2 && + state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 && + state->cbufs[1] && state->cbufs[1]->texture->nr_samples <= 1; + + /* CB can't do MSAA resolve on gfx11. */ + assert(!is_msaa_resolve); + + radeon_begin(cs); + gfx11_begin_packed_context_regs(); + + /* Colorbuffers. */ + for (i = 0; i < nr_cbufs; i++) { + if (!(sctx->framebuffer.dirty_cbufs & (1 << i))) + continue; + + /* RB+ depth-only rendering. See the comment where we set rbplus_depth_only_opt for more + * information. + */ + if (i == 0 && + sctx->screen->info.rbplus_allowed && + !sctx->queued.named.blend->cb_target_mask) { + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT_GFX11(V_028C70_COLOR_32) | + S_028C70_NUMBER_TYPE(V_028C70_NUMBER_FLOAT)); + continue; + } + + cb = (struct si_surface *)state->cbufs[i]; + if (!cb) { + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, + S_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)); + continue; + } + + tex = (struct si_texture *)cb->base.texture; + radeon_add_to_buffer_list( + sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | RADEON_USAGE_NEEDS_IMPLICIT_SYNC | + (tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER)); + + if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) { + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, tex->cmask_buffer, + RADEON_USAGE_READWRITE | RADEON_USAGE_NEEDS_IMPLICIT_SYNC | + RADEON_PRIO_SEPARATE_META); + } + + /* Compute mutable surface parameters. */ + uint64_t cb_color_base = tex->buffer.gpu_address >> 8; + uint64_t cb_dcc_base = 0; + unsigned cb_color_info = cb->cb_color_info | tex->cb_color_info; + + /* Set up DCC. */ + if (vi_dcc_enabled(tex, cb->base.u.tex.level)) { + cb_dcc_base = (tex->buffer.gpu_address + tex->surface.meta_offset) >> 8; + + unsigned dcc_tile_swizzle = tex->surface.tile_swizzle; + dcc_tile_swizzle &= ((1 << tex->surface.meta_alignment_log2) - 1) >> 8; + cb_dcc_base |= dcc_tile_swizzle; + } + + unsigned cb_color_attrib3, cb_fdcc_control; + + /* Set mutable surface parameters. */ + cb_color_base += tex->surface.u.gfx9.surf_offset >> 8; + cb_color_base |= tex->surface.tile_swizzle; + + cb_color_attrib3 = cb->cb_color_attrib3 | + S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.swizzle_mode) | + S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned); + cb_fdcc_control = cb->cb_dcc_control | + S_028C78_DISABLE_CONSTANT_ENCODE_REG(1) | + S_028C78_FDCC_ENABLE(vi_dcc_enabled(tex, cb->base.u.tex.level)); + + if (sctx->family >= CHIP_GFX1103_R2) { + cb_fdcc_control |= S_028C78_ENABLE_MAX_COMP_FRAG_OVERRIDE(1) | + S_028C78_MAX_COMP_FRAGS(cb->base.texture->nr_samples >= 4); + } + + gfx11_set_context_reg(R_028C60_CB_COLOR0_BASE + i * 0x3C, cb_color_base); + gfx11_set_context_reg(R_028C6C_CB_COLOR0_VIEW + i * 0x3C, cb->cb_color_view); + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, cb_color_info); + gfx11_set_context_reg(R_028C74_CB_COLOR0_ATTRIB + i * 0x3C, cb->cb_color_attrib); + gfx11_set_context_reg(R_028C78_CB_COLOR0_DCC_CONTROL + i * 0x3C, cb_fdcc_control); + gfx11_set_context_reg(R_028C94_CB_COLOR0_DCC_BASE + i * 0x3C, cb_dcc_base); + gfx11_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32); + gfx11_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32); + gfx11_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2); + gfx11_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3); + } + for (; i < 8; i++) + if (sctx->framebuffer.dirty_cbufs & (1 << i)) + gfx11_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); + + /* ZS buffer. */ + if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { + struct si_surface *zb = (struct si_surface *)state->zsbuf; + struct si_texture *tex = (struct si_texture *)zb->base.texture; + unsigned db_z_info = zb->db_z_info; + unsigned db_stencil_info = zb->db_stencil_info; + unsigned db_htile_surface = zb->db_htile_surface; + + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE | + (zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA + : RADEON_PRIO_DEPTH_BUFFER)); + bool tc_compat_htile = vi_tc_compat_htile_enabled(tex, zb->base.u.tex.level, PIPE_MASK_ZS); + + /* Set fields dependent on tc_compatile_htile. */ + if (tc_compat_htile) { + unsigned max_zplanes = 4; + + if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1) + max_zplanes = 2; + + bool iterate256 = tex->buffer.b.b.nr_samples >= 2; + db_z_info |= S_028040_ITERATE_FLUSH(1) | + S_028040_ITERATE_256(iterate256); + db_stencil_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled) | + S_028044_ITERATE_256(iterate256); + + /* Workaround for a DB hang when ITERATE_256 is set to 1. Only affects 4X MSAA D/S images. */ + if (sctx->screen->info.has_two_planes_iterate256_bug && iterate256 && + !tex->htile_stencil_disabled && tex->buffer.b.b.nr_samples == 4) + max_zplanes = 1; + + db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1); + } + + unsigned level = zb->base.u.tex.level; + + gfx11_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base); + gfx11_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size); + gfx11_set_context_reg(R_028040_DB_Z_INFO, db_z_info | + S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0)); + gfx11_set_context_reg(R_028044_DB_STENCIL_INFO, db_stencil_info); + gfx11_set_context_reg(R_028048_DB_Z_READ_BASE, zb->db_depth_base); + gfx11_set_context_reg(R_02804C_DB_STENCIL_READ_BASE, zb->db_stencil_base); + gfx11_set_context_reg(R_028050_DB_Z_WRITE_BASE, zb->db_depth_base); + gfx11_set_context_reg(R_028054_DB_STENCIL_WRITE_BASE, zb->db_stencil_base); + gfx11_set_context_reg(R_028068_DB_Z_READ_BASE_HI, zb->db_depth_base >> 32); + gfx11_set_context_reg(R_02806C_DB_STENCIL_READ_BASE_HI, zb->db_stencil_base >> 32); + gfx11_set_context_reg(R_028070_DB_Z_WRITE_BASE_HI, zb->db_depth_base >> 32); + gfx11_set_context_reg(R_028074_DB_STENCIL_WRITE_BASE_HI, zb->db_stencil_base >> 32); + gfx11_set_context_reg(R_028078_DB_HTILE_DATA_BASE_HI, zb->db_htile_data_base >> 32); + gfx11_set_context_reg(R_028028_DB_STENCIL_CLEAR, tex->stencil_clear_value[level]); + gfx11_set_context_reg(R_02802C_DB_DEPTH_CLEAR, fui(tex->depth_clear_value[level])); + gfx11_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view); + gfx11_set_context_reg(R_028ABC_DB_HTILE_SURFACE, db_htile_surface); + } else if (sctx->framebuffer.dirty_zsbuf) { + /* Gfx11+: DB_Z_INFO.NUM_SAMPLES should match the framebuffer samples if no Z/S is bound. + * It determines the sample count for VRS, primitive-ordered pixel shading, and occlusion + * queries. + */ + gfx11_set_context_reg(R_028040_DB_Z_INFO, + S_028040_FORMAT(V_028040_Z_INVALID) | + S_028040_NUM_SAMPLES(sctx->framebuffer.log_samples)); + gfx11_set_context_reg(R_028044_DB_STENCIL_INFO, S_028044_FORMAT(V_028044_STENCIL_INVALID)); + } + + /* Framebuffer dimensions. */ + /* PA_SC_WINDOW_SCISSOR_TL is set to 0,0 in gfx*_init_gfx_preamble_state */ + gfx11_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR, + S_028208_BR_X(state->width) | S_028208_BR_Y(state->height)); + gfx11_end_packed_context_regs(); + + if (sctx->screen->dpbb_allowed && + sctx->screen->pbb_context_states_per_bin > 1) { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0)); + } + radeon_end(); + + si_update_display_dcc_dirty(sctx); + + sctx->framebuffer.dirty_cbufs = 0; + sctx->framebuffer.dirty_zsbuf = false; +} + static bool si_out_of_order_rasterization(struct si_context *sctx) { struct si_state_blend *blend = sctx->queued.named.blend; @@ -3753,17 +3985,27 @@ static void si_emit_msaa_config(struct si_context *sctx, unsigned index) } } - radeon_begin(cs); - - /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */ - radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, - sc_line_cntl, sc_aa_config); - /* R_028804_DB_EQAA */ - radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); - /* R_028A4C_PA_SC_MODE_CNTL_1 */ - radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, - sc_mode_cntl_1); - radeon_end_update_context_roll(sctx); + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, + sc_line_cntl); + gfx11_opt_set_context_reg(R_028BE0_PA_SC_AA_CONFIG, SI_TRACKED_PA_SC_AA_CONFIG, + sc_aa_config); + gfx11_opt_set_context_reg(R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); + gfx11_opt_set_context_reg(R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(cs); + radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL, + sc_line_cntl, sc_aa_config); + radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa); + radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1, + sc_mode_cntl_1); + radeon_end_update_context_roll(sctx); + } } void si_update_ps_iter_samples(struct si_context *sctx) @@ -5447,7 +5689,11 @@ void si_init_state_functions(struct si_context *sctx) sctx->atoms.s.pm4_states[SI_STATE_IDX(vs)].emit = si_pm4_emit_shader; sctx->atoms.s.pm4_states[SI_STATE_IDX(ps)].emit = si_pm4_emit_shader; - sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; + if (sctx->screen->info.has_set_context_pairs_packed) + sctx->atoms.s.framebuffer.emit = gfx11_dgpu_emit_framebuffer_state; + else + sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state; + sctx->atoms.s.db_render_state.emit = si_emit_db_render_state; sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state; sctx->atoms.s.msaa_config.emit = si_emit_msaa_config; diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c index b69e4dbcbf3..4d2cc64998e 100644 --- a/src/gallium/drivers/radeonsi/si_state_msaa.c +++ b/src/gallium/drivers/radeonsi/si_state_msaa.c @@ -145,15 +145,28 @@ static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_cou static void si_emit_max_4_sample_locs(struct si_context *sctx, uint64_t centroid_priority, uint32_t sample_locs) { - radeon_begin(&sctx->gfx_cs); - radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); - radeon_emit(centroid_priority); - radeon_emit(centroid_priority >> 32); - radeon_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs); - radeon_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs); - radeon_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs); - radeon_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs); - radeon_end(); + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_set_context_reg(R_028BD4_PA_SC_CENTROID_PRIORITY_0, centroid_priority); + gfx11_set_context_reg(R_028BD8_PA_SC_CENTROID_PRIORITY_1, centroid_priority >> 32); + gfx11_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs); + gfx11_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs); + gfx11_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs); + gfx11_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs); + gfx11_end_packed_context_regs(); + radeon_end(); + } else { + radeon_begin(&sctx->gfx_cs); + radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); + radeon_emit(centroid_priority); + radeon_emit(centroid_priority >> 32); + radeon_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs); + radeon_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs); + radeon_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs); + radeon_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs); + radeon_end(); + } } static void si_emit_max_16_sample_locs(struct si_context *sctx, uint64_t centroid_priority, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 9137c7a2503..5c14d49e335 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1208,10 +1208,61 @@ static void gfx10_emit_shader_ngg(struct si_context *sctx, unsigned index) /* These don't cause a context roll. */ radeon_begin_again(&sctx->gfx_cs); + if (sctx->screen->info.uses_kernel_cu_mask) { + radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + 3, shader->ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + 3, shader->ngg.spi_shader_pgm_rsrc4_gs); + } else { + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ngg.spi_shader_pgm_rsrc4_gs); + } radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, shader->ngg.ge_pc_alloc); + radeon_end(); +} + +template <enum si_has_tess HAS_TESS> +static void gfx11_dgpu_emit_shader_ngg(struct si_context *sctx, unsigned index) +{ + struct si_shader *shader = sctx->queued.named.gs; + + SET_FIELD(sctx->current_gs_state, GS_STATE_ESGS_VERTEX_STRIDE, + shader->ngg.esgs_vertex_stride); + + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + if (HAS_TESS) { + gfx11_opt_set_context_reg(R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM, + shader->vgt_tf_param); + } + gfx11_opt_set_context_reg(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, + SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP, + shader->ngg.ge_max_output_per_subgroup); + gfx11_opt_set_context_reg(R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL, + shader->ngg.ge_ngg_subgrp_cntl); + gfx11_opt_set_context_reg(R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN, + shader->ngg.vgt_primitiveid_en); + gfx11_opt_set_context_reg(R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT, + shader->ngg.vgt_gs_max_vert_out); + gfx11_opt_set_context_reg(R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT, + shader->ngg.vgt_gs_instance_cnt); + gfx11_opt_set_context_reg(R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG, + shader->ngg.spi_vs_out_config); + gfx11_opt_set_context_reg(R_02870C_SPI_SHADER_POS_FORMAT, SI_TRACKED_SPI_SHADER_POS_FORMAT, + shader->ngg.spi_shader_pos_format); + gfx11_opt_set_context_reg(R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL, + shader->ngg.pa_cl_vte_cntl); + gfx11_end_packed_context_regs(); + + assert(!sctx->screen->info.uses_kernel_cu_mask); if (sctx->screen->info.has_set_sh_pairs_packed) { - assert(!sctx->screen->info.uses_kernel_cu_mask); gfx11_opt_push_gfx_sh_reg(R_00B21C_SPI_SHADER_PGM_RSRC3_GS, SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, shader->gs.spi_shader_pgm_rsrc3_gs); @@ -1235,6 +1286,9 @@ static void gfx10_emit_shader_ngg(struct si_context *sctx, unsigned index) shader->ngg.spi_shader_pgm_rsrc4_gs); } } + + radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC, + shader->ngg.ge_pc_alloc); radeon_end(); } @@ -1309,10 +1363,17 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader if (!pm4) return; - if (es_stage == MESA_SHADER_TESS_EVAL) - pm4->atom.emit = gfx10_emit_shader_ngg<TESS_ON>; - else - pm4->atom.emit = gfx10_emit_shader_ngg<TESS_OFF>; + if (sscreen->info.has_set_context_pairs_packed) { + if (es_stage == MESA_SHADER_TESS_EVAL) + pm4->atom.emit = gfx11_dgpu_emit_shader_ngg<TESS_ON>; + else + pm4->atom.emit = gfx11_dgpu_emit_shader_ngg<TESS_OFF>; + } else { + if (es_stage == MESA_SHADER_TESS_EVAL) + pm4->atom.emit = gfx10_emit_shader_ngg<TESS_ON>; + else + pm4->atom.emit = gfx10_emit_shader_ngg<TESS_OFF>; + } va = shader->bo->gpu_address; @@ -1735,7 +1796,7 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader) return value; } -static void si_emit_shader_ps(struct si_context *sctx, unsigned index) +static void gfx6_emit_shader_ps(struct si_context *sctx, unsigned index) { struct si_shader *shader = sctx->queued.named.ps; @@ -1755,6 +1816,30 @@ static void si_emit_shader_ps(struct si_context *sctx, unsigned index) radeon_end_update_context_roll(sctx); } +static void gfx11_dgpu_emit_shader_ps(struct si_context *sctx, unsigned index) +{ + struct si_shader *shader = sctx->queued.named.ps; + + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA, + shader->ps.spi_ps_input_ena); + gfx11_opt_set_context_reg(R_0286D0_SPI_PS_INPUT_ADDR, SI_TRACKED_SPI_PS_INPUT_ADDR, + shader->ps.spi_ps_input_addr); + gfx11_opt_set_context_reg(R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL, + shader->ps.spi_baryc_cntl); + gfx11_opt_set_context_reg(R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL, + shader->ps.spi_ps_in_control); + gfx11_opt_set_context_reg(R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT, + shader->ps.spi_shader_z_format); + gfx11_opt_set_context_reg(R_028714_SPI_SHADER_COL_FORMAT, SI_TRACKED_SPI_SHADER_COL_FORMAT, + shader->ps.spi_shader_col_format); + gfx11_opt_set_context_reg(R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK, + shader->ps.cb_shader_mask); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ +} + static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) { struct si_shader_info *info = &shader->selector->info; @@ -1923,10 +2008,15 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) S_0286D8_PARAM_GEN(param_gen) | S_0286D8_PS_W32_EN(shader->wave_size == 32); - struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, si_emit_shader_ps); + struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader, NULL); if (!pm4) return; + if (sscreen->info.has_set_context_pairs_packed) + pm4->atom.emit = gfx11_dgpu_emit_shader_ps; + else + pm4->atom.emit = gfx6_emit_shader_ps; + /* If multiple state sets are allowed to be in a bin, break the batch on a new PS. */ if (sscreen->dpbb_allowed && (sscreen->pbb_context_states_per_bin > 1 || diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index affd9301588..88b26580e9c 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -373,15 +373,31 @@ static void si_emit_guardband(struct si_context *sctx, unsigned index) * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ - radeon_begin(&sctx->gfx_cs); - radeon_opt_set_context_reg5(sctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, - pa_su_vtx_cntl, - fui(guardband_y), fui(discard_y), - fui(guardband_x), fui(discard_x)); - radeon_opt_set_context_reg(sctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, - SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, - pa_su_hardware_screen_offset); - radeon_end_update_context_roll(sctx); + if (sctx->screen->info.has_set_context_pairs_packed) { + radeon_begin(&sctx->gfx_cs); + gfx11_begin_packed_context_regs(); + gfx11_opt_set_context_reg(R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, + pa_su_vtx_cntl); + gfx11_opt_set_context_reg4(R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, + SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, + fui(guardband_y), fui(discard_y), + fui(guardband_x), fui(discard_x)); + gfx11_opt_set_context_reg(R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, + SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, + pa_su_hardware_screen_offset); + gfx11_end_packed_context_regs(); + radeon_end(); /* don't track context rolls on GFX11 */ + } else { + radeon_begin(&sctx->gfx_cs); + radeon_opt_set_context_reg5(sctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL, + pa_su_vtx_cntl, + fui(guardband_y), fui(discard_y), + fui(guardband_x), fui(discard_x)); + radeon_opt_set_context_reg(sctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, + SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET, + pa_su_hardware_screen_offset); + radeon_end_update_context_roll(sctx); + } } static void si_emit_scissors(struct si_context *ctx, unsigned index)