Series is Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Not really a fan of yet another flag in the command buffer, but not sure what would be the best solution. On Mon, Apr 3, 2017 at 5:44 AM, Dave Airlie <airl...@gmail.com> wrote: > From: Dave Airlie <airl...@redhat.com> > > The current code was broken, and I decided to redesign it instead. > > This puts the sample positions for all samples into the queue > constant descriptor buffer after all the spill/ring descriptors. > > It then uses a single offset register to point how far into the > samples the samples for num_samples are. This saves one user sgpr > and means we only generate the sample position data in the rare > single case where we need it currently. > > This doesn't fix the failing CTS tests without the followup > fix. > > Signed-off-by: Dave Airlie <airl...@redhat.com> > --- > src/amd/common/ac_nir_to_llvm.c | 31 +++++++++++--------- > src/amd/common/ac_nir_to_llvm.h | 4 ++- > src/amd/vulkan/radv_cmd_buffer.c | 61 > ++++++++++++++++++++-------------------- > src/amd/vulkan/radv_device.c | 40 ++++++++++++++++++++++---- > src/amd/vulkan/radv_private.h | 2 ++ > 5 files changed, 87 insertions(+), 51 deletions(-) > > diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c > index 520e4cf..aabcabe 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -109,7 +109,7 @@ struct nir_to_llvm_context { > LLVMValueRef hs_ring_tess_factor; > > LLVMValueRef prim_mask; > - LLVMValueRef sample_positions; > + LLVMValueRef sample_pos_offset; > LLVMValueRef persp_sample, persp_center, persp_centroid; > LLVMValueRef linear_sample, linear_center, linear_centroid; > LLVMValueRef front_face; > @@ -573,6 +573,7 @@ static void create_function(struct nir_to_llvm_context > *ctx) > ctx->stage == MESA_SHADER_VERTEX || > ctx->stage == MESA_SHADER_TESS_CTRL || > ctx->stage == MESA_SHADER_TESS_EVAL || > + ctx->stage == MESA_SHADER_FRAGMENT || > ctx->is_gs_copy_shader) > need_ring_offsets = true; > > @@ -584,7 +585,7 @@ static void create_function(struct nir_to_llvm_context > *ctx) > need_push_constants = false; > > if (need_ring_offsets && !ctx->options->supports_spill) { > - arg_types[arg_idx++] = const_array(ctx->v16i8, 8); /* address > of rings */ > + arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* > address of rings */ > } > > /* 1 for each descriptor set */ > @@ -679,7 +680,7 @@ static void create_function(struct nir_to_llvm_context > *ctx) > arg_types[arg_idx++] = ctx->i32; // GS instance id > break; > case MESA_SHADER_FRAGMENT: > - arg_types[arg_idx++] = const_array(ctx->f32, 32); /* sample > positions */ > + arg_types[arg_idx++] = ctx->i32; /* sample position offset */ > user_sgpr_count = arg_idx; > arg_types[arg_idx++] = ctx->i32; /* prim mask */ > sgpr_count = arg_idx; > @@ -735,7 +736,7 @@ static void create_function(struct nir_to_llvm_context > *ctx) > > LLVMPointerType(ctx->i8, CONST_ADDR_SPACE), > NULL, 0, > AC_FUNC_ATTR_READNONE); > ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, > ctx->ring_offsets, > - > const_array(ctx->v16i8, 8), ""); > + > const_array(ctx->v16i8, 16), ""); > } else > ctx->ring_offsets = LLVMGetParam(ctx->main_function, > arg_idx++); > } > @@ -844,9 +845,9 @@ static void create_function(struct nir_to_llvm_context > *ctx) > ctx->gs_invocation_id = LLVMGetParam(ctx->main_function, > arg_idx++); > break; > case MESA_SHADER_FRAGMENT: > - set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS, > user_sgpr_idx, 2); > - user_sgpr_idx += 2; > - ctx->sample_positions = LLVMGetParam(ctx->main_function, > arg_idx++); > + set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, > user_sgpr_idx, 1); > + user_sgpr_idx += 1; > + ctx->sample_pos_offset = LLVMGetParam(ctx->main_function, > arg_idx++); > ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++); > ctx->persp_sample = LLVMGetParam(ctx->main_function, > arg_idx++); > ctx->persp_center = LLVMGetParam(ctx->main_function, > arg_idx++); > @@ -3518,15 +3519,17 @@ static LLVMValueRef lookup_interp_param(struct > nir_to_llvm_context *ctx, > static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx, > LLVMValueRef sample_id) > { > - /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */ > - LLVMValueRef offset0 = LLVMBuildMul(ctx->builder, sample_id, > LLVMConstInt(ctx->i32, 8, false), ""); > - LLVMValueRef offset1 = LLVMBuildAdd(ctx->builder, offset0, > LLVMConstInt(ctx->i32, 4, false), ""); > - LLVMValueRef result[2]; > + LLVMValueRef result; > + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, > LLVMConstInt(ctx->i32, RING_PS_SAMPLE_POSITIONS, false)); > > - result[0] = ac_build_indexed_load_const(&ctx->ac, > ctx->sample_positions, offset0); > - result[1] = ac_build_indexed_load_const(&ctx->ac, > ctx->sample_positions, offset1); > + ptr = LLVMBuildBitCast(ctx->builder, ptr, > + const_array(ctx->v2f32, 64), ""); > > - return ac_build_gather_values(&ctx->ac, result, 2); > + sample_id = LLVMBuildAdd(ctx->builder, sample_id, > ctx->sample_pos_offset, ""); > + result = ac_build_indexed_load(&ctx->ac, ptr, sample_id, false); > + > + ctx->shader_info->fs.uses_sample_positions = true; > + return result; > } > > static LLVMValueRef load_sample_pos(struct nir_to_llvm_context *ctx) > diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h > index c468d93..3d0b456 100644 > --- a/src/amd/common/ac_nir_to_llvm.h > +++ b/src/amd/common/ac_nir_to_llvm.h > @@ -88,7 +88,7 @@ enum ac_ud_index { > AC_UD_VS_BASE_VERTEX_START_INSTANCE, > AC_UD_VS_LS_TCS_IN_LAYOUT, > AC_UD_VS_MAX_UD, > - AC_UD_PS_SAMPLE_POS = AC_UD_SHADER_START, > + AC_UD_PS_SAMPLE_POS_OFFSET = AC_UD_SHADER_START, > AC_UD_PS_MAX_UD, > AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START, > AC_UD_CS_MAX_UD, > @@ -109,6 +109,7 @@ enum ac_ud_index { > #define RING_GSVS_GS 4 > #define RING_HS_TESS_FACTOR 5 > #define RING_HS_TESS_OFFCHIP 6 > +#define RING_PS_SAMPLE_POSITIONS 7 > > // Match MAX_SETS from radv_descriptor_set.h > #define AC_UD_MAX_SETS MAX_SETS > @@ -165,6 +166,7 @@ struct ac_shader_variant_info { > bool force_persample; > bool prim_id_input; > bool layer_input; > + bool uses_sample_positions; > } fs; > struct { > unsigned block_size[3]; > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > b/src/amd/vulkan/radv_cmd_buffer.c > index c95388e..b0a6854 100644 > --- a/src/amd/vulkan/radv_cmd_buffer.c > +++ b/src/amd/vulkan/radv_cmd_buffer.c > @@ -222,6 +222,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer > *cmd_buffer) > cmd_buffer->esgs_ring_size_needed = 0; > cmd_buffer->gsvs_ring_size_needed = 0; > cmd_buffer->tess_rings_needed = false; > + cmd_buffer->sample_positions_needed = false; > > if (cmd_buffer->upload.upload_bo) > cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, > @@ -452,37 +453,35 @@ radv_update_multisample_state(struct radv_cmd_buffer > *cmd_buffer, > > radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples); > > - uint32_t samples_offset; > - void *samples_ptr; > - void *src; > - radv_cmd_buffer_upload_alloc(cmd_buffer, num_samples * 4 * 2, 256, > &samples_offset, > - &samples_ptr); > - switch (num_samples) { > - case 1: > - src = cmd_buffer->device->sample_locations_1x; > - break; > - case 2: > - src = cmd_buffer->device->sample_locations_2x; > - break; > - case 4: > - src = cmd_buffer->device->sample_locations_4x; > - break; > - case 8: > - src = cmd_buffer->device->sample_locations_8x; > - break; > - case 16: > - src = cmd_buffer->device->sample_locations_16x; > - break; > - default: > - unreachable("unknown number of samples"); > - } > - memcpy(samples_ptr, src, num_samples * 4 * 2); > - > - uint64_t va = > cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo); > - va += samples_offset; > + if > (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_positions) { > + uint32_t offset; > + struct ac_userdata_info *loc = > radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, > AC_UD_PS_SAMPLE_POS_OFFSET); > + uint32_t base_reg = > shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, > radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline)); > + if (loc->sgpr_idx == -1) > + return; > + assert(loc->num_sgprs == 1); > + assert(!loc->indirect); > + switch (num_samples) { > + default: > + offset = 0; > + break; > + case 2: > + offset = 1; > + break; > + case 4: > + offset = 3; > + break; > + case 8: > + offset = 7; > + break; > + case 16: > + offset = 15; > + break; > + } > > - radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT, > - AC_UD_PS_SAMPLE_POS, va); > + radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * > 4, offset); > + cmd_buffer->sample_positions_needed = true; > + } > } > > static void > @@ -2197,6 +2196,8 @@ void radv_CmdExecuteCommands( > primary->gsvs_ring_size_needed = > secondary->gsvs_ring_size_needed; > if (secondary->tess_rings_needed) > primary->tess_rings_needed = true; > + if (secondary->sample_positions_needed) > + primary->sample_positions_needed = true; > > if (secondary->ring_offsets_idx != -1) { > if (primary->ring_offsets_idx == -1) > diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c > index 5c48be1..24219e4 100644 > --- a/src/amd/vulkan/radv_device.c > +++ b/src/amd/vulkan/radv_device.c > @@ -1188,6 +1188,7 @@ static void radv_dump_trace(struct radv_device *device, > static void > fill_geom_tess_rings(struct radv_queue *queue, > uint32_t *map, > + bool add_sample_positions, > uint32_t esgs_ring_size, > struct radeon_winsys_bo *esgs_ring_bo, > uint32_t gsvs_ring_size, > @@ -1315,6 +1316,18 @@ fill_geom_tess_rings(struct radv_queue *queue, > S_008F0C_ELEMENT_SIZE(0) | > S_008F0C_INDEX_STRIDE(0) | > S_008F0C_ADD_TID_ENABLE(false); > + desc += 4; > + > + /* add sample positions after all rings */ > + memcpy(desc, queue->device->sample_locations_1x, 8); > + desc += 2; > + memcpy(desc, queue->device->sample_locations_2x, 16); > + desc += 4; > + memcpy(desc, queue->device->sample_locations_4x, 32); > + desc += 8; > + memcpy(desc, queue->device->sample_locations_8x, 64); > + desc += 16; > + memcpy(desc, queue->device->sample_locations_16x, 128); > } > > static unsigned > @@ -1374,6 +1387,7 @@ radv_get_preamble_cs(struct radv_queue *queue, > uint32_t esgs_ring_size, > uint32_t gsvs_ring_size, > bool needs_tess_rings, > + bool needs_sample_positions, > struct radeon_winsys_cs **initial_preamble_cs, > struct radeon_winsys_cs **continue_preamble_cs) > { > @@ -1385,7 +1399,7 @@ radv_get_preamble_cs(struct radv_queue *queue, > struct radeon_winsys_bo *tess_factor_ring_bo = NULL; > struct radeon_winsys_bo *tess_offchip_ring_bo = NULL; > struct radeon_winsys_cs *dest_cs[2] = {0}; > - bool add_tess_rings = false; > + bool add_tess_rings = false, add_sample_positions = false; > unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0; > unsigned max_offchip_buffers; > unsigned hs_offchip_param = 0; > @@ -1393,6 +1407,10 @@ radv_get_preamble_cs(struct radv_queue *queue, > if (needs_tess_rings) > add_tess_rings = true; > } > + if (!queue->has_sample_positions) { > + if (needs_sample_positions) > + add_sample_positions = true; > + } > tess_factor_ring_size = 32768 * > queue->device->physical_device->rad_info.max_se; > hs_offchip_param = radv_get_hs_offchip_param(queue->device, > &max_offchip_buffers); > @@ -1403,7 +1421,7 @@ radv_get_preamble_cs(struct radv_queue *queue, > compute_scratch_size <= queue->compute_scratch_size && > esgs_ring_size <= queue->esgs_ring_size && > gsvs_ring_size <= queue->gsvs_ring_size && > - !add_tess_rings && > + !add_tess_rings && !add_sample_positions && > queue->initial_preamble_cs) { > *initial_preamble_cs = queue->initial_preamble_cs; > *continue_preamble_cs = queue->continue_preamble_cs; > @@ -1485,11 +1503,14 @@ radv_get_preamble_cs(struct radv_queue *queue, > esgs_ring_bo != queue->esgs_ring_bo || > gsvs_ring_bo != queue->gsvs_ring_bo || > tess_factor_ring_bo != queue->tess_factor_ring_bo || > - tess_offchip_ring_bo != queue->tess_offchip_ring_bo) { > + tess_offchip_ring_bo != queue->tess_offchip_ring_bo || > add_sample_positions) { > uint32_t size = 0; > if (gsvs_ring_bo || esgs_ring_bo || > - tess_factor_ring_bo || tess_offchip_ring_bo) > + tess_factor_ring_bo || tess_offchip_ring_bo || > add_sample_positions) { > size = 112; /* 2 dword + 2 padding + 4 dword * 6 */ > + if (add_sample_positions) > + size += 256; /* 32+16+8+4+2+1 samples * 4 * 2 > = 248 bytes. */ > + } > else if (scratch_bo) > size = 8; /* 2 dword */ > > @@ -1541,8 +1562,9 @@ radv_get_preamble_cs(struct radv_queue *queue, > map[1] = rsrc1; > } > > - if (esgs_ring_bo || gsvs_ring_bo || > tess_factor_ring_bo || tess_offchip_ring_bo) > - fill_geom_tess_rings(queue, map, > + if (esgs_ring_bo || gsvs_ring_bo || > tess_factor_ring_bo || tess_offchip_ring_bo || > + add_sample_positions) > + fill_geom_tess_rings(queue, map, > add_sample_positions, > esgs_ring_size, > esgs_ring_bo, > gsvs_ring_size, > gsvs_ring_bo, > tess_factor_ring_size, > tess_factor_ring_bo, > @@ -1685,6 +1707,9 @@ radv_get_preamble_cs(struct radv_queue *queue, > queue->descriptor_bo = descriptor_bo; > } > > + if (add_sample_positions) > + queue->has_sample_positions = true; > + > *initial_preamble_cs = queue->initial_preamble_cs; > *continue_preamble_cs = queue->continue_preamble_cs; > if (!scratch_size && !compute_scratch_size && !esgs_ring_size && > !gsvs_ring_size) > @@ -1730,6 +1755,7 @@ VkResult radv_QueueSubmit( > VkResult result; > bool fence_emitted = false; > bool tess_rings_needed = false; > + bool sample_positions_needed = false; > > /* Do this first so failing to allocate scratch buffers can't result > in > * partially executed submissions. */ > @@ -1744,11 +1770,13 @@ VkResult radv_QueueSubmit( > esgs_ring_size = MAX2(esgs_ring_size, > cmd_buffer->esgs_ring_size_needed); > gsvs_ring_size = MAX2(gsvs_ring_size, > cmd_buffer->gsvs_ring_size_needed); > tess_rings_needed |= cmd_buffer->tess_rings_needed; > + sample_positions_needed |= > cmd_buffer->sample_positions_needed; > } > } > > result = radv_get_preamble_cs(queue, scratch_size, > compute_scratch_size, > esgs_ring_size, gsvs_ring_size, > tess_rings_needed, > + sample_positions_needed, > &initial_preamble_cs, > &continue_preamble_cs); > if (result != VK_SUCCESS) > return result; > diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h > index 3f92d59..a28c825 100644 > --- a/src/amd/vulkan/radv_private.h > +++ b/src/amd/vulkan/radv_private.h > @@ -460,6 +460,7 @@ struct radv_queue { > uint32_t esgs_ring_size; > uint32_t gsvs_ring_size; > bool has_tess_rings; > + bool has_sample_positions; > > struct radeon_winsys_bo *scratch_bo; > struct radeon_winsys_bo *descriptor_bo; > @@ -748,6 +749,7 @@ struct radv_cmd_buffer { > uint32_t esgs_ring_size_needed; > uint32_t gsvs_ring_size_needed; > bool tess_rings_needed; > + bool sample_positions_needed; > > int ring_offsets_idx; /* just used for verification */ > }; > -- > 2.7.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev