Re: [Mesa-dev] [PATCH 11/25] radeonsi: first bits for non-monolithic shaders
On 16.02.2016 11:39, Marek Olšák wrote: On Tue, Feb 16, 2016 at 5:01 PM, Nicolai Hähnle wrote: On 15.02.2016 18:59, Marek Olšák wrote: From: Marek Olšák --- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 3 ++ src/gallium/drivers/radeonsi/si_shader.c | 53 src/gallium/drivers/radeonsi/si_shader.h | 2 +- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index fa60732..448fe88 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -600,6 +600,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) sscreen->b.has_cp_dma = true; sscreen->b.has_streamout = true; + sscreen->use_monolithic_shaders = true; if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE)) sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b5790d6..2a2455c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -84,6 +84,9 @@ struct si_compute; struct si_screen { struct r600_common_screen b; unsignedgs_table_depth; + + /* Whether shaders are monolithic (1-part) or separate (3-part). */ + booluse_monolithic_shaders; }; struct si_blend_color { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index b058019..b74ed1e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -70,6 +70,12 @@ struct si_shader_context unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ bool is_gs_copy_shader; + + /* Whether to generate the optimized shader variant compiled as a whole +* (without a prolog and epilog) +*/ + bool is_monolithic; + int param_streamout_config; int param_streamout_write_index; int param_streamout_offset[4]; @@ -3657,8 +3663,10 @@ static void create_function(struct si_shader_context *ctx) struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32; + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v2i32, v3i32; + LLVMTypeRef returns[16+32*4]; This is a bit of a magic number, I guess something like max parameters plus attributes. Can you replace it by the appropriate defines? There is not a single definition that would express this clearly. The prolog has to return up to 16 input SGPRs and 4-20 input VGPRs. Additionally, the prolog returns other data in VGPRs. That's up to 4+16 VGPRs (16 vertex load addresses) for the VS and 20+8 VGPRs (2 vec4 colors) for the PS. The PS epilog returns one SGPR (but in s10 or so, so we need to allocate 11) and 9*4 VGPRs at most. This all can change in the future, who knows. 16+32*4 is much more than we'll ever need, but it shouldn't overflow at least. Assertions also check if we don't overflow. Hmm, I see. I guess I can live with it, as well as with the casts in patch 14. Nicolai Marek ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 11/25] radeonsi: first bits for non-monolithic shaders
On Tue, Feb 16, 2016 at 5:01 PM, Nicolai Hähnle wrote: > On 15.02.2016 18:59, Marek Olšák wrote: >> >> From: Marek Olšák >> >> --- >> src/gallium/drivers/radeonsi/si_pipe.c | 1 + >> src/gallium/drivers/radeonsi/si_pipe.h | 3 ++ >> src/gallium/drivers/radeonsi/si_shader.c | 53 >> >> src/gallium/drivers/radeonsi/si_shader.h | 2 +- >> 4 files changed, 45 insertions(+), 14 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c >> b/src/gallium/drivers/radeonsi/si_pipe.c >> index fa60732..448fe88 100644 >> --- a/src/gallium/drivers/radeonsi/si_pipe.c >> +++ b/src/gallium/drivers/radeonsi/si_pipe.c >> @@ -600,6 +600,7 @@ struct pipe_screen *radeonsi_screen_create(struct >> radeon_winsys *ws) >> >> sscreen->b.has_cp_dma = true; >> sscreen->b.has_streamout = true; >> + sscreen->use_monolithic_shaders = true; >> >> if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE)) >> sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | >> DBG_PS | DBG_CS; >> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h >> b/src/gallium/drivers/radeonsi/si_pipe.h >> index b5790d6..2a2455c 100644 >> --- a/src/gallium/drivers/radeonsi/si_pipe.h >> +++ b/src/gallium/drivers/radeonsi/si_pipe.h >> @@ -84,6 +84,9 @@ struct si_compute; >> struct si_screen { >> struct r600_common_screen b; >> unsignedgs_table_depth; >> + >> + /* Whether shaders are monolithic (1-part) or separate (3-part). >> */ >> + booluse_monolithic_shaders; >> }; >> >> struct si_blend_color { >> diff --git a/src/gallium/drivers/radeonsi/si_shader.c >> b/src/gallium/drivers/radeonsi/si_shader.c >> index b058019..b74ed1e 100644 >> --- a/src/gallium/drivers/radeonsi/si_shader.c >> +++ b/src/gallium/drivers/radeonsi/si_shader.c >> @@ -70,6 +70,12 @@ struct si_shader_context >> >> unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. >> */ >> bool is_gs_copy_shader; >> + >> + /* Whether to generate the optimized shader variant compiled as a >> whole >> +* (without a prolog and epilog) >> +*/ >> + bool is_monolithic; >> + >> int param_streamout_config; >> int param_streamout_write_index; >> int param_streamout_offset[4]; >> @@ -3657,8 +3663,10 @@ static void create_function(struct >> si_shader_context *ctx) >> struct lp_build_tgsi_context *bld_base = >> &ctx->radeon_bld.soa.bld_base; >> struct gallivm_state *gallivm = bld_base->base.gallivm; >> struct si_shader *shader = ctx->shader; >> - LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32; >> + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v2i32, >> v3i32; >> + LLVMTypeRef returns[16+32*4]; > > > This is a bit of a magic number, I guess something like max parameters plus > attributes. Can you replace it by the appropriate defines? There is not a single definition that would express this clearly. The prolog has to return up to 16 input SGPRs and 4-20 input VGPRs. Additionally, the prolog returns other data in VGPRs. That's up to 4+16 VGPRs (16 vertex load addresses) for the VS and 20+8 VGPRs (2 vec4 colors) for the PS. The PS epilog returns one SGPR (but in s10 or so, so we need to allocate 11) and 9*4 VGPRs at most. This all can change in the future, who knows. 16+32*4 is much more than we'll ever need, but it shouldn't overflow at least. Assertions also check if we don't overflow. Marek ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 11/25] radeonsi: first bits for non-monolithic shaders
On 15.02.2016 18:59, Marek Olšák wrote: From: Marek Olšák --- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 3 ++ src/gallium/drivers/radeonsi/si_shader.c | 53 src/gallium/drivers/radeonsi/si_shader.h | 2 +- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index fa60732..448fe88 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -600,6 +600,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) sscreen->b.has_cp_dma = true; sscreen->b.has_streamout = true; + sscreen->use_monolithic_shaders = true; if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE)) sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b5790d6..2a2455c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -84,6 +84,9 @@ struct si_compute; struct si_screen { struct r600_common_screen b; unsignedgs_table_depth; + + /* Whether shaders are monolithic (1-part) or separate (3-part). */ + booluse_monolithic_shaders; }; struct si_blend_color { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index b058019..b74ed1e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -70,6 +70,12 @@ struct si_shader_context unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ bool is_gs_copy_shader; + + /* Whether to generate the optimized shader variant compiled as a whole +* (without a prolog and epilog) +*/ + bool is_monolithic; + int param_streamout_config; int param_streamout_write_index; int param_streamout_offset[4]; @@ -3657,8 +3663,10 @@ static void create_function(struct si_shader_context *ctx) struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32; + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v2i32, v3i32; + LLVMTypeRef returns[16+32*4]; This is a bit of a magic number, I guess something like max parameters plus attributes. Can you replace it by the appropriate defines? Apart from this, patches 10-11 are Reviewed-by: Nicolai Hähnle unsigned i, last_array_pointer, last_sgpr, num_params; + unsigned num_returns = 0; v2i32 = LLVMVectorType(ctx->i32, 2); v3i32 = LLVMVectorType(ctx->i32, 3); @@ -3785,7 +3793,7 @@ static void create_function(struct si_shader_context *ctx) assert(num_params <= Elements(params)); - si_create_function(ctx, NULL, 0, params, + si_create_function(ctx, returns, num_returns, params, num_params, last_array_pointer, last_sgpr); shader->num_input_sgprs = 0; @@ -4492,9 +4500,11 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; } -int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, -struct si_shader *shader, -struct pipe_debug_callback *debug) +static int si_compile_tgsi_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + bool is_monolithic, + struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; struct tgsi_token *tokens = sel->tokens; @@ -4524,6 +4534,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, si_init_shader_ctx(&ctx, sscreen, shader, tm, poly_stipple ? &stipple_shader_info : &sel->info); + ctx.is_monolithic = is_monolithic; shader->uses_instanceid = sel->info.uses_instanceid; @@ -4604,14 +4615,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, goto out; } - si_shader_dump(sscreen, shader, debug, ctx.type); - - r = si_shader_binary_upload(sscreen, shader); - if (r) { - fprintf(stderr, "LLVM failed to upload shader\n"); - goto out; - } - radeon_llvm_dispose(&ctx.radeon_bld); /* Calculate the number of fragment input VGPRs. */ @@ -4675,6 +4678,30 @@ out: return r; } +int si_shader_create(struct si_screen
[Mesa-dev] [PATCH 11/25] radeonsi: first bits for non-monolithic shaders
From: Marek Olšák --- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 3 ++ src/gallium/drivers/radeonsi/si_shader.c | 53 src/gallium/drivers/radeonsi/si_shader.h | 2 +- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index fa60732..448fe88 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -600,6 +600,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) sscreen->b.has_cp_dma = true; sscreen->b.has_streamout = true; + sscreen->use_monolithic_shaders = true; if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE)) sscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b5790d6..2a2455c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -84,6 +84,9 @@ struct si_compute; struct si_screen { struct r600_common_screen b; unsignedgs_table_depth; + + /* Whether shaders are monolithic (1-part) or separate (3-part). */ + booluse_monolithic_shaders; }; struct si_blend_color { diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index b058019..b74ed1e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -70,6 +70,12 @@ struct si_shader_context unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ bool is_gs_copy_shader; + + /* Whether to generate the optimized shader variant compiled as a whole +* (without a prolog and epilog) +*/ + bool is_monolithic; + int param_streamout_config; int param_streamout_write_index; int param_streamout_offset[4]; @@ -3657,8 +3663,10 @@ static void create_function(struct si_shader_context *ctx) struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; - LLVMTypeRef params[SI_NUM_PARAMS], v2i32, v3i32; + LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v2i32, v3i32; + LLVMTypeRef returns[16+32*4]; unsigned i, last_array_pointer, last_sgpr, num_params; + unsigned num_returns = 0; v2i32 = LLVMVectorType(ctx->i32, 2); v3i32 = LLVMVectorType(ctx->i32, 3); @@ -3785,7 +3793,7 @@ static void create_function(struct si_shader_context *ctx) assert(num_params <= Elements(params)); - si_create_function(ctx, NULL, 0, params, + si_create_function(ctx, returns, num_returns, params, num_params, last_array_pointer, last_sgpr); shader->num_input_sgprs = 0; @@ -4492,9 +4500,11 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32"; } -int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, -struct si_shader *shader, -struct pipe_debug_callback *debug) +static int si_compile_tgsi_shader(struct si_screen *sscreen, + LLVMTargetMachineRef tm, + struct si_shader *shader, + bool is_monolithic, + struct pipe_debug_callback *debug) { struct si_shader_selector *sel = shader->selector; struct tgsi_token *tokens = sel->tokens; @@ -4524,6 +4534,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, si_init_shader_ctx(&ctx, sscreen, shader, tm, poly_stipple ? &stipple_shader_info : &sel->info); + ctx.is_monolithic = is_monolithic; shader->uses_instanceid = sel->info.uses_instanceid; @@ -4604,14 +4615,6 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, goto out; } - si_shader_dump(sscreen, shader, debug, ctx.type); - - r = si_shader_binary_upload(sscreen, shader); - if (r) { - fprintf(stderr, "LLVM failed to upload shader\n"); - goto out; - } - radeon_llvm_dispose(&ctx.radeon_bld); /* Calculate the number of fragment input VGPRs. */ @@ -4675,6 +4678,30 @@ out: return r; } +int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, +struct si_shader *shader, +struct pipe_debug_callback *debug) +{ + int r; + + /* Compile TGSI. */ + r = si_compile_tgsi_shader(sscreen, tm, shader, +