Re: [Mesa-dev] [PATCH 10/10] radeonsi/gfx9: add workarounds to avoid VGPR indexing completely

2017-07-16 Thread Nicolai Hähnle

Patches 2 - 10:

Reviewed-by: Nicolai Hähnle 


On 08.07.2017 02:42, Marek Olšák wrote:

From: Marek Olšák 

For inputs and outputs, indirect indexing is lowered by the GLSL compiler.
For temporaries, use alloca and disable the "promote-alloca" pass.

In the future, we could switch all codepaths to alloca permanently and
just rely on the "promote-alloca" pass.
---
  src/gallium/drivers/radeonsi/si_pipe.c | 25 --
  src/gallium/drivers/radeonsi/si_pipe.h |  1 +
  .../drivers/radeonsi/si_shader_tgsi_setup.c|  3 +--
  3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index afb2bcb..8a4bc41 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -134,22 +134,23 @@ static void si_emit_string_marker(struct pipe_context 
*ctx,
dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
  }
  
  static LLVMTargetMachineRef

  si_create_llvm_target_machine(struct si_screen *sscreen)
  {
const char *triple = "amdgcn--";
char features[256];
  
  	snprintf(features, sizeof(features),

-"+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s",
+
"+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s",
 sscreen->b.chip_class >= GFX9 ? ",+xnack" : ",-xnack",
+sscreen->llvm_has_working_vgpr_indexing ? "" : 
",-promote-alloca",
 sscreen->b.debug_flags & DBG_SI_SCHED ? ",+si-scheduler" : "");
  
  	return LLVMCreateTargetMachine(ac_get_llvm_target(triple), triple,

   
r600_get_llvm_processor_name(sscreen->b.family),
   features,
   LLVMCodeGenLevelDefault,
   LLVMRelocDefault,
   LLVMCodeModelDefault);
  }
  
@@ -750,34 +751,41 @@ static int si_get_shader_param(struct pipe_screen* pscreen,

case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
return 32;
case PIPE_SHADER_CAP_PREFERRED_IR:
return PIPE_SHADER_IR_TGSI;
case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
return 3;
  
  	/* Supported boolean features. */

case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
case PIPE_SHADER_CAP_INTEGERS:
case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
return 1;
  
  	case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:

-   /* TODO: Indirection of geometry shader input dimension is not
-* handled yet
-*/
-   return shader != PIPE_SHADER_GEOMETRY;
+   /* TODO: Indirect indexing of GS inputs is unimplemented. */
+   return shader != PIPE_SHADER_GEOMETRY &&
+  (sscreen->llvm_has_working_vgpr_indexing ||
+   /* TCS and TES load inputs directly from LDS or
+* offchip memory, so indirect indexing is trivial. */
+   shader == PIPE_SHADER_TESS_CTRL ||
+   shader == PIPE_SHADER_TESS_EVAL);
+
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+   return sscreen->llvm_has_working_vgpr_indexing ||
+  /* TCS stores outputs directly to memory. */
+  shader == PIPE_SHADER_TESS_CTRL;
  
  	/* Unsupported boolean features. */

case PIPE_SHADER_CAP_SUBROUTINES:
case PIPE_SHADER_CAP_SUPPORTED_IRS:
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
return 0;
}
return 0;
  }
@@ -999,20 +1007,25 @@ struct pipe_screen *radeonsi_screen_create(struct 
radeon_winsys *ws,
 sscreen->b.info.me_fw_version >= 173) ||
(sscreen->b.chip_class == SI &&
 sscreen->b.info.pfp_fw_version >= 121 &&
 sscreen->b.info.me_fw_version >= 87);
  
  	sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;

sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 
&&
sscreen->b.family <= 
CHIP_POLARIS12) ||
   sscreen->b.family == CHIP_VEGA10 ||
   sscreen->b.family == CHIP_RAVEN;
+   /* While it would be nice not to have this flag, we are constrained
+* by the reality that LLVM 5.0 doesn't have working VGPR indexing
+* on GFX9.
+*/
+   s

[Mesa-dev] [PATCH 10/10] radeonsi/gfx9: add workarounds to avoid VGPR indexing completely

2017-07-07 Thread Marek Olšák
From: Marek Olšák 

For inputs and outputs, indirect indexing is lowered by the GLSL compiler.
For temporaries, use alloca and disable the "promote-alloca" pass.

In the future, we could switch all codepaths to alloca permanently and
just rely on the "promote-alloca" pass.
---
 src/gallium/drivers/radeonsi/si_pipe.c | 25 --
 src/gallium/drivers/radeonsi/si_pipe.h |  1 +
 .../drivers/radeonsi/si_shader_tgsi_setup.c|  3 +--
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index afb2bcb..8a4bc41 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -134,22 +134,23 @@ static void si_emit_string_marker(struct pipe_context 
*ctx,
dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
 }
 
 static LLVMTargetMachineRef
 si_create_llvm_target_machine(struct si_screen *sscreen)
 {
const char *triple = "amdgcn--";
char features[256];
 
snprintf(features, sizeof(features),
-"+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s",
+
"+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s",
 sscreen->b.chip_class >= GFX9 ? ",+xnack" : ",-xnack",
+sscreen->llvm_has_working_vgpr_indexing ? "" : 
",-promote-alloca",
 sscreen->b.debug_flags & DBG_SI_SCHED ? ",+si-scheduler" : "");
 
return LLVMCreateTargetMachine(ac_get_llvm_target(triple), triple,
   
r600_get_llvm_processor_name(sscreen->b.family),
   features,
   LLVMCodeGenLevelDefault,
   LLVMRelocDefault,
   LLVMCodeModelDefault);
 }
 
@@ -750,34 +751,41 @@ static int si_get_shader_param(struct pipe_screen* 
pscreen,
case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
return 32;
case PIPE_SHADER_CAP_PREFERRED_IR:
return PIPE_SHADER_IR_TGSI;
case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
return 3;
 
/* Supported boolean features. */
case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
case PIPE_SHADER_CAP_INTEGERS:
case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
return 1;
 
case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
-   /* TODO: Indirection of geometry shader input dimension is not
-* handled yet
-*/
-   return shader != PIPE_SHADER_GEOMETRY;
+   /* TODO: Indirect indexing of GS inputs is unimplemented. */
+   return shader != PIPE_SHADER_GEOMETRY &&
+  (sscreen->llvm_has_working_vgpr_indexing ||
+   /* TCS and TES load inputs directly from LDS or
+* offchip memory, so indirect indexing is trivial. */
+   shader == PIPE_SHADER_TESS_CTRL ||
+   shader == PIPE_SHADER_TESS_EVAL);
+
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+   return sscreen->llvm_has_working_vgpr_indexing ||
+  /* TCS stores outputs directly to memory. */
+  shader == PIPE_SHADER_TESS_CTRL;
 
/* Unsupported boolean features. */
case PIPE_SHADER_CAP_SUBROUTINES:
case PIPE_SHADER_CAP_SUPPORTED_IRS:
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
return 0;
}
return 0;
 }
@@ -999,20 +1007,25 @@ struct pipe_screen *radeonsi_screen_create(struct 
radeon_winsys *ws,
 sscreen->b.info.me_fw_version >= 173) ||
(sscreen->b.chip_class == SI &&
 sscreen->b.info.pfp_fw_version >= 121 &&
 sscreen->b.info.me_fw_version >= 87);
 
sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 
&&
sscreen->b.family <= 
CHIP_POLARIS12) ||
   sscreen->b.family == CHIP_VEGA10 ||
   sscreen->b.family == CHIP_RAVEN;
+   /* While it would be nice not to have this flag, we are constrained
+* by the reality that LLVM 5.0 doesn't have working VGPR indexing
+* on GFX9.
+*/
+   sscreen->llvm_has_working_vgpr_indexing = sscreen->b.chip_class <= VI;
 
sscre