Re: [Mesa-dev] [PATCH 3/3] radeon/llvm: Use alloca instructions for larger arrays

2016-05-26 Thread Marek Olšák
For the series:

Reviewed-by: Marek Olšák 

Marek

On Wed, May 25, 2016 at 2:35 PM, Tom Stellard  wrote:
> We were storing arrays in vectors, which was leading to some really bad
> spill code for large arrays.  allocas instructions are a better fit for
> arrays and LLVM optimizations are more geared toward dealing with
> allocas instead of vectors.
>
> For arrays that have 16 or less 32-bit elements, we will continue to use
> vectors, because this will force LLVM to store them in registers and
> use indirect registers, which is usually faster for small arrays.
>
> In the future we should use allocas for all arrays and teach LLVM
> how to store allocas in registers.
>
> This fixes the piglit test:
>
> spec/glsl-1.50/execution/geometry/max-input-component
> ---
>  src/gallium/drivers/radeon/radeon_llvm.h   |   7 +-
>  .../drivers/radeon/radeon_setup_tgsi_llvm.c| 169 
> ++---
>  2 files changed, 151 insertions(+), 25 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/radeon_llvm.h 
> b/src/gallium/drivers/radeon/radeon_llvm.h
> index 3e11b36..5b524b6 100644
> --- a/src/gallium/drivers/radeon/radeon_llvm.h
> +++ b/src/gallium/drivers/radeon/radeon_llvm.h
> @@ -50,6 +50,11 @@ struct radeon_llvm_loop {
> LLVMBasicBlockRef endloop_block;
>  };
>
> +struct radeon_llvm_array {
> +   struct tgsi_declaration_range range;
> +   LLVMValueRef alloca;
> +};
> +
>  struct radeon_llvm_context {
> struct lp_build_tgsi_soa_context soa;
>
> @@ -96,7 +101,7 @@ struct radeon_llvm_context {
> unsigned loop_depth;
> unsigned loop_depth_max;
>
> -   struct tgsi_declaration_range *arrays;
> +   struct radeon_llvm_array *arrays;
>
> LLVMValueRef main_fn;
> LLVMTypeRef return_type;
> diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c 
> b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> index 93bc307..cb35390 100644
> --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> @@ -83,11 +83,25 @@ static LLVMValueRef emit_swizzle(
>
>  static struct tgsi_declaration_range
>  get_array_range(struct lp_build_tgsi_context *bld_base,
> -   unsigned File, const struct tgsi_ind_register *reg)
> +   unsigned File, unsigned reg_index,
> +   const struct tgsi_ind_register *reg)
>  {
> struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
>
> -   if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
> +   if (!reg) {
> +   unsigned i;
> +   unsigned num_arrays = 
> bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> +   for (i = 0; i < num_arrays; i++) {
> +   const struct tgsi_declaration_range *range =
> +   >arrays[i].range;
> +
> +   if (reg_index >= range->First && reg_index <= 
> range->Last) {
> +   return ctx->arrays[i].range;
> +   }
> +   }
> +   }
> +
> +   if (File != TGSI_FILE_TEMPORARY || !reg || reg->ArrayID == 0 ||
> reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
> struct tgsi_declaration_range range;
> range.First = 0;
> @@ -95,7 +109,32 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
> return range;
> }
>
> -   return ctx->arrays[reg->ArrayID - 1];
> +   return ctx->arrays[reg->ArrayID - 1].range;
> +}
> +
> +static LLVMValueRef get_alloca_for_array(
> +   struct lp_build_tgsi_context *bld_base,
> +   unsigned file,
> +   unsigned index) {
> +
> +   unsigned i;
> +   unsigned num_arrays;
> +   struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
> +
> +   if (file != TGSI_FILE_TEMPORARY) {
> +   return NULL;
> +   }
> +
> +   num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
> +   for (i = 0; i < num_arrays; i++) {
> +   const struct tgsi_declaration_range *range =
> +   >arrays[i].range;
> +
> +   if (index >= range->First && index <= range->Last) {
> +   return ctx->arrays[i].alloca;
> +   }
> +   }
> +   return NULL;
>  }
>
>  static LLVMValueRef
> @@ -106,6 +145,9 @@ emit_array_index(
>  {
> struct gallivm_state * gallivm = bld->bld_base.base.gallivm;
>
> +   if (!reg) {
> +   return lp_build_const_int32(gallivm, offset);
> +   }
> LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, 
> bld->addr[reg->Index][reg->Swizzle], "");
> return LLVMBuildAdd(gallivm->builder, addr, 
> lp_build_const_int32(gallivm, offset), "");
>  }
> @@ -154,7 +196,7 @@ emit_array_fetch(
> tmp_reg.Register.Index = i + range.First;
> 

[Mesa-dev] [PATCH 3/3] radeon/llvm: Use alloca instructions for larger arrays

2016-05-25 Thread Tom Stellard
We were storing arrays in vectors, which was leading to some really bad
spill code for large arrays.  allocas instructions are a better fit for
arrays and LLVM optimizations are more geared toward dealing with
allocas instead of vectors.

For arrays that have 16 or less 32-bit elements, we will continue to use
vectors, because this will force LLVM to store them in registers and
use indirect registers, which is usually faster for small arrays.

In the future we should use allocas for all arrays and teach LLVM
how to store allocas in registers.

This fixes the piglit test:

spec/glsl-1.50/execution/geometry/max-input-component
---
 src/gallium/drivers/radeon/radeon_llvm.h   |   7 +-
 .../drivers/radeon/radeon_setup_tgsi_llvm.c| 169 ++---
 2 files changed, 151 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h 
b/src/gallium/drivers/radeon/radeon_llvm.h
index 3e11b36..5b524b6 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -50,6 +50,11 @@ struct radeon_llvm_loop {
LLVMBasicBlockRef endloop_block;
 };
 
+struct radeon_llvm_array {
+   struct tgsi_declaration_range range;
+   LLVMValueRef alloca;
+};
+
 struct radeon_llvm_context {
struct lp_build_tgsi_soa_context soa;
 
@@ -96,7 +101,7 @@ struct radeon_llvm_context {
unsigned loop_depth;
unsigned loop_depth_max;
 
-   struct tgsi_declaration_range *arrays;
+   struct radeon_llvm_array *arrays;
 
LLVMValueRef main_fn;
LLVMTypeRef return_type;
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c 
b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 93bc307..cb35390 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -83,11 +83,25 @@ static LLVMValueRef emit_swizzle(
 
 static struct tgsi_declaration_range
 get_array_range(struct lp_build_tgsi_context *bld_base,
-   unsigned File, const struct tgsi_ind_register *reg)
+   unsigned File, unsigned reg_index,
+   const struct tgsi_ind_register *reg)
 {
struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 
-   if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
+   if (!reg) {
+   unsigned i;
+   unsigned num_arrays = 
bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+   for (i = 0; i < num_arrays; i++) {
+   const struct tgsi_declaration_range *range =
+   >arrays[i].range;
+
+   if (reg_index >= range->First && reg_index <= 
range->Last) {
+   return ctx->arrays[i].range;
+   }
+   }
+   }
+
+   if (File != TGSI_FILE_TEMPORARY || !reg || reg->ArrayID == 0 ||
reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
struct tgsi_declaration_range range;
range.First = 0;
@@ -95,7 +109,32 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
return range;
}
 
-   return ctx->arrays[reg->ArrayID - 1];
+   return ctx->arrays[reg->ArrayID - 1].range;
+}
+
+static LLVMValueRef get_alloca_for_array(
+   struct lp_build_tgsi_context *bld_base,
+   unsigned file,
+   unsigned index) {
+
+   unsigned i;
+   unsigned num_arrays;
+   struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base);
+
+   if (file != TGSI_FILE_TEMPORARY) {
+   return NULL;
+   }
+
+   num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+   for (i = 0; i < num_arrays; i++) {
+   const struct tgsi_declaration_range *range =
+   >arrays[i].range;
+
+   if (index >= range->First && index <= range->Last) {
+   return ctx->arrays[i].alloca;
+   }
+   }
+   return NULL;
 }
 
 static LLVMValueRef
@@ -106,6 +145,9 @@ emit_array_index(
 {
struct gallivm_state * gallivm = bld->bld_base.base.gallivm;
 
+   if (!reg) {
+   return lp_build_const_int32(gallivm, offset);
+   }
LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, 
bld->addr[reg->Index][reg->Swizzle], "");
return LLVMBuildAdd(gallivm->builder, addr, 
lp_build_const_int32(gallivm, offset), "");
 }
@@ -154,7 +196,7 @@ emit_array_fetch(
tmp_reg.Register.Index = i + range.First;
LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, _reg, 
type, swizzle);
result = LLVMBuildInsertElement(builder, result, temp,
-   lp_build_const_int32(gallivm, i), "");
+   lp_build_const_int32(gallivm, i), "array_vector");
}
return result;
 }
@@ -169,13 +211,35 @@ load_value_from_array(