For the series: Reviewed-by: Marek Olšák <marek.ol...@amd.com>
Marek On Wed, May 25, 2016 at 2:35 PM, Tom Stellard <thomas.stell...@amd.com> wrote: > We were storing arrays in vectors, which was leading to some really bad > spill code for large arrays. allocas instructions are a better fit for > arrays and LLVM optimizations are more geared toward dealing with > allocas instead of vectors. > > For arrays that have 16 or less 32-bit elements, we will continue to use > vectors, because this will force LLVM to store them in registers and > use indirect registers, which is usually faster for small arrays. > > In the future we should use allocas for all arrays and teach LLVM > how to store allocas in registers. > > This fixes the piglit test: > > spec/glsl-1.50/execution/geometry/max-input-component > --- > src/gallium/drivers/radeon/radeon_llvm.h | 7 +- > .../drivers/radeon/radeon_setup_tgsi_llvm.c | 169 > ++++++++++++++++++--- > 2 files changed, 151 insertions(+), 25 deletions(-) > > diff --git a/src/gallium/drivers/radeon/radeon_llvm.h > b/src/gallium/drivers/radeon/radeon_llvm.h > index 3e11b36..5b524b6 100644 > --- a/src/gallium/drivers/radeon/radeon_llvm.h > +++ b/src/gallium/drivers/radeon/radeon_llvm.h > @@ -50,6 +50,11 @@ struct radeon_llvm_loop { > LLVMBasicBlockRef endloop_block; > }; > > +struct radeon_llvm_array { > + struct tgsi_declaration_range range; > + LLVMValueRef alloca; > +}; > + > struct radeon_llvm_context { > struct lp_build_tgsi_soa_context soa; > > @@ -96,7 +101,7 @@ struct radeon_llvm_context { > unsigned loop_depth; > unsigned loop_depth_max; > > - struct tgsi_declaration_range *arrays; > + struct radeon_llvm_array *arrays; > > LLVMValueRef main_fn; > LLVMTypeRef return_type; > diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c > b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c > index 93bc307..cb35390 100644 > --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c > +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c > @@ -83,11 +83,25 @@ static LLVMValueRef emit_swizzle( > > static struct tgsi_declaration_range > get_array_range(struct lp_build_tgsi_context *bld_base, > - unsigned File, const struct tgsi_ind_register *reg) > + unsigned File, unsigned reg_index, > + const struct tgsi_ind_register *reg) > { > struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); > > - if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 || > + if (!reg) { > + unsigned i; > + unsigned num_arrays = > bld_base->info->array_max[TGSI_FILE_TEMPORARY]; > + for (i = 0; i < num_arrays; i++) { > + const struct tgsi_declaration_range *range = > + &ctx->arrays[i].range; > + > + if (reg_index >= range->First && reg_index <= > range->Last) { > + return ctx->arrays[i].range; > + } > + } > + } > + > + if (File != TGSI_FILE_TEMPORARY || !reg || reg->ArrayID == 0 || > reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) { > struct tgsi_declaration_range range; > range.First = 0; > @@ -95,7 +109,32 @@ get_array_range(struct lp_build_tgsi_context *bld_base, > return range; > } > > - return ctx->arrays[reg->ArrayID - 1]; > + return ctx->arrays[reg->ArrayID - 1].range; > +} > + > +static LLVMValueRef get_alloca_for_array( > + struct lp_build_tgsi_context *bld_base, > + unsigned file, > + unsigned index) { > + > + unsigned i; > + unsigned num_arrays; > + struct radeon_llvm_context *ctx = radeon_llvm_context(bld_base); > + > + if (file != TGSI_FILE_TEMPORARY) { > + return NULL; > + } > + > + num_arrays = bld_base->info->array_max[TGSI_FILE_TEMPORARY]; > + for (i = 0; i < num_arrays; i++) { > + const struct tgsi_declaration_range *range = > + &ctx->arrays[i].range; > + > + if (index >= range->First && index <= range->Last) { > + return ctx->arrays[i].alloca; > + } > + } > + return NULL; > } > > static LLVMValueRef > @@ -106,6 +145,9 @@ emit_array_index( > { > struct gallivm_state * gallivm = bld->bld_base.base.gallivm; > > + if (!reg) { > + return lp_build_const_int32(gallivm, offset); > + } > LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, > bld->addr[reg->Index][reg->Swizzle], ""); > return LLVMBuildAdd(gallivm->builder, addr, > lp_build_const_int32(gallivm, offset), ""); > } > @@ -154,7 +196,7 @@ emit_array_fetch( > tmp_reg.Register.Index = i + range.First; > LLVMValueRef temp = radeon_llvm_emit_fetch(bld_base, > &tmp_reg, type, swizzle); > result = LLVMBuildInsertElement(builder, result, temp, > - lp_build_const_int32(gallivm, i), ""); > + lp_build_const_int32(gallivm, i), "array_vector"); > } > return result; > } > @@ -169,13 +211,35 @@ load_value_from_array( > const struct tgsi_ind_register *reg_indirect) > { > struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); > - LLVMBuilderRef builder = bld_base->base.gallivm->builder; > - struct tgsi_declaration_range range = get_array_range(bld_base, file, > reg_indirect); > + struct gallivm_state *gallivm = bld_base->base.gallivm; > + LLVMBuilderRef builder = gallivm->builder; > + struct tgsi_declaration_range range = get_array_range(bld_base, file, > reg_index, reg_indirect); > + LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - > range.First); > + LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index); > + LLVMValueRef ptr, val, indices[2]; > + > + if (!array) { > + /* Handle the case where the array is stored as a vector. */ > + return LLVMBuildExtractElement(builder, > + emit_array_fetch(bld_base, file, type, range, > swizzle), > + index, ""); > + } > > - return LLVMBuildExtractElement(builder, > - emit_array_fetch(bld_base, file, type, range, > swizzle), > - emit_array_index(bld, reg_indirect, reg_index - > range.First), ""); > + index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, > TGSI_NUM_CHANNELS), ""); > + index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, > swizzle), ""); > + indices[0] = bld_base->uint_bld.zero; > + indices[1] = index; > + ptr = LLVMBuildGEP(builder, array, indices, 2, ""); > + val = LLVMBuildLoad(builder, ptr, ""); > + if (type == TGSI_TYPE_DOUBLE) { > + LLVMValueRef ptr_hi, val_hi; > + indices[0] = lp_build_const_int32(gallivm, 1); > + ptr_hi = LLVMBuildGEP(builder, ptr, indices, 1, ""); > + val_hi = LLVMBuildLoad(builder, ptr_hi, ""); > + val = radeon_llvm_emit_fetch_double(bld_base, val, val_hi); > > + } > + return val; > } > > static LLVMValueRef store_value_to_array( > @@ -187,13 +251,26 @@ static LLVMValueRef store_value_to_array( > const struct tgsi_ind_register *reg_indirect) > { > struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); > - LLVMBuilderRef builder = bld_base->base.gallivm->builder; > - struct tgsi_declaration_range range = get_array_range(bld_base, file, > reg_indirect); > - > - return LLVMBuildInsertElement(builder, > + struct gallivm_state *gallivm = bld_base->base.gallivm; > + LLVMBuilderRef builder = gallivm->builder; > + struct tgsi_declaration_range range = get_array_range(bld_base, file, > reg_index, reg_indirect); > + LLVMValueRef index = emit_array_index(bld, reg_indirect, reg_index - > range.First); > + LLVMValueRef array = get_alloca_for_array(bld_base, file, reg_index); > + > + if (array) { > + LLVMValueRef indices[2]; > + index = LLVMBuildMul(builder, index, > lp_build_const_int32(gallivm, TGSI_NUM_CHANNELS), ""); > + index = LLVMBuildAdd(builder, index, > lp_build_const_int32(gallivm, chan_index), ""); > + indices[0] = bld_base->uint_bld.zero; > + indices[1] = index; > + LLVMValueRef pointer = LLVMBuildGEP(builder, array, indices, > 2, ""); > + LLVMBuildStore(builder, value, pointer); > + return NULL; > + } else { > + return LLVMBuildInsertElement(builder, > emit_array_fetch(bld_base, file, > TGSI_TYPE_FLOAT, range, chan_index), > - value, emit_array_index(bld, reg_indirect, > reg_index - range.First), ""); > - return NULL; > + value, index, ""); > + } > } > > LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base, > @@ -217,8 +294,9 @@ LLVMValueRef radeon_llvm_emit_fetch(struct > lp_build_tgsi_context *bld_base, > } > > if (reg->Register.Indirect) { > - return load_value_from_array(bld_base, reg->Register.File, > type, > + LLVMValueRef load = load_value_from_array(bld_base, > reg->Register.File, type, > swizzle, reg->Register.Index, ®->Indirect); > + return bitcast(bld_base, type, load); > } > > switch(reg->Register.File) { > @@ -257,6 +335,11 @@ LLVMValueRef radeon_llvm_emit_fetch(struct > lp_build_tgsi_context *bld_base, > LLVMBuildLoad(builder, ptr, > ""), > LLVMBuildLoad(builder, ptr2, > "")); > } > + LLVMValueRef array = get_alloca_for_array(bld_base, > reg->Register.File, reg->Register.Index); > + if (array) { > + return bitcast(bld_base, type, > load_value_from_array(bld_base, reg->Register.File, type, > + swizzle, reg->Register.Index, NULL)); > + } > result = LLVMBuildLoad(builder, ptr, ""); > break; > > @@ -309,6 +392,7 @@ static void emit_declaration( > const struct tgsi_full_declaration *decl) > { > struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); > + LLVMBuilderRef builder = bld_base->base.gallivm->builder; > unsigned first, last, i, idx; > switch(decl->Declaration.File) { > case TGSI_FILE_ADDRESS: > @@ -326,13 +410,36 @@ static void emit_declaration( > } > > case TGSI_FILE_TEMPORARY: > + { > + unsigned decl_size; > + first = decl->Range.First; > + last = decl->Range.Last; > + decl_size = 4 * ((last - first) + 1); > if (decl->Declaration.Array) { > + unsigned id = decl->Array.ArrayID - 1; > if (!ctx->arrays) { > int size = > bld_base->info->array_max[TGSI_FILE_TEMPORARY]; > - ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * > size); > + ctx->arrays = CALLOC(size, > sizeof(ctx->arrays[0])); > + for (i = 0; i < size; ++i) { > + assert(!ctx->arrays[i].alloca);} > } > > - ctx->arrays[decl->Array.ArrayID - 1] = decl->Range; > + ctx->arrays[id].range = decl->Range; > + > + /* If the array is more than 16 elements (each element > + * is 32-bits), then store it in a vector. Storing > the > + * array in a vector will causes the compiler to store > + * the array in registers and access it using indirect > + * addressing. 16 is number of vector elements that > + * LLVM will store in a register. > + * FIXME: We shouldn't need to do this. LLVM should > be > + * smart enough to promote allocas int registers when > + * profitable. > + */ > + if (decl_size > 16) { > + ctx->arrays[id].alloca = > LLVMBuildAlloca(builder, > + > LLVMArrayType(bld_base->base.vec_type, decl_size),"array"); > + } > } > first = decl->Range.First; > last = decl->Range.Last; > @@ -349,7 +456,7 @@ static void emit_declaration( > } > } > break; > - > + } > case TGSI_FILE_INPUT: > { > unsigned idx; > @@ -459,11 +566,16 @@ void radeon_llvm_emit_store( > > if (reg->Register.Indirect) { > struct tgsi_declaration_range range = > get_array_range(bld_base, > - reg->Register.File, ®->Indirect); > + reg->Register.File, reg->Register.Index, > ®->Indirect); > > unsigned i, size = range.Last - range.First + 1; > - LLVMValueRef array = store_value_to_array(bld_base, > value, reg->Register.File, chan_index, > - > reg->Register.Index, ®->Indirect); > + unsigned file = reg->Register.File; > + unsigned reg_index = reg->Register.Index; > + LLVMValueRef array = store_value_to_array(bld_base, > value, file, chan_index, > + reg_index, > ®->Indirect); > + if (get_alloca_for_array(bld_base, file, reg_index)) { > + continue; > + } > for (i = 0; i < size; ++i) { > switch(reg->Register.File) { > case TGSI_FILE_OUTPUT: > @@ -477,7 +589,7 @@ void radeon_llvm_emit_store( > break; > > default: > - return; > + continue; > } > value = LLVMBuildExtractElement(builder, > array, > lp_build_const_int32(gallivm, i), ""); > @@ -493,14 +605,23 @@ void radeon_llvm_emit_store( > break; > > case TGSI_FILE_TEMPORARY: > + { > + LLVMValueRef array; > if (reg->Register.Index >= ctx->temps_count) > continue; > + array = get_alloca_for_array(bld_base, > reg->Register.File, reg->Register.Index); > + > + if (array) { > + store_value_to_array(bld_base, value, > reg->Register.File, chan_index, reg->Register.Index, > + NULL); > + continue; > + } > temp_ptr = ctx->temps[ TGSI_NUM_CHANNELS * > reg->Register.Index + chan_index]; > if (dtype == TGSI_TYPE_DOUBLE) > temp_ptr2 = ctx->temps[ > TGSI_NUM_CHANNELS * reg->Register.Index + chan_index + 1]; > > break; > - > + } > default: > return; > } > -- > 2.0.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev