radeon: reduce alloca of temporaries based on usagemask

Nicolai Hähnle Tue, 09 Aug 2016 03:37:45 -0700

From: Nicolai Hähnle <nicolai.haeh...@amd.com>

---
 src/gallium/drivers/radeon/radeon_llvm.h           |  1 +
 .../drivers/radeon/radeon_setup_tgsi_llvm.c        | 47 +++++++++++++++++-----
 2 files changed, 38 insertions(+), 10 deletions(-)


diff --git a/src/gallium/drivers/radeon/radeon_llvm.h 
b/src/gallium/drivers/radeon/radeon_llvm.h
index 13f3336..4c946b5 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -45,20 +45,21 @@ struct radeon_llvm_branch {
        unsigned has_else;
 };
 
 struct radeon_llvm_loop {
        LLVMBasicBlockRef loop_block;
        LLVMBasicBlockRef endloop_block;
 };
 
 struct radeon_llvm_array {
        struct tgsi_declaration_range range;
+       ubyte usagemask;
        LLVMValueRef alloca;
 };
 
 struct radeon_llvm_context {
        struct lp_build_tgsi_soa_context soa;
 
        /*=== Front end configuration ===*/
 
        /* Instructions that are not described by any of the TGSI opcodes. */
 
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c 
b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index d8ab5b0..73e4ce2 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -191,22 +191,30 @@ get_pointer_into_array(struct radeon_llvm_context *ctx,
        LLVMValueRef index;
 
        if (file != TGSI_FILE_TEMPORARY)
                return NULL;
 
        array = get_temp_array(&ctx->soa.bld_base, reg_index, reg_indirect);
        if (!array || !array->alloca)
                return NULL;
 
        index = emit_array_index(&ctx->soa, reg_indirect, reg_index - 
array->range.First);
-       index = LLVMBuildMul(builder, index, lp_build_const_int32(gallivm, 
TGSI_NUM_CHANNELS), "");
-       index = LLVMBuildAdd(builder, index, lp_build_const_int32(gallivm, 
swizzle), "");
+       index = LLVMBuildMul(
+               builder, index,
+               lp_build_const_int32(gallivm, util_bitcount(array->usagemask)),
+               "");
+       index = LLVMBuildAdd(
+               builder, index,
+               lp_build_const_int32(
+                       gallivm,
+                       util_bitcount(array->usagemask & ((1 << swizzle) - 1))),
+               "");
        idxs[0] = ctx->soa.bld_base.uint_bld.zero;
        idxs[1] = index;
        return LLVMBuildGEP(builder, array->alloca, idxs, 2, "");
 }
 
 LLVMValueRef
 radeon_llvm_emit_fetch_64bit(struct lp_build_tgsi_context *bld_base,
                             enum tgsi_opcode_type type,
                             LLVMValueRef ptr,
                             LLVMValueRef ptr2)
@@ -466,54 +474,61 @@ static void emit_declaration(struct lp_build_tgsi_context 
*bld_base,
                        }
                }
                break;
        }
 
        case TGSI_FILE_TEMPORARY:
        {
                char name[16] = "";
                LLVMValueRef array_alloca = NULL;
                unsigned decl_size;
+               unsigned usagemask = decl->Declaration.UsageMask;
                first = decl->Range.First;
                last = decl->Range.Last;
                decl_size = 4 * ((last - first) + 1);
+
                if (decl->Declaration.Array) {
                        unsigned id = decl->Array.ArrayID - 1;
+                       unsigned array_size;
+
                        if (!ctx->arrays) {
                                int size = 
bld_base->info->array_max[TGSI_FILE_TEMPORARY];
                                ctx->arrays = CALLOC(size, 
sizeof(ctx->arrays[0]));
                        }
 
                        ctx->arrays[id].range = decl->Range;
+                       ctx->arrays[id].usagemask = usagemask;
+                       array_size = ((last - first) + 1) * 
util_bitcount(usagemask);
 
                        /* If the array has more than 16 elements, store it
                         * in memory using an alloca that spans the entire
                         * array.
                         *
                         * Otherwise, store each array element individually.
                         * We will then generate vectors (per-channel, up to
-                        * <4 x float>) for indirect addressing.
+                        * <16 x float> if the usagemask is a single bit) for
+                        * indirect addressing.
                         *
                         * Note that 16 is the number of vector elements that
                         * LLVM will store in a register, so theoretically an
                         * array with up to 4 * 16 = 64 elements could be
                         * handled this way, but whether that's a good idea
                         * depends on VGPR register pressure elsewhere.
                         *
                         * FIXME: We shouldn't need to have the non-alloca
                         * code path for arrays. LLVM should be smart enough to
                         * promote allocas into registers when profitable.
                         */
-                       if (decl_size > 16) {
+                       if (array_size > 16) {
                                array_alloca = LLVMBuildAlloca(builder,
                                        LLVMArrayType(bld_base->base.vec_type,
-                                                     decl_size), "array");
+                                                     array_size), "array");
                                ctx->arrays[id].alloca = array_alloca;
                        }
                }
 
                if (!ctx->temps_count) {
                        ctx->temps_count = 
bld_base->info->file_max[TGSI_FILE_TEMPORARY] + 1;
                        ctx->temps = MALLOC(TGSI_NUM_CHANNELS * 
ctx->temps_count * sizeof(LLVMValueRef));
                }
                if (!array_alloca) {
                        for (i = 0; i < decl_size; ++i) {
@@ -524,28 +539,40 @@ static void emit_declaration(struct lp_build_tgsi_context 
*bld_base,
                                ctx->temps[first * TGSI_NUM_CHANNELS + i] =
                                        
si_build_alloca_undef(bld_base->base.gallivm,
                                                              
bld_base->base.vec_type,
                                                              name);
                        }
                } else {
                        LLVMValueRef idxs[2] = {
                                bld_base->uint_bld.zero,
                                NULL
                        };
+                       LLVMValueRef undef = NULL;
+                       unsigned j = 0;
+
+                       if (usagemask != TGSI_WRITEMASK_XYZW)
+                               undef = 
LLVMGetUndef(LLVMPointerType(bld_base->base.vec_type, 0));
+
                        for (i = 0; i < decl_size; ++i) {
+                               LLVMValueRef ptr;
+                               if (usagemask & (1 << (i % 4))) {
 #ifdef DEBUG
-                               snprintf(name, sizeof(name), "TEMP%d.%c",
-                                        first + i / 4, "xyzw"[i % 4]);
+                                       snprintf(name, sizeof(name), 
"TEMP%d.%c",
+                                                first + i / 4, "xyzw"[i % 4]);
 #endif
-                               idxs[1] = 
lp_build_const_int32(bld_base->base.gallivm, i);
-                               ctx->temps[first * TGSI_NUM_CHANNELS + i] =
-                                       LLVMBuildGEP(builder, array_alloca, 
idxs, 2, name);
+                                       idxs[1] = 
lp_build_const_int32(bld_base->base.gallivm, j);
+                                       ptr = LLVMBuildGEP(builder, 
array_alloca, idxs, 2, name);
+                                       j++;
+                               } else {
+                                       ptr = undef;
+                               }
+                               ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr;
                        }
                }
                break;
        }
        case TGSI_FILE_INPUT:
        {
                unsigned idx;
                for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
                        if (ctx->load_input)
                                ctx->load_input(ctx, idx, decl);
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 17/19] gallium/radeon: reduce alloca of temporaries based on usagemask

Reply via email to