From: Marek Olšák <marek.ol...@amd.com>

We know the divisors when we upload them, so instead we can precompute
and upload division factors derived from each divisor.

This fast division consists of add, mul_hi, and two shifts,
and we have to load 4 dwords intead of 1.

This probably won't affect any apps.
---
 src/gallium/drivers/radeonsi/si_shader.c | 46 +++++++++++++++-----------------
 src/gallium/drivers/radeonsi/si_state.c  | 42 ++++++++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_state.h  |  2 +-
 3 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 36f58e2..90cb059 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -421,34 +421,20 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct 
si_shader_context *ctx)
                        return LLVMConstInt(ctx->i32, stride, 0);
                }
                return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 
        default:
                assert(0);
                return NULL;
        }
 }
 
-static LLVMValueRef get_instance_index_for_fetch(
-       struct si_shader_context *ctx,
-       unsigned param_start_instance, LLVMValueRef divisor)
-{
-       LLVMValueRef result = ctx->abi.instance_id;
-
-       /* The division must be done before START_INSTANCE is added. */
-       if (divisor != ctx->i32_1)
-               result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
-
-       return LLVMBuildAdd(ctx->ac.builder, result,
-                           LLVMGetParam(ctx->main_fn, param_start_instance), 
"");
-}
-
 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
  * to float. */
 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
                                            LLVMValueRef vec4,
                                            unsigned double_index)
 {
        LLVMBuilderRef builder = ctx->ac.builder;
        LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
        LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
                                              LLVMVectorType(f64, 2), "");
@@ -7294,34 +7280,44 @@ static void si_build_vs_prolog_function(struct 
si_shader_context *ctx,
                        ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
        }
 
        for (i = 0; i <= key->vs_prolog.last_input; i++) {
                bool divisor_is_one =
                        key->vs_prolog.states.instance_divisor_is_one & (1u << 
i);
                bool divisor_is_fetched =
                        key->vs_prolog.states.instance_divisor_is_fetched & (1u 
<< i);
                LLVMValueRef index;
 
-               if (divisor_is_one || divisor_is_fetched) {
-                       LLVMValueRef divisor = ctx->i32_1;
+               if (divisor_is_one) {
+                       index = ctx->abi.instance_id;
+               } else if (divisor_is_fetched) {
+                       LLVMValueRef udiv_factors[4];
 
-                       if (divisor_is_fetched) {
-                               divisor = buffer_load_const(ctx, 
instance_divisor_constbuf,
-                                                           
LLVMConstInt(ctx->i32, i * 4, 0));
-                               divisor = ac_to_integer(&ctx->ac, divisor);
+                       for (unsigned j = 0; j < 4; j++) {
+                               udiv_factors[j] =
+                                       buffer_load_const(ctx, 
instance_divisor_constbuf,
+                                                         
LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+                               udiv_factors[j] = ac_to_integer(&ctx->ac, 
udiv_factors[j]);
                        }
+                       /* The faster NUW version doesn't work when InstanceID 
== UINT_MAX.
+                        * Such InstanceID might not be achievable in a 
reasonable time though.
+                        */
+                       index = ac_build_fast_udiv_nuw(&ctx->ac, 
ctx->abi.instance_id,
+                                                      udiv_factors[0], 
udiv_factors[1],
+                                                      udiv_factors[2], 
udiv_factors[3]);
+               }
 
-                       /* InstanceID / Divisor + StartInstance */
-                       index = get_instance_index_for_fetch(ctx,
-                                                            user_sgpr_base +
-                                                            
SI_SGPR_START_INSTANCE,
-                                                            divisor);
+               if (divisor_is_one || divisor_is_fetched) {
+                       /* Add StartInstance. */
+                       index = LLVMBuildAdd(ctx->ac.builder, index,
+                                            LLVMGetParam(ctx->main_fn, 
user_sgpr_base +
+                                                         
SI_SGPR_START_INSTANCE), "");
                } else {
                        /* VertexID + BaseVertex */
                        index = LLVMBuildAdd(ctx->ac.builder,
                                             ctx->abi.vertex_id,
                                             LLVMGetParam(func, user_sgpr_base +
                                                                
SI_SGPR_BASE_VERTEX), "");
                }
 
                index = ac_to_float(&ctx->ac, index);
                ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index bc1417a..aa57b3f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -25,20 +25,21 @@
 #include "si_build_pm4.h"
 #include "gfx9d.h"
 #include "si_query.h"
 
 #include "util/u_dual_blend.h"
 #include "util/u_format.h"
 #include "util/u_format_s3tc.h"
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
 
 static unsigned si_map_swizzle(unsigned swizzle)
 {
        switch (swizzle) {
        case PIPE_SWIZZLE_Y:
                return V_008F0C_SQ_SEL_Y;
        case PIPE_SWIZZLE_Z:
                return V_008F0C_SQ_SEL_Z;
        case PIPE_SWIZZLE_W:
                return V_008F0C_SQ_SEL_W;
@@ -4348,20 +4349,26 @@ static void si_delete_sampler_state(struct pipe_context 
*ctx, void *state)
  * Vertex elements & buffers
  */
 
 static void *si_create_vertex_elements(struct pipe_context *ctx,
                                       unsigned count,
                                       const struct pipe_vertex_element 
*elements)
 {
        struct si_screen *sscreen = (struct si_screen*)ctx->screen;
        struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
        bool used[SI_NUM_VERTEX_BUFFERS] = {};
+       struct util_fast_udiv_info divisor_factors[SI_MAX_ATTRIBS] = {};
+       STATIC_ASSERT(sizeof(struct util_fast_udiv_info) == 16);
+       STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+       STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+       STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+       STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
        int i;
 
        assert(count <= SI_MAX_ATTRIBS);
        if (!v)
                return NULL;
 
        v->count = count;
        v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT);
 
        for (i = 0; i < count; ++i) {
@@ -4370,28 +4377,31 @@ static void *si_create_vertex_elements(struct 
pipe_context *ctx,
                unsigned data_format, num_format;
                int first_non_void;
                unsigned vbo_index = elements[i].vertex_buffer_index;
                unsigned char swizzle[4];
 
                if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
                        FREE(v);
                        return NULL;
                }
 
-               if (elements[i].instance_divisor) {
+               unsigned instance_divisor = elements[i].instance_divisor;
+               if (instance_divisor) {
                        v->uses_instance_divisors = true;
-                       v->instance_divisors[i] = elements[i].instance_divisor;
 
-                       if (v->instance_divisors[i] == 1)
+                       if (instance_divisor == 1) {
                                v->instance_divisor_is_one |= 1u << i;
-                       else
+                       } else {
                                v->instance_divisor_is_fetched |= 1u << i;
+                               divisor_factors[i] =
+                                       
util_compute_fast_udiv_info(instance_divisor, 32);
+                       }
                }
 
                if (!used[vbo_index]) {
                        v->first_vb_use_mask |= 1 << i;
                        used[vbo_index] = true;
                }
 
                desc = util_format_description(elements[i].src_format);
                first_non_void = 
util_format_get_first_non_void_channel(elements[i].src_format);
                data_format = si_translate_buffer_dataformat(ctx->screen, desc, 
first_non_void);
@@ -4487,20 +4497,36 @@ static void *si_create_vertex_elements(struct 
pipe_context *ctx,
                        }
                }
 
                v->rsrc_word3[i] = 
S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
                                   
S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
                                   
S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
                                   
S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
                                   S_008F0C_NUM_FORMAT(num_format) |
                                   S_008F0C_DATA_FORMAT(data_format);
        }
+
+       if (v->instance_divisor_is_fetched) {
+               unsigned num_divisors = 
util_last_bit(v->instance_divisor_is_fetched);
+
+               v->instance_divisor_factor_buffer =
+                       (struct r600_resource*)
+                       pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
+                                          num_divisors * 
sizeof(divisor_factors[0]));
+               if (!v->instance_divisor_factor_buffer) {
+                       FREE(v);
+                       return NULL;
+               }
+               void *map = 
sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
+                                                   NULL, PIPE_TRANSFER_WRITE);
+               memcpy(map , divisor_factors, num_divisors * 
sizeof(divisor_factors[0]));
+       }
        return v;
 }
 
 static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_vertex_elements *old = sctx->vertex_elements;
        struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
        sctx->vertex_elements = v;
@@ -4510,34 +4536,36 @@ static void si_bind_vertex_elements(struct pipe_context 
*ctx, void *state)
            (!old ||
             old->count != v->count ||
             old->uses_instance_divisors != v->uses_instance_divisors ||
             v->uses_instance_divisors || /* we don't check which divisors 
changed */
             memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * 
v->count)))
                sctx->do_update_shaders = true;
 
        if (v && v->instance_divisor_is_fetched) {
                struct pipe_constant_buffer cb;
 
-               cb.buffer = NULL;
-               cb.user_buffer = v->instance_divisors;
+               cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+               cb.user_buffer = NULL;
                cb.buffer_offset = 0;
-               cb.buffer_size = sizeof(uint32_t) * v->count;
+               cb.buffer_size = 0xffffffff;
                si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
        }
 }
 
 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
+       struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
        if (sctx->vertex_elements == state)
                sctx->vertex_elements = NULL;
+       r600_resource_reference(&v->instance_divisor_factor_buffer, NULL);
        FREE(state);
 }
 
 static void si_set_vertex_buffers(struct pipe_context *ctx,
                                  unsigned start_slot, unsigned count,
                                  const struct pipe_vertex_buffer *buffers)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
        int i;
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index 89bb5b6..d9c3e70 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -131,21 +131,21 @@ struct si_state_dsa {
 
 };
 
 struct si_stencil_ref {
        struct pipe_stencil_ref         state;
        struct si_dsa_stencil_ref_part  dsa_part;
 };
 
 struct si_vertex_elements
 {
-       uint32_t                        instance_divisors[SI_MAX_ATTRIBS];
+       struct r600_resource            *instance_divisor_factor_buffer;
        uint32_t                        rsrc_word3[SI_MAX_ATTRIBS];
        uint16_t                        src_offset[SI_MAX_ATTRIBS];
        uint8_t                         fix_fetch[SI_MAX_ATTRIBS];
        uint8_t                         format_size[SI_MAX_ATTRIBS];
        uint8_t                         vertex_buffer_index[SI_MAX_ATTRIBS];
 
        uint8_t                         count;
        bool                            uses_instance_divisors;
 
        uint16_t                        first_vb_use_mask;
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to