The previous implementation relied on the std140 alignment rules to
avoid handling misalignment in the case where we are loading more than
2 double components from a vector, which requires to emit a second load
message.

This alternative implementation deals with misalignment and is more
flexible going forward. All credit for this goes to Curro, since he
not only suggested this but also wrote the implementation in the
mailing list.
---

Curro, maybe I should make you the author and me the reviewer then? :)

 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 59 +++++++-------------------------
 1 file changed, 13 insertions(+), 46 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index ebcc92a..35a6aed 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -3583,9 +3583,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
nir_intrinsic_instr *instr
                                nir->info.num_ubos - 1);
       }
 
-      /* Number of 32-bit slots in the type */
-      unsigned type_slots = MAX2(1, type_sz(dest.type) / 4);
-
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset == NULL) {
          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
@@ -3603,55 +3600,25 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
nir_intrinsic_instr *instr
           * we let CSE deal with duplicate loads. Here we see a vector access
           * and we have to split it if necessary.
           */
-         fs_reg packed_consts = vgrf(glsl_type::float_type);
-         packed_consts.type = dest.type;
+         const unsigned type_size = type_sz(dest.type);
+         const fs_reg packed_consts = bld.vgrf(BRW_REGISTER_TYPE_F);
+         for (unsigned c = 0; c < instr->num_components;) {
+            const unsigned base = const_offset->u32[0] + c * type_size;
 
-         unsigned const_offset_aligned = const_offset->u32[0] & ~15;
+            /* Number of usable components in the next 16B-aligned load */
+            const unsigned count = MIN2(instr->num_components - c,
+                                        (16 - base % 16) / type_size);
 
-         /* A vec4 only contains half of a dvec4, if we need more than 2
-          * components of a dvec4 we will have to issue another load for
-          * components z and w.
-          */
-         int num_components;
-         if (type_slots == 1)
-            num_components = instr->num_components;
-         else
-            num_components = MIN2(2, instr->num_components);
-
-         /* The computation of num_components doesn't take into account
-          * misalignment, which should be okay according to std140 vector
-          * alignment rules.
-          */
-         assert(const_offset->u32[0] % 16 +
-                type_sz(dest.type) * num_components <= 16);
-
-         int remaining_components = instr->num_components;
-         while (remaining_components > 0) {
-            /* Read the vec4 from a 16-byte aligned offset */
-            struct brw_reg const_offset_reg = brw_imm_ud(const_offset_aligned);
             bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                     retype(packed_consts, BRW_REGISTER_TYPE_F),
-                     surf_index, const_offset_reg);
-
-            const fs_reg consts = byte_offset(packed_consts, 
(const_offset->u32[0] % 16));
-            unsigned dest_offset = instr->num_components - 
remaining_components;
+                     packed_consts, surf_index, brw_imm_ud(base & ~15));
 
-            /* XXX: This doesn't update the sub-16B offset across iterations of
-             * the loop, which should work for std140 vector alignment rules.
-             */
-            assert(dest_offset == 0 || const_offset->u32[0] % 16 == 0);
+            const fs_reg consts =
+               retype(byte_offset(packed_consts, base & 15), dest.type);
 
-            for (int i = 0; i < num_components; i++)
-               bld.MOV(offset(dest, bld, i + dest_offset), component(consts, 
i));
+            for (unsigned d = 0; d < count; d++)
+               bld.MOV(offset(dest, bld, c + d), component(consts, d));
 
-            /* If this is a large enough 64-bit load, we will need to emit
-             * another message
-             */
-            remaining_components -= num_components;
-            assert(remaining_components == 0 ||
-                   (remaining_components <= 2 && type_slots == 2));
-            num_components = remaining_components;
-            const_offset_aligned += 16;
+            c += count;
          }
       }
       break;
-- 
1.9.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to