Module: Mesa
Branch: main
Commit: de51a21e26aaef21c9429142bb50662e69cf4be0
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=de51a21e26aaef21c9429142bb50662e69cf4be0

Author: Rhys Perry <[email protected]>
Date:   Mon Nov 27 14:09:25 2023 +0000

aco: implement 16-bit derivatives

These are used by radeonsi.

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Georg Lehmann <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26445>

---

 src/amd/compiler/aco_instruction_selection.cpp | 39 ++++++++++++++++++++------
 src/amd/compiler/aco_interface.cpp             |  8 +++++-
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp 
b/src/amd/compiler/aco_instruction_selection.cpp
index ddceeb3bbd6..e18b205f222 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3866,11 +3866,10 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
          /* Source is the same in all lanes, so the derivative is zero.
           * This also avoids emitting invalid IR.
           */
-         bld.copy(Definition(dst), Operand::zero());
+         bld.copy(Definition(dst), Operand::zero(dst.bytes()));
          break;
       }
 
-      Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
       uint16_t dpp_ctrl1, dpp_ctrl2;
       if (instr->op == nir_op_fddx_fine) {
          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
@@ -3886,14 +3885,38 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
       }
 
-      Temp tmp;
-      if (ctx->program->gfx_level >= GFX8) {
+      if (dst.regClass() == v1 && instr->def.bit_size == 16) {
+         assert(instr->def.num_components == 2);
+
+         Temp src = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
+
+         /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
+         unsigned opsel_lo = instr->src[0].swizzle[0] & 1;
+         unsigned opsel_hi = instr->src[0].swizzle[1] & 1;
+         opsel_lo |= opsel_lo << 1;
+         opsel_hi |= opsel_hi << 1;
+
          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, 
dpp_ctrl1);
-         bld.vop2_dpp(aco_opcode::v_sub_f32, Definition(dst), src, tl, 
dpp_ctrl2);
+         Temp tr = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, 
dpp_ctrl2);
+
+         VALU_instruction& sub =
+            bld.vop3p(aco_opcode::v_pk_add_f16, Definition(dst), tr, tl, 
opsel_lo, opsel_hi)
+               .instr->valu();
+         sub.neg_lo[1] = true;
+         sub.neg_hi[1] = true;
       } else {
-         Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 
15) | dpp_ctrl1);
-         Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 
15) | dpp_ctrl2);
-         bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl);
+         Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
+
+         if (ctx->program->gfx_level >= GFX8) {
+            aco_opcode sub =
+               instr->def.bit_size == 16 ? aco_opcode::v_sub_f16 : 
aco_opcode::v_sub_f32;
+            Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, 
dpp_ctrl1);
+            bld.vop2_dpp(sub, Definition(dst), src, tl, dpp_ctrl2);
+         } else {
+            Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 
<< 15) | dpp_ctrl1);
+            Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 
<< 15) | dpp_ctrl2);
+            bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl);
+         }
       }
       set_wqm(ctx, true);
       break;
diff --git a/src/amd/compiler/aco_interface.cpp 
b/src/amd/compiler/aco_interface.cpp
index 7ef3fcc7714..6ae3405b3e1 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -471,7 +471,13 @@ aco_nir_op_supports_packed_math_16bit(const nir_alu_instr* 
alu)
    case nir_op_imin:
    case nir_op_imax:
    case nir_op_umin:
-   case nir_op_umax: return true;
+   case nir_op_umax:
+   case nir_op_fddx:
+   case nir_op_fddy:
+   case nir_op_fddx_fine:
+   case nir_op_fddy_fine:
+   case nir_op_fddx_coarse:
+   case nir_op_fddy_coarse: return true;
    case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */
    case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */
    case nir_op_ushr:

Reply via email to