Re: [Mesa-dev] [PATCH 4/4] radeonsi: Implement ddx/ddy on VI using ds_bpermute

2016-04-20 Thread Marek Olšák
Patches 1-3:
Reviewed-by: Marek Olšák 

Patch 4:
Acked-by: Marek Olšák 

Marek


On Tue, Apr 19, 2016 at 7:52 PM, Tom Stellard  wrote:
> The ds_bpermute instruction allows threads to transfer data directly
> to or from the vgprs of other threads.  These instructions use the lds
> hardware to transfer data, but do not read or write lds memory.
>
> DDX BEFORE:|  DDX AFTER:
>|
> v_mbcnt_lo_u32_b32_e64 v2, -1, 0   |  v_mbcnt_lo_u32_b32_e64 v2, -1, 0
> v_mbcnt_hi_u32_b32_e64 v2, -1, v2  |  v_mbcnt_hi_u32_b32_e64 v2, -1, v2
> v_lshlrev_b32_e32 v4, 2, v2|  v_and_b32_e32 v2, 60, v2
> v_and_b32_e32 v2, 60, v2   |  v_lshlrev_b32_e32 v2, 2, v2
> v_lshlrev_b32_e32 v3, 2, v2|  ds_bpermute_b32 v3, v2, v0
> s_mov_b32 m0, -1   |  ds_bpermute_b32 v0, v2, v0 offset:4
> ds_write_b32 v4, v0|  s_waitcnt lgkmcnt(0)
> s_waitcnt lgkmcnt(0)   |
> v_or_b32_e32 v0, 1, v2 |
> v_lshlrev_b32_e32 v0, 2, v0|
> ds_read_b32 v1, v3 |
> ds_read_b32 v0, v0 |
> s_waitcnt lgkmcnt(0)   |
>|
> LDS: 1 blocks  |  LDS: 0 blocks
> ---
>  src/gallium/drivers/radeonsi/si_shader.c | 42 
> +++-
>  1 file changed, 30 insertions(+), 12 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
> b/src/gallium/drivers/radeonsi/si_shader.c
> index 2a747f9..d3e445b 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -4162,6 +4162,7 @@ static void si_llvm_emit_ddxy(
> LLVMValueRef indices[2];
> LLVMValueRef store_ptr, load_ptr0, load_ptr1;
> LLVMValueRef tl, trbl, result[4];
> +   LLVMValueRef tl_tid, trbl_tid;
> unsigned swizzle[4];
> unsigned c;
> int idx;
> @@ -4179,20 +4180,24 @@ static void si_llvm_emit_ddxy(
> else
> mask = TID_MASK_TOP_LEFT;
>
> -   indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
> - lp_build_const_int32(gallivm, mask), "");
> +   tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
> +   lp_build_const_int32(gallivm, mask), "");
> +   indices[1] = tl_tid;
> load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
>  indices, 2, "");
>
> /* for DDX we want to next X pixel, DDY next Y pixel. */
> idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 
> 1 : 2;
> -   indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
> +   trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
>   lp_build_const_int32(gallivm, idx), "");
> +   indices[1] = trbl_tid;
> load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
>  indices, 2, "");
>
> for (c = 0; c < 4; ++c) {
> unsigned i;
> +   LLVMValueRef val;
> +   LLVMValueRef args[2];
>
> swizzle[c] = 
> tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
> for (i = 0; i < c; ++i) {
> @@ -4204,18 +4209,31 @@ static void si_llvm_emit_ddxy(
> if (i != c)
> continue;
>
> -   LLVMBuildStore(gallivm->builder,
> -  LLVMBuildBitCast(gallivm->builder,
> -   lp_build_emit_fetch(bld_base, 
> inst, 0, c),
> -   ctx->i32, ""),
> -  store_ptr);
> +   val = LLVMBuildBitCast(gallivm->builder,
> +   lp_build_emit_fetch(bld_base, inst, 0, c),
> +   ctx->i32, "");
>
> -   tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
> -   tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
> +   if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= 
> CHIP_TONGA) {
>
> -   trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
> -   trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
> +   args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
> +lp_build_const_int32(gallivm, 4), 
> "");
> +   args[1] = val;
> +   tl = lp_build_intrinsic(gallivm->builder,
> +   "llvm.amdgcn.ds.bpermute", ctx->i32,
> +   args, 2, LLVMReadNoneAttribute);
>
> +   args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
> +lp_build_const_int32(gallivm, 4), 
> "");
> +   trbl = lp_build_intrinsic(gallivm->builder,
> + 

[Mesa-dev] [PATCH 4/4] radeonsi: Implement ddx/ddy on VI using ds_bpermute

2016-04-19 Thread Tom Stellard
The ds_bpermute instruction allows threads to transfer data directly
to or from the vgprs of other threads.  These instructions use the lds
hardware to transfer data, but do not read or write lds memory.

DDX BEFORE:|  DDX AFTER:
   |
v_mbcnt_lo_u32_b32_e64 v2, -1, 0   |  v_mbcnt_lo_u32_b32_e64 v2, -1, 0
v_mbcnt_hi_u32_b32_e64 v2, -1, v2  |  v_mbcnt_hi_u32_b32_e64 v2, -1, v2
v_lshlrev_b32_e32 v4, 2, v2|  v_and_b32_e32 v2, 60, v2
v_and_b32_e32 v2, 60, v2   |  v_lshlrev_b32_e32 v2, 2, v2
v_lshlrev_b32_e32 v3, 2, v2|  ds_bpermute_b32 v3, v2, v0
s_mov_b32 m0, -1   |  ds_bpermute_b32 v0, v2, v0 offset:4
ds_write_b32 v4, v0|  s_waitcnt lgkmcnt(0)
s_waitcnt lgkmcnt(0)   |
v_or_b32_e32 v0, 1, v2 |
v_lshlrev_b32_e32 v0, 2, v0|
ds_read_b32 v1, v3 |
ds_read_b32 v0, v0 |
s_waitcnt lgkmcnt(0)   |
   |
LDS: 1 blocks  |  LDS: 0 blocks
---
 src/gallium/drivers/radeonsi/si_shader.c | 42 +++-
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 2a747f9..d3e445b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -4162,6 +4162,7 @@ static void si_llvm_emit_ddxy(
LLVMValueRef indices[2];
LLVMValueRef store_ptr, load_ptr0, load_ptr1;
LLVMValueRef tl, trbl, result[4];
+   LLVMValueRef tl_tid, trbl_tid;
unsigned swizzle[4];
unsigned c;
int idx;
@@ -4179,20 +4180,24 @@ static void si_llvm_emit_ddxy(
else
mask = TID_MASK_TOP_LEFT;
 
-   indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
- lp_build_const_int32(gallivm, mask), "");
+   tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
+   lp_build_const_int32(gallivm, mask), "");
+   indices[1] = tl_tid;
load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
 indices, 2, "");
 
/* for DDX we want to next X pixel, DDY next Y pixel. */
idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 
: 2;
-   indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
+   trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
  lp_build_const_int32(gallivm, idx), "");
+   indices[1] = trbl_tid;
load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
 indices, 2, "");
 
for (c = 0; c < 4; ++c) {
unsigned i;
+   LLVMValueRef val;
+   LLVMValueRef args[2];
 
swizzle[c] = 
tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
for (i = 0; i < c; ++i) {
@@ -4204,18 +4209,31 @@ static void si_llvm_emit_ddxy(
if (i != c)
continue;
 
-   LLVMBuildStore(gallivm->builder,
-  LLVMBuildBitCast(gallivm->builder,
-   lp_build_emit_fetch(bld_base, 
inst, 0, c),
-   ctx->i32, ""),
-  store_ptr);
+   val = LLVMBuildBitCast(gallivm->builder,
+   lp_build_emit_fetch(bld_base, inst, 0, c),
+   ctx->i32, "");
 
-   tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
-   tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
+   if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= 
CHIP_TONGA) {
 
-   trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
-   trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
+   args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
+lp_build_const_int32(gallivm, 4), "");
+   args[1] = val;
+   tl = lp_build_intrinsic(gallivm->builder,
+   "llvm.amdgcn.ds.bpermute", ctx->i32,
+   args, 2, LLVMReadNoneAttribute);
 
+   args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
+lp_build_const_int32(gallivm, 4), "");
+   trbl = lp_build_intrinsic(gallivm->builder,
+   "llvm.amdgcn.ds.bpermute", ctx->i32,
+   args, 2, LLVMReadNoneAttribute);
+   } else {
+   LLVMBuildStore(gallivm->builder, val, store_ptr);
+   tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
+