That's a cool opt because RDSV is a costly operation (on maxwell it requires a bar dep). Thanks!

Reviewed-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>

On 01/26/2017 04:20 AM, Ilia Mirkin wrote:
Many many many compute shaders only define a 1- or 2-dimensional block,
but then continue to use system values that take the full 3d into
account (like gl_LocalInvocationIndex, etc). So for the special case
that a dimension is exactly 1, we know that the thread id along that
axis will always be 0, so return it as such and allow constant folding
to fix things up.

Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp           |  6 +++++-
 src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h      |  2 +-
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 10 ++++++++--
 src/gallium/drivers/nouveau/codegen/nv50_ir_target.h      |  4 +++-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index 186c9fd..b67a1dd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1179,7 +1179,11 @@ nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
       info->prop.gp.instanceCount = 1;
       info->prop.gp.maxVertices = 1;
    }
-   info->prop.cp.numThreads = 1;
+   if (info->type == PIPE_SHADER_COMPUTE) {
+      info->prop.cp.numThreads[0] =
+      info->prop.cp.numThreads[1] =
+      info->prop.cp.numThreads[2] = 1;
+   }
    info->io.pointSize = 0xff;
    info->io.instanceId = 0xff;
    info->io.vertexId = 0xff;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 65d0904..e7d840d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -152,7 +152,7 @@ struct nv50_ir_prog_info
          uint32_t inputOffset; /* base address for user args */
          uint32_t sharedOffset; /* reserved space in s[] */
          uint32_t gridInfoBase;  /* base address for NTID,NCTAID */
-         uint32_t numThreads; /* max number of threads */
+         uint16_t numThreads[3]; /* max number of threads */
       } cp;
    } prop;

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 6320e52..51f8b29 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1047,7 +1047,6 @@ bool Source::scanSource()
    }

    info->io.viewportId = -1;
-   info->prop.cp.numThreads = 1;

    info->immd.data = (uint32_t *)MALLOC(scan.immediate_count * 16);
    info->immd.type = (ubyte *)MALLOC(scan.immediate_count * sizeof(ubyte));
@@ -1150,9 +1149,13 @@ void Source::scanProperty(const struct 
tgsi_full_property *prop)
          info->prop.tp.outputPrim = PIPE_PRIM_TRIANGLES; /* anything but 
points */
       break;
    case TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH:
+      info->prop.cp.numThreads[0] = prop->u[0].Data;
+      break;
    case TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT:
+      info->prop.cp.numThreads[1] = prop->u[0].Data;
+      break;
    case TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH:
-      info->prop.cp.numThreads *= prop->u[0].Data;
+      info->prop.cp.numThreads[2] = prop->u[0].Data;
       break;
    case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
       info->io.clipDistances = prop->u[0].Data;
@@ -1941,6 +1944,9 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, 
int c, Value *ptr)
       return ld->getDef(0);
    case TGSI_FILE_SYSTEM_VALUE:
       assert(!ptr);
+      if (info->sv[idx].sn == TGSI_SEMANTIC_THREAD_ID &&
+          info->prop.cp.numThreads[swz] == 1)
+         return zero;
       ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
       ld->perPatch = info->sv[idx].patch;
       return ld->getDef(0);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
index eaf50cc..e9d1057 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -174,7 +174,9 @@ public:
    virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 
0;

    virtual void parseDriverInfo(const struct nv50_ir_prog_info *info) {
-      threads = info->prop.cp.numThreads;
+      threads = info->prop.cp.numThreads[0] *
+         info->prop.cp.numThreads[1] *
+         info->prop.cp.numThreads[2];
       if (threads == 0)
          threads = info->target >= NVISA_GK104_CHIPSET ? 1024 : 512;
    }

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to