Re: [Mesa-dev] [PATCH] radv/gfx10: add Wave32 support for compute shaders

2019-07-30 Thread Bas Nieuwenhuizen
r-b

On Tue, Jul 30, 2019 at 6:29 PM Samuel Pitoiset
 wrote:
>
> It can be enabled with RADV_PERFTEST=cswave32.
>
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/amd/vulkan/radv_debug.h   |  1 +
>  src/amd/vulkan/radv_device.c  | 12 +++-
>  src/amd/vulkan/radv_nir_to_llvm.c | 14 +-
>  src/amd/vulkan/radv_pipeline.c|  3 ++-
>  src/amd/vulkan/radv_private.h |  3 +++
>  src/amd/vulkan/radv_shader.c  | 25 ++---
>  src/amd/vulkan/radv_shader.h  |  1 +
>  7 files changed, 53 insertions(+), 6 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
> index 723fabda57f..6414e882676 100644
> --- a/src/amd/vulkan/radv_debug.h
> +++ b/src/amd/vulkan/radv_debug.h
> @@ -64,6 +64,7 @@ enum {
> RADV_PERFTEST_BO_LIST=  0x20,
> RADV_PERFTEST_SHADER_BALLOT  =  0x40,
> RADV_PERFTEST_TC_COMPAT_CMASK = 0x80,
> +   RADV_PERFTEST_CS_WAVE_32 = 0x100,
>  };
>
>  bool
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 65e3ccf91ad..29be192443a 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -383,6 +383,14 @@ radv_physical_device_init(struct radv_physical_device 
> *device,
>
> device->use_shader_ballot = device->instance->perftest_flags & 
> RADV_PERFTEST_SHADER_BALLOT;
>
> +   /* Determine the number of threads per wave for all stages. */
> +   device->cs_wave_size = 64;
> +
> +   if (device->rad_info.chip_class >= GFX10) {
> +   if (device->instance->perftest_flags & 
> RADV_PERFTEST_CS_WAVE_32)
> +   device->cs_wave_size = 32;
> +   }
> +
> radv_physical_device_init_mem_types(device);
> radv_fill_device_extension_table(device, 
> >supported_extensions);
>
> @@ -494,6 +502,7 @@ static const struct debug_control radv_perftest_options[] 
> = {
> {"bolist", RADV_PERFTEST_BO_LIST},
> {"shader_ballot", RADV_PERFTEST_SHADER_BALLOT},
> {"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK},
> +   {"cswave32", RADV_PERFTEST_CS_WAVE_32},
> {NULL, 0}
>  };
>
> @@ -1930,7 +1939,8 @@ VkResult radv_CreateDevice(
> device->scratch_waves = MAX2(32 * 
> physical_device->rad_info.num_good_compute_units,
>  max_threads_per_block / 64);
>
> -   device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
> +   device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) |
> +
> S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32);
>
> if (device->physical_device->rad_info.chip_class >= GFX7) {
> /* If the KMD allows it (there is a KMD hw register for it),
> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
> b/src/amd/vulkan/radv_nir_to_llvm.c
> index 020c6d17771..feaab8f6370 100644
> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> @@ -4317,6 +4317,15 @@ static void declare_esgs_ring(struct 
> radv_shader_context *ctx)
> LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
>  }
>
> +static uint8_t
> +radv_nir_shader_wave_size(struct nir_shader *const *shaders, int 
> shader_count,
> + const struct radv_nir_compiler_options *options)
> +{
> +   if (shaders[0]->info.stage == MESA_SHADER_COMPUTE)
> +   return options->cs_wave_size;
> +   return 64;
> +}
> +
>  static
>  LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
> struct nir_shader *const *shaders,
> @@ -4333,8 +4342,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct 
> ac_llvm_compiler *ac_llvm,
> options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
>AC_FLOAT_MODE_DEFAULT;
>
> +   uint8_t wave_size = radv_nir_shader_wave_size(shaders,
> + shader_count, options);
> +
> ac_llvm_context_init(, ac_llvm, options->chip_class,
> -options->family, float_mode, 64);
> +options->family, float_mode, wave_size);
> ctx.context = ctx.ac.context;
>
> radv_nir_shader_info_init(_info->info);
> diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
> index 583b600dfdd..6b8b7bbe25a 100644
> --- a/src/amd/vulkan/radv_pipeline.c
> +++ b/src/amd/vulkan/radv_pipeline.c
> @@ -4648,7 +4648,8 @@ radv_compute_generate_pm4(struct radv_pipeline 
> *pipeline)
> threads_per_threadgroup = compute_shader->info.cs.block_size[0] *
>   compute_shader->info.cs.block_size[1] *
>   compute_shader->info.cs.block_size[2];
> -   waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, 64);
> +   waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup,
> +  

[Mesa-dev] [PATCH] radv/gfx10: add Wave32 support for compute shaders

2019-07-30 Thread Samuel Pitoiset
It can be enabled with RADV_PERFTEST=cswave32.

Signed-off-by: Samuel Pitoiset 
---
 src/amd/vulkan/radv_debug.h   |  1 +
 src/amd/vulkan/radv_device.c  | 12 +++-
 src/amd/vulkan/radv_nir_to_llvm.c | 14 +-
 src/amd/vulkan/radv_pipeline.c|  3 ++-
 src/amd/vulkan/radv_private.h |  3 +++
 src/amd/vulkan/radv_shader.c  | 25 ++---
 src/amd/vulkan/radv_shader.h  |  1 +
 7 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 723fabda57f..6414e882676 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -64,6 +64,7 @@ enum {
RADV_PERFTEST_BO_LIST=  0x20,
RADV_PERFTEST_SHADER_BALLOT  =  0x40,
RADV_PERFTEST_TC_COMPAT_CMASK = 0x80,
+   RADV_PERFTEST_CS_WAVE_32 = 0x100,
 };
 
 bool
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 65e3ccf91ad..29be192443a 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -383,6 +383,14 @@ radv_physical_device_init(struct radv_physical_device 
*device,
 
device->use_shader_ballot = device->instance->perftest_flags & 
RADV_PERFTEST_SHADER_BALLOT;
 
+   /* Determine the number of threads per wave for all stages. */
+   device->cs_wave_size = 64;
+
+   if (device->rad_info.chip_class >= GFX10) {
+   if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
+   device->cs_wave_size = 32;
+   }
+
radv_physical_device_init_mem_types(device);
radv_fill_device_extension_table(device, >supported_extensions);
 
@@ -494,6 +502,7 @@ static const struct debug_control radv_perftest_options[] = 
{
{"bolist", RADV_PERFTEST_BO_LIST},
{"shader_ballot", RADV_PERFTEST_SHADER_BALLOT},
{"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK},
+   {"cswave32", RADV_PERFTEST_CS_WAVE_32},
{NULL, 0}
 };
 
@@ -1930,7 +1939,8 @@ VkResult radv_CreateDevice(
device->scratch_waves = MAX2(32 * 
physical_device->rad_info.num_good_compute_units,
 max_threads_per_block / 64);
 
-   device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
+   device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) |
+
S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32);
 
if (device->physical_device->rad_info.chip_class >= GFX7) {
/* If the KMD allows it (there is a KMD hw register for it),
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 020c6d17771..feaab8f6370 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -4317,6 +4317,15 @@ static void declare_esgs_ring(struct radv_shader_context 
*ctx)
LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
 }
 
+static uint8_t
+radv_nir_shader_wave_size(struct nir_shader *const *shaders, int shader_count,
+ const struct radv_nir_compiler_options *options)
+{
+   if (shaders[0]->info.stage == MESA_SHADER_COMPUTE)
+   return options->cs_wave_size;
+   return 64;
+}
+
 static
 LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
struct nir_shader *const *shaders,
@@ -4333,8 +4342,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct 
ac_llvm_compiler *ac_llvm,
options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
   AC_FLOAT_MODE_DEFAULT;
 
+   uint8_t wave_size = radv_nir_shader_wave_size(shaders,
+ shader_count, options);
+
ac_llvm_context_init(, ac_llvm, options->chip_class,
-options->family, float_mode, 64);
+options->family, float_mode, wave_size);
ctx.context = ctx.ac.context;
 
radv_nir_shader_info_init(_info->info);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 583b600dfdd..6b8b7bbe25a 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -4648,7 +4648,8 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline)
threads_per_threadgroup = compute_shader->info.cs.block_size[0] *
  compute_shader->info.cs.block_size[1] *
  compute_shader->info.cs.block_size[2];
-   waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, 64);
+   waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup,
+
device->physical_device->cs_wave_size);
 
if (device->physical_device->rad_info.chip_class >= GFX10 &&
waves_per_threadgroup == 1)
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 466f0288399..559cb3b336d