Module: Mesa
Branch: main
Commit: b92c40d40ad195039893edea36af3b85a5a3c4cd
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=b92c40d40ad195039893edea36af3b85a5a3c4cd

Author: Karmjit Mahil <[email protected]>
Date:   Fri Jun 24 16:34:31 2022 +0100

pvr: Add IDF/WDF program for compute pipeline barrier.

Signed-off-by: Karmjit Mahil <[email protected]>
Reviewed-by: Rajnesh Kanwal <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17683>

---

 src/imagination/csbgen/rogue_texstate.xml     |   6 +
 src/imagination/include/hwdef/rogue_hw_defs.h |   3 +
 src/imagination/vulkan/pvr_cmd_buffer.c       |   2 +-
 src/imagination/vulkan/pvr_device.c           | 296 +++++++++++++++++++++++++-
 src/imagination/vulkan/pvr_formats.c          |   2 +
 src/imagination/vulkan/pvr_hardcode.c         |  17 ++
 src/imagination/vulkan/pvr_hardcode.h         |   6 +
 src/imagination/vulkan/pvr_private.h          |  14 ++
 src/imagination/vulkan/pvr_tex_state.c        |   1 +
 9 files changed, 345 insertions(+), 2 deletions(-)

diff --git a/src/imagination/csbgen/rogue_texstate.xml 
b/src/imagination/csbgen/rogue_texstate.xml
index 8f2fc0f0fa3..537146de332 100644
--- a/src/imagination/csbgen/rogue_texstate.xml
+++ b/src/imagination/csbgen/rogue_texstate.xml
@@ -336,4 +336,10 @@ SOFTWARE.
                <field name="dadjust" start="0" end="12" type="DADJUST"/>
        </struct>
 
+       <struct name="SAMPLER_WORD1" length="2">
+               <field name="cemedge_dontfilter" start="63" end="63" 
type="bool"/>
+               <field name="texaddr_plane3" start="24" end="61" shift="2" 
type="address"/>
+               <field name="texaddr_plane2_hi" start="0" end="23" shift="16" 
type="address"/>
+       </struct>
+
 </csbgen>
diff --git a/src/imagination/include/hwdef/rogue_hw_defs.h 
b/src/imagination/include/hwdef/rogue_hw_defs.h
index 1ee1eb5ee34..e66f50537a0 100644
--- a/src/imagination/include/hwdef/rogue_hw_defs.h
+++ b/src/imagination/include/hwdef/rogue_hw_defs.h
@@ -90,6 +90,9 @@
 /* Number of TEXSTATE_IMAGE_WORD values that need setting up. */
 #define ROGUE_NUM_TEXSTATE_IMAGE_WORDS 2U
 
+/* Number of TEXSTATE_SAMPLER state words that need setting up. */
+#define ROGUE_NUM_TEXSTATE_SAMPLER_WORDS 2U
+
 #define ROGUE_MAX_RENDER_TARGETS 2048U
 
 /* 12 dwords reserved for shared register management. The first dword is the
diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c 
b/src/imagination/vulkan/pvr_cmd_buffer.c
index 9e7d02d7882..8cc6e1db876 100644
--- a/src/imagination/vulkan/pvr_cmd_buffer.c
+++ b/src/imagination/vulkan/pvr_cmd_buffer.c
@@ -1094,7 +1094,7 @@ pvr_sub_cmd_compute_job_init(const struct 
pvr_physical_device *pdevice,
       value.border_colour_table_address = PVR_DEV_ADDR_INVALID;
    }
 
-   sub_cmd->num_shared_regs = MAX2(PVR_IDF_WDF_IN_REGISTER_CONST_COUNT,
+   sub_cmd->num_shared_regs = 
MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
                                    cmd_buffer->state.max_shared_regs);
 
    cmd_buffer->state.max_shared_regs = 0U;
diff --git a/src/imagination/vulkan/pvr_device.c 
b/src/imagination/vulkan/pvr_device.c
index 16407fff347..f74fd092000 100644
--- a/src/imagination/vulkan/pvr_device.c
+++ b/src/imagination/vulkan/pvr_device.c
@@ -39,16 +39,19 @@
 #include <xf86drm.h>
 
 #include "hwdef/rogue_hw_utils.h"
+#include "pipe/p_defines.h"
 #include "pvr_bo.h"
 #include "pvr_csb.h"
 #include "pvr_csb_enum_helpers.h"
 #include "pvr_debug.h"
 #include "pvr_device_info.h"
+#include "pvr_hardcode.h"
 #include "pvr_job_render.h"
 #include "pvr_limits.h"
 #include "pvr_nop_usc.h"
 #include "pvr_pds.h"
 #include "pvr_private.h"
+#include "pvr_tex_state.h"
 #include "pvr_types.h"
 #include "pvr_winsys.h"
 #include "rogue/rogue_compiler.h"
@@ -1177,6 +1180,289 @@ static VkResult 
pvr_device_init_compute_fence_program(struct pvr_device *device)
    return result;
 }
 
+static VkResult pvr_pds_idfwdf_programs_create_and_upload(
+   struct pvr_device *device,
+   pvr_dev_addr_t usc_addr,
+   uint32_t shareds,
+   uint32_t temps,
+   pvr_dev_addr_t shareds_buffer_addr,
+   struct pvr_pds_upload *const upload_out,
+   struct pvr_pds_upload *const sw_compute_barrier_upload_out)
+{
+   const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
+   struct pvr_pds_vertex_shader_sa_program program = {
+      .kick_usc = true,
+      .clear_pds_barrier = PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info),
+   };
+   size_t staging_buffer_size;
+   uint32_t *staging_buffer;
+   VkResult result;
+
+   /* We'll need to DMA the shareds into the USC's Common Store. */
+   program.num_dma_kicks = pvr_pds_encode_dma_burst(program.dma_control,
+                                                    program.dma_address,
+                                                    0,
+                                                    shareds,
+                                                    shareds_buffer_addr.addr,
+                                                    dev_info);
+
+   /* DMA temp regs. */
+   pvr_pds_setup_doutu(&program.usc_task_control,
+                       usc_addr.addr,
+                       temps,
+                       PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
+                       false);
+
+   pvr_pds_vertex_shader_sa(&program, NULL, PDS_GENERATE_SIZES, dev_info);
+
+   staging_buffer_size =
+      (program.code_size + program.data_size) * sizeof(*staging_buffer);
+
+   staging_buffer = vk_alloc(&device->vk.alloc,
+                             staging_buffer_size,
+                             8,
+                             VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!staging_buffer)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* FIXME: Add support for PDS_GENERATE_CODEDATA_SEGMENTS? */
+   pvr_pds_vertex_shader_sa(&program,
+                            staging_buffer,
+                            PDS_GENERATE_DATA_SEGMENT,
+                            dev_info);
+   pvr_pds_vertex_shader_sa(&program,
+                            &staging_buffer[program.data_size],
+                            PDS_GENERATE_CODE_SEGMENT,
+                            dev_info);
+
+   /* At the time of writing, the SW_COMPUTE_PDS_BARRIER variant of the program
+    * is bigger so we handle it first (if needed) and realloc() for a smaller
+    * size.
+    */
+   if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info)) {
+      /* FIXME: Figure out the define for alignment of 16. */
+      result = pvr_gpu_upload_pds(device,
+                                  &staging_buffer[0],
+                                  program.data_size,
+                                  16,
+                                  &staging_buffer[program.data_size],
+                                  program.code_size,
+                                  16,
+                                  16,
+                                  sw_compute_barrier_upload_out);
+      if (result != VK_SUCCESS) {
+         vk_free(&device->vk.alloc, staging_buffer);
+         return result;
+      }
+
+      program.clear_pds_barrier = false;
+
+      pvr_pds_vertex_shader_sa(&program, NULL, PDS_GENERATE_SIZES, dev_info);
+
+      staging_buffer_size =
+         (program.code_size + program.data_size) * sizeof(*staging_buffer);
+
+      staging_buffer = vk_realloc(&device->vk.alloc,
+                                  staging_buffer,
+                                  staging_buffer_size,
+                                  8,
+                                  VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      if (!staging_buffer) {
+         pvr_bo_free(device, sw_compute_barrier_upload_out->pvr_bo);
+
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      /* FIXME: Add support for PDS_GENERATE_CODEDATA_SEGMENTS? */
+      pvr_pds_vertex_shader_sa(&program,
+                               staging_buffer,
+                               PDS_GENERATE_DATA_SEGMENT,
+                               dev_info);
+      pvr_pds_vertex_shader_sa(&program,
+                               &staging_buffer[program.data_size],
+                               PDS_GENERATE_CODE_SEGMENT,
+                               dev_info);
+   } else {
+      *sw_compute_barrier_upload_out = (struct pvr_pds_upload){
+         .pvr_bo = NULL,
+      };
+   }
+
+   /* FIXME: Figure out the define for alignment of 16. */
+   result = pvr_gpu_upload_pds(device,
+                               &staging_buffer[0],
+                               program.data_size,
+                               16,
+                               &staging_buffer[program.data_size],
+                               program.code_size,
+                               16,
+                               16,
+                               upload_out);
+   if (result != VK_SUCCESS) {
+      vk_free(&device->vk.alloc, staging_buffer);
+      pvr_bo_free(device, sw_compute_barrier_upload_out->pvr_bo);
+
+      return result;
+   }
+
+   vk_free(&device->vk.alloc, staging_buffer);
+
+   return VK_SUCCESS;
+}
+
+static VkResult pvr_device_init_compute_idfwdf_state(struct pvr_device *device)
+{
+   uint64_t sampler_state[ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
+   uint64_t image_state[ROGUE_NUM_TEXSTATE_IMAGE_WORDS];
+   const struct rogue_shader_binary *usc_program;
+   struct pvr_texture_state_info tex_info;
+   uint32_t *dword_ptr;
+   uint32_t usc_shareds;
+   uint32_t usc_temps;
+   VkResult result;
+
+   pvr_hard_code_get_idfwdf_program(&device->pdevice->dev_info,
+                                    &usc_program,
+                                    &usc_shareds,
+                                    &usc_temps);
+
+   device->idfwdf_state.usc_shareds = usc_shareds;
+
+   /* FIXME: Figure out the define for alignment of 16. */
+   result = pvr_gpu_upload_usc(device,
+                               usc_program->data,
+                               usc_program->size,
+                               16,
+                               &device->idfwdf_state.usc);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* TODO: Get the store buffer size from the compiler? */
+   /* TODO: How was the size derived here? */
+   result = pvr_bo_alloc(device,
+                         device->heaps.general_heap,
+                         4 * sizeof(float) * 4 * 2,
+                         4,
+                         0,
+                         &device->idfwdf_state.store_bo);
+   if (result != VK_SUCCESS)
+      goto err_free_usc_program;
+
+   result = pvr_bo_alloc(device,
+                         device->heaps.general_heap,
+                         usc_shareds * ROGUE_REG_SIZE_BYTES,
+                         ROGUE_REG_SIZE_BYTES,
+                         PVR_BO_ALLOC_FLAG_CPU_MAPPED,
+                         &device->idfwdf_state.shareds_bo);
+   if (result != VK_SUCCESS)
+      goto err_free_store_buffer;
+
+   /* Pack state words. */
+
+   pvr_csb_pack (&sampler_state[0], TEXSTATE_SAMPLER, sampler) {
+      sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
+      sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT);
+      sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
+      sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
+   }
+
+   /* clang-format off */
+   pvr_csb_pack (&sampler_state[1], TEXSTATE_SAMPLER_WORD1, sampler_word1) {}
+   /* clang-format on */
+
+   STATIC_ASSERT(1 + 1 == ROGUE_NUM_TEXSTATE_SAMPLER_WORDS);
+
+   tex_info = (struct pvr_texture_state_info){
+      .format = VK_FORMAT_R32G32B32A32_SFLOAT,
+      .mem_layout = PVR_MEMLAYOUT_LINEAR,
+      .flags = PVR_TEXFLAGS_INDEX_LOOKUP,
+      /* TODO: Is this correct? Is it 2D, 3D, or 2D_ARRAY? */
+      .type = VK_IMAGE_VIEW_TYPE_2D,
+      .extent = { .width = 4, .height = 2, .depth = 0 },
+      .mip_levels = 1,
+      .sample_count = 1,
+      .stride = 4,
+      .swizzle = { PIPE_SWIZZLE_X,
+                   PIPE_SWIZZLE_Y,
+                   PIPE_SWIZZLE_Z,
+                   PIPE_SWIZZLE_W },
+      .addr = device->idfwdf_state.store_bo->vma->dev_addr,
+   };
+
+   result = pvr_pack_tex_state(device, &tex_info, image_state);
+   if (result != VK_SUCCESS)
+      goto err_free_shareds_buffer;
+
+   /* Fill the shareds buffer. */
+
+   dword_ptr = (uint32_t *)device->idfwdf_state.shareds_bo->bo->map;
+
+#define HIGH_32(val) ((uint32_t)((val) >> 32U))
+#define LOW_32(val) ((uint32_t)(val))
+
+   /* TODO: Should we use compiler info to setup the shareds data instead of
+    * assuming there's always 12 and this is how they should be setup?
+    */
+
+   dword_ptr[0] = HIGH_32(device->idfwdf_state.store_bo->vma->dev_addr.addr);
+   dword_ptr[1] = LOW_32(device->idfwdf_state.store_bo->vma->dev_addr.addr);
+
+   /* Pad the shareds as the texture/sample state words are 128 bit aligned. */
+   dword_ptr[2] = 0U;
+   dword_ptr[3] = 0U;
+
+   dword_ptr[4] = LOW_32(image_state[0]);
+   dword_ptr[5] = HIGH_32(image_state[0]);
+   dword_ptr[6] = LOW_32(image_state[1]);
+   dword_ptr[7] = HIGH_32(image_state[1]);
+
+   dword_ptr[8] = LOW_32(sampler_state[0]);
+   dword_ptr[9] = HIGH_32(sampler_state[0]);
+   dword_ptr[10] = LOW_32(sampler_state[1]);
+   dword_ptr[11] = HIGH_32(sampler_state[1]);
+   assert(11 + 1 == usc_shareds);
+
+#undef HIGH_32
+#undef LOW_32
+
+   pvr_bo_cpu_unmap(device, device->idfwdf_state.shareds_bo);
+   dword_ptr = NULL;
+
+   /* Generate and upload PDS programs. */
+   result = pvr_pds_idfwdf_programs_create_and_upload(
+      device,
+      device->idfwdf_state.usc->vma->dev_addr,
+      usc_shareds,
+      usc_temps,
+      device->idfwdf_state.shareds_bo->vma->dev_addr,
+      &device->idfwdf_state.pds,
+      &device->idfwdf_state.sw_compute_barrier_pds);
+   if (result != VK_SUCCESS)
+      goto err_free_shareds_buffer;
+
+   return VK_SUCCESS;
+
+err_free_shareds_buffer:
+   pvr_bo_free(device, device->idfwdf_state.shareds_bo);
+
+err_free_store_buffer:
+   pvr_bo_free(device, device->idfwdf_state.store_bo);
+
+err_free_usc_program:
+   pvr_bo_free(device, device->idfwdf_state.usc);
+
+   return result;
+}
+
+static void pvr_device_finish_compute_idfwdf_state(struct pvr_device *device)
+{
+   pvr_bo_free(device, device->idfwdf_state.pds.pvr_bo);
+   pvr_bo_free(device, device->idfwdf_state.sw_compute_barrier_pds.pvr_bo);
+   pvr_bo_free(device, device->idfwdf_state.shareds_bo);
+   pvr_bo_free(device, device->idfwdf_state.store_bo);
+   pvr_bo_free(device, device->idfwdf_state.usc);
+}
+
 /* FIXME: We should be calculating the size when we upload the code in
  * pvr_srv_setup_static_pixel_event_program().
  */
@@ -1358,10 +1644,14 @@ VkResult pvr_CreateDevice(VkPhysicalDevice 
physicalDevice,
    if (result != VK_SUCCESS)
       goto err_pvr_free_nop_program;
 
-   result = pvr_queues_create(device, pCreateInfo);
+   result = pvr_device_init_compute_idfwdf_state(device);
    if (result != VK_SUCCESS)
       goto err_pvr_free_compute_fence;
 
+   result = pvr_queues_create(device, pCreateInfo);
+   if (result != VK_SUCCESS)
+      goto err_pvr_finish_compute_idfwdf;
+
    pvr_device_init_default_sampler_state(device);
 
    if (pCreateInfo->pEnabledFeatures)
@@ -1384,6 +1674,9 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice,
 
    return VK_SUCCESS;
 
+err_pvr_finish_compute_idfwdf:
+   pvr_device_finish_compute_idfwdf_state(device);
+
 err_pvr_free_compute_fence:
    pvr_bo_free(device, device->pds_compute_fence_program.pvr_bo);
 
@@ -1418,6 +1711,7 @@ void pvr_DestroyDevice(VkDevice _device,
    PVR_FROM_HANDLE(pvr_device, device, _device);
 
    pvr_queues_destroy(device);
+   pvr_device_finish_compute_idfwdf_state(device);
    pvr_bo_free(device, device->pds_compute_fence_program.pvr_bo);
    pvr_bo_free(device, device->nop_program.pds.pvr_bo);
    pvr_bo_free(device, device->nop_program.usc);
diff --git a/src/imagination/vulkan/pvr_formats.c 
b/src/imagination/vulkan/pvr_formats.c
index c7ae23116cb..c84985e508d 100644
--- a/src/imagination/vulkan/pvr_formats.c
+++ b/src/imagination/vulkan/pvr_formats.c
@@ -55,6 +55,8 @@ static const struct pvr_format pvr_format_table[] = {
    FORMAT(R32_UINT, U32, U32),
    /* VK_FORMAT_R32G32B32A32_UINT = 107. */
    FORMAT(R32G32B32A32_UINT, U32U32U32U32, U32U32U32U32),
+   /* VK_FORMAT_R32G32B32A32_SFLOAT = 109. */
+   FORMAT(R32G32B32A32_SFLOAT, F32F32F32F32, F32F32F32F32),
    /* VK_FORMAT_D32_SFLOAT = 126. */
    FORMAT(D32_SFLOAT, F32, F32),
 };
diff --git a/src/imagination/vulkan/pvr_hardcode.c 
b/src/imagination/vulkan/pvr_hardcode.c
index d9f651d0bd5..5296b7db259 100644
--- a/src/imagination/vulkan/pvr_hardcode.c
+++ b/src/imagination/vulkan/pvr_hardcode.c
@@ -332,3 +332,20 @@ void pvr_hard_code_graphics_get_build_info(
       unreachable("Unsupported stage.");
    }
 }
+
+void pvr_hard_code_get_idfwdf_program(
+   const struct pvr_device_info *const dev_info,
+   const struct rogue_shader_binary **const program_out,
+   uint32_t *usc_shareds_out,
+   uint32_t *usc_temps_out)
+{
+   static const struct rogue_shader_binary shader = {
+      .size = 8U,
+      .data = { 0, 0, 0, 0, 0, 0, 0, 0 }
+   };
+
+   mesa_loge("No hard coded idfwdf program. Returning empty program.");
+   *program_out = &shader;
+   *usc_shareds_out = 12U;
+   *usc_temps_out = 4U;
+}
diff --git a/src/imagination/vulkan/pvr_hardcode.h 
b/src/imagination/vulkan/pvr_hardcode.h
index 0661426dc26..ea0fecb130b 100644
--- a/src/imagination/vulkan/pvr_hardcode.h
+++ b/src/imagination/vulkan/pvr_hardcode.h
@@ -119,4 +119,10 @@ void pvr_hard_code_graphics_get_build_info(
    struct rogue_build_data *const build_data,
    struct pvr_explicit_constant_usage *const explicit_const_usage);
 
+void pvr_hard_code_get_idfwdf_program(
+   const struct pvr_device_info *const dev_info,
+   const struct rogue_shader_binary **const program_out,
+   uint32_t *usc_shareds_out,
+   uint32_t *usc_temps_out);
+
 #endif /* PVR_HARDCODE_SHADERS_H */
diff --git a/src/imagination/vulkan/pvr_private.h 
b/src/imagination/vulkan/pvr_private.h
index c514e697d26..466fa3d7b88 100644
--- a/src/imagination/vulkan/pvr_private.h
+++ b/src/imagination/vulkan/pvr_private.h
@@ -279,6 +279,20 @@ struct pvr_device {
       struct pvr_bo *usc;
    } nop_program;
 
+   /* Issue Data Fence, Wait for Data Fence state. */
+   struct {
+      uint32_t usc_shareds;
+      struct pvr_bo *usc;
+
+      /* Buffer in which the IDF/WDF program performs store ops. */
+      struct pvr_bo *store_bo;
+      /* Contains the initialization values for the shared registers. */
+      struct pvr_bo *shareds_bo;
+
+      struct pvr_pds_upload pds;
+      struct pvr_pds_upload sw_compute_barrier_pds;
+   } idfwdf_state;
+
    VkPhysicalDeviceFeatures features;
 };
 
diff --git a/src/imagination/vulkan/pvr_tex_state.c 
b/src/imagination/vulkan/pvr_tex_state.c
index 2df77f564d7..f37a06349db 100644
--- a/src/imagination/vulkan/pvr_tex_state.c
+++ b/src/imagination/vulkan/pvr_tex_state.c
@@ -25,6 +25,7 @@
 #include <vulkan/vulkan.h>
 
 #include "hwdef/rogue_hw_defs.h"
+#include "pipe/p_defines.h"
 #include "pvr_csb.h"
 #include "pvr_device_info.h"
 #include "pvr_formats.h"

Reply via email to