On 08/26/2013 03:12 PM, Paul Berry wrote:
Previously, we would always use the same push constant allocation
regardless of what shader programs were being run: the available push
constant space was split into 2 equal size partitions, one for the
vertex shader, and one for the fragment shader.

Now that we are adding geometry shader support, we need to do
something smarter.  This patch adjusts things so that when a geometry
shader is in use, we split the available push constant space into 3
nearly-equal size partitions instead of 2.

Since the push constant allocation is now affected by GL state, it can
no longer be set up by brw_upload_initial_gpu_state(); instead it must
be set up by a state atom.
---
  src/mesa/drivers/dri/i965/brw_context.h      |   3 +-
  src/mesa/drivers/dri/i965/brw_defines.h      |   1 +
  src/mesa/drivers/dri/i965/brw_state.h        |   4 +-
  src/mesa/drivers/dri/i965/brw_state_upload.c |   5 +-
  src/mesa/drivers/dri/i965/gen7_blorp.cpp     |   6 ++
  src/mesa/drivers/dri/i965/gen7_urb.c         | 101 +++++++++++++++++++++++----
  6 files changed, 98 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
b/src/mesa/drivers/dri/i965/brw_context.h
index 77f2a6b..95f9bb2 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1508,7 +1508,8 @@ gen6_get_sample_position(struct gl_context *ctx,

  /* gen7_urb.c */
  void
-gen7_allocate_push_constants(struct brw_context *brw);
+gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned gs_size, unsigned fs_size);

  void
  gen7_emit_urb_state(struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 832ff55..8d9a824 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1284,6 +1284,7 @@ enum brw_message_target {
  # define GEN7_URB_STARTING_ADDRESS_SHIFT                25

  #define _3DSTATE_PUSH_CONSTANT_ALLOC_VS         0x7912 /* GEN7+ */
+#define _3DSTATE_PUSH_CONSTANT_ALLOC_GS         0x7915 /* GEN7+ */
  #define _3DSTATE_PUSH_CONSTANT_ALLOC_PS         0x7916 /* GEN7+ */
  # define GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT         16

diff --git a/src/mesa/drivers/dri/i965/brw_state.h 
b/src/mesa/drivers/dri/i965/brw_state.h
index 85f82fe..4814639 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -112,6 +112,7 @@ extern const struct brw_tracked_state 
gen7_cc_viewport_state_pointer;
  extern const struct brw_tracked_state gen7_clip_state;
  extern const struct brw_tracked_state gen7_disable_stages;
  extern const struct brw_tracked_state gen7_ps_state;
+extern const struct brw_tracked_state gen7_push_constant_space;
  extern const struct brw_tracked_state gen7_sbe_state;
  extern const struct brw_tracked_state gen7_sf_clip_viewport;
  extern const struct brw_tracked_state gen7_sf_state;
@@ -220,9 +221,6 @@ uint32_t
  get_attr_override(const struct brw_vue_map *vue_map, int 
urb_entry_read_offset,
                    int fs_attr, bool two_side_color, uint32_t 
*max_source_attr);

-/* gen7_urb.c */
-void gen7_allocate_push_constants(struct brw_context *brw);
-
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
b/src/mesa/drivers/dri/i965/brw_state_upload.c
index b883002..9638c69 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -188,6 +188,7 @@ static const struct brw_tracked_state *gen7_atoms[] =
     &gen7_cc_viewport_state_pointer, /* must do after brw_cc_vp */
     &gen7_sf_clip_viewport,

+   &gen7_push_constant_space,
     &gen7_urb,
     &gen6_blend_state,             /* must do before cc unit */
     &gen6_color_calc_state,        /* must do before cc unit */
@@ -251,10 +252,6 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
        return;

     brw_upload_invariant_state(brw);
-
-   if (brw->gen >= 7) {
-      gen7_allocate_push_constants(brw);
-   }
  }

  void brw_init_state( struct brw_context *brw )
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp 
b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 6c798b1..9df3d92 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -51,6 +51,12 @@ static void
  gen7_blorp_emit_urb_config(struct brw_context *brw,
                             const brw_blorp_params *params)
  {
+   unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16;
+   gen7_emit_push_constant_state(brw,
+                                 urb_size / 2 /* vs_size */,
+                                 0 /* gs_size */,
+                                 urb_size / 2 /* fs_size */);
+
     /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
      * 1.15:0 "VS Number of URB Entries".
      */
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c 
b/src/mesa/drivers/dri/i965/gen7_urb.c
index 2d10cc12..4dc8f6e 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -30,12 +30,12 @@
  /**
   * The following diagram shows how we partition the URB:
   *
- *      8kB         8kB              Rest of the URB space
- *   ____-____   ____-____   _________________-_________________
- *  /         \ /         \ /                                   \
+ *        16kB or 32kB               Rest of the URB space
+ *   __________-__________   _________________-_________________
+ *  /                     \ /                                   \
   * +-------------------------------------------------------------+
- * | VS Push   | FS Push   | VS                                  |
- * | Constants | Constants | Handles                             |
+ * |     VS/FS/GS Push     |              VS/GS URB              |
+ * |       Constants       |               Entries               |
   * +-------------------------------------------------------------+
   *
   * Notably, push constants must be stored at the beginning of the URB
@@ -43,8 +43,12 @@
   * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
   * doubles this (32kB).
   *
- * Currently we split the constant buffer space evenly between VS and FS.
- * This is probably not ideal, but simple.
+ * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
+ * sized) in increments of 1kB.  Haswell GT3 requires them to be located and
+ * sized in increments of 2kB.
+ *
+ * Currently we split the constant buffer space evenly among whatever stages
+ * are active.  This is probably not ideal, but simple.
   *
   * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
   * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
@@ -53,22 +57,91 @@
   * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
   * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
   */
-void
+static void
  gen7_allocate_push_constants(struct brw_context *brw)
  {
-   unsigned size = 8;
-   if (brw->is_haswell && brw->gt == 3)
-      size = 16;
+   unsigned avail_size = 16;
+   unsigned multiplier = (brw->is_haswell && brw->gt == 3) ? 2 : 1;
+
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   bool gs_present = brw->geometry_program;
+
+   unsigned vs_size, gs_size;
+   if (gs_present) {
+      vs_size = avail_size / 3;
+      avail_size -= vs_size;
+      gs_size = avail_size / 2;
+      avail_size -= gs_size;
+   } else {
+      vs_size = avail_size / 2;
+      avail_size -= vs_size;
+      gs_size = 0;
+   }
+   unsigned fs_size = avail_size;
+
+   gen7_emit_push_constant_state(brw, multiplier * vs_size,
+                                 multiplier * gs_size, multiplier * fs_size);
+}
+
+void
+gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned gs_size, unsigned fs_size)
+{
+   unsigned offset = 0;

-   BEGIN_BATCH(4);
+   BEGIN_BATCH(6);
     OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
-   OUT_BATCH(size);
+   OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += vs_size;
+
+   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2));
+   OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += gs_size;

     OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_PS << 16 | (2 - 2));
-   OUT_BATCH(size | size << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   OUT_BATCH(offset | fs_size << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
     ADVANCE_BATCH();
+
+   /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
+    *
+    *     A PIPE_CONTOL command with the CS Stall bit set must be programmed
+    *     in the ring after this instruction.
+    *
+    * No such restriction exists for Haswell.
+    */
+   if (!brw->is_haswell) {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
+      /* From p61 of the Ivy Bridge PRM (1.10.4 PIPE_CONTROL Command: DW1[20]
+       * CS Stall):
+       *
+       *     One of the following must also be set:
+       *     - Render Target Cache Flush Enable ([12] of DW1)
+       *     - Depth Cache Flush Enable ([0] of DW1)
+       *     - Stall at Pixel Scoreboard ([1] of DW1)
+       *     - Depth Stall ([13] of DW1)
+       *     - Post-Sync Operation ([13] of DW1)
+       *
+       * We choose to do a Post-Sync Operation (Write Immediate Data), since
+       * it seems like it will incur the least additional performance penalty.
+       */
+      OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_WRITE_IMMEDIATE);
+      OUT_RELOC(brw->batch.workaround_bo,
+                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, 0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }

This is really a bug fix, since it's implementing a hardware workaround on a packet we've always been emitting. I'd really like to see it in a separate patch, for easier bisectability.

Interestingly, this text doesn't appear in the latest documentation, and I don't see it in the workaround database. It is definitely there in the PRM, though. I'm not sure what to make of it.

  }

+const struct brw_tracked_state gen7_push_constant_space = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,

It's worth noting that this will cause a full pipeline stall whenever the current geometry shader changes (since 3DSTATE_PUSH_CONSTANT_ALLOC_* are non-pipelined). We could avoid this when switching between two different geometry shaders. But I suspect it's premature to care about that. Not sure it'll matter much anyway.

+      .cache = 0,
+   },
+   .emit = gen7_allocate_push_constants,
+};
+
  static void
  gen7_upload_urb(struct brw_context *brw)
  {


Other than pulling the PIPE_CONTROL into a separate patch, this looks fine to me.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to