On Fri, Jul 14, 2017 at 2:42 AM, Nanley Chery <nanleych...@gmail.com> wrote:
> On Mon, Jul 10, 2017 at 10:14:21AM -0700, Jason Ekstrand wrote: > > On Wed, Jun 28, 2017 at 2:14 PM, Nanley Chery <nanleych...@gmail.com> > wrote: > > > > > v2: Expound on comment for the pipe controls (Jason Ekstrand). > > > > > > Signed-off-by: Nanley Chery <nanley.g.ch...@intel.com> > > > --- > > > src/intel/vulkan/anv_blorp.c | 4 +- > > > src/intel/vulkan/genX_cmd_buffer.c | 183 > ++++++++++++++++++++++++++++++ > > > +++---- > > > 2 files changed, 167 insertions(+), 20 deletions(-) > > > > > > diff --git a/src/intel/vulkan/anv_blorp.c > b/src/intel/vulkan/anv_blorp.c > > > index 459d57ec57..84b01e8792 100644 > > > --- a/src/intel/vulkan/anv_blorp.c > > > +++ b/src/intel/vulkan/anv_blorp.c > > > @@ -1451,7 +1451,9 @@ anv_image_ccs_clear(struct anv_cmd_buffer > > > *cmd_buffer, > > > > > > struct blorp_surf surf; > > > get_blorp_surf_for_anv_image(image, VK_IMAGE_ASPECT_COLOR_BIT, > > > - image->aux_usage, &surf); > > > + image->aux_usage == > ISL_AUX_USAGE_CCS_E ? > > > + ISL_AUX_USAGE_CCS_E : > ISL_AUX_USAGE_CCS_D, > > > + &surf); > > > > > > /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear": > > > * > > > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > > > b/src/intel/vulkan/genX_cmd_buffer.c > > > index decf0b28d6..1a9b841c7c 100644 > > > --- a/src/intel/vulkan/genX_cmd_buffer.c > > > +++ b/src/intel/vulkan/genX_cmd_buffer.c > > > @@ -524,6 +524,17 @@ genX(copy_fast_clear_dwords)(struct > anv_cmd_buffer > > > *cmd_buffer, > > > } > > > } > > > > > > +/** > > > + * @brief Transitions a color buffer from one layout to another. > > > + * > > > + * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 > spec > > > for > > > + * more information. > > > + * > > > + * @param level_count VK_REMAINING_MIP_LEVELS isn't supported. > > > + * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For > 3D > > > images, > > > + * this represents the maximum layers to > transition at > > > each > > > + * specified miplevel. > > > + */ > > > static void > > > transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, > > > const struct anv_image *image, > > > @@ -532,13 +543,27 @@ transition_color_buffer(struct anv_cmd_buffer > > > *cmd_buffer, > > > VkImageLayout initial_layout, > > > VkImageLayout final_layout) > > > { > > > - assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > > - > > > - if (image->aux_surface.isl.size == 0) > > > - return; > > > - > > > - if (initial_layout != VK_IMAGE_LAYOUT_UNDEFINED && > > > - initial_layout != VK_IMAGE_LAYOUT_PREINITIALIZED) > > > + /* Validate the inputs. */ > > > + assert(cmd_buffer); > > > + assert(image && image->aspects == VK_IMAGE_ASPECT_COLOR_BIT); > > > + /* These values aren't supported for simplicity's sake. */ > > > + assert(level_count != VK_REMAINING_MIP_LEVELS && > > > + layer_count != VK_REMAINING_ARRAY_LAYERS); > > > + /* Ensure the subresource range is valid. */ > > > + uint64_t last_level_num = base_level + level_count; > > > + const uint32_t max_depth = anv_minify(image->extent.depth, > > > base_level); > > > + const uint32_t image_layers = MAX2(image->array_size, max_depth); > > > + assert(base_layer + layer_count <= image_layers); > > > + assert(last_level_num <= image->levels); > > > + /* The spec disallows these final layouts. */ > > > + assert(final_layout != VK_IMAGE_LAYOUT_UNDEFINED && > > > + final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED); > > > + > > > + /* No work is necessary if the layout stays the same or if this > > > subresource > > > + * range lacks auxiliary data. > > > + */ > > > + if (initial_layout == final_layout || > > > + base_layer >= anv_image_aux_layers(image, base_level)) > > > return; > > > > > > /* A transition of a 3D subresource works on all slices at a time. > */ > > > @@ -549,22 +574,142 @@ transition_color_buffer(struct anv_cmd_buffer > > > *cmd_buffer, > > > > > > /* We're interested in the subresource range subset that has aux > data. > > > */ > > > level_count = MIN2(level_count, anv_image_aux_levels(image)); > > > + layer_count = MIN2(layer_count, anv_image_aux_layers(image, > > > base_level)); > > > > > > > Is this correct? I think we want MIN2(layer_count, > anv_image_aux_layers() > > - base_layer), don't we? This would also mean there's a bug in the > current > > level_count. > > > > > > I see nothing wrong with it. The current equation limits the > user-specified the layer_count to the actual number of layers which can > be transitioned. What's the meaning behind subtracting base_layer? > Because anv_image_aux_layers is the number of layers relative to 0 not relative to base_layer. The layer_count parameter, on the other hand, is relative to base_layer. Put differently, the maximum layer they will touch is base_layer + layer_count - 1 where the max layer in the image is anv_image_aux_layers() - 1. So, if we're going to clamp layer_count, we need to clamp it to anv_imagae_aux_layers() - base_layer. > > > + last_level_num = base_level + level_count; > > > + > > > + /* Record whether or not the layout is undefined. Pre-initialized > > > images > > > + * with auxiliary buffers have a non-linear layout and are thus > > > undefined. > > > + */ > > > + assert(image->tiling == VK_IMAGE_TILING_OPTIMAL); > > > + const bool undef_layout = initial_layout == > VK_IMAGE_LAYOUT_UNDEFINED > > > || > > > + initial_layout == VK_IMAGE_LAYOUT_ > > > PREINITIALIZED; > > > > > > - /* We're transitioning from an undefined layout. We must ensure > that > > > the > > > - * clear values buffer is filled with valid data. > > > + /* Do preparatory work before the resolve operation or return > early if > > > no > > > + * resolve is actually needed. > > > */ > > > - for (unsigned l = 0; l < level_count; l++) > > > - init_fast_clear_state_entry(cmd_buffer, image, base_level + l); > > > - > > > - if (image->aux_usage == ISL_AUX_USAGE_CCS_E) { > > > - /* We're transitioning from an undefined layout so it doesn't > really > > > - * matter what data ends up in the color buffer. We do, > however, > > > need to > > > - * ensure that the CCS has valid data in it. One easy way to do > > > that is > > > - * to fast-clear the specified range. > > > + if (undef_layout) { > > > + /* A subresource in the undefined layout may have been aliased > and > > > + * populated with any arrangement of bits. Therefore, we must > > > initialize > > > + * the related aux buffer and clear buffer entry with desirable > > > values. > > > + * > > > + * Initialize the relevant clear buffer entries. > > > */ > > > - anv_image_ccs_clear(cmd_buffer, image, base_level, level_count, > > > - base_layer, layer_count); > > > + for (unsigned level = base_level; level < last_level_num; > level++) > > > + init_fast_clear_state_entry(cmd_buffer, image, level); > > > + > > > + /* MCS buffers have been initialized for correct behaviour. */ > > > > > > > I don't think this is correct. I can definitely see MCS going off the > > rails in a similar way to CCS if things aren't initialized correctly. > > Unfortunately, MSAA test coverage is defintiely sub-par right now. I > think > > I'd feel more comfortable if we fast-cleared them too. Fortunately, > > though, we always leave basic MCS on for MSAA surfaces so no resolve will > > ever be needed. > > > > > > That's true. 2x and 8x MCS surfaces require that some of the upper bits > be zero, so they be should be fast-cleared as well. I'll make a separate > patch for fast-clearing MCS and CC it to stable. > > > > + if (image->samples > 1) > > > + return; > > > + > > > + /* Initialize the CCS buffers to enable correct rendering. This > > > operation > > > + * requires up to two steps: one to rid the CCS buffer of data > that > > > may > > > + * cause GPU hangs, and another to ensure that writes done > without > > > CCS > > > + * will be visible to reads done with CCS. > > > + * > > > + * On SKL+, it becomes possible to have a CCS buffer with > invalid > > > data. > > > + * One easy way to avoid this state is to fast-clear the > specified > > > range. > > > + */ > > > + if (GEN_GEN >= 9) { > > > + anv_image_ccs_clear(cmd_buffer, image, base_level, > level_count, > > > + base_layer, layer_count); > > > + } > > > + /* At this point, some elements of the CCS buffer may have the > > > fast-clear > > > + * bit-arrangement. As the user writes to a subresource, we > need to > > > have > > > + * the associated CCS elements enter the ambiguated state. This > > > enables > > > + * reads (implicit or explicit) to reflect the user-written data > > > instead > > > + * of the clear color. The only time such elements will not > change > > > their > > > + * state as described above, is in a final layout that doesn't > have > > > CCS > > > + * enabled. In this case, we must force the associated CCS > buffers > > > of the > > > + * specified range to enter the ambiguated state in advance. > > > + */ > > > + if (image->aux_usage != ISL_AUX_USAGE_CCS_E && > > > + final_layout != VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { > > > + /* The CCS_D buffer may not be enabled in the final layout. > > > Continue > > > + * executing this function to perform a resolve. > > > + */ > > > + anv_perf_warn("Performing an additional resolve for CCS_D > > > layout " > > > + "transition. Consider always leaving it on or > " > > > + "performing an ambiguation pass."); > > > + } else { > > > + /* Writes in the final layout will be aware of the CCS > buffer. In > > > + * addition, the clear buffer entries and the CCS buffers > have > > > been > > > + * populated with values that will result in correct > rendering. > > > + */ > > > + return; > > > > > > > Ugh... This all works but it makes me an even bigger fan of that > ambiguate > > pass... Maybe we should consider reviving it once this all lands. > > > > > > I can see why. It would simplify this chunk of code a lot. > > > > + } > > > + } else if (initial_layout != VK_IMAGE_LAYOUT_COLOR_ > ATTACHMENT_OPTIMAL) > > > { > > > + /* Resolves are only necessary if the subresource may contain > blocks > > > + * fast-cleared to values unsupported in other layouts. This > only > > > occurs > > > + * if the initial layout is COLOR_ATTACHMENT_OPTIMAL. > > > + */ > > > + return; > > > + } else if (image->samples > 1) { > > > + /* MCS buffers don't need resolving. */ > > > + return; > > > } > > > + > > > + /* Perform a resolve to synchronize data between the main and aux > > > buffer. > > > + * Before we begin, we must satisfy the cache flushing requirement > > > specified > > > + * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)": > > > + * > > > + * Any transition from any value in {Clear, Render, Resolve} to > a > > > + * different value in {Clear, Render, Resolve} requires end of > pipe > > > + * synchronization. > > > + * > > > + * We perform a flush of the write cache before and after the > clear and > > > + * resolve operations to meet this requirement. > > > + * > > > + * Unlike other drawing, it seems that fast clear operations are > not > > > + * properly synchronized. The first PIPE_CONTROL here likely > ensures > > > that > > > + * the contents of the previous render or clear hit the render > target > > > before > > > + * we resolve and the second likely ensures that the resolve is > > > complete > > > + * before we do any more rendering or clearing. > > > > > > > I'm really not a fan of how much this has been weasel-worded... > > > > <rant> > > One of the big problems with the PRMs is that they don't actually > describe > > the hardware. They describe how a theoretical driver can/should program > > the hardware to get correct rendering results. What they don't tell you > is > > *why* programming the hardware that way is needed. They get especially > bad > > about this around synchronization and image layout. They're also > > frequently missing important details and are sometimes flat-out wrong. > > > > The statement "fast clear ops are not properly synchronized with other > > drawing" describes, in some sense, what's going on in the hardware. You > > have two rendering operations in-flight and, due to some hardware detail > I > > don't fully understand, they can't safely be in the pipeline at the same > > time. It's not a full description I'll grant, but it at least describes, > > in general terms, the problem the workaround is solving. The language in > > the PRM, on the other hand, describes some engineer's mental model of > that > > synchronization problem and a way of working around it. > > > > We write drivers for hardware, not for PRMs. I don't think we need to > > weasel-word our comments to make it look like all we have is hair-brained > > theories while whoever wrote the PRM note has the actual knowledge. If > > we're particularly unsure about something then that should be expressed > in > > the comment. However, I'm pretty convinced that what's going on here is > a > > synchronization problem. Our understanding of fast-clear operations may > > improve in the future and, if/when it does, we'll update the comments. > > </rant> > > > > > > Thanks for sharing that point of view. I hadn't really thought that much > about how the PRMs were created. I agree that I can at least remove the > word "seems" from the comment while still being accurate. > > > > + */ > > > + cmd_buffer->state.pending_pipe_bits |= > > > + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; > > > + > > > + for (uint32_t level = base_level; level < last_level_num; level++) > { > > > + > > > + /* The number of layers changes at each 3D miplevel. */ > > > + if (image->type == VK_IMAGE_TYPE_3D) { > > > + layer_count = MIN2(layer_count, anv_image_aux_layers(image, > > > level)); > > > + } > > > + > > > + /* Create a surface state with the right clear color and > perform the > > > + * resolve. > > > + */ > > > + struct anv_state surface_state = > > > + anv_cmd_buffer_alloc_surface_state(cmd_buffer); > > > + isl_surf_fill_state(&cmd_buffer->device->isl_dev, > > > surface_state.map, > > > + .surf = &image->color_surface.isl, > > > + .view = &(struct isl_view) { > > > + .usage = ISL_SURF_USAGE_RENDER_TARGET_ > BIT, > > > + .format = image->color_surface.isl. > format, > > > + .swizzle = ISL_SWIZZLE_IDENTITY, > > > + .base_level = level, > > > + .levels = 1, > > > + .base_array_layer = base_layer, > > > + .array_len = layer_count, > > > + }, > > > + .aux_surf = &image->aux_surface.isl, > > > + .aux_usage = image->aux_usage == > > > ISL_AUX_USAGE_NONE ? > > > + ISL_AUX_USAGE_CCS_D : > > > image->aux_usage, > > > + .mocs = cmd_buffer->device->default_mocs); > > > + add_image_relocs(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT, > > > + image->aux_usage == ISL_AUX_USAGE_CCS_E ? > > > + ISL_AUX_USAGE_CCS_E : ISL_AUX_USAGE_CCS_D, > > > + surface_state); > > > + anv_state_flush(cmd_buffer->device, surface_state); > > > + genX(copy_fast_clear_dwords)(cmd_buffer, surface_state, image, > > > level, > > > + false /* copy to ss */); > > > + anv_ccs_resolve(cmd_buffer, surface_state, image, level, > > > layer_count, > > > + image->aux_usage == ISL_AUX_USAGE_CCS_E ? > > > + BLORP_FAST_CLEAR_OP_RESOLVE_PARTIAL : > > > + BLORP_FAST_CLEAR_OP_RESOLVE_FULL); > > > + } > > > + > > > + cmd_buffer->state.pending_pipe_bits |= > > > + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT; > > > } > > > > > > /** > > > -- > > > 2.13.1 > > > > > > _______________________________________________ > > > mesa-dev mailing list > > > mesa-dev@lists.freedesktop.org > > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev