Neil wrote: > It might be worth making a simpler hard-coded implementation of > quicksort because calling qsort is probably not very sensible for > such a small array and the function call overhead for each > comparison is probably quite a bit.
Ok, here is a v2 of the patch which has a simple custom quick sort implementation based on the Wikipedia description. It seems a lot faster than using qsort so the table is now like this: attributes are │ master with patch optimization removed patchv2 ────────────────┼────────────────────────────────────────────────── in order │ 820 560 325 760 out of order │ 325 540 325 760 - Neil ------- >8 --------------- (use git am --scissors to automatically chop here) When submitting the vertex buffers the i965 driver will try to recognise when multiple attributes are using the same buffer so that it can submit a single relocation for it and set a different offset in the attribute. Previously however if the application happens to have the attributes in a struct with an order that doesn't match the order they are listed in the gl_vert_attrib enum then the loop would end up processing the attributes with a greater offset first and the optimisation wouldn't be used. To make the optmisation more likely to be used this patch makes it always process the elements in increasing order of offset. This is done copying the element pointers into a separate array and sorting it with a simple quick sort implementation. This only affects the order that the elements are processed and doesn't change the order that they are submitted to the hardware. --- src/mesa/drivers/dri/i965/brw_draw_upload.c | 102 +++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c index 7bf9163..8480043 100644 --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c @@ -396,6 +396,89 @@ copy_array_to_vbo_array(struct brw_context *brw, buffer->stride = dst_stride; } +static bool +compare_array_ptr(const struct brw_vertex_element *input_a, + const struct brw_vertex_element *input_b) +{ + const GLubyte *ptr_a = input_a->glarray->Ptr; + const GLubyte *ptr_b = input_b->glarray->Ptr; + + if (ptr_a < ptr_b) + return false; + else if (ptr_a > ptr_b) + return true; + else { + /* Resort to comparing the element pointers so that it never returns + * equality because apparently that can be bad for quick sort */ + if (input_a < input_b) + return false; + else + return true; + } +} + +static int +partition_elements(struct brw_vertex_element **elements, + int n_elements) +{ + int pivot = n_elements / 2; + struct brw_vertex_element *pivot_value = elements[pivot]; + struct brw_vertex_element *tmp; + int i, store_index; + + elements[pivot] = elements[n_elements - 1]; + + store_index = 0; + + for (i = 0; i < n_elements - 1; i++) { + if (!compare_array_ptr(elements[i], pivot_value)) { + tmp = elements[i]; + elements[i] = elements[store_index]; + elements[store_index] = tmp; + store_index++; + } + } + + elements[n_elements - 1] = elements[store_index]; + elements[store_index] = pivot_value; + + return store_index; +} + +struct sort_stack { + int16_t start, end; +}; + +static void +sort_elements(struct brw_vertex_element **elements, + int n_elements) +{ + struct sort_stack stack[VERT_ATTRIB_MAX]; + int stack_size = 1; + int start, end, pivot_point; + + stack[0].start = 0; + stack[0].end = n_elements; + + while (stack_size > 0) { + stack_size--; + start = stack[stack_size].start; + end = stack[stack_size].end; + + if (end - start >= 2) { + pivot_point = partition_elements(elements + start, + end - start) + start; + assert(stack_size + 2 <= ARRAY_SIZE(stack)); + stack[stack_size].start = pivot_point + 1; + stack[stack_size].end = end; + stack_size++; + stack[stack_size].start = start; + stack[stack_size].end = pivot_point; + stack_size++; + } + } +} + void brw_prepare_vertices(struct brw_context *brw) { @@ -409,6 +492,7 @@ brw_prepare_vertices(struct brw_context *brw) int delta, i, j; struct brw_vertex_element *upload[VERT_ATTRIB_MAX]; + struct brw_vertex_element *sorted[VERT_ATTRIB_MAX]; GLuint nr_uploads = 0; /* _NEW_POLYGON @@ -442,8 +526,20 @@ brw_prepare_vertices(struct brw_context *brw) if (brw->vb.nr_buffers) return; + /* In the loop below if it finds an element that is using the same buffer + * as a previous element then it will reuse the same buffer relocation. + * However it will only work if the offset for the previous element is less + * than the offset for the new element and the difference is less than the + * stride. In order to increase the chances of hitting this optimisation + * the elements will be processed in increasing order of offset by first + * sorting the pointers. + */ + memcpy(sorted, brw->vb.enabled, + sizeof(sorted[0]) * brw->vb.nr_enabled); + sort_elements(sorted, brw->vb.nr_enabled); + for (i = j = 0; i < brw->vb.nr_enabled; i++) { - struct brw_vertex_element *input = brw->vb.enabled[i]; + struct brw_vertex_element *input = sorted[i]; const struct gl_client_array *glarray = input->glarray; if (_mesa_is_bufferobj(glarray->BufferObj)) { @@ -456,13 +552,13 @@ brw_prepare_vertices(struct brw_context *brw) * relocations. */ for (k = 0; k < i; k++) { - const struct gl_client_array *other = brw->vb.enabled[k]->glarray; + const struct gl_client_array *other = sorted[k]->glarray; if (glarray->BufferObj == other->BufferObj && glarray->StrideB == other->StrideB && glarray->InstanceDivisor == other->InstanceDivisor && (uintptr_t)(glarray->Ptr - other->Ptr) < glarray->StrideB) { - input->buffer = brw->vb.enabled[k]->buffer; + input->buffer = sorted[k]->buffer; input->offset = glarray->Ptr - other->Ptr; break; } -- 1.9.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev