On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg <rob...@sixbynine.org> wrote: > > > On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin <imir...@alum.mit.edu> wrote: >> >> The strategy is to just keep n anv_query_pool_slot entries per query >> instead of one. The available bit is only valid in the last one. >> >> Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> >> --- >> >> I think this is in a pretty good state now. I've tested both the direct >> and >> buffer paths with a hacked up cube application, and I'm seeing >> non-ridiculous >> values for the various counters, although I haven't 100% verified them for >> accuracy. >> >> This also implements the hsw/bdw workaround for dividing frag invocations >> by 4, >> copied from hsw_queryobj. I tested this on SKL and it seem to divide the >> values >> as expected. >> >> The cube patch I've been testing with is at >> http://paste.debian.net/899374/ >> You can flip between copying to a buffer and explicit retrieval by >> commenting >> out the relevant function calls. >> >> src/intel/vulkan/anv_device.c | 2 +- >> src/intel/vulkan/anv_private.h | 4 + >> src/intel/vulkan/anv_query.c | 99 ++++++++++---- >> src/intel/vulkan/genX_cmd_buffer.c | 260 >> ++++++++++++++++++++++++++++++++----- >> 4 files changed, 308 insertions(+), 57 deletions(-) >> >> >> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c >> index 99eb73c..7ad1970 100644 >> --- a/src/intel/vulkan/anv_device.c >> +++ b/src/intel/vulkan/anv_device.c >> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures( >> .textureCompressionASTC_LDR = pdevice->info.gen >= 9, >> /* FINISHME CHV */ >> .textureCompressionBC = true, >> .occlusionQueryPrecise = true, >> - .pipelineStatisticsQuery = false, >> + .pipelineStatisticsQuery = true, >> .fragmentStoresAndAtomics = true, >> .shaderTessellationAndGeometryPointSize = true, >> .shaderImageGatherExtended = false, >> diff --git a/src/intel/vulkan/anv_private.h >> b/src/intel/vulkan/anv_private.h >> index 2fc543d..7271609 100644 >> --- a/src/intel/vulkan/anv_private.h >> +++ b/src/intel/vulkan/anv_private.h >> @@ -1763,6 +1763,8 @@ struct anv_render_pass { >> struct anv_subpass subpasses[0]; >> }; >> >> +#define ANV_PIPELINE_STATISTICS_COUNT 11 >> + >> struct anv_query_pool_slot { >> uint64_t begin; >> uint64_t end; >> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot { >> struct anv_query_pool { >> VkQueryType type; >> uint32_t slots; >> + uint32_t pipeline_statistics; >> + uint32_t slot_stride; >> struct anv_bo bo; >> }; >> >> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c >> index 293257b..dc00859 100644 >> --- a/src/intel/vulkan/anv_query.c >> +++ b/src/intel/vulkan/anv_query.c >> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool( >> ANV_FROM_HANDLE(anv_device, device, _device); >> struct anv_query_pool *pool; >> VkResult result; >> - uint32_t slot_size; >> - uint64_t size; >> + uint32_t slot_size = sizeof(struct anv_query_pool_slot); >> + uint32_t slot_stride = 1; >> + uint64_t size = pCreateInfo->queryCount * slot_size; >> + uint32_t pipeline_statistics = 0; >> >> assert(pCreateInfo->sType == >> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); >> >> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool( >> case VK_QUERY_TYPE_TIMESTAMP: >> break; >> case VK_QUERY_TYPE_PIPELINE_STATISTICS: >> - return VK_ERROR_INCOMPATIBLE_DRIVER; >> + pipeline_statistics = pCreateInfo->pipelineStatistics & >> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1); >> + slot_stride = _mesa_bitcount(pipeline_statistics); >> + size *= slot_stride; >> + break; >> default: >> assert(!"Invalid query type"); >> + return VK_ERROR_INCOMPATIBLE_DRIVER; >> } >> >> - slot_size = sizeof(struct anv_query_pool_slot); >> pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, >> VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); >> if (pool == NULL) >> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool( >> >> pool->type = pCreateInfo->queryType; >> pool->slots = pCreateInfo->queryCount; >> + pool->pipeline_statistics = pipeline_statistics; >> + pool->slot_stride = slot_stride; >> >> - size = pCreateInfo->queryCount * slot_size; >> result = anv_bo_init_new(&pool->bo, device, size); >> if (result != VK_SUCCESS) >> goto fail; >> @@ -95,6 +102,27 @@ void anv_DestroyQueryPool( >> vk_free2(&device->alloc, pAllocator, pool); >> } >> >> +static void * >> +store_query_result(void *pData, VkQueryResultFlags flags, >> + uint64_t result, uint64_t available) >> +{ >> + if (flags & VK_QUERY_RESULT_64_BIT) { >> + uint64_t *dst = pData; >> + *dst++ = result; >> + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >> + *dst++ = available; >> + return dst; >> + } else { >> + uint32_t *dst = pData; >> + if (result > UINT32_MAX) >> + result = UINT32_MAX; >> + *dst++ = result; >> + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >> + *dst++ = available; >> + return dst; >> + } >> +} >> + >> VkResult anv_GetQueryPoolResults( >> VkDevice _device, >> VkQueryPool queryPool, >> @@ -112,6 +140,7 @@ VkResult anv_GetQueryPoolResults( >> int ret; >> >> assert(pool->type == VK_QUERY_TYPE_OCCLUSION || >> + pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || >> pool->type == VK_QUERY_TYPE_TIMESTAMP); >> >> if (pData == NULL) >> @@ -129,14 +158,42 @@ VkResult anv_GetQueryPoolResults( >> void *data_end = pData + dataSize; >> struct anv_query_pool_slot *slot = pool->bo.map; >> >> - for (uint32_t i = 0; i < queryCount; i++) { >> + for (uint32_t i = 0; i < queryCount && pData < data_end; >> + i++, pData += stride) { >> + if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { >> + VkQueryResultFlags f = flags & >> ~VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; >> + void *pos = pData; >> + uint32_t pipeline_statistics = pool->pipeline_statistics; >> + struct anv_query_pool_slot *base = >> + &slot[(firstQuery + i) * pool->slot_stride]; >> + >> + while (pipeline_statistics) { >> + uint32_t stat = u_bit_scan(&pipeline_statistics); >> + uint64_t result = base->end - base->begin; >> + >> + /* WaDividePSInvocationCountBy4:HSW,BDW */ >> + if ((device->info.gen == 8 || device->info.is_haswell) && >> + (1 << stat) == >> VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) >> + result >>= 2; >> + >> + pos = store_query_result(pos, f, result, 0); >> + base++; >> + } >> + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { >> + base--; >> + if (flags & VK_QUERY_RESULT_64_BIT) >> + *(uint64_t *)pos = base->available; >> + else >> + *(uint32_t *)pos = base->available; >> + } >> + continue; >> + } >> + >> switch (pool->type) { >> case VK_QUERY_TYPE_OCCLUSION: { >> result = slot[firstQuery + i].end - slot[firstQuery + i].begin; >> break; >> } >> - case VK_QUERY_TYPE_PIPELINE_STATISTICS: >> - unreachable("pipeline stats not supported"); >> case VK_QUERY_TYPE_TIMESTAMP: { >> result = slot[firstQuery + i].begin; >> break; >> @@ -145,23 +202,7 @@ VkResult anv_GetQueryPoolResults( >> unreachable("invalid pool type"); >> } >> >> - if (flags & VK_QUERY_RESULT_64_BIT) { >> - uint64_t *dst = pData; >> - dst[0] = result; >> - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >> - dst[1] = slot[firstQuery + i].available; >> - } else { >> - uint32_t *dst = pData; >> - if (result > UINT32_MAX) >> - result = UINT32_MAX; >> - dst[0] = result; >> - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >> - dst[1] = slot[firstQuery + i].available; >> - } >> - >> - pData += stride; >> - if (pData >= data_end) >> - break; >> + store_query_result(pData, flags, result, slot[firstQuery + >> i].available); >> } >> >> return VK_SUCCESS; >> @@ -183,6 +224,14 @@ void anv_CmdResetQueryPool( >> slot[firstQuery + i].available = 0; >> break; >> } >> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >> + struct anv_query_pool_slot *slot = pool->bo.map; >> + >> + slot = &slot[(firstQuery + i) * pool->slot_stride]; >> + for (uint32_t j = 0; j < pool->slot_stride; j++) >> + slot[j].available = 0; >> + break; >> + } >> default: >> assert(!"Invalid query type"); >> } >> diff --git a/src/intel/vulkan/genX_cmd_buffer.c >> b/src/intel/vulkan/genX_cmd_buffer.c >> index a965cd6..1369ac2 100644 >> --- a/src/intel/vulkan/genX_cmd_buffer.c >> +++ b/src/intel/vulkan/genX_cmd_buffer.c >> @@ -2272,6 +2272,50 @@ emit_query_availability(struct anv_cmd_buffer >> *cmd_buffer, >> } >> } >> >> +#define IA_VERTICES_COUNT 0x2310 >> +#define IA_PRIMITIVES_COUNT 0x2318 >> +#define VS_INVOCATION_COUNT 0x2320 >> +#define HS_INVOCATION_COUNT 0x2300 >> +#define DS_INVOCATION_COUNT 0x2308 >> +#define GS_INVOCATION_COUNT 0x2328 >> +#define GS_PRIMITIVES_COUNT 0x2330 >> +#define CL_INVOCATION_COUNT 0x2338 >> +#define CL_PRIMITIVES_COUNT 0x2340 >> +#define PS_INVOCATION_COUNT 0x2348 >> +#define CS_INVOCATION_COUNT 0x2290 >> + >> +static const uint32_t PIPELINE_STAT_TO_REG[] = { >> + IA_VERTICES_COUNT, >> + IA_PRIMITIVES_COUNT, >> + VS_INVOCATION_COUNT, >> + GS_INVOCATION_COUNT, >> + GS_PRIMITIVES_COUNT, >> + CL_INVOCATION_COUNT, >> + CL_PRIMITIVES_COUNT, >> + PS_INVOCATION_COUNT, >> + HS_INVOCATION_COUNT, >> + DS_INVOCATION_COUNT, >> + CS_INVOCATION_COUNT >> +}; >> + >> +static void >> +emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat, >> + struct anv_bo *bo, uint32_t offset) { >> + STATIC_ASSERT(ARRAY_SIZE(PIPELINE_STAT_TO_REG) == >> + ANV_PIPELINE_STATISTICS_COUNT); >> + >> + uint32_t reg = PIPELINE_STAT_TO_REG[stat]; >> + >> + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { >> + lrm.RegisterAddress = reg, >> + lrm.MemoryAddress = (struct anv_address) { bo, offset }; >> + } >> + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { >> + lrm.RegisterAddress = reg + 4, >> + lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; >> + } >> +} >> + >> void genX(CmdBeginQuery)( >> VkCommandBuffer commandBuffer, >> VkQueryPool queryPool, >> @@ -2301,7 +2345,25 @@ void genX(CmdBeginQuery)( >> query * sizeof(struct anv_query_pool_slot)); >> break; >> >> - case VK_QUERY_TYPE_PIPELINE_STATISTICS: >> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >> + uint32_t pipeline_statistics = pool->pipeline_statistics; >> + uint32_t slot_offset = query * pool->slot_stride * >> + sizeof(struct anv_query_pool_slot); >> + >> + /* TODO: This might only be necessary for certain stats */ >> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { >> + pc.CommandStreamerStallEnable = true; >> + pc.StallAtPixelScoreboard = true; >> + } > > > I couldn't find a reference for needing this for workarounds, though I saw > the workaround Ben implemented for Gen9 + GT4 using a CS stall when querying > timestamps, which is handled separately.
Well, experimentally, I need it for the clipper and frag shader invocation counts to show up - otherwise they're always 0. This is on SKL GT2 with the vk-loader cube patched up as described. Perhaps the timing works out s.t. the counters all update outside of the begin/end "critical section" otherwise, so even though they're moving, the difference remains 0. > > Depending on how strictly we consider that the queries should only measure > the commands they bracket then I think some stalling will be necessary to > serialize the work associated with a query and defer reading the end state > until after the relevant stages have completed their work. > > We aren't very precise about this in GL currently, but in Begin maybe we > should stall until everything >= the statistic-stage is idle and in End > stall until everything <= the statistic-stage is idle before reading (where > 'statistic-stage' here is the pipeline stage associated with the pipeline > statistic being queried (or respectively the min/max stage for a set)). > > For reference in my implementation of INTEL_performance_query facing this > same question, I'm currently just stalling before and after queries: > > https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L994 > https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L1136 So that's essentially what I'm doing here, I think. (And what the GL driver does.) brw_emit_mi_flush actually emits a lot more flushes than this, but I poked around and the main thing is to have a command streamer stall, which on earlier gens requires at least one other bit set. [Please note that I have no real familiarity with the hardware - I'm mostly just looking at what the GL driver does and copying it here with minor adjustments.] > >> + >> + while (pipeline_statistics) { >> + uint32_t stat = u_bit_scan(&pipeline_statistics); >> + >> + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset); >> + slot_offset += sizeof(struct anv_query_pool_slot); >> + } >> + break; >> + } >> default: >> unreachable(""); >> } >> @@ -2314,17 +2376,35 @@ void genX(CmdEndQuery)( >> { >> ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); >> ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); >> + uint32_t slot_offset = query * pool->slot_stride * >> + sizeof(struct anv_query_pool_slot); >> >> switch (pool->type) { >> case VK_QUERY_TYPE_OCCLUSION: >> - emit_ps_depth_count(cmd_buffer, &pool->bo, >> - query * sizeof(struct anv_query_pool_slot) + >> 8); >> + emit_ps_depth_count(cmd_buffer, &pool->bo, slot_offset + 8); >> + emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16); >> + break; >> + >> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >> + uint32_t pipeline_statistics = pool->pipeline_statistics; >> + /* TODO: This might only be necessary for certain stats */ >> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { >> + pc.CommandStreamerStallEnable = true; >> + pc.StallAtPixelScoreboard = true; >> + } >> + >> + while (pipeline_statistics) { >> + uint32_t stat = u_bit_scan(&pipeline_statistics); >> >> - emit_query_availability(cmd_buffer, &pool->bo, >> - query * sizeof(struct anv_query_pool_slot) >> + 16); >> + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset + >> 8); >> + slot_offset += sizeof(struct anv_query_pool_slot); >> + } >> + >> + slot_offset -= sizeof(struct anv_query_pool_slot); >> + emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16); >> break; >> + } >> >> - case VK_QUERY_TYPE_PIPELINE_STATISTICS: >> default: >> unreachable(""); >> } >> @@ -2421,6 +2501,31 @@ emit_load_alu_reg_u64(struct anv_batch *batch, >> uint32_t reg, >> } >> >> static void >> +emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t >> imm) >> +{ >> + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { >> + lri.RegisterOffset = reg; >> + lri.DataDWord = imm; >> + } >> +} >> + >> +static void >> +emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t >> imm) >> +{ >> + emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm); >> + emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32)); >> +} >> + >> +static void >> +emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t >> dst) >> +{ >> + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { >> + lrr.SourceRegisterAddress = src; >> + lrr.DestinationRegisterAddress = dst; >> + } >> +} >> + >> +static uint32_t >> store_query_result(struct anv_batch *batch, uint32_t reg, >> struct anv_bo *bo, uint32_t offset, VkQueryResultFlags >> flags) >> { >> @@ -2434,9 +2539,88 @@ store_query_result(struct anv_batch *batch, >> uint32_t reg, >> srm.RegisterAddress = reg + 4; >> srm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; >> } >> + >> + return offset + 8; >> + } >> + >> + return offset + 4; >> +} >> + >> +/* >> + * GPR0 = GPR0 & ((1ull << n) - 1); >> + */ >> +static void >> +keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n) >> +{ >> + assert(n < 64); >> + emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1); >> + >> + uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); >> + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); >> + dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1); >> + dw[3] = alu(OPCODE_AND, 0, 0); >> + dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU); >> +} >> + >> +/* >> + * GPR0 = GPR0 << 30; >> + */ >> +static void >> +shl_gpr0_by_30_bits(struct anv_batch *batch) >> +{ >> + /* First we mask 34 bits of GPR0 to prevent overflow */ >> + keep_gpr0_lower_n_bits(batch, 34); >> + >> + const uint32_t outer_count = 5; >> + const uint32_t inner_count = 6; >> + STATIC_ASSERT(outer_count * inner_count == 30); >> + const uint32_t cmd_len = 1 + inner_count * 4; >> + >> + /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a >> total of >> + * 30 left shifts. >> + */ >> + for (int o = 0; o < outer_count; o++) { >> + /* Submit one MI_MATH to shift left by 6 bits */ >> + uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH)); >> + dw++; >> + for (int i = 0; i < inner_count; i++, dw += 4) { >> + dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); >> + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); >> + dw[2] = alu(OPCODE_ADD, 0, 0); >> + dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU); >> + } >> } >> } >> >> +/* >> + * GPR0 = GPR0 >> 2; >> + * >> + * Note that the upper 30 bits of GPR are lost! >> + */ >> +static void >> +shr_gpr0_by_2_bits(struct anv_batch *batch) >> +{ >> + shl_gpr0_by_30_bits(batch); >> + emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0)); >> + emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0); >> +} >> + >> +static void >> +compute_query_result(struct anv_batch *batch, struct anv_bo *bo, >> + uint32_t dst_reg, uint32_t offset) >> +{ >> + emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset); >> + emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8); >> + >> + /* FIXME: We need to clamp the result for 32 bit. */ >> + >> + uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); >> + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); >> + dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); >> + dw[3] = alu(OPCODE_SUB, 0, 0); >> + dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU); >> +} >> + >> void genX(CmdCopyQueryPoolResults)( >> VkCommandBuffer commandBuffer, >> VkQueryPool queryPool, >> @@ -2459,50 +2643,64 @@ void genX(CmdCopyQueryPoolResults)( >> } >> } >> >> - dst_offset = buffer->offset + destOffset; >> for (uint32_t i = 0; i < queryCount; i++) { >> - >> + dst_offset = buffer->offset + destOffset + destStride * i; >> slot_offset = (firstQuery + i) * sizeof(struct >> anv_query_pool_slot); >> switch (pool->type) { >> case VK_QUERY_TYPE_OCCLUSION: >> - emit_load_alu_reg_u64(&cmd_buffer->batch, >> - CS_GPR(0), &pool->bo, slot_offset); >> - emit_load_alu_reg_u64(&cmd_buffer->batch, >> - CS_GPR(1), &pool->bo, slot_offset + 8); >> - >> - /* FIXME: We need to clamp the result for 32 bit. */ >> - >> - uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, >> GENX(MI_MATH)); >> - dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); >> - dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); >> - dw[3] = alu(OPCODE_SUB, 0, 0); >> - dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU); >> + compute_query_result(&cmd_buffer->batch, &pool->bo, OPERAND_R2, >> + slot_offset); >> + dst_offset = store_query_result( >> + &cmd_buffer->batch, >> + CS_GPR(2), buffer->bo, dst_offset, flags); >> break; >> >> case VK_QUERY_TYPE_TIMESTAMP: >> emit_load_alu_reg_u64(&cmd_buffer->batch, >> CS_GPR(2), &pool->bo, slot_offset); >> + dst_offset = store_query_result( >> + &cmd_buffer->batch, >> + CS_GPR(2), buffer->bo, dst_offset, flags); >> break; >> >> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >> + uint32_t pipeline_statistics = pool->pipeline_statistics; >> + >> + slot_offset *= pool->slot_stride; >> + while (pipeline_statistics) { >> + uint32_t stat = u_bit_scan(&pipeline_statistics); >> + >> + compute_query_result(&cmd_buffer->batch, &pool->bo, >> OPERAND_R0, >> + slot_offset); >> + >> + /* WaDividePSInvocationCountBy4:HSW,BDW */ >> + if ((cmd_buffer->device->info.gen == 8 || >> + cmd_buffer->device->info.is_haswell) && >> + (1 << stat) == >> VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { >> + shr_gpr0_by_2_bits(&cmd_buffer->batch); >> + } >> + dst_offset = store_query_result( >> + &cmd_buffer->batch, >> + CS_GPR(0), buffer->bo, dst_offset, flags); >> + slot_offset += sizeof(struct anv_query_pool_slot); >> + } >> + >> + /* Get the slot offset to where it's supposed to be for the >> + * availability bit. >> + */ >> + slot_offset -= sizeof(struct anv_query_pool_slot); >> + break; >> + } >> default: >> unreachable("unhandled query type"); >> } >> >> - store_query_result(&cmd_buffer->batch, >> - CS_GPR(2), buffer->bo, dst_offset, flags); >> - >> if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { >> emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), >> &pool->bo, slot_offset + 16); >> - if (flags & VK_QUERY_RESULT_64_BIT) >> - store_query_result(&cmd_buffer->batch, >> - CS_GPR(0), buffer->bo, dst_offset + 8, >> flags); >> - else >> - store_query_result(&cmd_buffer->batch, >> - CS_GPR(0), buffer->bo, dst_offset + 4, >> flags); >> + store_query_result(&cmd_buffer->batch, >> + CS_GPR(0), buffer->bo, dst_offset, flags); >> } >> - >> - dst_offset += destStride; >> } >> } > > > I was a bit surprised to see us needing to go the same route as GL query > objects with using the command streamer's very limited ALU for fixing up > metrics on the gpu instead of doing that on the CPU if possible. > > My understanding had been that multiple HW vendors pushed back on the design > of these query apis to ensure they could deal with counter scaling and > normalization on the CPU without needing to support direct feedback into the > gpu. Reading the spec though it does unfortunately look like > vkCmdCopyQueryPoolResults implies scaling on the gpu :-/ Yeah, that looks identical to ARB_query_buffer_object functionality, with very minor differences. > > One thorny problem we have today at least with GL is that we don't scale > Skylake timestamps properly when using the ALU considering the extra > annoyance of scaling by a floating point factor of 83.3333. In the future > this scale factor might ideally not even be a constant if we can take > advantage of training done in the kernel to determine a more accurate > frequency (ref: > https://lists.freedesktop.org/archives/intel-gfx/2016-June/097272.html) Also > for reference here, there's this patch that looks at fixing the timestamp > scaling done on the cpu for SKL: > https://patchwork.freedesktop.org/patch/118855/ I understand you aren't > dealing with timestamp scaling here, but it seems like an a example of where > it's getting a bit out-of-hand using the ALU for this type of purpose. I haven't checked, but I hope that the timestamp units can be in whatever you want. So you can hopefully get away without scaling. > > Can we maybe at least consider the complexity trade-off from using a shader > instead of ALU - or consider if there's a legitimate way we're missing of > using the cpu here (though I suppose not). Go ahead :) I was just copying the GL approach. Happy to rip out the MI_MATH stuff and make the feature IVB + SKL+ only for now if you all agree that the MI_MATH stuff is bad. IMHO the complexity of the MI_MATH logic is quite small for the / 4 case, but I don't feel like I'm particularly part of the decision committee. Cheers, -ilia _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev