ping.
On Thu, Dec 22, 2016 at 11:14 AM, Ilia Mirkin <imir...@alum.mit.edu> wrote: > Ping? Any further comments/feedback/reviews? > > > On Dec 5, 2016 11:22 AM, "Ilia Mirkin" <imir...@alum.mit.edu> wrote: > > On Mon, Dec 5, 2016 at 11:11 AM, Robert Bragg <rob...@sixbynine.org> wrote: >> >> >> On Sun, Nov 27, 2016 at 7:23 PM, Ilia Mirkin <imir...@alum.mit.edu> wrote: >>> >>> The strategy is to just keep n anv_query_pool_slot entries per query >>> instead of one. The available bit is only valid in the last one. >>> >>> Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> >>> --- >>> >>> I think this is in a pretty good state now. I've tested both the direct >>> and >>> buffer paths with a hacked up cube application, and I'm seeing >>> non-ridiculous >>> values for the various counters, although I haven't 100% verified them >>> for >>> accuracy. >>> >>> This also implements the hsw/bdw workaround for dividing frag invocations >>> by 4, >>> copied from hsw_queryobj. I tested this on SKL and it seem to divide the >>> values >>> as expected. >>> >>> The cube patch I've been testing with is at >>> http://paste.debian.net/899374/ >>> You can flip between copying to a buffer and explicit retrieval by >>> commenting >>> out the relevant function calls. >>> >>> src/intel/vulkan/anv_device.c | 2 +- >>> src/intel/vulkan/anv_private.h | 4 + >>> src/intel/vulkan/anv_query.c | 99 ++++++++++---- >>> src/intel/vulkan/genX_cmd_buffer.c | 260 >>> ++++++++++++++++++++++++++++++++----- >>> 4 files changed, 308 insertions(+), 57 deletions(-) >>> >>> >>> diff --git a/src/intel/vulkan/anv_device.c >>> b/src/intel/vulkan/anv_device.c >>> index 99eb73c..7ad1970 100644 >>> --- a/src/intel/vulkan/anv_device.c >>> +++ b/src/intel/vulkan/anv_device.c >>> @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures( >>> .textureCompressionASTC_LDR = pdevice->info.gen >= >>> 9, >>> /* FINISHME CHV */ >>> .textureCompressionBC = true, >>> .occlusionQueryPrecise = true, >>> - .pipelineStatisticsQuery = false, >>> + .pipelineStatisticsQuery = true, >>> .fragmentStoresAndAtomics = true, >>> .shaderTessellationAndGeometryPointSize = true, >>> .shaderImageGatherExtended = false, >>> diff --git a/src/intel/vulkan/anv_private.h >>> b/src/intel/vulkan/anv_private.h >>> index 2fc543d..7271609 100644 >>> --- a/src/intel/vulkan/anv_private.h >>> +++ b/src/intel/vulkan/anv_private.h >>> @@ -1763,6 +1763,8 @@ struct anv_render_pass { >>> struct anv_subpass subpasses[0]; >>> }; >>> >>> +#define ANV_PIPELINE_STATISTICS_COUNT 11 >>> + >>> struct anv_query_pool_slot { >>> uint64_t begin; >>> uint64_t end; >>> @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot { >>> struct anv_query_pool { >>> VkQueryType type; >>> uint32_t slots; >>> + uint32_t pipeline_statistics; >>> + uint32_t slot_stride; >>> struct anv_bo bo; >>> }; >>> >>> diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c >>> index 293257b..dc00859 100644 >>> --- a/src/intel/vulkan/anv_query.c >>> +++ b/src/intel/vulkan/anv_query.c >>> @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool( >>> ANV_FROM_HANDLE(anv_device, device, _device); >>> struct anv_query_pool *pool; >>> VkResult result; >>> - uint32_t slot_size; >>> - uint64_t size; >>> + uint32_t slot_size = sizeof(struct anv_query_pool_slot); >>> + uint32_t slot_stride = 1; >>> + uint64_t size = pCreateInfo->queryCount * slot_size; >>> + uint32_t pipeline_statistics = 0; >>> >>> assert(pCreateInfo->sType == >>> VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); >>> >>> @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool( >>> case VK_QUERY_TYPE_TIMESTAMP: >>> break; >>> case VK_QUERY_TYPE_PIPELINE_STATISTICS: >>> - return VK_ERROR_INCOMPATIBLE_DRIVER; >>> + pipeline_statistics = pCreateInfo->pipelineStatistics & >>> + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1); >>> + slot_stride = _mesa_bitcount(pipeline_statistics); >>> + size *= slot_stride; >>> + break; >>> default: >>> assert(!"Invalid query type"); >>> + return VK_ERROR_INCOMPATIBLE_DRIVER; >>> } >>> >>> - slot_size = sizeof(struct anv_query_pool_slot); >>> pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, >>> VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); >>> if (pool == NULL) >>> @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool( >>> >>> pool->type = pCreateInfo->queryType; >>> pool->slots = pCreateInfo->queryCount; >>> + pool->pipeline_statistics = pipeline_statistics; >>> + pool->slot_stride = slot_stride; >>> >>> - size = pCreateInfo->queryCount * slot_size; >>> result = anv_bo_init_new(&pool->bo, device, size); >>> if (result != VK_SUCCESS) >>> goto fail; >>> @@ -95,6 +102,27 @@ void anv_DestroyQueryPool( >>> vk_free2(&device->alloc, pAllocator, pool); >>> } >>> >>> +static void * >>> +store_query_result(void *pData, VkQueryResultFlags flags, >>> + uint64_t result, uint64_t available) >>> +{ >>> + if (flags & VK_QUERY_RESULT_64_BIT) { >>> + uint64_t *dst = pData; >>> + *dst++ = result; >>> + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >>> + *dst++ = available; >>> + return dst; >>> + } else { >>> + uint32_t *dst = pData; >>> + if (result > UINT32_MAX) >>> + result = UINT32_MAX; >>> + *dst++ = result; >>> + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >>> + *dst++ = available; >>> + return dst; >>> + } >>> +} >>> + >>> VkResult anv_GetQueryPoolResults( >>> VkDevice _device, >>> VkQueryPool queryPool, >>> @@ -112,6 +140,7 @@ VkResult anv_GetQueryPoolResults( >>> int ret; >>> >>> assert(pool->type == VK_QUERY_TYPE_OCCLUSION || >>> + pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || >>> pool->type == VK_QUERY_TYPE_TIMESTAMP); >>> >>> if (pData == NULL) >>> @@ -129,14 +158,42 @@ VkResult anv_GetQueryPoolResults( >>> void *data_end = pData + dataSize; >>> struct anv_query_pool_slot *slot = pool->bo.map; >>> >>> - for (uint32_t i = 0; i < queryCount; i++) { >>> + for (uint32_t i = 0; i < queryCount && pData < data_end; >>> + i++, pData += stride) { >>> + if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { >>> + VkQueryResultFlags f = flags & >>> ~VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; >>> + void *pos = pData; >>> + uint32_t pipeline_statistics = pool->pipeline_statistics; >>> + struct anv_query_pool_slot *base = >>> + &slot[(firstQuery + i) * pool->slot_stride]; >>> + >>> + while (pipeline_statistics) { >>> + uint32_t stat = u_bit_scan(&pipeline_statistics); >>> + uint64_t result = base->end - base->begin; >>> + >>> + /* WaDividePSInvocationCountBy4:HSW,BDW */ >>> + if ((device->info.gen == 8 || device->info.is_haswell) && >>> + (1 << stat) == >>> VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) >>> + result >>= 2; >>> + >>> + pos = store_query_result(pos, f, result, 0); >>> + base++; >>> + } >>> + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { >>> + base--; >>> + if (flags & VK_QUERY_RESULT_64_BIT) >>> + *(uint64_t *)pos = base->available; >>> + else >>> + *(uint32_t *)pos = base->available; >>> + } >>> + continue; >>> + } >>> + >>> switch (pool->type) { >>> case VK_QUERY_TYPE_OCCLUSION: { >>> result = slot[firstQuery + i].end - slot[firstQuery + i].begin; >>> break; >>> } >>> - case VK_QUERY_TYPE_PIPELINE_STATISTICS: >>> - unreachable("pipeline stats not supported"); >>> case VK_QUERY_TYPE_TIMESTAMP: { >>> result = slot[firstQuery + i].begin; >>> break; >>> @@ -145,23 +202,7 @@ VkResult anv_GetQueryPoolResults( >>> unreachable("invalid pool type"); >>> } >>> >>> - if (flags & VK_QUERY_RESULT_64_BIT) { >>> - uint64_t *dst = pData; >>> - dst[0] = result; >>> - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >>> - dst[1] = slot[firstQuery + i].available; >>> - } else { >>> - uint32_t *dst = pData; >>> - if (result > UINT32_MAX) >>> - result = UINT32_MAX; >>> - dst[0] = result; >>> - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) >>> - dst[1] = slot[firstQuery + i].available; >>> - } >>> - >>> - pData += stride; >>> - if (pData >= data_end) >>> - break; >>> + store_query_result(pData, flags, result, slot[firstQuery + >>> i].available); >>> } >>> >>> return VK_SUCCESS; >>> @@ -183,6 +224,14 @@ void anv_CmdResetQueryPool( >>> slot[firstQuery + i].available = 0; >>> break; >>> } >>> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >>> + struct anv_query_pool_slot *slot = pool->bo.map; >>> + >>> + slot = &slot[(firstQuery + i) * pool->slot_stride]; >>> + for (uint32_t j = 0; j < pool->slot_stride; j++) >>> + slot[j].available = 0; >>> + break; >>> + } >>> default: >>> assert(!"Invalid query type"); >>> } >>> diff --git a/src/intel/vulkan/genX_cmd_buffer.c >>> b/src/intel/vulkan/genX_cmd_buffer.c >>> index a965cd6..1369ac2 100644 >>> --- a/src/intel/vulkan/genX_cmd_buffer.c >>> +++ b/src/intel/vulkan/genX_cmd_buffer.c >>> @@ -2272,6 +2272,50 @@ emit_query_availability(struct anv_cmd_buffer >>> *cmd_buffer, >>> } >>> } >>> >>> +#define IA_VERTICES_COUNT 0x2310 >>> +#define IA_PRIMITIVES_COUNT 0x2318 >>> +#define VS_INVOCATION_COUNT 0x2320 >>> +#define HS_INVOCATION_COUNT 0x2300 >>> +#define DS_INVOCATION_COUNT 0x2308 >>> +#define GS_INVOCATION_COUNT 0x2328 >>> +#define GS_PRIMITIVES_COUNT 0x2330 >>> +#define CL_INVOCATION_COUNT 0x2338 >>> +#define CL_PRIMITIVES_COUNT 0x2340 >>> +#define PS_INVOCATION_COUNT 0x2348 >>> +#define CS_INVOCATION_COUNT 0x2290 >>> + >>> +static const uint32_t PIPELINE_STAT_TO_REG[] = { >>> + IA_VERTICES_COUNT, >>> + IA_PRIMITIVES_COUNT, >>> + VS_INVOCATION_COUNT, >>> + GS_INVOCATION_COUNT, >>> + GS_PRIMITIVES_COUNT, >>> + CL_INVOCATION_COUNT, >>> + CL_PRIMITIVES_COUNT, >>> + PS_INVOCATION_COUNT, >>> + HS_INVOCATION_COUNT, >>> + DS_INVOCATION_COUNT, >>> + CS_INVOCATION_COUNT >>> +}; >>> + >>> +static void >>> +emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat, >>> + struct anv_bo *bo, uint32_t offset) { >>> + STATIC_ASSERT(ARRAY_SIZE(PIPELINE_STAT_TO_REG) == >>> + ANV_PIPELINE_STATISTICS_COUNT); >>> + >>> + uint32_t reg = PIPELINE_STAT_TO_REG[stat]; >>> + >>> + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) >>> { >>> + lrm.RegisterAddress = reg, >>> + lrm.MemoryAddress = (struct anv_address) { bo, offset }; >>> + } >>> + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) >>> { >>> + lrm.RegisterAddress = reg + 4, >>> + lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; >>> + } >>> +} >>> + >>> void genX(CmdBeginQuery)( >>> VkCommandBuffer commandBuffer, >>> VkQueryPool queryPool, >>> @@ -2301,7 +2345,25 @@ void genX(CmdBeginQuery)( >>> query * sizeof(struct anv_query_pool_slot)); >>> break; >>> >>> - case VK_QUERY_TYPE_PIPELINE_STATISTICS: >>> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >>> + uint32_t pipeline_statistics = pool->pipeline_statistics; >>> + uint32_t slot_offset = query * pool->slot_stride * >>> + sizeof(struct anv_query_pool_slot); >>> + >>> + /* TODO: This might only be necessary for certain stats */ >>> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { >>> + pc.CommandStreamerStallEnable = true; >>> + pc.StallAtPixelScoreboard = true; >>> + } >> >> >> I couldn't find a reference for needing this for workarounds, though I saw >> the workaround Ben implemented for Gen9 + GT4 using a CS stall when >> querying >> timestamps, which is handled separately. > > Well, experimentally, I need it for the clipper and frag shader > invocation counts to show up - otherwise they're always 0. This is on > SKL GT2 with the vk-loader cube patched up as described. Perhaps the > timing works out s.t. the counters all update outside of the begin/end > "critical section" otherwise, so even though they're moving, the > difference remains 0. > >> >> Depending on how strictly we consider that the queries should only measure >> the commands they bracket then I think some stalling will be necessary to >> serialize the work associated with a query and defer reading the end state >> until after the relevant stages have completed their work. >> >> We aren't very precise about this in GL currently, but in Begin maybe we >> should stall until everything >= the statistic-stage is idle and in End >> stall until everything <= the statistic-stage is idle before reading >> (where >> 'statistic-stage' here is the pipeline stage associated with the pipeline >> statistic being queried (or respectively the min/max stage for a set)). >> >> For reference in my implementation of INTEL_performance_query facing this >> same question, I'm currently just stalling before and after queries: >> >> >> https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L994 >> >> https://github.com/rib/mesa/blob/wip/rib/oa-next/src/mesa/drivers/dri/i965/brw_performance_query.c#L1136 > > So that's essentially what I'm doing here, I think. (And what the GL > driver does.) brw_emit_mi_flush actually emits a lot more flushes than > this, but I poked around and the main thing is to have a command > streamer stall, which on earlier gens requires at least one other bit > set. > > [Please note that I have no real familiarity with the hardware - I'm > mostly just looking at what the GL driver does and copying it here > with minor adjustments.] > >> >>> + >>> + while (pipeline_statistics) { >>> + uint32_t stat = u_bit_scan(&pipeline_statistics); >>> + >>> + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset); >>> + slot_offset += sizeof(struct anv_query_pool_slot); >>> + } >>> + break; >>> + } >>> default: >>> unreachable(""); >>> } >>> @@ -2314,17 +2376,35 @@ void genX(CmdEndQuery)( >>> { >>> ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); >>> ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); >>> + uint32_t slot_offset = query * pool->slot_stride * >>> + sizeof(struct anv_query_pool_slot); >>> >>> switch (pool->type) { >>> case VK_QUERY_TYPE_OCCLUSION: >>> - emit_ps_depth_count(cmd_buffer, &pool->bo, >>> - query * sizeof(struct anv_query_pool_slot) + >>> 8); >>> + emit_ps_depth_count(cmd_buffer, &pool->bo, slot_offset + 8); >>> + emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16); >>> + break; >>> + >>> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >>> + uint32_t pipeline_statistics = pool->pipeline_statistics; >>> + /* TODO: This might only be necessary for certain stats */ >>> + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { >>> + pc.CommandStreamerStallEnable = true; >>> + pc.StallAtPixelScoreboard = true; >>> + } >>> + >>> + while (pipeline_statistics) { >>> + uint32_t stat = u_bit_scan(&pipeline_statistics); >>> >>> - emit_query_availability(cmd_buffer, &pool->bo, >>> - query * sizeof(struct anv_query_pool_slot) >>> + 16); >>> + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset + >>> 8); >>> + slot_offset += sizeof(struct anv_query_pool_slot); >>> + } >>> + >>> + slot_offset -= sizeof(struct anv_query_pool_slot); >>> + emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16); >>> break; >>> + } >>> >>> - case VK_QUERY_TYPE_PIPELINE_STATISTICS: >>> default: >>> unreachable(""); >>> } >>> @@ -2421,6 +2501,31 @@ emit_load_alu_reg_u64(struct anv_batch *batch, >>> uint32_t reg, >>> } >>> >>> static void >>> +emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t >>> imm) >>> +{ >>> + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { >>> + lri.RegisterOffset = reg; >>> + lri.DataDWord = imm; >>> + } >>> +} >>> + >>> +static void >>> +emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t >>> imm) >>> +{ >>> + emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm); >>> + emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32)); >>> +} >>> + >>> +static void >>> +emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t >>> dst) >>> +{ >>> + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { >>> + lrr.SourceRegisterAddress = src; >>> + lrr.DestinationRegisterAddress = dst; >>> + } >>> +} >>> + >>> +static uint32_t >>> store_query_result(struct anv_batch *batch, uint32_t reg, >>> struct anv_bo *bo, uint32_t offset, >>> VkQueryResultFlags >>> flags) >>> { >>> @@ -2434,9 +2539,88 @@ store_query_result(struct anv_batch *batch, >>> uint32_t reg, >>> srm.RegisterAddress = reg + 4; >>> srm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; >>> } >>> + >>> + return offset + 8; >>> + } >>> + >>> + return offset + 4; >>> +} >>> + >>> +/* >>> + * GPR0 = GPR0 & ((1ull << n) - 1); >>> + */ >>> +static void >>> +keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n) >>> +{ >>> + assert(n < 64); >>> + emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1); >>> + >>> + uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); >>> + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); >>> + dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1); >>> + dw[3] = alu(OPCODE_AND, 0, 0); >>> + dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU); >>> +} >>> + >>> +/* >>> + * GPR0 = GPR0 << 30; >>> + */ >>> +static void >>> +shl_gpr0_by_30_bits(struct anv_batch *batch) >>> +{ >>> + /* First we mask 34 bits of GPR0 to prevent overflow */ >>> + keep_gpr0_lower_n_bits(batch, 34); >>> + >>> + const uint32_t outer_count = 5; >>> + const uint32_t inner_count = 6; >>> + STATIC_ASSERT(outer_count * inner_count == 30); >>> + const uint32_t cmd_len = 1 + inner_count * 4; >>> + >>> + /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a >>> total of >>> + * 30 left shifts. >>> + */ >>> + for (int o = 0; o < outer_count; o++) { >>> + /* Submit one MI_MATH to shift left by 6 bits */ >>> + uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH)); >>> + dw++; >>> + for (int i = 0; i < inner_count; i++, dw += 4) { >>> + dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); >>> + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); >>> + dw[2] = alu(OPCODE_ADD, 0, 0); >>> + dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU); >>> + } >>> } >>> } >>> >>> +/* >>> + * GPR0 = GPR0 >> 2; >>> + * >>> + * Note that the upper 30 bits of GPR are lost! >>> + */ >>> +static void >>> +shr_gpr0_by_2_bits(struct anv_batch *batch) >>> +{ >>> + shl_gpr0_by_30_bits(batch); >>> + emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0)); >>> + emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0); >>> +} >>> + >>> +static void >>> +compute_query_result(struct anv_batch *batch, struct anv_bo *bo, >>> + uint32_t dst_reg, uint32_t offset) >>> +{ >>> + emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset); >>> + emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8); >>> + >>> + /* FIXME: We need to clamp the result for 32 bit. */ >>> + >>> + uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); >>> + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); >>> + dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); >>> + dw[3] = alu(OPCODE_SUB, 0, 0); >>> + dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU); >>> +} >>> + >>> void genX(CmdCopyQueryPoolResults)( >>> VkCommandBuffer commandBuffer, >>> VkQueryPool queryPool, >>> @@ -2459,50 +2643,64 @@ void genX(CmdCopyQueryPoolResults)( >>> } >>> } >>> >>> - dst_offset = buffer->offset + destOffset; >>> for (uint32_t i = 0; i < queryCount; i++) { >>> - >>> + dst_offset = buffer->offset + destOffset + destStride * i; >>> slot_offset = (firstQuery + i) * sizeof(struct >>> anv_query_pool_slot); >>> switch (pool->type) { >>> case VK_QUERY_TYPE_OCCLUSION: >>> - emit_load_alu_reg_u64(&cmd_buffer->batch, >>> - CS_GPR(0), &pool->bo, slot_offset); >>> - emit_load_alu_reg_u64(&cmd_buffer->batch, >>> - CS_GPR(1), &pool->bo, slot_offset + 8); >>> - >>> - /* FIXME: We need to clamp the result for 32 bit. */ >>> - >>> - uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, >>> GENX(MI_MATH)); >>> - dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); >>> - dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); >>> - dw[3] = alu(OPCODE_SUB, 0, 0); >>> - dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU); >>> + compute_query_result(&cmd_buffer->batch, &pool->bo, OPERAND_R2, >>> + slot_offset); >>> + dst_offset = store_query_result( >>> + &cmd_buffer->batch, >>> + CS_GPR(2), buffer->bo, dst_offset, flags); >>> break; >>> >>> case VK_QUERY_TYPE_TIMESTAMP: >>> emit_load_alu_reg_u64(&cmd_buffer->batch, >>> CS_GPR(2), &pool->bo, slot_offset); >>> + dst_offset = store_query_result( >>> + &cmd_buffer->batch, >>> + CS_GPR(2), buffer->bo, dst_offset, flags); >>> break; >>> >>> + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { >>> + uint32_t pipeline_statistics = pool->pipeline_statistics; >>> + >>> + slot_offset *= pool->slot_stride; >>> + while (pipeline_statistics) { >>> + uint32_t stat = u_bit_scan(&pipeline_statistics); >>> + >>> + compute_query_result(&cmd_buffer->batch, &pool->bo, >>> OPERAND_R0, >>> + slot_offset); >>> + >>> + /* WaDividePSInvocationCountBy4:HSW,BDW */ >>> + if ((cmd_buffer->device->info.gen == 8 || >>> + cmd_buffer->device->info.is_haswell) && >>> + (1 << stat) == >>> VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { >>> + shr_gpr0_by_2_bits(&cmd_buffer->batch); >>> + } >>> + dst_offset = store_query_result( >>> + &cmd_buffer->batch, >>> + CS_GPR(0), buffer->bo, dst_offset, flags); >>> + slot_offset += sizeof(struct anv_query_pool_slot); >>> + } >>> + >>> + /* Get the slot offset to where it's supposed to be for the >>> + * availability bit. >>> + */ >>> + slot_offset -= sizeof(struct anv_query_pool_slot); >>> + break; >>> + } >>> default: >>> unreachable("unhandled query type"); >>> } >>> >>> - store_query_result(&cmd_buffer->batch, >>> - CS_GPR(2), buffer->bo, dst_offset, flags); >>> - >>> if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { >>> emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), >>> &pool->bo, slot_offset + 16); >>> - if (flags & VK_QUERY_RESULT_64_BIT) >>> - store_query_result(&cmd_buffer->batch, >>> - CS_GPR(0), buffer->bo, dst_offset + 8, >>> flags); >>> - else >>> - store_query_result(&cmd_buffer->batch, >>> - CS_GPR(0), buffer->bo, dst_offset + 4, >>> flags); >>> + store_query_result(&cmd_buffer->batch, >>> + CS_GPR(0), buffer->bo, dst_offset, flags); >>> } >>> - >>> - dst_offset += destStride; >>> } >>> } >> >> >> I was a bit surprised to see us needing to go the same route as GL query >> objects with using the command streamer's very limited ALU for fixing up >> metrics on the gpu instead of doing that on the CPU if possible. >> >> My understanding had been that multiple HW vendors pushed back on the >> design >> of these query apis to ensure they could deal with counter scaling and >> normalization on the CPU without needing to support direct feedback into >> the >> gpu. Reading the spec though it does unfortunately look like >> vkCmdCopyQueryPoolResults implies scaling on the gpu :-/ > > Yeah, that looks identical to ARB_query_buffer_object functionality, > with very minor differences. > >> >> One thorny problem we have today at least with GL is that we don't scale >> Skylake timestamps properly when using the ALU considering the extra >> annoyance of scaling by a floating point factor of 83.3333. In the future >> this scale factor might ideally not even be a constant if we can take >> advantage of training done in the kernel to determine a more accurate >> frequency (ref: >> https://lists.freedesktop.org/archives/intel-gfx/2016-June/097272.html) >> Also >> for reference here, there's this patch that looks at fixing the timestamp >> scaling done on the cpu for SKL: >> https://patchwork.freedesktop.org/patch/118855/ I understand you aren't >> dealing with timestamp scaling here, but it seems like an a example of >> where >> it's getting a bit out-of-hand using the ALU for this type of purpose. > > I haven't checked, but I hope that the timestamp units can be in > whatever you want. So you can hopefully get away without scaling. > >> >> Can we maybe at least consider the complexity trade-off from using a >> shader >> instead of ALU - or consider if there's a legitimate way we're missing of >> using the cpu here (though I suppose not). > > Go ahead :) I was just copying the GL approach. Happy to rip out the > MI_MATH stuff and make the feature IVB + SKL+ only for now if you all > agree that the MI_MATH stuff is bad. IMHO the complexity of the > MI_MATH logic is quite small for the / 4 case, but I don't feel like > I'm particularly part of the decision committee. > > Cheers, > > -ilia > > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev