From c8237d5c312485394389b2520451ef720604ea9a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 20 Aug 2023 17:53:08 +0200 Subject: Query Cache: Implement host side sample counting. --- src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/queries_prefix_scan_sum.comp | 124 +++++++++++++++++ src/video_core/renderer_vulkan/vk_compute_pass.cpp | 110 ++++++++++++++- src/video_core/renderer_vulkan/vk_compute_pass.h | 14 ++ src/video_core/renderer_vulkan/vk_query_cache.cpp | 147 ++++++++++++++------- 5 files changed, 348 insertions(+), 48 deletions(-) create mode 100644 src/video_core/host_shaders/queries_prefix_scan_sum.comp diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index fb24b6532..8218ec4c8 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -41,6 +41,7 @@ set(SHADER_FILES pitch_unswizzle.comp present_bicubic.frag present_gaussian.frag + queries_prefix_scan_sum.comp resolve_conditional_render.comp smaa_edge_detection.vert smaa_edge_detection.frag diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp new file mode 100644 index 000000000..dce1279fe --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp @@ -0,0 +1,124 @@ +// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel +// SPDX-License-Identifier: MIT + +// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and +// Nicholas Haemel. Modified to suit needs and optimize for subgroup + +#version 460 core + +#ifdef VULKAN + +#extension GL_KHR_shader_subgroup_arithmetic : enable +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS \ + } \ + ; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_KHR_shader_subgroup_arithmetic : enable +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uint max_accumulation_base; +UNIFORM(1) uint accumulation_limit; +END_PUSH_CONSTANTS + +layout(local_size_x = 32) in; + +layout(std430, binding = 0) readonly buffer block1 { + uvec2 input_data[gl_WorkGroupSize.x]; +}; + +layout(std430, binding = 1) writeonly coherent buffer block2 { + uvec2 output_data[gl_WorkGroupSize.x]; +}; + +layout(std430, binding = 2) coherent buffer block3 { + uvec2 accumulated_data; +}; + +shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; + +uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { + uint carry = 0; + uvec2 result; + result.x = uaddCarry(value_1.x, value_2.x, carry); + result.y = value_1.y + value_2.y + carry; + return result; +} + +void main(void) { + uint id = gl_LocalInvocationID.x; + uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); + uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); + uint work_size = gl_WorkGroupSize.x; + uint rd_id; + uint wr_id; + uint mask; + uvec2 input_1 = input_data[id * 2]; + uvec2 input_2 = input_data[id * 2 + 1]; + // The number of steps is the log base 2 of the + // work group size, which should be a power of 2 + const uint steps = uint(log2(work_size)) + 1; + uint step = 0; + + // Each invocation is responsible for the content of + // two elements of the output array + shared_data[id * 2] = input_1; + shared_data[id * 2 + 1] = input_2; + // Synchronize to make sure that everyone has initialized + // their elements of shared_data[] with data loaded from + // the input arrays + barrier(); + memoryBarrierShared(); + // For each step... + for (step = 0; step < steps; step++) { + // Calculate the read and write index in the + // shared array + mask = (1 << step) - 1; + rd_id = ((id >> step) << (step + 1)) + mask; + wr_id = rd_id + 1 + (id & mask); + // Accumulate the read data into our element + + shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); + // Synchronize again to make sure that everyone + // has caught up with us + barrier(); + memoryBarrierShared(); + } + // Add the accumulation + shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); + shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); + barrier(); + memoryBarrierShared(); + + // Finally write our data back to the output buffer + output_data[id * 2] = shared_data[id * 2]; + output_data[id * 2 + 1] = shared_data[id * 2 + 1]; + if (id == 0) { + if (max_accumulation_base >= accumulation_limit + 1) { + accumulated_data = shared_data[accumulation_limit]; + return; + } + uvec2 value_1 = shared_data[max_accumulation_base]; + uvec2 value_2 = shared_data[accumulation_limit]; + accumulated_data = AddUint64(value_1, -value_2); + } +} \ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 039dc95e1..a1af08cda 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -12,6 +12,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h" +#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" @@ -58,6 +59,30 @@ constexpr std::array INPUT_OUTPUT_DESCRIPTOR_SE }, }}; +constexpr std::array QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ + { + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, + { + .binding = 2, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = nullptr, + }, +}}; + constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .uniform_buffers = 0, .storage_buffers = 2, @@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ .score = 2, }; +constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ + .uniform_buffers = 0, + .storage_buffers = 3, + .texture_buffers = 0, + .image_buffers = 0, + .textures = 0, + .images = 0, + .score = 3, +}; + constexpr std::array ASTC_DESCRIPTOR_SET_BINDINGS{{ { .binding = ASTC_BINDING_INPUT_BUFFER, @@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT .stride = sizeof(DescriptorUpdateEntry), }; +constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 3, + .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .offset = 0, + .stride = sizeof(DescriptorUpdateEntry), +}; + constexpr std::array ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ { @@ -132,6 +176,11 @@ struct AstcPushConstants { u32 block_height; u32 block_height_mask; }; + +struct QueriesPrefixScanPushConstants { + u32 max_accumulation_base; + u32 accumulation_limit; +}; } // Anonymous namespace ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, @@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero) { - scheduler.RequestOutsideRenderPassOperationContext(); - const size_t compare_size = compare_to_zero ? 8 : 24; compute_pass_descriptor_queue.Acquire(); @@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ static constexpr VkMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, }; static constexpr VkMemoryBarrier write_barrier{ @@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ }); } +QueriesPrefixScanPass::QueriesPrefixScanPass( + const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_) + : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, + QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, + COMPUTE_PUSH_CONSTANT_RANGE, + QUERIES_PREFIX_SCAN_SUM_COMP_SPV), + scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} + +void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, + VkBuffer src_buffer, size_t number_of_sums, + size_t max_accumulation_limit) { + size_t aligned_runs = Common::AlignUp(number_of_sums, 32); + + compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); + compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); + const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, + aligned_runs](vk::CommandBuffer cmdbuf) { + static constexpr VkMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | + VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | + VK_ACCESS_UNIFORM_READ_BIT | + VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, + }; + const QueriesPrefixScanPushConstants uniforms{ + .max_accumulation_base = static_cast(max_accumulation_limit), + .accumulation_limit = static_cast(number_of_sums - 1), + }; + const VkDescriptorSet set = descriptor_allocator.Commit(); + device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); + cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); + cmdbuf.Dispatch(static_cast(aligned_runs / 32U), 1, 1); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); + }); +} + ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, StagingBufferPool& staging_buffer_pool_, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index c62f30d30..e6ff86e9a 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -95,6 +95,20 @@ private: ComputePassDescriptorQueue& compute_pass_descriptor_queue; }; +class QueriesPrefixScanPass final : public ComputePass { +public: + explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, + DescriptorPool& descriptor_pool_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue_); + + void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, + size_t number_of_sums, size_t max_accumulation_limit); + +private: + Scheduler& scheduler; + ComputePassDescriptorQueue& compute_pass_descriptor_queue; +}; + class ASTCDecoderPass final : public ComputePass { public: explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 2147776f8..ded190ae0 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -11,6 +11,7 @@ #include #include +#include "common/bit_util.h" #include "common/common_types.h" #include "core/memory.h" #include "video_core/engines/draw_manager.h" @@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer { public: explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, VideoCore::RasterizerInterface* rasterizer_, const Device& device_, - Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) + Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, + ComputePassDescriptorQueue& compute_pass_descriptor_queue, + DescriptorPool& descriptor_pool) : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, scheduler{scheduler_}, memory_allocator{memory_allocator_} { - BuildResolveBuffer(); current_bank = nullptr; current_query = nullptr; ammend_value = 0; acumulation_value = 0; + queries_prefix_scan_pass = std::make_unique( + device, scheduler, descriptor_pool, compute_pass_descriptor_queue); + + const VkBufferCreateInfo buffer_ci = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = 8, + .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }; + accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); + }); } ~SamplesStreamer() = default; @@ -159,6 +180,8 @@ public: acumulation_value = 0; }); rasterizer->SyncOperation(std::move(func)); + accumulation_since_last_sync = false; + last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); } void CloseCounter() override { @@ -175,7 +198,8 @@ public: } for (size_t i = 0; i < sync_values_stash.size(); i++) { - runtime.template SyncValues(sync_values_stash[i], *resolve_buffers[i]); + runtime.template SyncValues(sync_values_stash[i], + *buffers[resolve_buffers[i]]); } sync_values_stash.clear(); @@ -189,36 +213,21 @@ public: sync_values_stash.clear(); sync_values_stash.emplace_back(); std::vector* sync_values = &sync_values_stash.back(); - sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); + sync_values->reserve(num_slots_used); std::unordered_map> offsets; - size_t this_bank_slot = std::numeric_limits::max(); - size_t resolve_slots_remaining = resolve_slots; - size_t resolve_buffer_index = 0; + resolve_buffers.clear(); + size_t resolve_buffer_index = ObtainBuffer(num_slots_used); + resolve_buffers.push_back(resolve_buffer_index); + size_t base_offset = 0; + ApplyBanksWideOp(pending_sync, [&](SamplesQueryBank* bank, size_t start, size_t amount) { size_t bank_id = bank->GetIndex(); - if (this_bank_slot != bank_id) { - this_bank_slot = bank_id; - if (resolve_slots_remaining == 0) { - resolve_buffer_index++; - if (resolve_buffer_index >= resolve_buffers.size()) { - BuildResolveBuffer(); - } - resolve_slots_remaining = resolve_slots; - sync_values_stash.emplace_back(); - sync_values = &sync_values_stash.back(); - sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); - } - resolve_slots_remaining--; - } - auto& resolve_buffer = resolve_buffers[resolve_buffer_index]; - const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * - (resolve_slots - resolve_slots_remaining - 1); + auto& resolve_buffer = buffers[resolve_buffer_index]; VkQueryPool query_pool = bank->GetInnerPool(); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([start, amount, base_offset, query_pool, buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { - size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE; const VkBufferMemoryBarrier copy_query_pool_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .pNext = nullptr, @@ -227,39 +236,60 @@ public: .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = buffer, - .offset = final_offset, + .offset = base_offset, .size = amount * SamplesQueryBank::QUERY_SIZE, }; cmdbuf.CopyQueryPoolResults( query_pool, static_cast(start), static_cast(amount), buffer, - static_cast(final_offset), SamplesQueryBank::QUERY_SIZE, + static_cast(base_offset), SamplesQueryBank::QUERY_SIZE, VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); }); - offsets[bank_id] = {sync_values_stash.size() - 1, base_offset}; + offsets[bank_id] = {start, base_offset}; + base_offset += amount * SamplesQueryBank::QUERY_SIZE; }); // Convert queries + bool has_multi_queries = false; for (auto q : pending_sync) { auto* query = GetQuery(q); + size_t sync_value_slot = 0; if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { continue; } if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { continue; } - if (query->size_slots > 1) { - // This is problematic. - // UNIMPLEMENTED(); + if (accumulation_since_last_sync || query->size_slots > 1) { + if (!has_multi_queries) { + has_multi_queries = true; + sync_values_stash.emplace_back(); + } + sync_value_slot = 1; } query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; auto loc_data = offsets[query->start_bank_id]; - sync_values_stash[loc_data.first].emplace_back(HostSyncValues{ + sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ .address = query->guest_address, .size = SamplesQueryBank::QUERY_SIZE, - .offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE, + .offset = + loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * + SamplesQueryBank::QUERY_SIZE, + }); + } + + if (has_multi_queries) { + size_t intermediary_buffer_index = ObtainBuffer(num_slots_used); + resolve_buffers.push_back(intermediary_buffer_index); + queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], + *buffers[resolve_buffer_index], num_slots_used, + std::min(last_accumulation_checkpoint, num_slots_used)); + } else { + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { + cmdbuf.FillBuffer(buffer, 0, 8, 0); }); } @@ -267,6 +297,9 @@ public: std::function func([this] { ammend_value = acumulation_value; }); rasterizer->SyncOperation(std::move(func)); AbandonCurrentQuery(); + num_slots_used = 0; + last_accumulation_checkpoint = std::numeric_limits::max(); + accumulation_since_last_sync = has_multi_queries; pending_sync.clear(); } @@ -400,6 +433,7 @@ private: void ReserveHostQuery() { size_t new_slot = ReserveBankSlot(); current_bank->AddReference(1); + num_slots_used++; if (current_query) { size_t bank_id = current_query->start_bank_id; size_t banks_set = current_query->size_banks - 1; @@ -470,32 +504,50 @@ private: }); } - void BuildResolveBuffer() { + template + size_t ObtainBuffer(size_t num_needed) { + const size_t log_2 = std::max(6U, Common::Log2Ceil64(num_needed)); + if constexpr (is_resolve) { + if (resolve_table[log_2] != 0) { + return resolve_table[log_2] - 1; + } + } else { + if (intermediary_table[log_2] != 0) { + return intermediary_table[log_2] - 1; + } + } const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, + .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; - resolve_buffers.emplace_back( - memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); + if constexpr (is_resolve) { + resolve_table[log_2] = buffers.size(); + } else { + intermediary_table[log_2] = buffers.size(); + } + return buffers.size() - 1; } - static constexpr size_t resolve_slots = 8; - QueryCacheRuntime& runtime; VideoCore::RasterizerInterface* rasterizer; const Device& device; Scheduler& scheduler; const MemoryAllocator& memory_allocator; VideoCommon::BankPool bank_pool; - std::deque resolve_buffers; + std::deque buffers; + std::array resolve_table{}; + std::array intermediary_table{}; + vk::Buffer accumulation_buffer; std::deque> sync_values_stash; + std::vector resolve_buffers; // syncing queue std::vector pending_sync; @@ -510,10 +562,14 @@ private: SamplesQueryBank* current_bank; VkQueryPool current_query_pool; size_t current_query_id; + size_t num_slots_used{}; + size_t last_accumulation_checkpoint{}; + bool accumulation_since_last_sync{}; VideoCommon::HostQueryBase* current_query; bool has_started{}; - bool current_unset{}; std::mutex flush_guard; + + std::unique_ptr queries_prefix_scan_pass; }; // Transform feedback queries @@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl { memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, guest_streamer(0, runtime), sample_streamer(static_cast(QueryType::ZPassPixelCount64), runtime, rasterizer, - device, scheduler, memory_allocator), + device, scheduler, memory_allocator, compute_pass_descriptor_queue, + descriptor_pool), tfb_streamer(static_cast(QueryType::StreamingByteCount), runtime, device, scheduler, memory_allocator, staging_pool), primitives_succeeded_streamer( @@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku return true; } } - if (!is_in_bc[0] && !is_in_bc[1]) { + /*if (!is_in_bc[0] && !is_in_bc[1]) { // Both queries are in query cache, it's best to just flush. - return false; - } + return true; + }*/ HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); return true; } -- cgit v1.2.3