From f2d3212de97ebed710bc03792343fae45b3203f3 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 22 Apr 2023 13:36:18 +0200 Subject: Buffer Cache rework: Setup async downloads. --- src/video_core/buffer_cache/buffer_cache.h | 229 ++++++++++-------------- src/video_core/buffer_cache/buffer_cache_base.h | 65 ++++++- 2 files changed, 154 insertions(+), 140 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index a0701ce4e..43fe5b080 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -11,6 +11,8 @@ namespace VideoCommon { +using Core::Memory::YUZU_PAGESIZE; + template BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, Core::Memory::Memory& cpu_memory_, Runtime& runtime_) @@ -87,9 +89,11 @@ void BufferCache

::TickFrame() { template void BufferCache

::WriteMemory(VAddr cpu_addr, u64 size) { memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); - const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); + if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { + const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + } } template @@ -102,17 +106,33 @@ void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { template void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { + WaitOnAsyncFlushes(cpu_addr, size); ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer, cpu_addr, size); }); } +template +void BufferCache

::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { + bool must_wait = false; + ForEachInOverlapCounter(async_downloads, cpu_addr, size, + [&](VAddr, VAddr, int) { must_wait = true; }); + bool must_release = false; + ForEachInRangeSet(pending_ranges, cpu_addr, size, [&](VAddr, VAddr) { must_release = true; }); + if (must_release) { + std::function tmp([]() {}); + rasterizer.SignalFence(std::move(tmp)); + } + if (must_wait || must_release) { + rasterizer.ReleaseFences(); + } +} + template void BufferCache

::ClearDownload(IntervalType subtract_interval) { + async_downloads -= std::make_pair(subtract_interval, std::numeric_limits::max()); uncommitted_ranges.subtract(subtract_interval); - for (auto& interval_set : async_downloads) { - interval_set.subtract(subtract_interval); - } + pending_ranges.subtract(subtract_interval); for (auto& interval_set : committed_ranges) { interval_set.subtract(subtract_interval); } @@ -132,6 +152,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; + WaitOnAsyncFlushes(*cpu_src_address, static_cast(amount)); ClearDownload(subtract_interval); BufferId buffer_a; @@ -162,6 +183,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am tmp_intervals.push_back(add_interval); if (is_high_accuracy) { uncommitted_ranges.add(add_interval); + pending_ranges.add(add_interval); } }; ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); @@ -413,18 +435,15 @@ template void BufferCache

::FlushCachedWrites() { cached_write_buffer_ids.clear(); memory_tracker.FlushCachedWrites(); - /*for (auto& interval : cached_ranges) { - VAddr cpu_addr = interval.lower(); - const std::size_t size = interval.upper() - interval.lower(); - memory_tracker.FlushCachedWrites(cpu_addr, size); - // common_ranges.subtract(interval); - }*/ + for (auto& interval : cached_ranges) { + ClearDownload(interval); + } cached_ranges.clear(); } template bool BufferCache

::HasUncommittedFlushes() const noexcept { - return !uncommitted_ranges.empty() || !committed_ranges.empty() || !pending_queries.empty(); + return !uncommitted_ranges.empty() || !committed_ranges.empty(); } template @@ -437,8 +456,11 @@ void BufferCache

::AccumulateFlushes() { template bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { - return (!async_buffers.empty() && async_buffers.front().has_value()) || - (!query_async_buffers.empty() && query_async_buffers.front().has_value()); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + return (!async_buffers.empty() && async_buffers.front().has_value()); + } else { + return false; + } } template @@ -446,11 +468,14 @@ void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { - async_buffers.emplace_back(std::optional{}); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + async_buffers.emplace_back(std::optional{}); + } return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); + pending_ranges.clear(); auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { auto& current_intervals = *it; @@ -491,7 +516,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { buffer_id, }); // Align up to avoid cache conflicts - constexpr u64 align = 8ULL; + constexpr u64 align = 64ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); @@ -504,7 +529,9 @@ void BufferCache

::CommitAsyncFlushesHigh() { } committed_ranges.clear(); if (downloads.empty()) { - async_buffers.emplace_back(std::optional{}); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + async_buffers.emplace_back(std::optional{}); + } return; } if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { @@ -520,99 +547,54 @@ void BufferCache

::CommitAsyncFlushesHigh() { second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; VAddr orig_cpu_addr = static_cast(second_copy.src_offset); const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; - new_async_range.add(base_interval); + async_downloads += std::make_pair(base_interval, 1); runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); normalized_copies.push_back(second_copy); } - async_downloads.emplace_back(std::move(new_async_range)); + runtime.PostCopyBarrier(); pending_downloads.emplace_back(std::move(normalized_copies)); async_buffers.emplace_back(download_staging); } else { - const std::span immediate_buffer = ImmediateBuffer(largest_copy); - for (const auto& [copy, buffer_id] : downloads) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); - } - } -} - -template -void BufferCache

::CommitAsyncQueries() { - if (pending_queries.empty()) { - query_async_buffers.emplace_back(std::optional{}); - return; - } - - MICROPROFILE_SCOPE(GPU_DownloadMemory); - boost::container::small_vector, 8> downloads; - u64 total_size_bytes = 0; - u64 largest_copy = 0; - do { - has_deleted_buffers = false; - downloads.clear(); - total_size_bytes = 0; - largest_copy = 0; - for (const auto& query_info : pending_queries) { - const std::size_t size = query_info.second; - const VAddr cpu_addr = query_info.first; - const BufferId buffer_id = FindBuffer(cpu_addr, static_cast(size)); - Buffer& buffer = slot_buffers[buffer_id]; - if (has_deleted_buffers) { - break; + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); + } + runtime.PostCopyBarrier(); + runtime.Finish(); + for (const auto& [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto& [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); } - downloads.push_back({ - BufferCopy{ - .src_offset = buffer.Offset(cpu_addr), - .dst_offset = total_size_bytes, - .size = size, - }, - buffer_id, - }); - constexpr u64 align = 8ULL; - constexpr u64 mask = ~(align - 1ULL); - total_size_bytes += (size + align - 1) & mask; - largest_copy = std::max(largest_copy, size); - } - } while (has_deleted_buffers); - pending_queries.clear(); - if (downloads.empty()) { - query_async_buffers.push_back(std::optional{}); - return; - } - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); - boost::container::small_vector normalized_copies; - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - // Have in mind the staging buffer offset for the copy - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - const Buffer& buffer = slot_buffers[buffer_id]; - BufferCopy second_copy{copy}; - second_copy.src_offset = static_cast(buffer.CpuAddr()) + second_copy.src_offset; - runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); - normalized_copies.push_back(second_copy); } - committed_queries.emplace_back(std::move(normalized_copies)); - query_async_buffers.emplace_back(download_staging); - } else { - query_async_buffers.push_back(std::optional{}); } } template void BufferCache

::CommitAsyncFlushes() { CommitAsyncFlushesHigh(); - CommitAsyncQueries(); } template void BufferCache

::PopAsyncFlushes() { MICROPROFILE_SCOPE(GPU_DownloadMemory); PopAsyncBuffers(); - PopAsyncQueries(); } template @@ -627,59 +609,34 @@ void BufferCache

::PopAsyncBuffers() { if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { auto& downloads = pending_downloads.front(); auto& async_buffer = async_buffers.front(); - auto& async_range = async_downloads.front(); u8* base = async_buffer->mapped_span.data(); const size_t base_offset = async_buffer->offset; for (const auto& copy : downloads) { const VAddr cpu_addr = static_cast(copy.src_offset); const u64 dst_offset = copy.dst_offset - base_offset; const u8* read_mapped_memory = base + dst_offset; - ForEachInRangeSet(async_range, cpu_addr, copy.size, [&](VAddr start, VAddr end) { - const size_t diff = start - cpu_addr; - const size_t new_size = end - start; - cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[diff], new_size); - const IntervalType base_interval{start, end}; - common_ranges.subtract(base_interval); - }); + ForEachInOverlapCounter( + async_downloads, cpu_addr, copy.size, [&](VAddr start, VAddr end, int count) { + cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - cpu_addr], + end - start); + if (count == 1) { + const IntervalType base_interval{start, end}; + common_ranges.subtract(base_interval); + } + }); + async_downloads -= std::make_pair(IntervalType(cpu_addr, cpu_addr + copy.size), 1); } runtime.FreeDeferredStagingBuffer(*async_buffer); async_buffers.pop_front(); pending_downloads.pop_front(); - async_downloads.pop_front(); - } -} - -template -void BufferCache

::PopAsyncQueries() { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - if (query_async_buffers.empty()) { - return; - } - if (!query_async_buffers.front().has_value()) { - query_async_buffers.pop_front(); - return; - } - auto& downloads = committed_queries.front(); - auto& async_buffer = query_async_buffers.front(); - flushed_queries.clear(); - u8* base = async_buffer->mapped_span.data(); - const size_t base_offset = async_buffer->offset; - for (const auto& copy : downloads) { - const size_t dst_offset = copy.dst_offset - base_offset; - const u8* read_mapped_memory = base + dst_offset; - u64 new_value{}; - std::memcpy(&new_value, read_mapped_memory, copy.size); - flushed_queries.push_back(new_value); - } - runtime.FreeDeferredStagingBuffer(*async_buffer); - committed_queries.pop_front(); - query_async_buffers.pop_front(); } } template bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { - return memory_tracker.IsRegionGpuModified(addr, size); + bool is_dirty = false; + ForEachInRangeSet(common_ranges, addr, size, [&](VAddr, VAddr) { is_dirty = true; }); + return is_dirty; } template @@ -1232,16 +1189,18 @@ void BufferCache

::UpdateComputeTextureBuffers() { } template -void BufferCache

::MarkWrittenBuffer(BufferId, VAddr cpu_addr, u32 size) { +void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); + if (memory_tracker.IsRegionCpuModified(cpu_addr, size)) { + SynchronizeBuffer(slot_buffers[buffer_id], cpu_addr, size); + } + const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - for (auto& interval_set : async_downloads) { - interval_set.subtract(base_interval); - } if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { uncommitted_ranges.add(base_interval); + pending_ranges.add(base_interval); } } @@ -1530,7 +1489,9 @@ bool BufferCache

::InlineMemory(VAddr dest_address, size_t copy_size, if (!is_dirty) { return false; } - if (!IsRegionGpuModified(dest_address, copy_size)) { + VAddr aligned_start = Common::AlignDown(dest_address, YUZU_PAGESIZE); + VAddr aligned_end = Common::AlignUp(dest_address + copy_size, YUZU_PAGESIZE); + if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) { return false; } diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 4b3677da3..6f29cba25 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -17,6 +17,7 @@ #include #undef BOOST_NO_MT #include +#include #include #include @@ -44,8 +45,7 @@ namespace boost { template -class fast_pool_allocator; +class fast_pool_allocator; } namespace VideoCommon { @@ -123,6 +123,31 @@ class BufferCache : public VideoCommon::ChannelSetupCaches; using IntervalType = typename IntervalSet::interval_type; + template + struct counter_add_functor : public boost::icl::identity_based_inplace_combine { + // types + typedef counter_add_functor type; + typedef boost::icl::identity_based_inplace_combine base_type; + + // public member functions + void operator()(Type& current, const Type& added) const { + current += added; + if (current < base_type::identity_element()) { + current = base_type::identity_element(); + } + } + + // public static functions + static void version(Type&){}; + }; + + using OverlapCombine = ICL_COMBINE_INSTANCE(counter_add_functor, int); + using OverlapSection = ICL_SECTION_INSTANCE(boost::icl::inter_section, int); + using OverlapCounter = + boost::icl::split_interval_map; + struct Empty {}; struct OverlapResult { @@ -219,12 +244,9 @@ public: /// Commit asynchronous downloads void CommitAsyncFlushes(); void CommitAsyncFlushesHigh(); - void CommitAsyncQueries(); /// Pop asynchronous downloads void PopAsyncFlushes(); - - void PopAsyncQueries(); void PopAsyncBuffers(); bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); @@ -302,6 +324,34 @@ private: } } + template + void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size, + Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + auto& inter = it->first; + VAddr inter_addr_end = inter.upper(); + VAddr inter_addr = inter.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + if (it->second <= 0) { + __debugbreak(); + } + func(inter_addr, inter_addr_end, it->second); + } + } + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); @@ -309,6 +359,8 @@ private: void RunGarbageCollector(); + void WaitOnAsyncFlushes(VAddr cpu_addr, u64 size); + void BindHostIndexBuffer(); void BindHostVertexBuffers(); @@ -474,10 +526,11 @@ private: IntervalSet uncommitted_ranges; IntervalSet common_ranges; IntervalSet cached_ranges; + IntervalSet pending_ranges; std::deque committed_ranges; // Async Buffers - std::deque async_downloads; + OverlapCounter async_downloads; std::deque> async_buffers; std::deque> pending_downloads; std::optional current_buffer; -- cgit v1.2.3