diff options
Diffstat (limited to 'src/video_core')
63 files changed, 3149 insertions, 1653 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e904573d7..a0009a36f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -11,8 +11,11 @@ endif() add_library(video_core STATIC buffer_cache/buffer_base.h + buffer_cache/buffer_cache_base.h buffer_cache/buffer_cache.cpp buffer_cache/buffer_cache.h + buffer_cache/memory_tracker_base.h + buffer_cache/word_manager.h cache_types.h cdma_pusher.cpp cdma_pusher.h @@ -104,6 +107,7 @@ add_library(video_core STATIC renderer_null/renderer_null.h renderer_opengl/blit_image.cpp renderer_opengl/blit_image.h + renderer_opengl/gl_buffer_cache_base.cpp renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_compute_pipeline.cpp @@ -154,6 +158,7 @@ add_library(video_core STATIC renderer_vulkan/renderer_vulkan.cpp renderer_vulkan/vk_blit_screen.cpp renderer_vulkan/vk_blit_screen.h + renderer_vulkan/vk_buffer_cache_base.cpp renderer_vulkan/vk_buffer_cache.cpp renderer_vulkan/vk_buffer_cache.h renderer_vulkan/vk_command_pool.cpp @@ -174,6 +179,8 @@ add_library(video_core STATIC renderer_vulkan/vk_master_semaphore.h renderer_vulkan/vk_pipeline_cache.cpp renderer_vulkan/vk_pipeline_cache.h + renderer_vulkan/vk_present_manager.cpp + renderer_vulkan/vk_present_manager.h renderer_vulkan/vk_query_cache.cpp renderer_vulkan/vk_query_cache.h renderer_vulkan/vk_rasterizer.cpp diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 1b4d63616..9cbd95c4b 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once @@ -11,9 +11,7 @@ #include "common/alignment.h" #include "common/common_funcs.h" #include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/settings.h" -#include "core/memory.h" +#include "video_core/buffer_cache/word_manager.h" namespace VideoCommon { @@ -36,116 +34,12 @@ struct NullBufferParams {}; */ template <class RasterizerInterface> class BufferBase { - static constexpr u64 PAGES_PER_WORD = 64; - static constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; - static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; - - /// Vector tracking modified pages tightly packed with small vector optimization - union WordsArray { - /// Returns the pointer to the words state - [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { - return is_short ? &stack : heap; - } - - /// Returns the pointer to the words state - [[nodiscard]] u64* Pointer(bool is_short) noexcept { - return is_short ? &stack : heap; - } - - u64 stack = 0; ///< Small buffers storage - u64* heap; ///< Not-small buffers pointer to the storage - }; - - struct Words { - explicit Words() = default; - explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { - if (IsShort()) { - cpu.stack = ~u64{0}; - gpu.stack = 0; - cached_cpu.stack = 0; - untracked.stack = ~u64{0}; - } else { - // Share allocation between CPU and GPU pages and set their default values - const size_t num_words = NumWords(); - u64* const alloc = new u64[num_words * 4]; - cpu.heap = alloc; - gpu.heap = alloc + num_words; - cached_cpu.heap = alloc + num_words * 2; - untracked.heap = alloc + num_words * 3; - std::fill_n(cpu.heap, num_words, ~u64{0}); - std::fill_n(gpu.heap, num_words, 0); - std::fill_n(cached_cpu.heap, num_words, 0); - std::fill_n(untracked.heap, num_words, ~u64{0}); - } - // Clean up tailing bits - const u64 last_word_size = size_bytes % BYTES_PER_WORD; - const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); - const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; - const u64 last_word = (~u64{0} << shift) >> shift; - cpu.Pointer(IsShort())[NumWords() - 1] = last_word; - untracked.Pointer(IsShort())[NumWords() - 1] = last_word; - } - - ~Words() { - Release(); - } - - Words& operator=(Words&& rhs) noexcept { - Release(); - size_bytes = rhs.size_bytes; - cpu = rhs.cpu; - gpu = rhs.gpu; - cached_cpu = rhs.cached_cpu; - untracked = rhs.untracked; - rhs.cpu.heap = nullptr; - return *this; - } - - Words(Words&& rhs) noexcept - : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, - cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { - rhs.cpu.heap = nullptr; - } - - Words& operator=(const Words&) = delete; - Words(const Words&) = delete; - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return size_bytes <= BYTES_PER_WORD; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return Common::DivCeil(size_bytes, BYTES_PER_WORD); - } - - /// Release buffer resources - void Release() { - if (!IsShort()) { - // CPU written words is the base for the heap allocation - delete[] cpu.heap; - } - } - - u64 size_bytes = 0; - WordsArray cpu; - WordsArray gpu; - WordsArray cached_cpu; - WordsArray untracked; - }; - - enum class Type { - CPU, - GPU, - CachedCPU, - Untracked, - }; - public: - explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) - : rasterizer{&rasterizer_}, cpu_addr{Common::AlignDown(cpu_addr_, BYTES_PER_PAGE)}, - words(Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BYTES_PER_PAGE)) {} + static constexpr u64 BASE_PAGE_BITS = 16; + static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; + + explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_) + : cpu_addr{cpu_addr_}, size_bytes{size_bytes_} {} explicit BufferBase(NullBufferParams) {} @@ -155,100 +49,6 @@ public: BufferBase& operator=(BufferBase&&) = default; BufferBase(BufferBase&&) = default; - /// Returns the inclusive CPU modified range in a begin end pair - [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, - u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion<Type::CPU>(offset, query_size); - } - - /// Returns the inclusive GPU modified range in a begin end pair - [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, - u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion<Type::GPU>(offset, query_size); - } - - /// Returns true if a region has been modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified<Type::CPU>(offset, query_size); - } - - /// Returns true if a region has been modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified<Type::GPU>(offset, query_size); - } - - /// Mark region as CPU modified, notifying the rasterizer about this change - void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); - } - - /// Unmark region as CPU modified, notifying the rasterizer about this change - void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); - } - - /// Mark region as modified from the host GPU - void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); - } - - /// Unmark region as modified from the host GPU - void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); - } - - /// Mark region as modified from the CPU - /// but don't mark it as modified until FlusHCachedWrites is called. - void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { - flags |= BufferFlagBits::CachedWrites; - ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); - } - - /// Flushes cached CPU writes, and notify the rasterizer about the deltas - void FlushCachedWrites() noexcept { - flags &= ~BufferFlagBits::CachedWrites; - const u64 num_words = NumWords(); - u64* const cached_words = Array<Type::CachedCPU>(); - u64* const untracked_words = Array<Type::Untracked>(); - u64* const cpu_words = Array<Type::CPU>(); - for (u64 word_index = 0; word_index < num_words; ++word_index) { - const u64 cached_bits = cached_words[word_index]; - NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); - untracked_words[word_index] |= cached_bits; - cpu_words[word_index] |= cached_bits; - if (!Settings::values.use_pessimistic_flushes) { - cached_words[word_index] = 0; - } - } - } - - /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - template <typename Func> - void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func); - } - - /// Call 'func' for each GPU modified range and unmark those pages as GPU modified - template <typename Func> - void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { - ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func); - } - - template <typename Func> - void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func); - } - - /// Call 'func' for each GPU modified range and unmark those pages as GPU modified - template <typename Func> - void ForEachDownloadRange(Func&& func) { - ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func); - } - /// Mark buffer as picked void Pick() noexcept { flags |= BufferFlagBits::Picked; @@ -295,11 +95,6 @@ public: return static_cast<u32>(other_cpu_addr - cpu_addr); } - /// Returns the size in bytes of the buffer - [[nodiscard]] u64 SizeBytes() const noexcept { - return words.size_bytes; - } - size_t getLRUID() const noexcept { return lru_id; } @@ -308,305 +103,16 @@ public: lru_id = lru_id_; } -private: - template <Type type> - u64* Array() noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - template <Type type> - const u64* Array() const noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - /** - * Change the state of a range of pages - * - * @param dirty_addr Base address to mark or unmark as modified - * @param size Size in bytes to mark or unmark as modified - */ - template <Type type, bool enable> - void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { - const s64 difference = dirty_addr - cpu_addr; - const u64 offset = std::max<s64>(difference, 0); - size += std::min<s64>(difference, 0); - if (offset >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array<Type::Untracked>(); - u64* const state_words = Array<type>(); - const u64 offset_end = std::min(offset + size, SizeBytes()); - const u64 begin_page_index = offset / BYTES_PER_PAGE; - const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; - const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); - const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); - u64 page_index = begin_page_index % PAGES_PER_WORD; - u64 word_index = begin_word_index; - while (word_index < end_word_index) { - const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; - const u64 left_offset = - std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; - const u64 right_offset = page_index; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); - } - if constexpr (enable) { - state_words[word_index] |= bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] |= bits; - } - } else { - state_words[word_index] &= ~bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] &= ~bits; - } - } - page_index = 0; - ++word_index; - } - } - - /** - * Notify rasterizer about changes in the CPU tracking state of a word in the buffer - * - * @param word_index Index to the word to notify to the rasterizer - * @param current_bits Current state of the word - * @param new_bits New state of the word - * - * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages - */ - template <bool add_to_rasterizer> - void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { - u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; - VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; - while (changed_bits != 0) { - const int empty_bits = std::countr_zero(changed_bits); - addr += empty_bits * BYTES_PER_PAGE; - changed_bits >>= empty_bits; - - const u32 continuous_bits = std::countr_one(changed_bits); - const u64 size = continuous_bits * BYTES_PER_PAGE; - const VAddr begin_addr = addr; - addr += size; - changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; - rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); - } - } - - /** - * Loop over each page in the given range, turn off those bits and notify the rasterizer if - * needed. Call the given function on each turned off range. - * - * @param query_cpu_range Base CPU address to loop over - * @param size Size in bytes of the CPU range to loop over - * @param func Function to call for each turned off region - */ - template <Type type, typename Func> - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { - static_assert(type != Type::Untracked); - - const s64 difference = query_cpu_range - cpu_addr; - const u64 query_begin = std::max<s64>(difference, 0); - size += std::min<s64>(difference, 0); - if (query_begin >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array<Type::Untracked>(); - u64* const state_words = Array<type>(); - const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); - u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; - u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); - - const auto modified = [](u64 word) { return word != 0; }; - const auto first_modified_word = std::find_if(words_begin, words_end, modified); - if (first_modified_word == words_end) { - // Exit early when the buffer is not modified - return; - } - const auto last_modified_word = std::find_if_not(first_modified_word, words_end, modified); - - const u64 word_index_begin = std::distance(state_words, first_modified_word); - const u64 word_index_end = std::distance(state_words, last_modified_word); - - const unsigned local_page_begin = std::countr_zero(*first_modified_word); - const unsigned local_page_end = - static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); - const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; - const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; - const u64 query_page_begin = query_begin / BYTES_PER_PAGE; - const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); - const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); - const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); - const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; - const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; - - u64 page_begin = first_word_page_begin; - u64 current_base = 0; - u64 current_size = 0; - bool on_going = false; - for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { - const bool is_last_word = word_index + 1 == word_index_end; - const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; - const u64 right_offset = page_begin; - const u64 left_offset = PAGES_PER_WORD - page_end; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - - const u64 current_word = state_words[word_index] & bits; - if (clear) { - state_words[word_index] &= ~bits; - } - - if constexpr (type == Type::CPU) { - const u64 current_bits = untracked_words[word_index] & bits; - untracked_words[word_index] &= ~bits; - NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); - } - // Exclude CPU modified pages when visiting GPU pages - const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); - u64 page = page_begin; - page_begin = 0; - - while (page < page_end) { - const int empty_bits = std::countr_zero(word >> page); - if (on_going && empty_bits != 0) { - InvokeModifiedRange(func, current_size, current_base); - current_size = 0; - on_going = false; - } - if (empty_bits == PAGES_PER_WORD) { - break; - } - page += empty_bits; - - const int continuous_bits = std::countr_one(word >> page); - if (!on_going && continuous_bits != 0) { - current_base = word_index * PAGES_PER_WORD + page; - on_going = true; - } - current_size += continuous_bits; - page += continuous_bits; - } - } - if (on_going && current_size > 0) { - InvokeModifiedRange(func, current_size, current_base); - } - } - - template <typename Func> - void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { - const u64 current_size_bytes = current_size * BYTES_PER_PAGE; - const u64 offset_begin = current_base * BYTES_PER_PAGE; - const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); - func(offset_begin, offset_end - offset_begin); + size_t SizeBytes() const { + return size_bytes; } - /** - * Returns true when a region has been modified - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template <Type type> - [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array<Type::Untracked>(); - const u64* const state_words = Array<type>(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords()); - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; - for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); - const u64 local_page_end = page_end % PAGES_PER_WORD; - const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; - if (((word >> page_index) << page_index) << page_end_shift != 0) { - return true; - } - } - return false; - } - - /** - * Returns a begin end pair with the inclusive modified region - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template <Type type> - [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array<Type::Untracked>(); - const u64* const state_words = Array<type>(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords()); - const u64 page_base = offset / BYTES_PER_PAGE; - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 begin = std::numeric_limits<u64>::max(); - u64 end = 0; - for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 local_page_begin = std::countr_zero(word); - const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); - const u64 page_index = word_index * PAGES_PER_WORD; - const u64 page_begin = std::max(page_index + local_page_begin, page_base); - const u64 page_end = std::min(page_index + local_page_end, page_limit); - begin = std::min(begin, page_begin); - end = std::max(end, page_end); - } - static constexpr std::pair<u64, u64> EMPTY{0, 0}; - return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return words.NumWords(); - } - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return words.IsShort(); - } - - RasterizerInterface* rasterizer = nullptr; +private: VAddr cpu_addr = 0; - Words words; BufferFlagBits flags{}; int stream_score = 0; size_t lru_id = SIZE_MAX; + size_t size_bytes = 0; }; } // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index a16308b60..40db243d2 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #include "common/microprofile.h" diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index abdc593df..e534e1e9c 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1,485 +1,29 @@ -// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once #include <algorithm> -#include <array> #include <memory> -#include <mutex> #include <numeric> -#include <span> -#include <vector> - -#include <boost/container/small_vector.hpp> -#include <boost/icl/interval_set.hpp> - -#include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/literals.h" -#include "common/lru_cache.h" -#include "common/microprofile.h" -#include "common/polyfill_ranges.h" -#include "common/scratch_buffer.h" -#include "common/settings.h" -#include "core/memory.h" -#include "video_core/buffer_cache/buffer_base.h" -#include "video_core/control/channel_state_cache.h" -#include "video_core/delayed_destruction_ring.h" -#include "video_core/dirty_flags.h" -#include "video_core/engines/draw_manager.h" -#include "video_core/engines/kepler_compute.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/slot_vector.h" -#include "video_core/texture_cache/types.h" -namespace VideoCommon { - -MICROPROFILE_DECLARE(GPU_PrepareBuffers); -MICROPROFILE_DECLARE(GPU_BindUploadBuffers); -MICROPROFILE_DECLARE(GPU_DownloadMemory); - -using BufferId = SlotId; - -using VideoCore::Surface::PixelFormat; -using namespace Common::Literals; - -constexpr u32 NUM_VERTEX_BUFFERS = 32; -constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; -constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; -constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; -constexpr u32 NUM_STORAGE_BUFFERS = 16; -constexpr u32 NUM_TEXTURE_BUFFERS = 16; -constexpr u32 NUM_STAGES = 5; - -enum class ObtainBufferSynchronize : u32 { - NoSynchronize = 0, - FullSynchronize = 1, - SynchronizeNoDirty = 2, -}; - -enum class ObtainBufferOperation : u32 { - DoNothing = 0, - MarkAsWritten = 1, - DiscardWrite = 2, - MarkQuery = 3, -}; - -using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; -using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; - -template <typename P> -class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { - - // Page size for caching purposes. - // This is unrelated to the CPU page size and it can be changed as it seems optimal. - static constexpr u32 YUZU_PAGEBITS = 16; - static constexpr u64 YUZU_PAGESIZE = u64{1} << YUZU_PAGEBITS; - - static constexpr bool IS_OPENGL = P::IS_OPENGL; - static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = - P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; - static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = - P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; - static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; - static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; - static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; - static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; - - static constexpr BufferId NULL_BUFFER_ID{0}; - - static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; - static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; - static constexpr s64 TARGET_THRESHOLD = 4_GiB; - - using Maxwell = Tegra::Engines::Maxwell3D::Regs; - - using Runtime = typename P::Runtime; - using Buffer = typename P::Buffer; - - using IntervalSet = boost::icl::interval_set<VAddr>; - using IntervalType = typename IntervalSet::interval_type; - - struct Empty {}; - - struct OverlapResult { - std::vector<BufferId> ids; - VAddr begin; - VAddr end; - bool has_stream_leap = false; - }; - - struct Binding { - VAddr cpu_addr{}; - u32 size{}; - BufferId buffer_id; - }; - - struct TextureBufferBinding : Binding { - PixelFormat format; - }; - - static constexpr Binding NULL_BINDING{ - .cpu_addr = 0, - .size = 0, - .buffer_id = NULL_BUFFER_ID, - }; - -public: - static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); - - explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, Runtime& runtime_); - - void TickFrame(); - - void WriteMemory(VAddr cpu_addr, u64 size); - - void CachedWriteMemory(VAddr cpu_addr, u64 size); - - void DownloadMemory(VAddr cpu_addr, u64 size); - - bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); - - void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); - - void DisableGraphicsUniformBuffer(size_t stage, u32 index); - - void UpdateGraphicsBuffers(bool is_indexed); - - void UpdateComputeBuffers(); - - void BindHostGeometryBuffers(bool is_indexed); - - void BindHostStageBuffers(size_t stage); - - void BindHostComputeBuffers(); - - void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask, - const UniformBufferSizes* sizes); - - void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); - - void UnbindGraphicsStorageBuffers(size_t stage); - - void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindGraphicsTextureBuffers(size_t stage); - - void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, - PixelFormat format, bool is_written, bool is_image); - - void UnbindComputeStorageBuffers(); - - void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindComputeTextureBuffers(); - - void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, - bool is_written, bool is_image); - - void FlushCachedWrites(); - - /// Return true when there are uncommitted buffers to be downloaded - [[nodiscard]] bool HasUncommittedFlushes() const noexcept; - - void AccumulateFlushes(); - - /// Return true when the caller should wait for async downloads - [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; - - /// Commit asynchronous downloads - void CommitAsyncFlushes(); - void CommitAsyncFlushesHigh(); - - /// Pop asynchronous downloads - void PopAsyncFlushes(); - - bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); - - bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); - - [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, - ObtainBufferSynchronize sync_info, - ObtainBufferOperation post_op); - - /// Return true when a CPU region is modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - - /// Return true when a region is registered on the cache - [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); - - /// Return true when a CPU region is modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); - - void SetDrawIndirect( - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { - current_draw_indirect = current_draw_indirect_; - } - - [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount(); - - [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); - - std::recursive_mutex mutex; - Runtime& runtime; - -private: - template <typename Func> - static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { - for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { - const int disabled_bits = std::countr_zero(enabled_mask); - index += disabled_bits; - enabled_mask >>= disabled_bits; - func(index); - } - } - - template <typename Func> - void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { - const u64 page_end = Common::DivCeil(cpu_addr + size, YUZU_PAGESIZE); - for (u64 page = cpu_addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; - if (!buffer_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[buffer_id]; - func(buffer_id, buffer); - - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - } - - template <typename Func> - void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) { - const VAddr start_address = cpu_addr; - const VAddr end_address = start_address + size; - const VAddr search_base = - static_cast<VAddr>(std::min<s64>(0LL, static_cast<s64>(start_address - size))); - const IntervalType search_interval{search_base, search_base + 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - for (; it != common_ranges.end(); it++) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end); - } - } - - static bool IsRangeGranular(VAddr cpu_addr, size_t size) { - return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == - ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); - } - - void RunGarbageCollector(); - - void BindHostIndexBuffer(); - - void BindHostVertexBuffers(); - - void BindHostDrawIndirectBuffers(); - - void BindHostGraphicsUniformBuffers(size_t stage); - - void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); - - void BindHostGraphicsStorageBuffers(size_t stage); - - void BindHostGraphicsTextureBuffers(size_t stage); - - void BindHostTransformFeedbackBuffers(); - - void BindHostComputeUniformBuffers(); - - void BindHostComputeStorageBuffers(); - - void BindHostComputeTextureBuffers(); - - void DoUpdateGraphicsBuffers(bool is_indexed); - - void DoUpdateComputeBuffers(); - - void UpdateIndexBuffer(); - - void UpdateVertexBuffers(); - - void UpdateVertexBuffer(u32 index); - - void UpdateDrawIndirect(); - - void UpdateUniformBuffers(size_t stage); - - void UpdateStorageBuffers(size_t stage); - - void UpdateTextureBuffers(size_t stage); - - void UpdateTransformFeedbackBuffers(); - - void UpdateTransformFeedbackBuffer(u32 index); - - void UpdateComputeUniformBuffers(); - - void UpdateComputeStorageBuffers(); - - void UpdateComputeTextureBuffers(); - - void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); - - [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); - - [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); - - void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); - - [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); - - void Register(BufferId buffer_id); - - void Unregister(BufferId buffer_id); - - template <bool insert> - void ChangeRegister(BufferId buffer_id); - - void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; - - bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); - - bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); - - void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, - std::span<BufferCopy> copies); - - void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span<const BufferCopy> copies); - - void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); - - void DownloadBufferMemory(Buffer& buffer_id); - - void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); - - void DeleteBuffer(BufferId buffer_id); - - void NotifyBufferDeletion(); - - [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, - bool is_written = false) const; - - [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, - PixelFormat format); - - [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); - - [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); - - [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; - - void ClearDownload(IntervalType subtract_interval); - - VideoCore::RasterizerInterface& rasterizer; - Core::Memory::Memory& cpu_memory; - - SlotVector<Buffer> slot_buffers; - DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; - - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; - - u32 last_index_count = 0; - - Binding index_buffer; - std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; - std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; - std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; - std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers; - std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; - Binding count_buffer_binding; - Binding indirect_buffer_binding; - - std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; - std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; - std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers; - - std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{}; - u32 enabled_compute_uniform_buffer_mask = 0; - - const UniformBufferSizes* uniform_buffer_sizes{}; - const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; - - std::array<u32, NUM_STAGES> enabled_storage_buffers{}; - std::array<u32, NUM_STAGES> written_storage_buffers{}; - u32 enabled_compute_storage_buffers = 0; - u32 written_compute_storage_buffers = 0; - - std::array<u32, NUM_STAGES> enabled_texture_buffers{}; - std::array<u32, NUM_STAGES> written_texture_buffers{}; - std::array<u32, NUM_STAGES> image_texture_buffers{}; - u32 enabled_compute_texture_buffers = 0; - u32 written_compute_texture_buffers = 0; - u32 image_compute_texture_buffers = 0; - - std::array<u32, 16> uniform_cache_hits{}; - std::array<u32, 16> uniform_cache_shots{}; - - u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; - - bool has_deleted_buffers = false; +#include "video_core/buffer_cache/buffer_cache_base.h" - std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> - dirty_uniform_buffers{}; - std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{}; - std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, - std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty> - uniform_buffer_binding_sizes{}; - - std::vector<BufferId> cached_write_buffer_ids; - - IntervalSet uncommitted_ranges; - IntervalSet common_ranges; - std::deque<IntervalSet> committed_ranges; - - Common::ScratchBuffer<u8> immediate_buffer_alloc; - - struct LRUItemParams { - using ObjectType = BufferId; - using TickType = u64; - }; - Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; - u64 frame_tick = 0; - u64 total_used_memory = 0; - u64 minimum_memory = 0; - u64 critical_memory = 0; +namespace VideoCommon { - std::array<BufferId, ((1ULL << 39) >> YUZU_PAGEBITS)> page_table; -}; +using Core::Memory::YUZU_PAGESIZE; template <class P> BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, Core::Memory::Memory& cpu_memory_, Runtime& runtime_) - : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_} { + : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, memory_tracker{ + rasterizer} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); + inline_buffer_id = NULL_BUFFER_ID; + + active_async_buffers = !Settings::IsGPULevelHigh(); if (!runtime.CanReportMemoryUsage()) { minimum_memory = DEFAULT_EXPECTED_MEMORY; @@ -531,6 +75,8 @@ void BufferCache<P>::TickFrame() { uniform_cache_hits[0] = 0; uniform_cache_shots[0] = 0; + active_async_buffers = !Settings::IsGPULevelHigh(); + const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; @@ -543,35 +89,62 @@ void BufferCache<P>::TickFrame() { } ++frame_tick; delayed_destruction_ring.Tick(); + + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + for (auto& buffer : async_buffers_death_ring) { + runtime.FreeDeferredStagingBuffer(buffer); + } + async_buffers_death_ring.clear(); + } } template <class P> void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { - buffer.MarkRegionAsCpuModified(cpu_addr, size); - }); + memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); + if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { + const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + } } template <class P> void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - if (!buffer.HasCachedWrites()) { - cached_write_buffer_ids.push_back(buffer_id); - } - buffer.CachedCpuWrite(cpu_addr, size); - }); + memory_tracker.CachedCpuWrite(cpu_addr, size); + const IntervalType add_interval{Common::AlignDown(cpu_addr, YUZU_PAGESIZE), + Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE)}; + cached_ranges.add(add_interval); } template <class P> void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { + WaitOnAsyncFlushes(cpu_addr, size); ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer, cpu_addr, size); }); } template <class P> +void BufferCache<P>::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { + bool must_wait = false; + ForEachInOverlapCounter(async_downloads, cpu_addr, size, + [&](VAddr, VAddr, int) { must_wait = true; }); + bool must_release = false; + ForEachInRangeSet(pending_ranges, cpu_addr, size, [&](VAddr, VAddr) { must_release = true; }); + if (must_release) { + std::function<void()> tmp([]() {}); + rasterizer.SignalFence(std::move(tmp)); + } + if (must_wait || must_release) { + rasterizer.ReleaseFences(); + } +} + +template <class P> void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024); uncommitted_ranges.subtract(subtract_interval); + pending_ranges.subtract(subtract_interval); for (auto& interval_set : committed_ranges) { interval_set.subtract(subtract_interval); } @@ -591,6 +164,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; + WaitOnAsyncFlushes(*cpu_src_address, static_cast<u32>(amount)); ClearDownload(subtract_interval); BufferId buffer_a; @@ -616,10 +190,11 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; - uncommitted_ranges.add(add_interval); tmp_intervals.push_back(add_interval); + uncommitted_ranges.add(add_interval); + pending_ranges.add(add_interval); }; - ForEachWrittenRange(*cpu_src_address, amount, mirror); + ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. common_ranges.subtract(subtract_interval); const bool has_new_downloads = tmp_intervals.size() != 0; @@ -628,7 +203,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } runtime.CopyBuffer(dest_buffer, src_buffer, copies); if (has_new_downloads) { - dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); + memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } std::vector<u8> tmp_buffer(amount); cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); @@ -866,10 +441,9 @@ void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add template <class P> void BufferCache<P>::FlushCachedWrites() { - for (const BufferId buffer_id : cached_write_buffer_ids) { - slot_buffers[buffer_id].FlushCachedWrites(); - } cached_write_buffer_ids.clear(); + memory_tracker.FlushCachedWrites(); + cached_ranges.clear(); } template <class P> @@ -879,10 +453,6 @@ bool BufferCache<P>::HasUncommittedFlushes() const noexcept { template <class P> void BufferCache<P>::AccumulateFlushes() { - if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { - uncommitted_ranges.clear(); - return; - } if (uncommitted_ranges.empty()) { return; } @@ -891,7 +461,11 @@ void BufferCache<P>::AccumulateFlushes() { template <class P> bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { - return false; + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + return (!async_buffers.empty() && async_buffers.front().has_value()); + } else { + return false; + } } template <class P> @@ -899,12 +473,16 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional<Async_Buffer>{}); + } + } return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); - const bool is_accuracy_normal = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; + pending_ranges.clear(); auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { auto& current_intervals = *it; @@ -926,11 +504,12 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { const std::size_t size = interval.upper() - interval.lower(); const VAddr cpu_addr = interval.lower(); ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - buffer.ForEachDownloadRangeAndClear( - cpu_addr, size, [&](u64 range_offset, u64 range_size) { - if (is_accuracy_normal) { - return; - } + const VAddr buffer_start = buffer.CpuAddr(); + const VAddr buffer_end = buffer_start + buffer.SizeBytes(); + const VAddr new_start = std::max(buffer_start, cpu_addr); + const VAddr new_end = std::min(buffer_end, cpu_addr + size); + memory_tracker.ForEachDownloadRange( + new_start, new_end - new_start, false, [&](u64 cpu_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; @@ -944,92 +523,142 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { buffer_id, }); // Align up to avoid cache conflicts - constexpr u64 align = 8ULL; + constexpr u64 align = 64ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); }; - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - common_ranges.subtract(subtract_interval); + ForEachInRangeSet(common_ranges, cpu_addr_out, range_size, add_download); }); }); } } committed_ranges.clear(); if (downloads.empty()) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional<Async_Buffer>{}); + } + } return; } - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - // Have in mind the staging buffer offset for the copy - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); - } - runtime.PostCopyBarrier(); - runtime.Finish(); - for (const auto& [copy, buffer_id] : downloads) { - const Buffer& buffer = slot_buffers[buffer_id]; - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + if (active_async_buffers) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector<BufferCopy, 4> normalized_copies; + IntervalSet new_async_range{}; + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + BufferCopy second_copy{copy}; + Buffer& buffer = slot_buffers[buffer_id]; + second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset; + VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset); + const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; + async_downloads += std::make_pair(base_interval, 1); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); + } + runtime.PostCopyBarrier(); + pending_downloads.emplace_back(std::move(normalized_copies)); + async_buffers.emplace_back(download_staging); + } else { + committed_ranges.clear(); + uncommitted_ranges.clear(); } } else { - const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); - for (const auto& [copy, buffer_id] : downloads) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); + } + runtime.PostCopyBarrier(); + runtime.Finish(); + for (const auto& [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto& [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + } } } } template <class P> void BufferCache<P>::CommitAsyncFlushes() { - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - CommitAsyncFlushesHigh(); - } else { - uncommitted_ranges.clear(); - committed_ranges.clear(); - } + CommitAsyncFlushesHigh(); } template <class P> -void BufferCache<P>::PopAsyncFlushes() {} +void BufferCache<P>::PopAsyncFlushes() { + MICROPROFILE_SCOPE(GPU_DownloadMemory); + PopAsyncBuffers(); +} template <class P> -bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionGpuModified(addr, size)) { - return true; +void BufferCache<P>::PopAsyncBuffers() { + if (async_buffers.empty()) { + return; + } + if (!async_buffers.front().has_value()) { + async_buffers.pop_front(); + return; + } + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto& downloads = pending_downloads.front(); + auto& async_buffer = async_buffers.front(); + u8* base = async_buffer->mapped_span.data(); + const size_t base_offset = async_buffer->offset; + for (const auto& copy : downloads) { + const VAddr cpu_addr = static_cast<VAddr>(copy.src_offset); + const u64 dst_offset = copy.dst_offset - base_offset; + const u8* read_mapped_memory = base + dst_offset; + ForEachInOverlapCounter( + async_downloads, cpu_addr, copy.size, [&](VAddr start, VAddr end, int count) { + cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - cpu_addr], + end - start); + if (count == 1) { + const IntervalType base_interval{start, end}; + common_ranges.subtract(base_interval); + } + }); + const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size}; + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); + async_buffers_death_ring.emplace_back(*async_buffer); + async_buffers.pop_front(); + pending_downloads.pop_front(); } - return false; +} + +template <class P> +bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { + bool is_dirty = false; + ForEachInRangeSet(common_ranges, addr, size, [&](VAddr, VAddr) { is_dirty = true; }); + return is_dirty; } template <class P> bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; - const u64 page_end = Common::DivCeil(end_addr, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { + const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); + for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -1041,28 +670,14 @@ bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { if (buf_start_addr < end_addr && addr < buf_end_addr) { return true; } - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); } return false; } template <class P> bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionCpuModified(addr, size)) { - return true; - } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - return false; + return memory_tracker.IsRegionCpuModified(addr, size); } template <class P> @@ -1072,7 +687,7 @@ void BufferCache<P>::BindHostIndexBuffer() { const u32 offset = buffer.Offset(index_buffer.cpu_addr); const u32 size = index_buffer.size; const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - if (!draw_state.inline_index_draw_indexes.empty()) { + if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { if constexpr (USE_MEMORY_MAPS) { auto upload_staging = runtime.UploadStagingBuffer(size); std::array<BufferCopy, 1> copies{ @@ -1155,7 +770,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 TouchBuffer(buffer, binding.buffer_id); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && - !buffer.IsRegionGpuModified(cpu_addr, size); + !memory_tracker.IsRegionGpuModified(cpu_addr, size); if (use_fast_buffer) { if constexpr (IS_OPENGL) { if (runtime.HasFastBufferSubData()) { @@ -1378,27 +993,36 @@ void BufferCache<P>::UpdateIndexBuffer() { // We have to check for the dirty flags and index count // The index count is currently changed without updating the dirty flags const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - const auto& index_array = draw_state.index_buffer; + const auto& index_buffer_ref = draw_state.index_buffer; auto& flags = maxwell3d->dirty.flags; if (!flags[Dirty::IndexBuffer]) { return; } flags[Dirty::IndexBuffer] = false; - last_index_count = index_array.count; - if (!draw_state.inline_index_draw_indexes.empty()) { + if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); + u32 buffer_size = Common::AlignUp(inline_index_size, CACHING_PAGESIZE); + if (inline_buffer_id == NULL_BUFFER_ID) [[unlikely]] { + inline_buffer_id = CreateBuffer(0, buffer_size); + } + if (slot_buffers[inline_buffer_id].SizeBytes() < buffer_size) [[unlikely]] { + slot_buffers.erase(inline_buffer_id); + inline_buffer_id = CreateBuffer(0, buffer_size); + } index_buffer = Binding{ .cpu_addr = 0, .size = inline_index_size, - .buffer_id = CreateBuffer(0, inline_index_size), + .buffer_id = inline_buffer_id, }; return; } - const GPUVAddr gpu_addr_begin = index_array.StartAddress(); - const GPUVAddr gpu_addr_end = index_array.EndAddress(); + + const GPUVAddr gpu_addr_begin = index_buffer_ref.StartAddress(); + const GPUVAddr gpu_addr_end = index_buffer_ref.EndAddress(); const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); - const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); + const u32 draw_size = + (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes(); const u32 size = std::min(address_size, draw_size); if (size == 0 || !cpu_addr) { index_buffer = NULL_BINDING; @@ -1434,17 +1058,15 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) { const GPUVAddr gpu_addr_begin = array.Address(); const GPUVAddr gpu_addr_end = limit.Address() + 1; const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); - u32 address_size = static_cast<u32>( - std::min(gpu_addr_end - gpu_addr_begin, static_cast<u64>(std::numeric_limits<u32>::max()))); - if (array.enable == 0 || address_size == 0 || !cpu_addr) { + const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); + u32 size = address_size; // TODO: Analyze stride and number of vertices + if (array.enable == 0 || size == 0 || !cpu_addr) { vertex_buffers[index] = NULL_BINDING; return; } if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { - address_size = - static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, address_size)); + size = static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, size)); } - const u32 size = address_size; // TODO: Analyze stride and number of vertices vertex_buffers[index] = Binding{ .cpu_addr = *cpu_addr, .size = size, @@ -1591,17 +1213,16 @@ void BufferCache<P>::UpdateComputeTextureBuffers() { template <class P> void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsGpuModified(cpu_addr, size); + memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); + + if (memory_tracker.IsRegionCpuModified(cpu_addr, size)) { + SynchronizeBuffer(slot_buffers[buffer_id], cpu_addr, size); + } const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - - const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); - if (!is_async) { - return; - } uncommitted_ranges.add(base_interval); + pending_ranges.add(base_interval); } template <class P> @@ -1609,7 +1230,7 @@ BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { if (cpu_addr == 0) { return NULL_BUFFER_ID; } - const u64 page = cpu_addr >> YUZU_PAGEBITS; + const u64 page = cpu_addr >> CACHING_PAGEBITS; const BufferId buffer_id = page_table[page]; if (!buffer_id) { return CreateBuffer(cpu_addr, size); @@ -1638,9 +1259,9 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu .has_stream_leap = has_stream_leap, }; } - for (; cpu_addr >> YUZU_PAGEBITS < Common::DivCeil(end, YUZU_PAGESIZE); - cpu_addr += YUZU_PAGESIZE) { - const BufferId overlap_id = page_table[cpu_addr >> YUZU_PAGEBITS]; + for (; cpu_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE); + cpu_addr += CACHING_PAGESIZE) { + const BufferId overlap_id = page_table[cpu_addr >> CACHING_PAGEBITS]; if (!overlap_id) { continue; } @@ -1666,11 +1287,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu // as a stream buffer. Increase the size to skip constantly recreating buffers. has_stream_leap = true; if (expands_right) { - begin -= YUZU_PAGESIZE * 256; + begin -= CACHING_PAGESIZE * 256; cpu_addr = begin; } if (expands_left) { - end += YUZU_PAGESIZE * 256; + end += CACHING_PAGESIZE * 256; } } } @@ -1690,25 +1311,22 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, if (accumulate_stream_score) { new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); } - std::vector<BufferCopy> copies; + boost::container::small_vector<BufferCopy, 1> copies; const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); - overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { - copies.push_back(BufferCopy{ - .src_offset = begin, - .dst_offset = dst_base_offset + begin, - .size = range_size, - }); - new_buffer.UnmarkRegionAsCpuModified(begin, range_size); - new_buffer.MarkRegionAsGpuModified(begin, range_size); + copies.push_back(BufferCopy{ + .src_offset = 0, + .dst_offset = dst_base_offset, + .size = overlap.SizeBytes(), }); - if (!copies.empty()) { - runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); - } - DeleteBuffer(overlap_id); + runtime.CopyBuffer(new_buffer, overlap, copies); + DeleteBuffer(overlap_id, true); } template <class P> BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { + VAddr cpu_addr_end = Common::AlignUp(cpu_addr + wanted_size, CACHING_PAGESIZE); + cpu_addr = Common::AlignDown(cpu_addr, CACHING_PAGESIZE); + wanted_size = static_cast<u32>(cpu_addr_end - cpu_addr); const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast<u32>(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); @@ -1718,7 +1336,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } Register(new_buffer_id); - TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id); + TouchBuffer(new_buffer, new_buffer_id); return new_buffer_id; } @@ -1746,8 +1364,8 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { } const VAddr cpu_addr_begin = buffer.CpuAddr(); const VAddr cpu_addr_end = cpu_addr_begin + size; - const u64 page_begin = cpu_addr_begin / YUZU_PAGESIZE; - const u64 page_end = Common::DivCeil(cpu_addr_end, YUZU_PAGESIZE); + const u64 page_begin = cpu_addr_begin / CACHING_PAGESIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, CACHING_PAGESIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { page_table[page] = buffer_id; @@ -1766,9 +1384,6 @@ void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept { template <class P> bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { - if (buffer.CpuAddr() == 0) { - return true; - } return SynchronizeBufferImpl(buffer, cpu_addr, size); } @@ -1777,10 +1392,11 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s boost::container::small_vector<BufferCopy, 4> copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + VAddr buffer_start = buffer.CpuAddr(); + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { copies.push_back(BufferCopy{ .src_offset = total_size_bytes, - .dst_offset = range_offset, + .dst_offset = cpu_addr_out - buffer_start, .size = range_size, }); total_size_bytes += range_size; @@ -1795,6 +1411,51 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s } template <class P> +bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector<BufferCopy, 4> copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + IntervalSet found_sets{}; + auto make_copies = [&] { + for (auto& interval : found_sets) { + const std::size_t sub_size = interval.upper() - interval.lower(); + const VAddr cpu_addr_ = interval.lower(); + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = cpu_addr_ - buffer.CpuAddr(), + .size = sub_size, + }); + total_size_bytes += sub_size; + largest_copy = std::max<u64>(largest_copy, sub_size); + } + const std::span<BufferCopy> copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); + }; + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr base_adr = cpu_addr_out; + const VAddr end_adr = base_adr + range_size; + const IntervalType add_interval{base_adr, end_adr}; + found_sets.add(add_interval); + }); + if (found_sets.empty()) { + return true; + } + const IntervalType search_interval{cpu_addr, cpu_addr + size}; + auto it = common_ranges.lower_bound(search_interval); + auto it_end = common_ranges.upper_bound(search_interval); + if (it == common_ranges.end()) { + make_copies(); + return false; + } + while (it != it_end) { + found_sets.subtract(*it); + it++; + } + make_copies(); + return false; +} + +template <class P> void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span<BufferCopy> copies) { if constexpr (USE_MEMORY_MAPS) { @@ -1805,39 +1466,45 @@ void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 larg } template <class P> -void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span<const BufferCopy> copies) { - std::span<u8> immediate_buffer; - for (const BufferCopy& copy : copies) { - std::span<const u8> upload_span; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - if (IsRangeGranular(cpu_addr, copy.size)) { - upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); - } else { - if (immediate_buffer.empty()) { - immediate_buffer = ImmediateBuffer(largest_copy); +void BufferCache<P>::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 largest_copy, + [[maybe_unused]] std::span<const BufferCopy> copies) { + if constexpr (!USE_MEMORY_MAPS) { + std::span<u8> immediate_buffer; + for (const BufferCopy& copy : copies) { + std::span<const u8> upload_span; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + if (IsRangeGranular(cpu_addr, copy.size)) { + upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); + } else { + if (immediate_buffer.empty()) { + immediate_buffer = ImmediateBuffer(largest_copy); + } + cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + upload_span = immediate_buffer.subspan(0, copy.size); } - cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); - upload_span = immediate_buffer.subspan(0, copy.size); + buffer.ImmediateUpload(copy.dst_offset, upload_span); } - buffer.ImmediateUpload(copy.dst_offset, upload_span); } } template <class P> -void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, - std::span<BufferCopy> copies) { - auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); - const std::span<u8> staging_pointer = upload_staging.mapped_span; - for (BufferCopy& copy : copies) { - u8* const src_pointer = staging_pointer.data() + copy.src_offset; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); +void BufferCache<P>::MappedUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 total_size_bytes, + [[maybe_unused]] std::span<BufferCopy> copies) { + if constexpr (USE_MEMORY_MAPS) { + auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); + const std::span<u8> staging_pointer = upload_staging.mapped_span; + for (BufferCopy& copy : copies) { + u8* const src_pointer = staging_pointer.data() + copy.src_offset; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); - // Apply the staging offset - copy.src_offset += upload_staging.offset; + // Apply the staging offset + copy.src_offset += upload_staging.offset; + } + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } - runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } template <class P> @@ -1847,7 +1514,9 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, if (!is_dirty) { return false; } - if (!IsRegionGpuModified(dest_address, copy_size)) { + VAddr aligned_start = Common::AlignDown(dest_address, YUZU_PAGESIZE); + VAddr aligned_end = Common::AlignUp(dest_address + copy_size, YUZU_PAGESIZE); + if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) { return false; } @@ -1886,30 +1555,31 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si boost::container::small_vector<BufferCopy, 1> copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) { - const VAddr buffer_addr = buffer.CpuAddr(); - const auto add_download = [&](VAddr start, VAddr end) { - const u64 new_offset = start - buffer_addr; - const u64 new_size = end - start; - copies.push_back(BufferCopy{ - .src_offset = new_offset, - .dst_offset = total_size_bytes, - .size = new_size, - }); - // Align up to avoid cache conflicts - constexpr u64 align = 256ULL; - constexpr u64 mask = ~(align - 1ULL); - total_size_bytes += (new_size + align - 1) & mask; - largest_copy = std::max(largest_copy, new_size); - }; - - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); - }); + memory_tracker.ForEachDownloadRangeAndClear( + cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr buffer_addr = buffer.CpuAddr(); + const auto add_download = [&](VAddr start, VAddr end) { + const u64 new_offset = start - buffer_addr; + const u64 new_size = end - start; + copies.push_back(BufferCopy{ + .src_offset = new_offset, + .dst_offset = total_size_bytes, + .size = new_size, + }); + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; + largest_copy = std::max(largest_copy, new_size); + }; + + const VAddr start_address = cpu_addr_out; + const VAddr end_address = start_address + range_size; + ForEachInRangeSet(common_ranges, start_address, range_size, add_download); + const IntervalType subtract_interval{start_address, end_address}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + }); if (total_size_bytes == 0) { return; } @@ -1943,7 +1613,7 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si } template <class P> -void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { +void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { const auto scalar_replace = [buffer_id](Binding& binding) { if (binding.buffer_id == buffer_id) { binding.buffer_id = BufferId{}; @@ -1962,8 +1632,10 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { std::erase(cached_write_buffer_ids, buffer_id); // Mark the whole buffer as CPU written to stop tracking CPU writes - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + if (!do_not_mark) { + Buffer& buffer = slot_buffers[buffer_id]; + memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + } Unregister(buffer_id); delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); @@ -2011,7 +1683,7 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); return NULL_BINDING; } - const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); + const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, YUZU_PAGESIZE); const Binding binding{ .cpu_addr = *cpu_addr, .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr), diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h new file mode 100644 index 000000000..656baa550 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -0,0 +1,580 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <algorithm> +#include <array> +#include <functional> +#include <memory> +#include <mutex> +#include <numeric> +#include <span> +#include <unordered_map> +#include <vector> + +#include <boost/container/small_vector.hpp> +#define BOOST_NO_MT +#include <boost/pool/detail/mutex.hpp> +#undef BOOST_NO_MT +#include <boost/icl/interval.hpp> +#include <boost/icl/interval_base_set.hpp> +#include <boost/icl/interval_set.hpp> +#include <boost/icl/split_interval_map.hpp> +#include <boost/pool/pool.hpp> +#include <boost/pool/pool_alloc.hpp> +#include <boost/pool/poolfwd.hpp> + +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "common/literals.h" +#include "common/lru_cache.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "core/memory.h" +#include "video_core/buffer_cache/buffer_base.h" +#include "video_core/control/channel_state_cache.h" +#include "video_core/delayed_destruction_ring.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/draw_manager.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" + +namespace boost { +template <typename T> +class fast_pool_allocator<T, default_user_allocator_new_delete, details::pool::null_mutex, 4096, 0>; +} + +namespace VideoCommon { + +MICROPROFILE_DECLARE(GPU_PrepareBuffers); +MICROPROFILE_DECLARE(GPU_BindUploadBuffers); +MICROPROFILE_DECLARE(GPU_DownloadMemory); + +using BufferId = SlotId; + +using VideoCore::Surface::PixelFormat; +using namespace Common::Literals; + +constexpr u32 NUM_VERTEX_BUFFERS = 32; +constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; +constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; +constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; +constexpr u32 NUM_STORAGE_BUFFERS = 16; +constexpr u32 NUM_TEXTURE_BUFFERS = 16; +constexpr u32 NUM_STAGES = 5; + +using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; +using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; + +enum class ObtainBufferSynchronize : u32 { + NoSynchronize = 0, + FullSynchronize = 1, + SynchronizeNoDirty = 2, +}; + +enum class ObtainBufferOperation : u32 { + DoNothing = 0, + MarkAsWritten = 1, + DiscardWrite = 2, + MarkQuery = 3, +}; + +template <typename P> +class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { + // Page size for caching purposes. + // This is unrelated to the CPU page size and it can be changed as it seems optimal. + static constexpr u32 CACHING_PAGEBITS = 16; + static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; + + static constexpr bool IS_OPENGL = P::IS_OPENGL; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = + P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = + P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; + static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; + static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS; + + static constexpr BufferId NULL_BUFFER_ID{0}; + + static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; + static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; + static constexpr s64 TARGET_THRESHOLD = 4_GiB; + + // Debug Flags. + + static constexpr bool DISABLE_DOWNLOADS = true; + + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + using Runtime = typename P::Runtime; + using Buffer = typename P::Buffer; + using Async_Buffer = typename P::Async_Buffer; + using MemoryTracker = typename P::MemoryTracker; + + using IntervalCompare = std::less<VAddr>; + using IntervalInstance = boost::icl::interval_type_default<VAddr, std::less>; + using IntervalAllocator = boost::fast_pool_allocator<VAddr>; + using IntervalSet = boost::icl::interval_set<VAddr>; + using IntervalType = typename IntervalSet::interval_type; + + template <typename Type> + struct counter_add_functor : public boost::icl::identity_based_inplace_combine<Type> { + // types + typedef counter_add_functor<Type> type; + typedef boost::icl::identity_based_inplace_combine<Type> base_type; + + // public member functions + void operator()(Type& current, const Type& added) const { + current += added; + if (current < base_type::identity_element()) { + current = base_type::identity_element(); + } + } + + // public static functions + static void version(Type&){}; + }; + + using OverlapCombine = counter_add_functor<int>; + using OverlapSection = boost::icl::inter_section<int>; + using OverlapCounter = boost::icl::split_interval_map<VAddr, int>; + + struct Empty {}; + + struct OverlapResult { + std::vector<BufferId> ids; + VAddr begin; + VAddr end; + bool has_stream_leap = false; + }; + + struct Binding { + VAddr cpu_addr{}; + u32 size{}; + BufferId buffer_id; + }; + + struct TextureBufferBinding : Binding { + PixelFormat format; + }; + + static constexpr Binding NULL_BINDING{ + .cpu_addr = 0, + .size = 0, + .buffer_id = NULL_BUFFER_ID, + }; + +public: + static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); + + explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, Runtime& runtime_); + + void TickFrame(); + + void WriteMemory(VAddr cpu_addr, u64 size); + + void CachedWriteMemory(VAddr cpu_addr, u64 size); + + void DownloadMemory(VAddr cpu_addr, u64 size); + + bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); + + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); + + void DisableGraphicsUniformBuffer(size_t stage, u32 index); + + void UpdateGraphicsBuffers(bool is_indexed); + + void UpdateComputeBuffers(); + + void BindHostGeometryBuffers(bool is_indexed); + + void BindHostStageBuffers(size_t stage); + + void BindHostComputeBuffers(); + + void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask, + const UniformBufferSizes* sizes); + + void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); + + void UnbindGraphicsStorageBuffers(size_t stage); + + void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindGraphicsTextureBuffers(size_t stage); + + void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, + PixelFormat format, bool is_written, bool is_image); + + void UnbindComputeStorageBuffers(); + + void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindComputeTextureBuffers(); + + void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, + bool is_written, bool is_image); + + [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); + void FlushCachedWrites(); + + /// Return true when there are uncommitted buffers to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; + + void AccumulateFlushes(); + + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; + + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + void CommitAsyncFlushesHigh(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + void PopAsyncBuffers(); + + bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + + bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + + /// Return true when a region is registered on the cache + [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); + + /// Return true when a CPU region is modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + + void SetDrawIndirect( + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { + current_draw_indirect = current_draw_indirect_; + } + + [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount(); + + [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); + + std::recursive_mutex mutex; + Runtime& runtime; + +private: + template <typename Func> + static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { + for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { + const int disabled_bits = std::countr_zero(enabled_mask); + index += disabled_bits; + enabled_mask >>= disabled_bits; + func(index); + } + } + + template <typename Func> + void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(cpu_addr + size, CACHING_PAGESIZE); + for (u64 page = cpu_addr >> CACHING_PAGEBITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); + } + } + + template <typename Func> + void ForEachInRangeSet(IntervalSet& current_range, VAddr cpu_addr, u64 size, Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + + template <typename Func> + void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size, + Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + auto& inter = it->first; + VAddr inter_addr_end = inter.upper(); + VAddr inter_addr = inter.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end, it->second); + } + } + + void RemoveEachInOverlapCounter(OverlapCounter& current_range, + const IntervalType search_interval, int subtract_value) { + bool any_removals = false; + current_range.add(std::make_pair(search_interval, subtract_value)); + do { + any_removals = false; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + if (it->second <= 0) { + any_removals = true; + current_range.erase(it); + break; + } + } + } while (any_removals); + } + + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { + return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == + ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); + } + + void RunGarbageCollector(); + + void WaitOnAsyncFlushes(VAddr cpu_addr, u64 size); + + void BindHostIndexBuffer(); + + void BindHostVertexBuffers(); + + void BindHostDrawIndirectBuffers(); + + void BindHostGraphicsUniformBuffers(size_t stage); + + void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); + + void BindHostGraphicsStorageBuffers(size_t stage); + + void BindHostGraphicsTextureBuffers(size_t stage); + + void BindHostTransformFeedbackBuffers(); + + void BindHostComputeUniformBuffers(); + + void BindHostComputeStorageBuffers(); + + void BindHostComputeTextureBuffers(); + + void DoUpdateGraphicsBuffers(bool is_indexed); + + void DoUpdateComputeBuffers(); + + void UpdateIndexBuffer(); + + void UpdateVertexBuffers(); + + void UpdateVertexBuffer(u32 index); + + void UpdateDrawIndirect(); + + void UpdateUniformBuffers(size_t stage); + + void UpdateStorageBuffers(size_t stage); + + void UpdateTextureBuffers(size_t stage); + + void UpdateTransformFeedbackBuffers(); + + void UpdateTransformFeedbackBuffer(u32 index); + + void UpdateComputeUniformBuffers(); + + void UpdateComputeStorageBuffers(); + + void UpdateComputeTextureBuffers(); + + void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); + + [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); + + void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); + + [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template <bool insert> + void ChangeRegister(BufferId buffer_id); + + void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; + + bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); + + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span<BufferCopy> copies); + + void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span<const BufferCopy> copies); + + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); + + void DownloadBufferMemory(Buffer& buffer_id); + + void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); + + void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); + + void NotifyBufferDeletion(); + + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, + bool is_written) const; + + [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, + PixelFormat format); + + [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); + + [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); + + [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; + + void ClearDownload(IntervalType subtract_interval); + + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; + + SlotVector<Buffer> slot_buffers; + DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; + + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; + + u32 last_index_count = 0; + + Binding index_buffer; + std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; + std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; + std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; + std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers; + std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; + Binding count_buffer_binding; + Binding indirect_buffer_binding; + + std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; + std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; + std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers; + + std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{}; + u32 enabled_compute_uniform_buffer_mask = 0; + + const UniformBufferSizes* uniform_buffer_sizes{}; + const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; + + std::array<u32, NUM_STAGES> enabled_storage_buffers{}; + std::array<u32, NUM_STAGES> written_storage_buffers{}; + u32 enabled_compute_storage_buffers = 0; + u32 written_compute_storage_buffers = 0; + + std::array<u32, NUM_STAGES> enabled_texture_buffers{}; + std::array<u32, NUM_STAGES> written_texture_buffers{}; + std::array<u32, NUM_STAGES> image_texture_buffers{}; + u32 enabled_compute_texture_buffers = 0; + u32 written_compute_texture_buffers = 0; + u32 image_compute_texture_buffers = 0; + + std::array<u32, 16> uniform_cache_hits{}; + std::array<u32, 16> uniform_cache_shots{}; + + u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; + + bool has_deleted_buffers = false; + + std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> + dirty_uniform_buffers{}; + std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{}; + std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, + std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty> + uniform_buffer_binding_sizes{}; + + std::vector<BufferId> cached_write_buffer_ids; + + MemoryTracker memory_tracker; + IntervalSet uncommitted_ranges; + IntervalSet common_ranges; + IntervalSet cached_ranges; + IntervalSet pending_ranges; + std::deque<IntervalSet> committed_ranges; + + // Async Buffers + OverlapCounter async_downloads; + std::deque<std::optional<Async_Buffer>> async_buffers; + std::deque<boost::container::small_vector<BufferCopy, 4>> pending_downloads; + std::optional<Async_Buffer> current_buffer; + + std::deque<Async_Buffer> async_buffers_death_ring; + + size_t immediate_buffer_capacity = 0; + Common::ScratchBuffer<u8> immediate_buffer_alloc; + + struct LRUItemParams { + using ObjectType = BufferId; + using TickType = u64; + }; + Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; + u64 frame_tick = 0; + u64 total_used_memory = 0; + u64 minimum_memory = 0; + u64 critical_memory = 0; + BufferId inline_buffer_id; + + bool active_async_buffers = false; + + std::array<BufferId, ((1ULL << 39) >> CACHING_PAGEBITS)> page_table; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h new file mode 100644 index 000000000..dc4ebfcaa --- /dev/null +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -0,0 +1,273 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <algorithm> +#include <bit> +#include <deque> +#include <limits> +#include <type_traits> +#include <unordered_set> +#include <utility> + +#include "common/alignment.h" +#include "common/common_types.h" +#include "video_core/buffer_cache/word_manager.h" + +namespace VideoCommon { + +template <class RasterizerInterface> +class MemoryTrackerBase { + static constexpr size_t MAX_CPU_PAGE_BITS = 39; + static constexpr size_t HIGHER_PAGE_BITS = 22; + static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; + static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; + static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS); + static constexpr size_t MANAGER_POOL_SIZE = 32; + static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD; + using Manager = WordManager<RasterizerInterface, WORDS_STACK_NEEDED>; + +public: + MemoryTrackerBase(RasterizerInterface& rasterizer_) : rasterizer{&rasterizer_} {} + ~MemoryTrackerBase() = default; + + /// Returns the inclusive CPU modified range in a begin end pair + [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs<true>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion<Type::CPU>(offset, size); + }); + } + + /// Returns the inclusive GPU modified range in a begin end pair + [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs<false>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion<Type::GPU>(offset, size); + }); + } + + /// Returns true if a region has been modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages<true>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified<Type::CPU>(offset, size); + }); + } + + /// Returns true if a region has been modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages<false>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified<Type::GPU>(offset, size); + }); + } + + /// Mark region as CPU modified, notifying the rasterizer about this change + void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::CPU, true>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as CPU modified, notifying the rasterizer about this change + void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::CPU, false>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the host GPU + void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::GPU, true>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as modified from the host GPU + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::GPU, false>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the CPU + /// but don't mark it as modified until FlusHCachedWrites is called. + void CachedCpuWrite(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages<true>( + dirty_cpu_addr, query_size, [this](Manager* manager, u64 offset, size_t size) { + const VAddr cpu_address = manager->GetCpuAddr() + offset; + manager->template ChangeRegionState<Type::CachedCPU, true>(cpu_address, size); + cached_pages.insert(static_cast<u32>(cpu_address >> HIGHER_PAGE_BITS)); + }); + } + + /// Flushes cached CPU writes, and notify the rasterizer about the deltas + void FlushCachedWrites(VAddr query_cpu_addr, u64 query_size) noexcept { + IteratePages<false>(query_cpu_addr, query_size, + [](Manager* manager, [[maybe_unused]] u64 offset, + [[maybe_unused]] size_t size) { manager->FlushCachedWrites(); }); + } + + void FlushCachedWrites() noexcept { + for (auto id : cached_pages) { + top_tier[id]->FlushCachedWrites(); + } + cached_pages.clear(); + } + + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified + template <typename Func> + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages<true>(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange<Type::CPU, true>( + manager->GetCpuAddr() + offset, size, func); + }); + } + + /// Call 'func' for each GPU modified range and unmark those pages as GPU modified + template <typename Func> + void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { + IteratePages<false>(query_cpu_range, query_size, + [&func, clear](Manager* manager, u64 offset, size_t size) { + if (clear) { + manager->template ForEachModifiedRange<Type::GPU, true>( + manager->GetCpuAddr() + offset, size, func); + } else { + manager->template ForEachModifiedRange<Type::GPU, false>( + manager->GetCpuAddr() + offset, size, func); + } + }); + } + + template <typename Func> + void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages<false>(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange<Type::GPU, true>( + manager->GetCpuAddr() + offset, size, func); + }); + } + +private: + template <bool create_region_on_fail, typename Func> + bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result<Func, Manager*, u64, size_t>::type; + static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min<std::size_t>(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + if (manager) { + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + return false; + } + + template <bool create_region_on_fail, typename Func> + std::pair<u64, u64> IteratePairs(VAddr cpu_address, size_t size, Func&& func) { + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + u64 begin = std::numeric_limits<u64>::max(); + u64 end = 0; + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min<std::size_t>(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + const auto execute = [&] { + auto [new_begin, new_end] = func(manager, page_offset, copy_amount); + if (new_begin != 0 || new_end != 0) { + const u64 base_address = page_index << HIGHER_PAGE_BITS; + begin = std::min(new_begin + base_address, begin); + end = std::max(new_end + base_address, end); + } + }; + if (manager) { + execute(); + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + execute(); + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + if (begin < end) { + return std::make_pair(begin, end); + } else { + return std::make_pair(0ULL, 0ULL); + } + } + + void CreateRegion(std::size_t page_index) { + const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS; + top_tier[page_index] = GetNewManager(base_cpu_addr); + } + + Manager* GetNewManager(VAddr base_cpu_addess) { + const auto on_return = [&] { + auto* new_manager = free_managers.front(); + new_manager->SetCpuAddress(base_cpu_addess); + free_managers.pop_front(); + return new_manager; + }; + if (!free_managers.empty()) { + return on_return(); + } + manager_pool.emplace_back(); + auto& last_pool = manager_pool.back(); + for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) { + new (&last_pool[i]) Manager(0, *rasterizer, HIGHER_PAGE_SIZE); + free_managers.push_back(&last_pool[i]); + } + return on_return(); + } + + std::deque<std::array<Manager, MANAGER_POOL_SIZE>> manager_pool; + std::deque<Manager*> free_managers; + + std::array<Manager*, NUM_HIGH_PAGES> top_tier{}; + + std::unordered_set<u32> cached_pages; + + RasterizerInterface* rasterizer = nullptr; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h new file mode 100644 index 000000000..a42455045 --- /dev/null +++ b/src/video_core/buffer_cache/word_manager.h @@ -0,0 +1,462 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <algorithm> +#include <bit> +#include <limits> +#include <span> +#include <utility> + +#include "common/alignment.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "core/memory.h" + +namespace VideoCommon { + +constexpr u64 PAGES_PER_WORD = 64; +constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; +constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; + +enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, +}; + +/// Vector tracking modified pages tightly packed with small vector optimization +template <size_t stack_words = 1> +struct WordsArray { + /// Returns the pointer to the words state + [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { + return is_short ? stack.data() : heap; + } + + /// Returns the pointer to the words state + [[nodiscard]] u64* Pointer(bool is_short) noexcept { + return is_short ? stack.data() : heap; + } + + std::array<u64, stack_words> stack{}; ///< Small buffers storage + u64* heap; ///< Not-small buffers pointer to the storage +}; + +template <size_t stack_words = 1> +struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { + num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD); + if (IsShort()) { + cpu.stack.fill(~u64{0}); + gpu.stack.fill(0); + cached_cpu.stack.fill(0); + untracked.stack.fill(~u64{0}); + } else { + // Share allocation between CPU and GPU pages and set their default values + u64* const alloc = new u64[num_words * 4]; + cpu.heap = alloc; + gpu.heap = alloc + num_words; + cached_cpu.heap = alloc + num_words * 2; + untracked.heap = alloc + num_words * 3; + std::fill_n(cpu.heap, num_words, ~u64{0}); + std::fill_n(gpu.heap, num_words, 0); + std::fill_n(cached_cpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); + } + // Clean up tailing bits + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); + const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; + } + + ~Words() { + Release(); + } + + Words& operator=(Words&& rhs) noexcept { + Release(); + size_bytes = rhs.size_bytes; + num_words = rhs.num_words; + cpu = rhs.cpu; + gpu = rhs.gpu; + cached_cpu = rhs.cached_cpu; + untracked = rhs.untracked; + rhs.cpu.heap = nullptr; + return *this; + } + + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu}, + cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { + rhs.cpu.heap = nullptr; + } + + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return num_words <= stack_words; + } + + /// Returns the number of words of the buffer + [[nodiscard]] size_t NumWords() const noexcept { + return num_words; + } + + /// Release buffer resources + void Release() { + if (!IsShort()) { + // CPU written words is the base for the heap allocation + delete[] cpu.heap; + } + } + + template <Type type> + std::span<u64> Span() noexcept { + if constexpr (type == Type::CPU) { + return std::span<u64>(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span<u64>(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::CachedCPU) { + return std::span<u64>(cached_cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span<u64>(untracked.Pointer(IsShort()), num_words); + } + } + + template <Type type> + std::span<const u64> Span() const noexcept { + if constexpr (type == Type::CPU) { + return std::span<const u64>(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span<const u64>(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::CachedCPU) { + return std::span<const u64>(cached_cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span<const u64>(untracked.Pointer(IsShort()), num_words); + } + } + + u64 size_bytes = 0; + size_t num_words = 0; + WordsArray<stack_words> cpu; + WordsArray<stack_words> gpu; + WordsArray<stack_words> cached_cpu; + WordsArray<stack_words> untracked; +}; + +template <class RasterizerInterface, size_t stack_words = 1> +class WordManager { +public: + explicit WordManager(VAddr cpu_addr_, RasterizerInterface& rasterizer_, u64 size_bytes) + : cpu_addr{cpu_addr_}, rasterizer{&rasterizer_}, words{size_bytes} {} + + explicit WordManager() = default; + + void SetCpuAddress(VAddr new_cpu_addr) { + cpu_addr = new_cpu_addr; + } + + VAddr GetCpuAddr() const { + return cpu_addr; + } + + static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { + constexpr size_t number_bits = sizeof(u64) * 8; + const size_t limit_page_end = number_bits - std::min(page_end, number_bits); + u64 bits = (word >> page_start) << page_start; + bits = (bits << limit_page_end) >> limit_page_end; + return bits; + } + + static std::pair<size_t, size_t> GetWordPage(VAddr address) { + const size_t converted_address = static_cast<size_t>(address); + const size_t word_number = converted_address / BYTES_PER_WORD; + const size_t amount_pages = converted_address % BYTES_PER_WORD; + return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE); + } + + template <typename Func> + void IterateWords(size_t offset, size_t size, Func&& func) const { + using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>; + static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; + const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL)); + const size_t end = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset + size), 0LL)); + if (start >= SizeBytes() || end <= start) { + return; + } + auto [start_word, start_page] = GetWordPage(start); + auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL); + const size_t num_words = NumWords(); + start_word = std::min(start_word, num_words); + end_word = std::min(end_word, num_words); + const size_t diff = end_word - start_word; + end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD; + end_word = std::min(end_word, num_words); + end_page += diff * PAGES_PER_WORD; + constexpr u64 base_mask{~0ULL}; + for (size_t word_index = start_word; word_index < end_word; word_index++) { + const u64 mask = ExtractBits(base_mask, start_page, end_page); + start_page = 0; + end_page -= PAGES_PER_WORD; + if constexpr (BOOL_BREAK) { + if (func(word_index, mask)) { + return; + } + } else { + func(word_index, mask); + } + } + } + + template <typename Func> + void IteratePages(u64 mask, Func&& func) const { + size_t offset = 0; + while (mask != 0) { + const size_t empty_bits = std::countr_zero(mask); + offset += empty_bits; + mask = mask >> empty_bits; + + const size_t continuous_bits = std::countr_one(mask); + func(offset, continuous_bits); + mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0; + offset += continuous_bits; + } + } + + /** + * Change the state of a range of pages + * + * @param dirty_addr Base address to mark or unmark as modified + * @param size Size in bytes to mark or unmark as modified + */ + template <Type type, bool enable> + void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) { + std::span<u64> state_words = words.template Span<type>(); + [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>(); + [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>(); + IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) { + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer<!enable>(index, untracked_words[index], mask); + } + if constexpr (enable) { + state_words[index] |= mask; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[index] |= mask; + } + if constexpr (type == Type::CPU) { + cached_words[index] &= ~mask; + } + } else { + if constexpr (type == Type::CPU) { + const u64 word = state_words[index] & mask; + cached_words[index] &= ~word; + } + state_words[index] &= ~mask; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[index] &= ~mask; + } + } + }); + } + + /** + * Loop over each page in the given range, turn off those bits and notify the rasterizer if + * needed. Call the given function on each turned off range. + * + * @param query_cpu_range Base CPU address to loop over + * @param size Size in bytes of the CPU range to loop over + * @param func Function to call for each turned off region + */ + template <Type type, bool clear, typename Func> + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + static_assert(type != Type::Untracked); + + std::span<u64> state_words = words.template Span<type>(); + [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>(); + [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>(); + const size_t offset = query_cpu_range - cpu_addr; + bool pending = false; + size_t pending_offset{}; + size_t pending_pointer{}; + const auto release = [&]() { + func(cpu_addr + pending_offset * BYTES_PER_PAGE, + (pending_pointer - pending_offset) * BYTES_PER_PAGE); + }; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if constexpr (clear) { + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer<true>(index, untracked_words[index], mask); + } + state_words[index] &= ~mask; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[index] &= ~mask; + } + if constexpr (type == Type::CPU) { + cached_words[index] &= ~word; + } + } + const size_t base_offset = index * PAGES_PER_WORD; + IteratePages(word, [&](size_t pages_offset, size_t pages_size) { + const auto reset = [&]() { + pending_offset = base_offset + pages_offset; + pending_pointer = base_offset + pages_offset + pages_size; + }; + if (!pending) { + reset(); + pending = true; + return; + } + if (pending_pointer == base_offset + pages_offset) { + pending_pointer += pages_size; + return; + } + release(); + reset(); + }); + }); + if (pending) { + release(); + } + } + + /** + * Returns true when a region has been modified + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template <Type type> + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + + const std::span<const u64> state_words = words.template Span<type>(); + bool result = false; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if (word != 0) { + result = true; + return true; + } + return false; + }); + return result; + } + + /** + * Returns a begin end pair with the inclusive modified region + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template <Type type> + [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + const std::span<const u64> state_words = words.template Span<type>(); + u64 begin = std::numeric_limits<u64>::max(); + u64 end = 0; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if (word == 0) { + return; + } + const u64 local_page_begin = std::countr_zero(word); + const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); + const u64 page_index = index * PAGES_PER_WORD; + begin = std::min(begin, page_index + local_page_begin); + end = page_index + local_page_end; + }); + static constexpr std::pair<u64, u64> EMPTY{0, 0}; + return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; + } + + /// Returns the number of words of the manager + [[nodiscard]] size_t NumWords() const noexcept { + return words.NumWords(); + } + + /// Returns the size in bytes of the manager + [[nodiscard]] u64 SizeBytes() const noexcept { + return words.size_bytes; + } + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return words.IsShort(); + } + + void FlushCachedWrites() noexcept { + const u64 num_words = NumWords(); + u64* const cached_words = Array<Type::CachedCPU>(); + u64* const untracked_words = Array<Type::Untracked>(); + u64* const cpu_words = Array<Type::CPU>(); + for (u64 word_index = 0; word_index < num_words; ++word_index) { + const u64 cached_bits = cached_words[word_index]; + NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); + untracked_words[word_index] |= cached_bits; + cpu_words[word_index] |= cached_bits; + cached_words[word_index] = 0; + } + } + +private: + template <Type type> + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template <Type type> + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + /** + * Notify rasterizer about changes in the CPU tracking state of a word in the buffer + * + * @param word_index Index to the word to notify to the rasterizer + * @param current_bits Current state of the word + * @param new_bits New state of the word + * + * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages + */ + template <bool add_to_rasterizer> + void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { + u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; + VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; + IteratePages(changed_bits, [&](size_t offset, size_t size) { + rasterizer->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, + size * BYTES_PER_PAGE, add_to_rasterizer ? 1 : -1); + }); + } + + VAddr cpu_addr = 0; + RasterizerInterface* rasterizer = nullptr; + Words<stack_words> words; +}; + +} // namespace VideoCommon diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp index 4e75f33ca..ab4f4d407 100644 --- a/src/video_core/compatible_formats.cpp +++ b/src/video_core/compatible_formats.cpp @@ -126,15 +126,14 @@ constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{ PixelFormat::ASTC_2D_8X8_SRGB, }; -// Missing formats: -// PixelFormat::ASTC_2D_10X5_UNORM -// PixelFormat::ASTC_2D_10X5_SRGB - -// Missing formats: -// PixelFormat::ASTC_2D_10X6_SRGB +constexpr std::array VIEW_CLASS_ASTC_10x5_RGBA{ + PixelFormat::ASTC_2D_10X5_UNORM, + PixelFormat::ASTC_2D_10X5_SRGB, +}; constexpr std::array VIEW_CLASS_ASTC_10x6_RGBA{ PixelFormat::ASTC_2D_10X6_UNORM, + PixelFormat::ASTC_2D_10X6_SRGB, }; constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{ @@ -147,9 +146,10 @@ constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{ PixelFormat::ASTC_2D_10X10_SRGB, }; -// Missing formats -// ASTC_2D_12X10_UNORM, -// ASTC_2D_12X10_SRGB, +constexpr std::array VIEW_CLASS_ASTC_12x10_RGBA{ + PixelFormat::ASTC_2D_12X10_UNORM, + PixelFormat::ASTC_2D_12X10_SRGB, +}; constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{ PixelFormat::ASTC_2D_12X12_UNORM, @@ -229,9 +229,11 @@ constexpr Table MakeViewTable() { EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA); EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA); EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_10x5_RGBA); EnableRange(view, VIEW_CLASS_ASTC_10x6_RGBA); EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA); EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_12x10_RGBA); EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA); return view; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 614d61db4..2f986097f 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -4,6 +4,7 @@ #include <cstring> #include <optional> #include "common/assert.h" +#include "common/bit_util.h" #include "common/scope_exit.h" #include "common/settings.h" #include "core/core.h" @@ -222,6 +223,9 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool } void Maxwell3D::RefreshParametersImpl() { + if (!Settings::IsGPULevelHigh()) { + return; + } size_t current_index = 0; for (auto& segment : macro_segments) { if (segment.first == 0) { @@ -259,12 +263,13 @@ u32 Maxwell3D::GetMaxCurrentVertices() { size_t Maxwell3D::EstimateIndexBufferSize() { GPUVAddr start_address = regs.index_buffer.StartAddress(); GPUVAddr end_address = regs.index_buffer.EndAddress(); - static constexpr std::array<size_t, 4> max_sizes = { - std::numeric_limits<u8>::max(), std::numeric_limits<u16>::max(), - std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()}; + static constexpr std::array<size_t, 3> max_sizes = {std::numeric_limits<u8>::max(), + std::numeric_limits<u16>::max(), + std::numeric_limits<u32>::max()}; const size_t byte_size = regs.index_buffer.FormatSizeInBytes(); + const size_t log2_byte_size = Common::Log2Ceil64(byte_size); return std::min<size_t>( - memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[byte_size]) / + memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[log2_byte_size]) / byte_size, static_cast<size_t>(end_address - start_address)); } diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index c390ac91b..3b2f6aab6 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -4,13 +4,20 @@ #pragma once #include <algorithm> +#include <condition_variable> #include <cstring> #include <deque> #include <functional> #include <memory> +#include <mutex> +#include <thread> #include <queue> #include "common/common_types.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "common/thread.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/gpu.h" #include "video_core/host1x/host1x.h" @@ -23,15 +30,26 @@ class FenceBase { public: explicit FenceBase(bool is_stubbed_) : is_stubbed{is_stubbed_} {} + bool IsStubbed() const { + return is_stubbed; + } + protected: bool is_stubbed; }; -template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache> +template <typename Traits> class FenceManager { + using TFence = typename Traits::FenceType; + using TTextureCache = typename Traits::TextureCacheType; + using TBufferCache = typename Traits::BufferCacheType; + using TQueryCache = typename Traits::QueryCacheType; + static constexpr bool can_async_check = Traits::HAS_ASYNC_CHECK; + public: /// Notify the fence manager about a new frame void TickFrame() { + std::unique_lock lock(ring_guard); delayed_destruction_ring.Tick(); } @@ -46,17 +64,33 @@ public: } void SignalFence(std::function<void()>&& func) { - TryReleasePendingFences(); + rasterizer.InvalidateGPUCache(); + bool delay_fence = Settings::IsGPULevelHigh(); + if constexpr (!can_async_check) { + TryReleasePendingFences<false>(); + } const bool should_flush = ShouldFlush(); CommitAsyncFlushes(); - uncommitted_operations.emplace_back(std::move(func)); - CommitOperations(); TFence new_fence = CreateFence(!should_flush); - fences.push(new_fence); + if constexpr (can_async_check) { + guard.lock(); + } + if (delay_fence) { + uncommitted_operations.emplace_back(std::move(func)); + } + pending_operations.emplace_back(std::move(uncommitted_operations)); QueueFence(new_fence); + if (!delay_fence) { + func(); + } + fences.push(std::move(new_fence)); if (should_flush) { rasterizer.FlushCommands(); } + if constexpr (can_async_check) { + guard.unlock(); + cv.notify_all(); + } } void SignalSyncPoint(u32 value) { @@ -66,29 +100,30 @@ public: } void WaitPendingFences() { - while (!fences.empty()) { - TFence& current_fence = fences.front(); - if (ShouldWait()) { - WaitFence(current_fence); - } - PopAsyncFlushes(); - auto operations = std::move(pending_operations.front()); - pending_operations.pop_front(); - for (auto& operation : operations) { - operation(); - } - PopFence(); + if constexpr (!can_async_check) { + TryReleasePendingFences<true>(); } } protected: explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, - TTextureCache& texture_cache_, TTBufferCache& buffer_cache_, + TTextureCache& texture_cache_, TBufferCache& buffer_cache_, TQueryCache& query_cache_) : rasterizer{rasterizer_}, gpu{gpu_}, syncpoint_manager{gpu.Host1x().GetSyncpointManager()}, - texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {} + texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} { + if constexpr (can_async_check) { + fence_thread = + std::jthread([this](std::stop_token token) { ReleaseThreadFunc(token); }); + } + } - virtual ~FenceManager() = default; + virtual ~FenceManager() { + if constexpr (can_async_check) { + fence_thread.request_stop(); + cv.notify_all(); + fence_thread.join(); + } + } /// Creates a Fence Interface, does not create a backend fence if 'is_stubbed' is /// true @@ -104,15 +139,20 @@ protected: Tegra::GPU& gpu; Tegra::Host1x::SyncpointManager& syncpoint_manager; TTextureCache& texture_cache; - TTBufferCache& buffer_cache; + TBufferCache& buffer_cache; TQueryCache& query_cache; private: + template <bool force_wait> void TryReleasePendingFences() { while (!fences.empty()) { TFence& current_fence = fences.front(); if (ShouldWait() && !IsFenceSignaled(current_fence)) { - return; + if constexpr (force_wait) { + WaitFence(current_fence); + } else { + return; + } } PopAsyncFlushes(); auto operations = std::move(pending_operations.front()); @@ -120,7 +160,49 @@ private: for (auto& operation : operations) { operation(); } - PopFence(); + { + std::unique_lock lock(ring_guard); + delayed_destruction_ring.Push(std::move(current_fence)); + } + fences.pop(); + } + } + + void ReleaseThreadFunc(std::stop_token stop_token) { + std::string name = "GPUFencingThread"; + MicroProfileOnThreadCreate(name.c_str()); + + // Cleanup + SCOPE_EXIT({ MicroProfileOnThreadExit(); }); + + Common::SetCurrentThreadName(name.c_str()); + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); + + TFence current_fence; + std::deque<std::function<void()>> current_operations; + while (!stop_token.stop_requested()) { + { + std::unique_lock lock(guard); + cv.wait(lock, [&] { return stop_token.stop_requested() || !fences.empty(); }); + if (stop_token.stop_requested()) [[unlikely]] { + return; + } + current_fence = std::move(fences.front()); + current_operations = std::move(pending_operations.front()); + fences.pop(); + pending_operations.pop_front(); + } + if (!current_fence->IsStubbed()) { + WaitFence(current_fence); + } + PopAsyncFlushes(); + for (auto& operation : current_operations) { + operation(); + } + { + std::unique_lock lock(ring_guard); + delayed_destruction_ring.Push(std::move(current_fence)); + } } } @@ -154,19 +236,16 @@ private: query_cache.CommitAsyncFlushes(); } - void PopFence() { - delayed_destruction_ring.Push(std::move(fences.front())); - fences.pop(); - } - - void CommitOperations() { - pending_operations.emplace_back(std::move(uncommitted_operations)); - } - std::queue<TFence> fences; std::deque<std::function<void()>> uncommitted_operations; std::deque<std::deque<std::function<void()>>> pending_operations; + std::mutex guard; + std::mutex ring_guard; + std::condition_variable cv; + + std::jthread fence_thread; + DelayedDestructionRing<TFence, 6> delayed_destruction_ring; }; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 01fb5b546..7b2cde7a7 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -82,6 +82,7 @@ void MemoryManager::SetEntry(size_t position, MemoryManager::EntryType entry) { } PTEKind MemoryManager::GetPageKind(GPUVAddr gpu_addr) const { + std::unique_lock<std::mutex> lock(guard); return kind_map.GetValueAt(gpu_addr); } @@ -160,7 +161,10 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr } remaining_size -= big_page_size; } - kind_map.Map(gpu_addr, gpu_addr + size, kind); + { + std::unique_lock<std::mutex> lock(guard); + kind_map.Map(gpu_addr, gpu_addr + size, kind); + } return gpu_addr; } @@ -553,6 +557,7 @@ size_t MemoryManager::MaxContinuousRange(GPUVAddr gpu_addr, size_t size) const { } size_t MemoryManager::GetMemoryLayoutSize(GPUVAddr gpu_addr, size_t max_size) const { + std::unique_lock<std::mutex> lock(guard); return kind_map.GetContinuousSizeFrom(gpu_addr); } @@ -745,10 +750,10 @@ void MemoryManager::FlushCaching() { return; } accumulator->Callback([this](GPUVAddr addr, size_t size) { - GetSubmappedRangeImpl<false>(addr, size, page_stash); + GetSubmappedRangeImpl<false>(addr, size, page_stash2); }); - rasterizer->InnerInvalidation(page_stash); - page_stash.clear(); + rasterizer->InnerInvalidation(page_stash2); + page_stash2.clear(); accumulator->Clear(); } diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index fbbe856c4..794535122 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -5,6 +5,7 @@ #include <atomic> #include <map> +#include <mutex> #include <optional> #include <vector> @@ -215,6 +216,9 @@ private: std::vector<u64> big_page_continuous; std::vector<std::pair<VAddr, std::size_t>> page_stash{}; + std::vector<std::pair<VAddr, std::size_t>> page_stash2{}; + + mutable std::mutex guard; static constexpr size_t continuous_bits = 64; diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 8906ba6d8..941de95c1 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -6,6 +6,7 @@ #include <algorithm> #include <array> #include <cstring> +#include <functional> #include <iterator> #include <list> #include <memory> @@ -17,13 +18,19 @@ #include "common/assert.h" #include "common/settings.h" +#include "core/memory.h" #include "video_core/control/channel_state_cache.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/texture_cache/slot_vector.h" namespace VideoCommon { +using AsyncJobId = SlotId; + +static constexpr AsyncJobId NULL_ASYNC_JOB_ID{0}; + template <class QueryCache, class HostCounter> class CounterStreamBase { public: @@ -93,9 +100,13 @@ private: template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { public: - explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_) - : rasterizer{rasterizer_}, streams{{CounterStream{static_cast<QueryCache&>(*this), - VideoCore::QueryType::SamplesPassed}}} {} + explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_) + : rasterizer{rasterizer_}, + cpu_memory{cpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this), + VideoCore::QueryType::SamplesPassed}}} { + (void)slot_async_jobs.insert(); // Null value + } void InvalidateRegion(VAddr addr, std::size_t size) { std::unique_lock lock{mutex}; @@ -126,10 +137,15 @@ public: query = Register(type, *cpu_addr, host_ptr, timestamp.has_value()); } - query->BindCounter(Stream(type).Current(), timestamp); - if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - AsyncFlushQuery(*cpu_addr); + auto result = query->BindCounter(Stream(type).Current(), timestamp); + if (result) { + auto async_job_id = query->GetAsyncJob(); + auto& async_job = slot_async_jobs[async_job_id]; + async_job.collected = true; + async_job.value = *result; + query->SetAsyncJob(NULL_ASYNC_JOB_ID); } + AsyncFlushQuery(query, timestamp, lock); } /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch. @@ -173,15 +189,18 @@ public: } void CommitAsyncFlushes() { + std::unique_lock lock{mutex}; committed_flushes.push_back(uncommitted_flushes); uncommitted_flushes.reset(); } bool HasUncommittedFlushes() const { + std::unique_lock lock{mutex}; return uncommitted_flushes != nullptr; } bool ShouldWaitAsyncFlushes() const { + std::unique_lock lock{mutex}; if (committed_flushes.empty()) { return false; } @@ -189,6 +208,7 @@ public: } void PopAsyncFlushes() { + std::unique_lock lock{mutex}; if (committed_flushes.empty()) { return; } @@ -197,15 +217,25 @@ public: committed_flushes.pop_front(); return; } - for (VAddr query_address : *flush_list) { - FlushAndRemoveRegion(query_address, 4); + for (AsyncJobId async_job_id : *flush_list) { + AsyncJob& async_job = slot_async_jobs[async_job_id]; + if (!async_job.collected) { + FlushAndRemoveRegion(async_job.query_location, 2, true); + } } committed_flushes.pop_front(); } private: + struct AsyncJob { + bool collected = false; + u64 value = 0; + VAddr query_location = 0; + std::optional<u64> timestamp{}; + }; + /// Flushes a memory range to guest memory and removes it from the cache. - void FlushAndRemoveRegion(VAddr addr, std::size_t size) { + void FlushAndRemoveRegion(VAddr addr, std::size_t size, bool async = false) { const u64 addr_begin = addr; const u64 addr_end = addr_begin + size; const auto in_range = [addr_begin, addr_end](const CachedQuery& query) { @@ -226,7 +256,16 @@ private: continue; } rasterizer.UpdatePagesCachedCount(query.GetCpuAddr(), query.SizeInBytes(), -1); - query.Flush(); + AsyncJobId async_job_id = query.GetAsyncJob(); + auto flush_result = query.Flush(async); + if (async_job_id == NULL_ASYNC_JOB_ID) { + ASSERT_MSG(false, "This should not be reachable at all"); + continue; + } + AsyncJob& async_job = slot_async_jobs[async_job_id]; + async_job.collected = true; + async_job.value = flush_result; + query.SetAsyncJob(NULL_ASYNC_JOB_ID); } std::erase_if(contents, in_range); } @@ -253,26 +292,60 @@ private: return found != std::end(contents) ? &*found : nullptr; } - void AsyncFlushQuery(VAddr addr) { - if (!uncommitted_flushes) { - uncommitted_flushes = std::make_shared<std::vector<VAddr>>(); + void AsyncFlushQuery(CachedQuery* query, std::optional<u64> timestamp, + std::unique_lock<std::recursive_mutex>& lock) { + const AsyncJobId new_async_job_id = slot_async_jobs.insert(); + { + AsyncJob& async_job = slot_async_jobs[new_async_job_id]; + query->SetAsyncJob(new_async_job_id); + async_job.query_location = query->GetCpuAddr(); + async_job.collected = false; + + if (!uncommitted_flushes) { + uncommitted_flushes = std::make_shared<std::vector<AsyncJobId>>(); + } + uncommitted_flushes->push_back(new_async_job_id); } - uncommitted_flushes->push_back(addr); + lock.unlock(); + std::function<void()> operation([this, new_async_job_id, timestamp] { + std::unique_lock local_lock{mutex}; + AsyncJob& async_job = slot_async_jobs[new_async_job_id]; + u64 value = async_job.value; + VAddr address = async_job.query_location; + slot_async_jobs.erase(new_async_job_id); + local_lock.unlock(); + if (timestamp) { + u64 timestamp_value = *timestamp; + cpu_memory.WriteBlockUnsafe(address + sizeof(u64), ×tamp_value, sizeof(u64)); + cpu_memory.WriteBlockUnsafe(address, &value, sizeof(u64)); + rasterizer.InvalidateRegion(address, sizeof(u64) * 2, + VideoCommon::CacheType::NoQueryCache); + } else { + u32 small_value = static_cast<u32>(value); + cpu_memory.WriteBlockUnsafe(address, &small_value, sizeof(u32)); + rasterizer.InvalidateRegion(address, sizeof(u32), + VideoCommon::CacheType::NoQueryCache); + } + }); + rasterizer.SyncOperation(std::move(operation)); } static constexpr std::uintptr_t YUZU_PAGESIZE = 4096; static constexpr unsigned YUZU_PAGEBITS = 12; + SlotVector<AsyncJob> slot_async_jobs; + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; - std::recursive_mutex mutex; + mutable std::recursive_mutex mutex; std::unordered_map<u64, std::vector<CachedQuery>> cached_queries; std::array<CounterStream, VideoCore::NumQueryTypes> streams; - std::shared_ptr<std::vector<VAddr>> uncommitted_flushes{}; - std::list<std::shared_ptr<std::vector<VAddr>>> committed_flushes; + std::shared_ptr<std::vector<AsyncJobId>> uncommitted_flushes{}; + std::list<std::shared_ptr<std::vector<AsyncJobId>>> committed_flushes; }; template <class QueryCache, class HostCounter> @@ -291,12 +364,12 @@ public: virtual ~HostCounterBase() = default; /// Returns the current value of the query. - u64 Query() { + u64 Query(bool async = false) { if (result) { return *result; } - u64 value = BlockingQuery() + base_result; + u64 value = BlockingQuery(async) + base_result; if (dependency) { value += dependency->Query(); dependency = nullptr; @@ -317,7 +390,7 @@ public: protected: /// Returns the value of query from the backend API blocking as needed. - virtual u64 BlockingQuery() const = 0; + virtual u64 BlockingQuery(bool async = false) const = 0; private: std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value. @@ -340,26 +413,33 @@ public: CachedQueryBase& operator=(const CachedQueryBase&) = delete; /// Flushes the query to guest memory. - virtual void Flush() { + virtual u64 Flush(bool async = false) { // When counter is nullptr it means that it's just been reset. We are supposed to write a // zero in these cases. - const u64 value = counter ? counter->Query() : 0; + const u64 value = counter ? counter->Query(async) : 0; + if (async) { + return value; + } std::memcpy(host_ptr, &value, sizeof(u64)); if (timestamp) { std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64)); } + return value; } /// Binds a counter to this query. - void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) { + std::optional<u64> BindCounter(std::shared_ptr<HostCounter> counter_, + std::optional<u64> timestamp_) { + std::optional<u64> result{}; if (counter) { // If there's an old counter set it means the query is being rewritten by the game. // To avoid losing the data forever, flush here. - Flush(); + result = std::make_optional(Flush()); } counter = std::move(counter_); timestamp = timestamp_; + return result; } VAddr GetCpuAddr() const noexcept { @@ -374,6 +454,14 @@ public: return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE; } + void SetAsyncJob(AsyncJobId assigned_async_job_) { + assigned_async_job = assigned_async_job_; + } + + AsyncJobId GetAsyncJob() const { + return assigned_async_job; + } + protected: /// Returns true when querying the counter may potentially block. bool WaitPending() const noexcept { @@ -389,6 +477,7 @@ private: u8* host_ptr; ///< Writable host pointer. std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree. std::optional<u64> timestamp; ///< Timestamp to flush to guest memory. + AsyncJobId assigned_async_job; }; } // namespace VideoCommon diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a8c3f8b67..18d3c3ac0 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -8,6 +8,7 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -200,6 +201,8 @@ private: struct BufferCacheParams { using Runtime = OpenGL::BufferCacheRuntime; using Buffer = OpenGL::Buffer; + using Async_Buffer = u32; + using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; static constexpr bool IS_OPENGL = true; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; @@ -208,6 +211,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; static constexpr bool USE_MEMORY_MAPS = false; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false; }; using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp new file mode 100644 index 000000000..f15ae8e25 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache<OpenGL::BufferCacheParams>; +} diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 22ed16ebf..400c21981 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -108,7 +108,8 @@ bool IsASTCSupported() { [[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) { const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); + return nsight || HasExtension(extensions, "GL_EXT_debug_tool") || + Settings::values.renderer_debug.GetValue(); } } // Anonymous namespace diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h index f1446e732..e21b19dcc 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.h +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -30,7 +30,17 @@ private: }; using Fence = std::shared_ptr<GLInnerFence>; -using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>; + +struct FenceManagerParams { + using FenceType = Fence; + using BufferCacheType = BufferCache; + using TextureCacheType = TextureCache; + using QueryCacheType = QueryCache; + + static constexpr bool HAS_ASYNC_CHECK = false; +}; + +using GenericFenceManager = VideoCommon::FenceManager<FenceManagerParams>; class FenceManagerOpenGL final : public GenericFenceManager { public: diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 5070db441..99d7347f5 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -26,8 +26,8 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace -QueryCache::QueryCache(RasterizerOpenGL& rasterizer_) - : QueryCacheBase(rasterizer_), gl_rasterizer{rasterizer_} {} +QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) + : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; @@ -74,7 +74,7 @@ void HostCounter::EndQuery() { glEndQuery(GetTarget(type)); } -u64 HostCounter::BlockingQuery() const { +u64 HostCounter::BlockingQuery([[maybe_unused]] bool async) const { GLint64 value; glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value); return static_cast<u64>(value); @@ -96,7 +96,7 @@ CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { return *this; } -void CachedQuery::Flush() { +u64 CachedQuery::Flush([[maybe_unused]] bool async) { // Waiting for a query while another query of the same target is enabled locks Nvidia's driver. // To avoid this disable and re-enable keeping the dependency stream. // But we only have to do this if we have pending waits to be done. @@ -106,11 +106,13 @@ void CachedQuery::Flush() { stream.Update(false); } - VideoCommon::CachedQueryBase<HostCounter>::Flush(); + auto result = VideoCommon::CachedQueryBase<HostCounter>::Flush(); if (slice_counter) { stream.Update(true); } + + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 14ce59990..872513f22 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -28,7 +28,7 @@ using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(RasterizerOpenGL& rasterizer_); + explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); ~QueryCache(); OGLQuery AllocateQuery(VideoCore::QueryType type); @@ -51,7 +51,7 @@ public: void EndQuery(); private: - u64 BlockingQuery() const override; + u64 BlockingQuery(bool async = false) const override; QueryCache& cache; const VideoCore::QueryType type; @@ -70,7 +70,7 @@ public: CachedQuery(const CachedQuery&) = delete; CachedQuery& operator=(const CachedQuery&) = delete; - void Flush() override; + u64 Flush(bool async = false) override; private: QueryCache* cache; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 90e35e307..0089b4b27 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -63,7 +63,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache(*this, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager, state_tracker, gpu.ShaderNotify()), - query_cache(*this), accelerate_dma(buffer_cache, texture_cache), + query_cache(*this, cpu_memory_), accelerate_dma(buffer_cache, texture_cache), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), blit_image(program_manager_) {} @@ -1287,8 +1287,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, } const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; - const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing - : VideoCommon::ObtainBufferOperation::MarkAsWritten; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; const auto [buffer, offset] = buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); @@ -1299,7 +1298,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, if constexpr (IS_IMAGE_UPLOAD) { image->UploadMemory(buffer->Handle(), offset, copy_span); } else { - image->DownloadMemory(buffer->Handle(), offset, copy_span); + texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span, + buffer_operand.address, buffer_size); } return true; } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 0b9c4a904..052456f61 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -803,30 +803,40 @@ void Image::UploadMemory(const ImageBufferMap& map, void Image::DownloadMemory(GLuint buffer_handle, size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies) { + std::array buffer_handles{buffer_handle}; + std::array buffer_offsets{buffer_offset}; + DownloadMemory(buffer_handles, buffer_offsets, copies); +} + +void Image::DownloadMemory(std::span<GLuint> buffer_handles, std::span<size_t> buffer_offsets, + std::span<const VideoCommon::BufferImageCopy> copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); } glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API - glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle); - glPixelStorei(GL_PACK_ALIGNMENT, 1); + for (size_t i = 0; i < buffer_handles.size(); i++) { + auto& buffer_handle = buffer_handles[i]; + glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle); + glPixelStorei(GL_PACK_ALIGNMENT, 1); - u32 current_row_length = std::numeric_limits<u32>::max(); - u32 current_image_height = std::numeric_limits<u32>::max(); + u32 current_row_length = std::numeric_limits<u32>::max(); + u32 current_image_height = std::numeric_limits<u32>::max(); - for (const VideoCommon::BufferImageCopy& copy : copies) { - if (copy.image_subresource.base_level >= gl_num_levels) { - continue; - } - if (current_row_length != copy.buffer_row_length) { - current_row_length = copy.buffer_row_length; - glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length); - } - if (current_image_height != copy.buffer_image_height) { - current_image_height = copy.buffer_image_height; - glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); + for (const VideoCommon::BufferImageCopy& copy : copies) { + if (copy.image_subresource.base_level >= gl_num_levels) { + continue; + } + if (current_row_length != copy.buffer_row_length) { + current_row_length = copy.buffer_row_length; + glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length); + } + if (current_image_height != copy.buffer_image_height) { + current_image_height = copy.buffer_image_height; + glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); + } + CopyImageToBuffer(copy, buffer_offsets[i]); } - CopyImageToBuffer(copy, buffer_offset); } if (is_rescaled) { ScaleUp(true); @@ -851,9 +861,12 @@ GLuint Image::StorageHandle() noexcept { case PixelFormat::ASTC_2D_8X5_SRGB: case PixelFormat::ASTC_2D_5X4_SRGB: case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X5_SRGB: + case PixelFormat::ASTC_2D_10X6_SRGB: case PixelFormat::ASTC_2D_10X8_SRGB: case PixelFormat::ASTC_2D_6X6_SRGB: case PixelFormat::ASTC_2D_10X10_SRGB: + case PixelFormat::ASTC_2D_12X10_SRGB: case PixelFormat::ASTC_2D_12X12_SRGB: case PixelFormat::ASTC_2D_8X6_SRGB: case PixelFormat::ASTC_2D_6X5_SRGB: @@ -1113,7 +1126,8 @@ bool Image::ScaleDown(bool ignore) { ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, ImageId image_id_, Image& image, const SlotVector<Image>&) - : VideoCommon::ImageViewBase{info, image.info, image_id_}, views{runtime.null_image_views} { + : VideoCommon::ImageViewBase{info, image.info, image_id_, image.gpu_addr}, + views{runtime.null_image_views} { const Device& device = runtime.device; if (True(image.flags & ImageFlagBits::Converted)) { internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; @@ -1204,12 +1218,12 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, const VideoCommon::ImageViewInfo& view_info, GPUVAddr gpu_addr_) - : VideoCommon::ImageViewBase{info, view_info}, gpu_addr{gpu_addr_}, + : VideoCommon::ImageViewBase{info, view_info, gpu_addr_}, buffer_size{VideoCommon::CalculateGuestSizeInBytes(info)} {} ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, const VideoCommon::ImageViewInfo& view_info) - : VideoCommon::ImageViewBase{info, view_info} {} + : VideoCommon::ImageViewBase{info, view_info, 0} {} ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageViewParams& params) : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {} @@ -1269,7 +1283,7 @@ GLuint ImageView::MakeView(Shader::TextureType view_type, GLenum view_format) { ApplySwizzle(view.handle, format, casted_swizzle); } if (set_object_label) { - const std::string name = VideoCommon::Name(*this); + const std::string name = VideoCommon::Name(*this, gpu_addr); glObjectLabel(GL_TEXTURE, view.handle, static_cast<GLsizei>(name.size()), name.data()); } return view.handle; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 911e4607a..1190999a8 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -215,6 +215,9 @@ public: void DownloadMemory(GLuint buffer_handle, size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(std::span<GLuint> buffer_handle, std::span<size_t> buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); GLuint StorageHandle() noexcept; @@ -311,7 +314,6 @@ private: std::unique_ptr<StorageViews> storage_views; GLenum internal_format = GL_NONE; GLuint default_handle = 0; - GPUVAddr gpu_addr = 0; u32 buffer_size = 0; GLuint original_texture = 0; int num_samples = 0; @@ -376,6 +378,7 @@ struct TextureCacheParams { using Sampler = OpenGL::Sampler; using Framebuffer = OpenGL::Framebuffer; using AsyncBuffer = u32; + using BufferType = GLuint; }; using TextureCache = VideoCommon::TextureCache<TextureCacheParams>; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index ef1190e1f..c7dc7e0a1 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -100,10 +100,13 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB {GL_COMPRESSED_RGBA_ASTC_10x6_KHR}, // ASTC_2D_10X6_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR}, // ASTC_2D_10X6_SRGB {GL_COMPRESSED_RGBA_ASTC_10x5_KHR}, // ASTC_2D_10X5_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR}, // ASTC_2D_10X5_SRGB {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB + {GL_COMPRESSED_RGBA_ASTC_12x10_KHR}, // ASTC_2D_12X10_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR}, // ASTC_2D_12X10_SRGB {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 5dce51be8..8853cf0f7 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -197,10 +197,13 @@ struct FormatTuple { {VK_FORMAT_ASTC_6x6_UNORM_BLOCK}, // ASTC_2D_6X6_UNORM {VK_FORMAT_ASTC_6x6_SRGB_BLOCK}, // ASTC_2D_6X6_SRGB {VK_FORMAT_ASTC_10x6_UNORM_BLOCK}, // ASTC_2D_10X6_UNORM + {VK_FORMAT_ASTC_10x6_SRGB_BLOCK}, // ASTC_2D_10X6_SRGB {VK_FORMAT_ASTC_10x5_UNORM_BLOCK}, // ASTC_2D_10X5_UNORM {VK_FORMAT_ASTC_10x5_SRGB_BLOCK}, // ASTC_2D_10X5_SRGB {VK_FORMAT_ASTC_10x10_UNORM_BLOCK}, // ASTC_2D_10X10_UNORM {VK_FORMAT_ASTC_10x10_SRGB_BLOCK}, // ASTC_2D_10X10_SRGB + {VK_FORMAT_ASTC_12x10_UNORM_BLOCK}, // ASTC_2D_12X10_UNORM + {VK_FORMAT_ASTC_12x10_SRGB_BLOCK}, // ASTC_2D_12X10_SRGB {VK_FORMAT_ASTC_12x12_UNORM_BLOCK}, // ASTC_2D_12X12_UNORM {VK_FORMAT_ASTC_12x12_SRGB_BLOCK}, // ASTC_2D_12X12_SRGB {VK_FORMAT_ASTC_8x6_UNORM_BLOCK}, // ASTC_2D_8X6_UNORM diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 2a8d9e377..8e31eba34 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -88,13 +88,14 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, Settings::values.renderer_debug.GetValue())), debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), - surface(CreateSurface(instance, render_window)), + surface(CreateSurface(instance, render_window.GetWindowInfo())), device(CreateDevice(instance, dld, *surface)), memory_allocator(device, false), state_tracker(), scheduler(device, state_tracker), swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, render_window.GetFramebufferLayout().height, false), - blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler, - screen_info), + present_manager(render_window, device, memory_allocator, scheduler, swapchain), + blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, present_manager, + scheduler, screen_info), rasterizer(render_window, gpu, cpu_memory, screen_info, device, memory_allocator, state_tracker, scheduler) { if (Settings::values.renderer_force_max_clock.GetValue() && device.ShouldBoostClocks()) { @@ -121,46 +122,19 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { return; } // Update screen info if the framebuffer size has changed. - if (screen_info.width != framebuffer->width || screen_info.height != framebuffer->height) { - screen_info.width = framebuffer->width; - screen_info.height = framebuffer->height; - } + screen_info.width = framebuffer->width; + screen_info.height = framebuffer->height; + const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; const bool use_accelerated = rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); const bool is_srgb = use_accelerated && screen_info.is_srgb; RenderScreenshot(*framebuffer, use_accelerated); - bool has_been_recreated = false; - const auto recreate_swapchain = [&](u32 width, u32 height) { - if (!has_been_recreated) { - has_been_recreated = true; - scheduler.Finish(); - } - swapchain.Create(width, height, is_srgb); - }; - - const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout(); - if (swapchain.NeedsRecreation(is_srgb) || swapchain.GetWidth() != layout.width || - swapchain.GetHeight() != layout.height) { - recreate_swapchain(layout.width, layout.height); - } - bool is_outdated; - do { - swapchain.AcquireNextImage(); - is_outdated = swapchain.IsOutDated(); - if (is_outdated) { - recreate_swapchain(layout.width, layout.height); - } - } while (is_outdated); - if (has_been_recreated) { - blit_screen.Recreate(); - } - const VkSemaphore render_semaphore = blit_screen.DrawToSwapchain(*framebuffer, use_accelerated); - const VkSemaphore present_semaphore = swapchain.CurrentPresentSemaphore(); - scheduler.Flush(render_semaphore, present_semaphore); - scheduler.WaitWorker(); - swapchain.Present(render_semaphore); + Frame* frame = present_manager.GetRenderFrame(); + blit_screen.DrawToSwapchain(frame, *framebuffer, use_accelerated, is_srgb); + scheduler.Flush(*frame->render_ready); + present_manager.Present(frame); gpu.RendererFrameEndNotify(); rasterizer.TickFrame(); @@ -246,8 +220,7 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr }); const VkExtent2D render_area{.width = layout.width, .height = layout.height}; const vk::Framebuffer screenshot_fb = blit_screen.CreateFramebuffer(*dst_view, render_area); - // Since we're not rendering to the screen, ignore the render semaphore. - void(blit_screen.Draw(framebuffer, *screenshot_fb, layout, render_area, use_accelerated)); + blit_screen.Draw(framebuffer, *screenshot_fb, layout, render_area, use_accelerated); const auto buffer_size = static_cast<VkDeviceSize>(layout.width * layout.height * 4); const VkBufferCreateInfo dst_buffer_info{ @@ -270,7 +243,7 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr .pNext = nullptr, .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - .oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 009e75e0d..f44367cb2 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -9,6 +9,7 @@ #include "common/dynamic_library.h" #include "video_core/renderer_base.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" +#include "video_core/renderer_vulkan/vk_present_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" @@ -76,6 +77,7 @@ private: StateTracker state_tracker; Scheduler scheduler; Swapchain swapchain; + PresentManager present_manager; BlitScreen blit_screen; RasterizerVulkan rasterizer; std::optional<TurboMode> turbo_mode; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 2f0cc27e8..1e0fdd3d9 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -122,10 +122,12 @@ struct BlitScreen::BufferData { BlitScreen::BlitScreen(Core::Memory::Memory& cpu_memory_, Core::Frontend::EmuWindow& render_window_, const Device& device_, MemoryAllocator& memory_allocator_, - Swapchain& swapchain_, Scheduler& scheduler_, const ScreenInfo& screen_info_) + Swapchain& swapchain_, PresentManager& present_manager_, + Scheduler& scheduler_, const ScreenInfo& screen_info_) : cpu_memory{cpu_memory_}, render_window{render_window_}, device{device_}, - memory_allocator{memory_allocator_}, swapchain{swapchain_}, scheduler{scheduler_}, - image_count{swapchain.GetImageCount()}, screen_info{screen_info_} { + memory_allocator{memory_allocator_}, swapchain{swapchain_}, present_manager{present_manager_}, + scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_}, + current_srgb{swapchain.IsSrgb()}, image_view_format{swapchain.GetImageViewFormat()} { resource_ticks.resize(image_count); CreateStaticResources(); @@ -135,25 +137,20 @@ BlitScreen::BlitScreen(Core::Memory::Memory& cpu_memory_, Core::Frontend::EmuWin BlitScreen::~BlitScreen() = default; void BlitScreen::Recreate() { + present_manager.WaitPresent(); + scheduler.Finish(); + device.GetLogical().WaitIdle(); CreateDynamicResources(); } -VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, - const VkFramebuffer& host_framebuffer, - const Layout::FramebufferLayout layout, VkExtent2D render_area, - bool use_accelerated) { +void BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, + const VkFramebuffer& host_framebuffer, const Layout::FramebufferLayout layout, + VkExtent2D render_area, bool use_accelerated) { RefreshResources(framebuffer); // Finish any pending renderpass scheduler.RequestOutsideRenderPassOperationContext(); - if (const auto swapchain_images = swapchain.GetImageCount(); swapchain_images != image_count) { - image_count = swapchain_images; - Recreate(); - } - - const std::size_t image_index = swapchain.GetImageIndex(); - scheduler.Wait(resource_ticks[image_index]); resource_ticks[image_index] = scheduler.CurrentTick(); @@ -169,7 +166,7 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, std::memcpy(mapped_span.data(), &data, sizeof(data)); if (!use_accelerated) { - const u64 image_offset = GetRawImageOffset(framebuffer, image_index); + const u64 image_offset = GetRawImageOffset(framebuffer); const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr); @@ -204,8 +201,8 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, .depth = 1, }, }; - scheduler.Record([this, copy, image_index](vk::CommandBuffer cmdbuf) { - const VkImage image = *raw_images[image_index]; + scheduler.Record([this, copy, index = image_index](vk::CommandBuffer cmdbuf) { + const VkImage image = *raw_images[index]; const VkImageMemoryBarrier base_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, @@ -245,14 +242,15 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, const auto anti_alias_pass = Settings::values.anti_aliasing.GetValue(); if (use_accelerated && anti_alias_pass == Settings::AntiAliasing::Fxaa) { - UpdateAADescriptorSet(image_index, source_image_view, false); + UpdateAADescriptorSet(source_image_view, false); const u32 up_scale = Settings::values.resolution_info.up_scale; const u32 down_shift = Settings::values.resolution_info.down_shift; VkExtent2D size{ .width = (up_scale * framebuffer.width) >> down_shift, .height = (up_scale * framebuffer.height) >> down_shift, }; - scheduler.Record([this, image_index, size, anti_alias_pass](vk::CommandBuffer cmdbuf) { + scheduler.Record([this, index = image_index, size, + anti_alias_pass](vk::CommandBuffer cmdbuf) { const VkImageMemoryBarrier base_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, @@ -326,7 +324,7 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices)); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline_layout, 0, - aa_descriptor_sets[image_index], {}); + aa_descriptor_sets[index], {}); cmdbuf.Draw(4, 1, 0, 0); cmdbuf.EndRenderPass(); @@ -369,81 +367,99 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, }; VkImageView fsr_image_view = fsr->Draw(scheduler, image_index, source_image_view, fsr_input_size, crop_rect); - UpdateDescriptorSet(image_index, fsr_image_view, true); + UpdateDescriptorSet(fsr_image_view, true); } else { const bool is_nn = Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::NearestNeighbor; - UpdateDescriptorSet(image_index, source_image_view, is_nn); + UpdateDescriptorSet(source_image_view, is_nn); } - scheduler.Record( - [this, host_framebuffer, image_index, size = render_area](vk::CommandBuffer cmdbuf) { - const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; - const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f; - const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f; - const VkClearValue clear_color{ - .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}}, - }; - const VkRenderPassBeginInfo renderpass_bi{ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, - .pNext = nullptr, - .renderPass = *renderpass, - .framebuffer = host_framebuffer, - .renderArea = - { - .offset = {0, 0}, - .extent = size, - }, - .clearValueCount = 1, - .pClearValues = &clear_color, - }; - const VkViewport viewport{ - .x = 0.0f, - .y = 0.0f, - .width = static_cast<float>(size.width), - .height = static_cast<float>(size.height), - .minDepth = 0.0f, - .maxDepth = 1.0f, - }; - const VkRect2D scissor{ - .offset = {0, 0}, - .extent = size, - }; - cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); - auto graphics_pipeline = [this]() { - switch (Settings::values.scaling_filter.GetValue()) { - case Settings::ScalingFilter::NearestNeighbor: - case Settings::ScalingFilter::Bilinear: - return *bilinear_pipeline; - case Settings::ScalingFilter::Bicubic: - return *bicubic_pipeline; - case Settings::ScalingFilter::Gaussian: - return *gaussian_pipeline; - case Settings::ScalingFilter::ScaleForce: - return *scaleforce_pipeline; - default: - return *bilinear_pipeline; - } - }(); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline); - cmdbuf.SetViewport(0, viewport); - cmdbuf.SetScissor(0, scissor); - - cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices)); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline_layout, 0, - descriptor_sets[image_index], {}); - cmdbuf.Draw(4, 1, 0, 0); - cmdbuf.EndRenderPass(); - }); - return *semaphores[image_index]; + scheduler.Record([this, host_framebuffer, index = image_index, + size = render_area](vk::CommandBuffer cmdbuf) { + const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; + const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f; + const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f; + const VkClearValue clear_color{ + .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}}, + }; + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = *renderpass, + .framebuffer = host_framebuffer, + .renderArea = + { + .offset = {0, 0}, + .extent = size, + }, + .clearValueCount = 1, + .pClearValues = &clear_color, + }; + const VkViewport viewport{ + .x = 0.0f, + .y = 0.0f, + .width = static_cast<float>(size.width), + .height = static_cast<float>(size.height), + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + const VkRect2D scissor{ + .offset = {0, 0}, + .extent = size, + }; + cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); + auto graphics_pipeline = [this]() { + switch (Settings::values.scaling_filter.GetValue()) { + case Settings::ScalingFilter::NearestNeighbor: + case Settings::ScalingFilter::Bilinear: + return *bilinear_pipeline; + case Settings::ScalingFilter::Bicubic: + return *bicubic_pipeline; + case Settings::ScalingFilter::Gaussian: + return *gaussian_pipeline; + case Settings::ScalingFilter::ScaleForce: + return *scaleforce_pipeline; + default: + return *bilinear_pipeline; + } + }(); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline); + cmdbuf.SetViewport(0, viewport); + cmdbuf.SetScissor(0, scissor); + + cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices)); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline_layout, 0, + descriptor_sets[index], {}); + cmdbuf.Draw(4, 1, 0, 0); + cmdbuf.EndRenderPass(); + }); } -VkSemaphore BlitScreen::DrawToSwapchain(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated) { - const std::size_t image_index = swapchain.GetImageIndex(); - const VkExtent2D render_area = swapchain.GetSize(); +void BlitScreen::DrawToSwapchain(Frame* frame, const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated, bool is_srgb) { + // Recreate dynamic resources if the the image count or colorspace changed + if (const std::size_t swapchain_images = swapchain.GetImageCount(); + swapchain_images != image_count || current_srgb != is_srgb) { + current_srgb = is_srgb; + image_view_format = current_srgb ? VK_FORMAT_B8G8R8A8_SRGB : VK_FORMAT_B8G8R8A8_UNORM; + image_count = swapchain_images; + Recreate(); + } + + // Recreate the presentation frame if the dimensions of the window changed const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout(); - return Draw(framebuffer, *framebuffers[image_index], layout, render_area, use_accelerated); + if (layout.width != frame->width || layout.height != frame->height || + is_srgb != frame->is_srgb) { + Recreate(); + present_manager.RecreateFrame(frame, layout.width, layout.height, is_srgb, + image_view_format, *renderpass); + } + + const VkExtent2D render_area{frame->width, frame->height}; + Draw(framebuffer, *frame->framebuffer, layout, render_area, use_accelerated); + if (++image_index >= image_count) { + image_index = 0; + } } vk::Framebuffer BlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent) { @@ -471,13 +487,11 @@ void BlitScreen::CreateStaticResources() { } void BlitScreen::CreateDynamicResources() { - CreateSemaphores(); CreateDescriptorPool(); CreateDescriptorSetLayout(); CreateDescriptorSets(); CreatePipelineLayout(); CreateRenderPass(); - CreateFramebuffers(); CreateGraphicsPipeline(); fsr.reset(); smaa.reset(); @@ -525,11 +539,6 @@ void BlitScreen::CreateShaders() { } } -void BlitScreen::CreateSemaphores() { - semaphores.resize(image_count); - std::ranges::generate(semaphores, [this] { return device.GetLogical().CreateSemaphore(); }); -} - void BlitScreen::CreateDescriptorPool() { const std::array<VkDescriptorPoolSize, 2> pool_sizes{{ { @@ -571,10 +580,10 @@ void BlitScreen::CreateDescriptorPool() { } void BlitScreen::CreateRenderPass() { - renderpass = CreateRenderPassImpl(swapchain.GetImageViewFormat()); + renderpass = CreateRenderPassImpl(image_view_format); } -vk::RenderPass BlitScreen::CreateRenderPassImpl(VkFormat format, bool is_present) { +vk::RenderPass BlitScreen::CreateRenderPassImpl(VkFormat format) { const VkAttachmentDescription color_attachment{ .flags = 0, .format = format, @@ -584,7 +593,7 @@ vk::RenderPass BlitScreen::CreateRenderPassImpl(VkFormat format, bool is_present .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - .finalLayout = is_present ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, }; const VkAttachmentReference color_attachment_ref{ @@ -1052,16 +1061,6 @@ void BlitScreen::CreateSampler() { nn_sampler = device.GetLogical().CreateSampler(ci_nn); } -void BlitScreen::CreateFramebuffers() { - const VkExtent2D size{swapchain.GetSize()}; - framebuffers.resize(image_count); - - for (std::size_t i = 0; i < image_count; ++i) { - const VkImageView image_view{swapchain.GetImageViewIndex(i)}; - framebuffers[i] = CreateFramebuffer(image_view, size, renderpass); - } -} - void BlitScreen::ReleaseRawImages() { for (const u64 tick : resource_ticks) { scheduler.Wait(tick); @@ -1175,7 +1174,7 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass); return; } - aa_renderpass = CreateRenderPassImpl(GetFormat(framebuffer), false); + aa_renderpass = CreateRenderPassImpl(GetFormat(framebuffer)); aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass); const std::array<VkPipelineShaderStageCreateInfo, 2> fxaa_shader_stages{{ @@ -1319,8 +1318,7 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { aa_pipeline = device.GetLogical().CreateGraphicsPipeline(fxaa_pipeline_ci); } -void BlitScreen::UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view, - bool nn) const { +void BlitScreen::UpdateAADescriptorSet(VkImageView image_view, bool nn) const { const VkDescriptorImageInfo image_info{ .sampler = nn ? *nn_sampler : *sampler, .imageView = image_view, @@ -1356,8 +1354,7 @@ void BlitScreen::UpdateAADescriptorSet(std::size_t image_index, VkImageView imag device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, sampler_write_2}, {}); } -void BlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, - bool nn) const { +void BlitScreen::UpdateDescriptorSet(VkImageView image_view, bool nn) const { const VkDescriptorBufferInfo buffer_info{ .buffer = *buffer, .offset = offsetof(BufferData, uniform), @@ -1480,8 +1477,7 @@ u64 BlitScreen::CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) return sizeof(BufferData) + GetSizeInBytes(framebuffer) * image_count; } -u64 BlitScreen::GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, - std::size_t image_index) const { +u64 BlitScreen::GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer) const { constexpr auto first_image_offset = static_cast<u64>(sizeof(BufferData)); return first_image_offset + GetSizeInBytes(framebuffer) * image_index; } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index ebe10b08b..68ec20253 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -5,6 +5,7 @@ #include <memory> +#include "core/frontend/framebuffer_layout.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -42,6 +43,9 @@ class RasterizerVulkan; class Scheduler; class SMAA; class Swapchain; +class PresentManager; + +struct Frame; struct ScreenInfo { VkImage image{}; @@ -55,18 +59,17 @@ class BlitScreen { public: explicit BlitScreen(Core::Memory::Memory& cpu_memory, Core::Frontend::EmuWindow& render_window, const Device& device, MemoryAllocator& memory_manager, Swapchain& swapchain, - Scheduler& scheduler, const ScreenInfo& screen_info); + PresentManager& present_manager, Scheduler& scheduler, + const ScreenInfo& screen_info); ~BlitScreen(); void Recreate(); - [[nodiscard]] VkSemaphore Draw(const Tegra::FramebufferConfig& framebuffer, - const VkFramebuffer& host_framebuffer, - const Layout::FramebufferLayout layout, VkExtent2D render_area, - bool use_accelerated); + void Draw(const Tegra::FramebufferConfig& framebuffer, const VkFramebuffer& host_framebuffer, + const Layout::FramebufferLayout layout, VkExtent2D render_area, bool use_accelerated); - [[nodiscard]] VkSemaphore DrawToSwapchain(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated); + void DrawToSwapchain(Frame* frame, const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated, bool is_srgb); [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent); @@ -79,10 +82,9 @@ private: void CreateStaticResources(); void CreateShaders(); - void CreateSemaphores(); void CreateDescriptorPool(); void CreateRenderPass(); - vk::RenderPass CreateRenderPassImpl(VkFormat, bool is_present = true); + vk::RenderPass CreateRenderPassImpl(VkFormat format); void CreateDescriptorSetLayout(); void CreateDescriptorSets(); void CreatePipelineLayout(); @@ -90,15 +92,14 @@ private: void CreateSampler(); void CreateDynamicResources(); - void CreateFramebuffers(); void RefreshResources(const Tegra::FramebufferConfig& framebuffer); void ReleaseRawImages(); void CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer); void CreateRawImages(const Tegra::FramebufferConfig& framebuffer); - void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const; - void UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const; + void UpdateDescriptorSet(VkImageView image_view, bool nn) const; + void UpdateAADescriptorSet(VkImageView image_view, bool nn) const; void SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const; void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, const Layout::FramebufferLayout layout) const; @@ -107,16 +108,17 @@ private: void CreateFSR(); u64 CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const; - u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, - std::size_t image_index) const; + u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer) const; Core::Memory::Memory& cpu_memory; Core::Frontend::EmuWindow& render_window; const Device& device; MemoryAllocator& memory_allocator; Swapchain& swapchain; + PresentManager& present_manager; Scheduler& scheduler; std::size_t image_count; + std::size_t image_index{}; const ScreenInfo& screen_info; vk::ShaderModule vertex_shader; @@ -135,7 +137,6 @@ private: vk::Pipeline gaussian_pipeline; vk::Pipeline scaleforce_pipeline; vk::RenderPass renderpass; - std::vector<vk::Framebuffer> framebuffers; vk::DescriptorSets descriptor_sets; vk::Sampler nn_sampler; vk::Sampler sampler; @@ -145,7 +146,6 @@ private: std::vector<u64> resource_ticks; - std::vector<vk::Semaphore> semaphores; std::vector<vk::Image> raw_images; std::vector<vk::ImageView> raw_image_views; std::vector<MemoryCommit> raw_buffer_commits; @@ -164,6 +164,8 @@ private: u32 raw_width = 0; u32 raw_height = 0; Service::android::PixelFormat pixel_format{}; + bool current_srgb; + VkFormat image_view_format; std::unique_ptr<FSR> fsr; std::unique_ptr<SMAA> smaa; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 9cbcb3c8f..510602e8e 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -314,8 +314,12 @@ StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) { return staging_pool.Request(size, MemoryUsage::Upload); } -StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { - return staging_pool.Request(size, MemoryUsage::Download); +StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) { + return staging_pool.Request(size, MemoryUsage::Download, deferred); +} + +void BufferCacheRuntime::FreeDeferredStagingBuffer(StagingBufferRef& ref) { + staging_pool.FreeDeferred(ref); } u64 BufferCacheRuntime::GetDeviceLocalMemory() const { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 183b33632..879f1ed94 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -3,7 +3,8 @@ #pragma once -#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/buffer_cache_base.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -75,7 +76,9 @@ public: [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); - [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false); + + void FreeDeferredStagingBuffer(StagingBufferRef& ref); void PreCopyBarrier(); @@ -142,6 +145,8 @@ private: struct BufferCacheParams { using Runtime = Vulkan::BufferCacheRuntime; using Buffer = Vulkan::Buffer; + using Async_Buffer = Vulkan::StagingBufferRef; + using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; static constexpr bool IS_OPENGL = false; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; @@ -150,6 +155,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; }; using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp new file mode 100644 index 000000000..f9e271507 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache<Vulkan::BufferCacheParams>; +} diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp index 0214b103a..fad9e3832 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp @@ -5,6 +5,7 @@ #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_fence_manager.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/vulkan_common/vulkan_device.h" diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 7fe2afcd9..145359d4e 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -40,7 +40,16 @@ private: }; using Fence = std::shared_ptr<InnerFence>; -using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>; +struct FenceManagerParams { + using FenceType = Fence; + using BufferCacheType = BufferCache; + using TextureCacheType = TextureCache; + using QueryCacheType = QueryCache; + + static constexpr bool HAS_ASYNC_CHECK = true; +}; + +using GenericFenceManager = VideoCommon::FenceManager<FenceManagerParams>; class FenceManager final : public GenericFenceManager { public: diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 985cc3203..a318d643e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -696,6 +696,13 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( ShaderPools& pools, const ComputePipelineCacheKey& key, Shader::Environment& env, PipelineStatistics* statistics, bool build_in_parallel) try { + // TODO: Remove this when Intel fixes their shader compiler. + // https://github.com/IGCIT/Intel-GPU-Community-Issue-Tracker-IGCIT/issues/159 + if (device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + LOG_ERROR(Render_Vulkan, "Skipping 0x{:016x}", key.Hash()); + return nullptr; + } + LOG_INFO(Render_Vulkan, "0x{:016x}", key.Hash()); Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()}; diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp new file mode 100644 index 000000000..c49583013 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -0,0 +1,457 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/microprofile.h" +#include "common/settings.h" +#include "common/thread.h" +#include "video_core/renderer_vulkan/vk_present_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_swapchain.h" +#include "video_core/vulkan_common/vulkan_device.h" + +namespace Vulkan { + +MICROPROFILE_DEFINE(Vulkan_WaitPresent, "Vulkan", "Wait For Present", MP_RGB(128, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_CopyToSwapchain, "Vulkan", "Copy to swapchain", MP_RGB(192, 255, 192)); + +namespace { + +bool CanBlitToSwapchain(const vk::PhysicalDevice& physical_device, VkFormat format) { + const VkFormatProperties props{physical_device.GetFormatProperties(format)}; + return (props.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT); +} + +[[nodiscard]] VkImageSubresourceLayers MakeImageSubresourceLayers() { + return VkImageSubresourceLayers{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }; +} + +[[nodiscard]] VkImageBlit MakeImageBlit(s32 frame_width, s32 frame_height, s32 swapchain_width, + s32 swapchain_height) { + return VkImageBlit{ + .srcSubresource = MakeImageSubresourceLayers(), + .srcOffsets = + { + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = frame_width, + .y = frame_height, + .z = 1, + }, + }, + .dstSubresource = MakeImageSubresourceLayers(), + .dstOffsets = + { + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = swapchain_width, + .y = swapchain_height, + .z = 1, + }, + }, + }; +} + +[[nodiscard]] VkImageCopy MakeImageCopy(u32 frame_width, u32 frame_height, u32 swapchain_width, + u32 swapchain_height) { + return VkImageCopy{ + .srcSubresource = MakeImageSubresourceLayers(), + .srcOffset = + { + .x = 0, + .y = 0, + .z = 0, + }, + .dstSubresource = MakeImageSubresourceLayers(), + .dstOffset = + { + .x = 0, + .y = 0, + .z = 0, + }, + .extent = + { + .width = std::min(frame_width, swapchain_width), + .height = std::min(frame_height, swapchain_height), + .depth = 1, + }, + }; +} + +} // Anonymous namespace + +PresentManager::PresentManager(Core::Frontend::EmuWindow& render_window_, const Device& device_, + MemoryAllocator& memory_allocator_, Scheduler& scheduler_, + Swapchain& swapchain_) + : render_window{render_window_}, device{device_}, + memory_allocator{memory_allocator_}, scheduler{scheduler_}, swapchain{swapchain_}, + blit_supported{CanBlitToSwapchain(device.GetPhysical(), swapchain.GetImageViewFormat())}, + use_present_thread{Settings::values.async_presentation.GetValue()}, + image_count{swapchain.GetImageCount()} { + + auto& dld = device.GetLogical(); + cmdpool = dld.CreateCommandPool({ + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = + VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = device.GetGraphicsFamily(), + }); + auto cmdbuffers = cmdpool.Allocate(image_count); + + frames.resize(image_count); + for (u32 i = 0; i < frames.size(); i++) { + Frame& frame = frames[i]; + frame.cmdbuf = vk::CommandBuffer{cmdbuffers[i], device.GetDispatchLoader()}; + frame.render_ready = dld.CreateSemaphore({ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + }); + frame.present_done = dld.CreateFence({ + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .pNext = nullptr, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, + }); + free_queue.push(&frame); + } + + if (use_present_thread) { + present_thread = std::jthread([this](std::stop_token token) { PresentThread(token); }); + } +} + +PresentManager::~PresentManager() = default; + +Frame* PresentManager::GetRenderFrame() { + MICROPROFILE_SCOPE(Vulkan_WaitPresent); + + // Wait for free presentation frames + std::unique_lock lock{free_mutex}; + free_cv.wait(lock, [this] { return !free_queue.empty(); }); + + // Take the frame from the queue + Frame* frame = free_queue.front(); + free_queue.pop(); + + // Wait for the presentation to be finished so all frame resources are free + frame->present_done.Wait(); + frame->present_done.Reset(); + + return frame; +} + +void PresentManager::Present(Frame* frame) { + if (!use_present_thread) { + scheduler.WaitWorker(); + CopyToSwapchain(frame); + free_queue.push(frame); + return; + } + + scheduler.Record([this, frame](vk::CommandBuffer) { + std::unique_lock lock{queue_mutex}; + present_queue.push(frame); + frame_cv.notify_one(); + }); +} + +void PresentManager::RecreateFrame(Frame* frame, u32 width, u32 height, bool is_srgb, + VkFormat image_view_format, VkRenderPass rd) { + auto& dld = device.GetLogical(); + + frame->width = width; + frame->height = height; + frame->is_srgb = is_srgb; + + frame->image = dld.CreateImage({ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = nullptr, + .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, + .imageType = VK_IMAGE_TYPE_2D, + .format = swapchain.GetImageFormat(), + .extent = + { + .width = width, + .height = height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }); + + frame->image_commit = memory_allocator.Commit(frame->image, MemoryUsage::DeviceLocal); + + frame->image_view = dld.CreateImageView({ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = *frame->image, + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = image_view_format, + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }); + + const VkImageView image_view{*frame->image_view}; + frame->framebuffer = dld.CreateFramebuffer({ + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .renderPass = rd, + .attachmentCount = 1, + .pAttachments = &image_view, + .width = width, + .height = height, + .layers = 1, + }); +} + +void PresentManager::WaitPresent() { + if (!use_present_thread) { + return; + } + + // Wait for the present queue to be empty + { + std::unique_lock queue_lock{queue_mutex}; + frame_cv.wait(queue_lock, [this] { return present_queue.empty(); }); + } + + // The above condition will be satisfied when the last frame is taken from the queue. + // To ensure that frame has been presented as well take hold of the swapchain + // mutex. + std::scoped_lock swapchain_lock{swapchain_mutex}; +} + +void PresentManager::PresentThread(std::stop_token token) { + Common::SetCurrentThreadName("VulkanPresent"); + while (!token.stop_requested()) { + std::unique_lock lock{queue_mutex}; + + // Wait for presentation frames + Common::CondvarWait(frame_cv, lock, token, [this] { return !present_queue.empty(); }); + if (token.stop_requested()) { + return; + } + + // Take the frame and notify anyone waiting + Frame* frame = present_queue.front(); + present_queue.pop(); + frame_cv.notify_one(); + + // By exchanging the lock ownership we take the swapchain lock + // before the queue lock goes out of scope. This way the swapchain + // lock in WaitPresent is guaranteed to occur after here. + std::exchange(lock, std::unique_lock{swapchain_mutex}); + + CopyToSwapchain(frame); + + // Free the frame for reuse + std::scoped_lock fl{free_mutex}; + free_queue.push(frame); + free_cv.notify_one(); + } +} + +void PresentManager::CopyToSwapchain(Frame* frame) { + MICROPROFILE_SCOPE(Vulkan_CopyToSwapchain); + + const auto recreate_swapchain = [&] { + swapchain.Create(frame->width, frame->height, frame->is_srgb); + image_count = swapchain.GetImageCount(); + }; + + // If the size or colorspace of the incoming frames has changed, recreate the swapchain + // to account for that. + const bool srgb_changed = swapchain.NeedsRecreation(frame->is_srgb); + const bool size_changed = + swapchain.GetWidth() != frame->width || swapchain.GetHeight() != frame->height; + if (srgb_changed || size_changed) { + recreate_swapchain(); + } + + while (swapchain.AcquireNextImage()) { + recreate_swapchain(); + } + + const vk::CommandBuffer cmdbuf{frame->cmdbuf}; + cmdbuf.Begin({ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .pNext = nullptr, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + .pInheritanceInfo = nullptr, + }); + + const VkImage image{swapchain.CurrentImage()}; + const VkExtent2D extent = swapchain.GetExtent(); + const std::array pre_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *frame->image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + }; + const std::array post_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *frame->image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + }; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, {}, + {}, {}, pre_barriers); + + if (blit_supported) { + cmdbuf.BlitImage(*frame->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageBlit(frame->width, frame->height, extent.width, extent.height), + VK_FILTER_LINEAR); + } else { + cmdbuf.CopyImage(*frame->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageCopy(frame->width, frame->height, extent.width, extent.height)); + } + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, {}, + {}, {}, post_barriers); + + cmdbuf.End(); + + const VkSemaphore present_semaphore = swapchain.CurrentPresentSemaphore(); + const VkSemaphore render_semaphore = swapchain.CurrentRenderSemaphore(); + const std::array wait_semaphores = {present_semaphore, *frame->render_ready}; + + static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{ + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + }; + + const VkSubmitInfo submit_info{ + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = nullptr, + .waitSemaphoreCount = 2U, + .pWaitSemaphores = wait_semaphores.data(), + .pWaitDstStageMask = wait_stage_masks.data(), + .commandBufferCount = 1, + .pCommandBuffers = cmdbuf.address(), + .signalSemaphoreCount = 1U, + .pSignalSemaphores = &render_semaphore, + }; + + // Submit the image copy/blit to the swapchain + { + std::scoped_lock lock{scheduler.submit_mutex}; + switch (const VkResult result = + device.GetGraphicsQueue().Submit(submit_info, *frame->present_done)) { + case VK_SUCCESS: + break; + case VK_ERROR_DEVICE_LOST: + device.ReportLoss(); + [[fallthrough]]; + default: + vk::Check(result); + break; + } + } + + // Present + swapchain.Present(render_semaphore); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_present_manager.h b/src/video_core/renderer_vulkan/vk_present_manager.h new file mode 100644 index 000000000..420a775e2 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_present_manager.h @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <condition_variable> +#include <mutex> +#include <queue> + +#include "common/common_types.h" +#include "common/polyfill_thread.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Core::Frontend { +class EmuWindow; +} // namespace Core::Frontend + +namespace Vulkan { + +class Device; +class Scheduler; +class Swapchain; + +struct Frame { + u32 width; + u32 height; + bool is_srgb; + vk::Image image; + vk::ImageView image_view; + vk::Framebuffer framebuffer; + MemoryCommit image_commit; + vk::CommandBuffer cmdbuf; + vk::Semaphore render_ready; + vk::Fence present_done; +}; + +class PresentManager { +public: + PresentManager(Core::Frontend::EmuWindow& render_window, const Device& device, + MemoryAllocator& memory_allocator, Scheduler& scheduler, Swapchain& swapchain); + ~PresentManager(); + + /// Returns the last used presentation frame + Frame* GetRenderFrame(); + + /// Pushes a frame for presentation + void Present(Frame* frame); + + /// Recreates the present frame to match the provided parameters + void RecreateFrame(Frame* frame, u32 width, u32 height, bool is_srgb, + VkFormat image_view_format, VkRenderPass rd); + + /// Waits for the present thread to finish presenting all queued frames. + void WaitPresent(); + +private: + void PresentThread(std::stop_token token); + + void CopyToSwapchain(Frame* frame); + +private: + Core::Frontend::EmuWindow& render_window; + const Device& device; + MemoryAllocator& memory_allocator; + Scheduler& scheduler; + Swapchain& swapchain; + vk::CommandPool cmdpool; + std::vector<Frame> frames; + std::queue<Frame*> present_queue; + std::queue<Frame*> free_queue; + std::condition_variable_any frame_cv; + std::condition_variable free_cv; + std::mutex swapchain_mutex; + std::mutex queue_mutex; + std::mutex free_mutex; + std::jthread present_thread; + bool blit_supported; + bool use_present_thread; + std::size_t image_count; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 929c8ece6..d67490449 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -66,9 +66,10 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { } } -QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, const Device& device_, +QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, const Device& device_, Scheduler& scheduler_) - : QueryCacheBase{rasterizer_}, device{device_}, scheduler{scheduler_}, + : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, query_pools{ QueryPool{device_, scheduler_, QueryType::SamplesPassed}, } {} @@ -98,8 +99,10 @@ HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> depend query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { const vk::Device* logical = &cache.GetDevice().GetLogical(); cache.GetScheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) { + const bool use_precise = Settings::IsGPULevelHigh(); logical->ResetQueryPool(query.first, query.second, 1); - cmdbuf.BeginQuery(query.first, query.second, VK_QUERY_CONTROL_PRECISE_BIT); + cmdbuf.BeginQuery(query.first, query.second, + use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); }); } @@ -112,8 +115,10 @@ void HostCounter::EndQuery() { [query = query](vk::CommandBuffer cmdbuf) { cmdbuf.EndQuery(query.first, query.second); }); } -u64 HostCounter::BlockingQuery() const { - cache.GetScheduler().Wait(tick); +u64 HostCounter::BlockingQuery(bool async) const { + if (!async) { + cache.GetScheduler().Wait(tick); + } u64 data; const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( query.first, query.second, 1, sizeof(data), &data, sizeof(data), diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index 26762ee09..c1b9552eb 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -52,7 +52,8 @@ private: class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, const Device& device_, + explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, const Device& device_, Scheduler& scheduler_); ~QueryCache(); @@ -83,7 +84,7 @@ public: void EndQuery(); private: - u64 BlockingQuery() const override; + u64 BlockingQuery(bool async = false) const override; QueryCache& cache; const VideoCore::QueryType type; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 673ab478e..d1489fc95 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -172,7 +172,8 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache(*this, cpu_memory_, buffer_cache_runtime), pipeline_cache(*this, device, scheduler, descriptor_pool, update_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), - query_cache{*this, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), + query_cache{*this, cpu_memory_, device, scheduler}, + accelerate_dma(buffer_cache, texture_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { scheduler.SetQueryCache(query_cache); @@ -675,7 +676,8 @@ bool RasterizerVulkan::AccelerateConditionalRendering() { const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; Maxwell::ReportSemaphore::Compare cmp; if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), - VideoCommon::CacheType::BufferCache)) { + VideoCommon::CacheType::BufferCache | + VideoCommon::CacheType::QueryCache)) { return true; } return false; @@ -781,8 +783,7 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, } const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; - const auto post_op = IS_IMAGE_UPLOAD ? VideoCommon::ObtainBufferOperation::DoNothing - : VideoCommon::ObtainBufferOperation::MarkAsWritten; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; const auto [buffer, offset] = buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); @@ -793,7 +794,8 @@ bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, if constexpr (IS_IMAGE_UPLOAD) { image->UploadMemory(buffer->Handle(), offset, copy_span); } else { - image->DownloadMemory(buffer->Handle(), offset, copy_span); + texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span, + buffer_operand.address, buffer_size); } return true; } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 057e16967..80455ec08 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -46,10 +46,11 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_) Scheduler::~Scheduler() = default; -void Scheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { +u64 Scheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { // When flushing, we only send data to the worker thread; no waiting is necessary. - SubmitExecution(signal_semaphore, wait_semaphore); + const u64 signal_value = SubmitExecution(signal_semaphore, wait_semaphore); AllocateNewContext(); + return signal_value; } void Scheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { @@ -205,7 +206,7 @@ void Scheduler::AllocateWorkerCommandBuffer() { }); } -void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { +u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { EndPendingOperations(); InvalidateState(); @@ -217,6 +218,7 @@ void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_s on_submit(); } + std::scoped_lock lock{submit_mutex}; switch (const VkResult result = master_semaphore->SubmitQueue( cmdbuf, signal_semaphore, wait_semaphore, signal_value)) { case VK_SUCCESS: @@ -231,6 +233,7 @@ void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_s }); chunk->MarkSubmit(); DispatchWork(); + return signal_value; } void Scheduler::AllocateNewContext() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 8d75ce987..475c682eb 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -34,7 +34,7 @@ public: ~Scheduler(); /// Sends the current execution context to the GPU. - void Flush(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); + u64 Flush(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); /// Sends the current execution context to the GPU and waits for it to complete. void Finish(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); @@ -106,6 +106,8 @@ public: return *master_semaphore; } + std::mutex submit_mutex; + private: class Command { public: @@ -201,7 +203,7 @@ private: void AllocateWorkerCommandBuffer(); - void SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore); + u64 SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore); void AllocateNewContext(); diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index 85fdce6e5..1e80ce463 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -14,6 +14,7 @@ #include "video_core/renderer_vulkan/vk_swapchain.h" #include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_wrapper.h" +#include "vulkan/vulkan_core.h" namespace Vulkan { @@ -33,23 +34,47 @@ VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats) return found != formats.end() ? *found : formats[0]; } -VkPresentModeKHR ChooseSwapPresentMode(vk::Span<VkPresentModeKHR> modes) { - // Mailbox (triple buffering) doesn't lock the application like fifo (vsync), - // prefer it if vsync option is not selected - const auto found_mailbox = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_MAILBOX_KHR); - if (Settings::values.fullscreen_mode.GetValue() == Settings::FullscreenMode::Borderless && - found_mailbox != modes.end() && !Settings::values.use_vsync.GetValue()) { - return VK_PRESENT_MODE_MAILBOX_KHR; - } - if (!Settings::values.use_speed_limit.GetValue()) { - // FIFO present mode locks the framerate to the monitor's refresh rate, - // Find an alternative to surpass this limitation if FPS is unlocked. - const auto found_imm = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_IMMEDIATE_KHR); - if (found_imm != modes.end()) { - return VK_PRESENT_MODE_IMMEDIATE_KHR; +static constexpr VkPresentModeKHR ChooseSwapPresentMode(bool has_imm, bool has_mailbox, + bool has_fifo_relaxed) { + // Mailbox doesn't lock the application like FIFO (vsync) + // FIFO present mode locks the framerate to the monitor's refresh rate + Settings::VSyncMode setting = [has_imm, has_mailbox]() { + // Choose Mailbox or Immediate if unlocked and those modes are supported + const auto mode = Settings::values.vsync_mode.GetValue(); + if (Settings::values.use_speed_limit.GetValue()) { + return mode; + } + switch (mode) { + case Settings::VSyncMode::FIFO: + case Settings::VSyncMode::FIFORelaxed: + if (has_mailbox) { + return Settings::VSyncMode::Mailbox; + } else if (has_imm) { + return Settings::VSyncMode::Immediate; + } + [[fallthrough]]; + default: + return mode; } + }(); + if ((setting == Settings::VSyncMode::Mailbox && !has_mailbox) || + (setting == Settings::VSyncMode::Immediate && !has_imm) || + (setting == Settings::VSyncMode::FIFORelaxed && !has_fifo_relaxed)) { + setting = Settings::VSyncMode::FIFO; + } + + switch (setting) { + case Settings::VSyncMode::Immediate: + return VK_PRESENT_MODE_IMMEDIATE_KHR; + case Settings::VSyncMode::Mailbox: + return VK_PRESENT_MODE_MAILBOX_KHR; + case Settings::VSyncMode::FIFO: + return VK_PRESENT_MODE_FIFO_KHR; + case Settings::VSyncMode::FIFORelaxed: + return VK_PRESENT_MODE_FIFO_RELAXED_KHR; + default: + return VK_PRESENT_MODE_FIFO_KHR; } - return VK_PRESENT_MODE_FIFO_KHR; } VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height) { @@ -65,6 +90,18 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi return extent; } +VkCompositeAlphaFlagBitsKHR ChooseAlphaFlags(const VkSurfaceCapabilitiesKHR& capabilities) { + if (capabilities.supportedCompositeAlpha & VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR) { + return VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + } else if (capabilities.supportedCompositeAlpha & VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR) { + return VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR; + } else { + LOG_ERROR(Render_Vulkan, "Unknown composite alpha flags value {:#x}", + capabilities.supportedCompositeAlpha); + return VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + } +} + } // Anonymous namespace Swapchain::Swapchain(VkSurfaceKHR surface_, const Device& device_, Scheduler& scheduler_, @@ -87,18 +124,16 @@ void Swapchain::Create(u32 width_, u32 height_, bool srgb) { return; } - device.GetLogical().WaitIdle(); Destroy(); CreateSwapchain(capabilities, srgb); CreateSemaphores(); - CreateImageViews(); resource_ticks.clear(); resource_ticks.resize(image_count); } -void Swapchain::AcquireNextImage() { +bool Swapchain::AcquireNextImage() { const VkResult result = device.GetLogical().AcquireNextImageKHR( *swapchain, std::numeric_limits<u64>::max(), *present_semaphores[frame_index], VK_NULL_HANDLE, &image_index); @@ -115,8 +150,11 @@ void Swapchain::AcquireNextImage() { LOG_ERROR(Render_Vulkan, "vkAcquireNextImageKHR returned {}", vk::ToString(result)); break; } + scheduler.Wait(resource_ticks[image_index]); resource_ticks[image_index] = scheduler.CurrentTick(); + + return is_suboptimal || is_outdated; } void Swapchain::Present(VkSemaphore render_semaphore) { @@ -131,6 +169,7 @@ void Swapchain::Present(VkSemaphore render_semaphore) { .pImageIndices = &image_index, .pResults = nullptr, }; + std::scoped_lock lock{scheduler.submit_mutex}; switch (const VkResult result = present_queue.Present(present_info)) { case VK_SUCCESS: break; @@ -153,10 +192,17 @@ void Swapchain::Present(VkSemaphore render_semaphore) { void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bool srgb) { const auto physical_device{device.GetPhysical()}; const auto formats{physical_device.GetSurfaceFormatsKHR(surface)}; - const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)}; + const auto present_modes = physical_device.GetSurfacePresentModesKHR(surface); + has_mailbox = std::find(present_modes.begin(), present_modes.end(), + VK_PRESENT_MODE_MAILBOX_KHR) != present_modes.end(); + has_imm = std::find(present_modes.begin(), present_modes.end(), + VK_PRESENT_MODE_IMMEDIATE_KHR) != present_modes.end(); + has_fifo_relaxed = std::find(present_modes.begin(), present_modes.end(), + VK_PRESENT_MODE_FIFO_RELAXED_KHR) != present_modes.end(); - const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)}; - present_mode = ChooseSwapPresentMode(present_modes); + const VkCompositeAlphaFlagBitsKHR alpha_flags{ChooseAlphaFlags(capabilities)}; + surface_format = ChooseSwapSurfaceFormat(formats); + present_mode = ChooseSwapPresentMode(has_imm, has_mailbox, has_fifo_relaxed); u32 requested_image_count{capabilities.minImageCount + 1}; // Ensure Triple buffering if possible. @@ -180,12 +226,12 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo .imageColorSpace = surface_format.colorSpace, .imageExtent = {}, .imageArrayLayers = 1, - .imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT, .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, .preTransform = capabilities.currentTransform, - .compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, + .compositeAlpha = alpha_flags, .presentMode = present_mode, .clipped = VK_FALSE, .oldSwapchain = nullptr, @@ -217,7 +263,6 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo extent = swapchain_ci.imageExtent; current_srgb = srgb; - current_fps_unlocked = !Settings::values.use_speed_limit.GetValue(); images = swapchain.GetImages(); image_count = static_cast<u32>(images.size()); @@ -228,56 +273,20 @@ void Swapchain::CreateSemaphores() { present_semaphores.resize(image_count); std::ranges::generate(present_semaphores, [this] { return device.GetLogical().CreateSemaphore(); }); -} - -void Swapchain::CreateImageViews() { - VkImageViewCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = {}, - .viewType = VK_IMAGE_VIEW_TYPE_2D, - .format = image_view_format, - .components = - { - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, - }, - .subresourceRange = - { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = 1, - }, - }; - - image_views.resize(image_count); - for (std::size_t i = 0; i < image_count; i++) { - ci.image = images[i]; - image_views[i] = device.GetLogical().CreateImageView(ci); - } + render_semaphores.resize(image_count); + std::ranges::generate(render_semaphores, + [this] { return device.GetLogical().CreateSemaphore(); }); } void Swapchain::Destroy() { frame_index = 0; present_semaphores.clear(); - framebuffers.clear(); - image_views.clear(); swapchain.reset(); } -bool Swapchain::HasFpsUnlockChanged() const { - return current_fps_unlocked != !Settings::values.use_speed_limit.GetValue(); -} - bool Swapchain::NeedsPresentModeUpdate() const { - // Mailbox present mode is the ideal for all scenarios. If it is not available, - // A different present mode is needed to support unlocked FPS above the monitor's refresh rate. - return present_mode != VK_PRESENT_MODE_MAILBOX_KHR && HasFpsUnlockChanged(); + const auto requested_mode = ChooseSwapPresentMode(has_imm, has_mailbox, has_fifo_relaxed); + return present_mode != requested_mode; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index caf1ff32b..bf1ea7254 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -27,7 +27,7 @@ public: void Create(u32 width, u32 height, bool srgb); /// Acquires the next image in the swapchain, waits as needed. - void AcquireNextImage(); + bool AcquireNextImage(); /// Presents the rendered image to the swapchain. void Present(VkSemaphore render_semaphore); @@ -52,6 +52,11 @@ public: return is_suboptimal; } + /// Returns true when the swapchain format is in the srgb color space + bool IsSrgb() const { + return current_srgb; + } + VkExtent2D GetSize() const { return extent; } @@ -64,22 +69,34 @@ public: return image_index; } + std::size_t GetFrameIndex() const { + return frame_index; + } + VkImage GetImageIndex(std::size_t index) const { return images[index]; } - VkImageView GetImageViewIndex(std::size_t index) const { - return *image_views[index]; + VkImage CurrentImage() const { + return images[image_index]; } VkFormat GetImageViewFormat() const { return image_view_format; } + VkFormat GetImageFormat() const { + return surface_format.format; + } + VkSemaphore CurrentPresentSemaphore() const { return *present_semaphores[frame_index]; } + VkSemaphore CurrentRenderSemaphore() const { + return *render_semaphores[frame_index]; + } + u32 GetWidth() const { return width; } @@ -88,6 +105,10 @@ public: return height; } + VkExtent2D GetExtent() const { + return extent; + } + private: void CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bool srgb); void CreateSemaphores(); @@ -95,8 +116,6 @@ private: void Destroy(); - bool HasFpsUnlockChanged() const; - bool NeedsPresentModeUpdate() const; const VkSurfaceKHR surface; @@ -107,10 +126,9 @@ private: std::size_t image_count{}; std::vector<VkImage> images; - std::vector<vk::ImageView> image_views; - std::vector<vk::Framebuffer> framebuffers; std::vector<u64> resource_ticks; std::vector<vk::Semaphore> present_semaphores; + std::vector<vk::Semaphore> render_semaphores; u32 width; u32 height; @@ -121,9 +139,12 @@ private: VkFormat image_view_format{}; VkExtent2D extent{}; VkPresentModeKHR present_mode{}; + VkSurfaceFormatKHR surface_format{}; + bool has_imm{false}; + bool has_mailbox{false}; + bool has_fifo_relaxed{false}; bool current_srgb{}; - bool current_fps_unlocked{}; bool is_outdated{}; bool is_suboptimal{}; }; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ae15f6976..99dd1260a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1,10 +1,11 @@ // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-3.0-or-later #include <algorithm> #include <array> #include <span> #include <vector> +#include <boost/container/small_vector.hpp> #include "common/bit_cast.h" #include "common/bit_util.h" @@ -1343,14 +1344,31 @@ void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImag void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset, std::span<const VideoCommon::BufferImageCopy> copies) { + std::array buffer_handles{ + buffer, + }; + std::array buffer_offsets{ + offset, + }; + DownloadMemory(buffer_handles, buffer_offsets, copies); +} + +void Image::DownloadMemory(std::span<VkBuffer> buffers_span, std::span<VkDeviceSize> offsets_span, + std::span<const VideoCommon::BufferImageCopy> copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); } - std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); + boost::container::small_vector<VkBuffer, 1> buffers_vector{}; + boost::container::small_vector<std::vector<VkBufferImageCopy>, 1> vk_copies; + for (size_t index = 0; index < buffers_span.size(); index++) { + buffers_vector.emplace_back(buffers_span[index]); + vk_copies.emplace_back( + TransformBufferImageCopies(copies, offsets_span[index], aspect_mask)); + } scheduler->RequestOutsideRenderPassOperationContext(); - scheduler->Record([buffer, image = *original_image, aspect_mask = aspect_mask, - vk_copies](vk::CommandBuffer cmdbuf) { + scheduler->Record([buffers = std::move(buffers_vector), image = *original_image, + aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { const VkImageMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, @@ -1369,6 +1387,20 @@ void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset, .layerCount = VK_REMAINING_ARRAY_LAYERS, }, }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, read_barrier); + + for (size_t index = 0; index < buffers.size(); index++) { + cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffers[index], + vk_copies[index]); + } + + const VkMemoryBarrier memory_write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; const VkImageMemoryBarrier image_write_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, @@ -1387,15 +1419,6 @@ void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset, .layerCount = VK_REMAINING_ARRAY_LAYERS, }, }; - const VkMemoryBarrier memory_write_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, - .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - 0, read_barrier); - cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, memory_write_barrier, nullptr, image_write_barrier); }); @@ -1405,7 +1428,13 @@ void Image::DownloadMemory(VkBuffer buffer, VkDeviceSize offset, } void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) { - DownloadMemory(map.buffer, map.offset, copies); + std::array buffers{ + map.buffer, + }; + std::array offsets{ + map.offset, + }; + DownloadMemory(buffers, offsets, copies); } bool Image::IsRescaled() const noexcept { @@ -1555,8 +1584,9 @@ bool Image::NeedsScaleHelper() const { ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, ImageId image_id_, Image& image) - : VideoCommon::ImageViewBase{info, image.info, image_id_}, device{&runtime.device}, - image_handle{image.Handle()}, samples(ConvertSampleCount(image.info.num_samples)) { + : VideoCommon::ImageViewBase{info, image.info, image_id_, image.gpu_addr}, + device{&runtime.device}, image_handle{image.Handle()}, + samples(ConvertSampleCount(image.info.num_samples)) { using Shader::TextureType; const VkImageAspectFlags aspect_mask = ImageViewAspectMask(info); @@ -1602,7 +1632,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI } vk::ImageView handle = device->GetLogical().CreateImageView(ci); if (device->HasDebuggingToolAttached()) { - handle.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); + handle.SetObjectNameEXT(VideoCommon::Name(*this, gpu_addr).c_str()); } image_views[static_cast<size_t>(tex_type)] = std::move(handle); }; @@ -1643,7 +1673,7 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, const VideoCommon::ImageViewInfo& view_info, GPUVAddr gpu_addr_) - : VideoCommon::ImageViewBase{info, view_info}, gpu_addr{gpu_addr_}, + : VideoCommon::ImageViewBase{info, view_info, gpu_addr_}, buffer_size{VideoCommon::CalculateGuestSizeInBytes(info)} {} ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageViewParams& params) diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index d5ee23f8d..6f360177a 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once @@ -141,6 +141,9 @@ public: void DownloadMemory(VkBuffer buffer, VkDeviceSize offset, std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(std::span<VkBuffer> buffers, std::span<VkDeviceSize> offsets, + std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferImageCopy> copies); @@ -262,7 +265,6 @@ private: VkImage image_handle = VK_NULL_HANDLE; VkImageView render_target = VK_NULL_HANDLE; VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT; - GPUVAddr gpu_addr = 0; u32 buffer_size = 0; }; @@ -371,6 +373,7 @@ struct TextureCacheParams { using Sampler = Vulkan::Sampler; using Framebuffer = Vulkan::Framebuffer; using AsyncBuffer = Vulkan::StagingBufferRef; + using BufferType = VkBuffer; }; using TextureCache = VideoCommon::TextureCache<TextureCacheParams>; diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 009dab0b6..0630ebda5 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -14,13 +14,18 @@ namespace Vulkan { UpdateDescriptorQueue::UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_) : device{device_}, scheduler{scheduler_} { + payload_start = payload.data(); payload_cursor = payload.data(); } UpdateDescriptorQueue::~UpdateDescriptorQueue() = default; void UpdateDescriptorQueue::TickFrame() { - payload_cursor = payload.data(); + if (++frame_index >= FRAMES_IN_FLIGHT) { + frame_index = 0; + } + payload_start = payload.data() + frame_index * FRAME_PAYLOAD_SIZE; + payload_cursor = payload_start; } void UpdateDescriptorQueue::Acquire() { @@ -28,10 +33,10 @@ void UpdateDescriptorQueue::Acquire() { // This is the maximum number of entries a single draw call might use. static constexpr size_t MIN_ENTRIES = 0x400; - if (std::distance(payload.data(), payload_cursor) + MIN_ENTRIES >= payload.max_size()) { + if (std::distance(payload_start, payload_cursor) + MIN_ENTRIES >= FRAME_PAYLOAD_SIZE) { LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread"); scheduler.WaitWorker(); - payload_cursor = payload.data(); + payload_cursor = payload_start; } upload_start = payload_cursor; } diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index 625bcc809..1c1a7020b 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -29,6 +29,12 @@ struct DescriptorUpdateEntry { }; class UpdateDescriptorQueue final { + // This should be plenty for the vast majority of cases. Most desktop platforms only + // provide up to 3 swapchain images. + static constexpr size_t FRAMES_IN_FLIGHT = 5; + static constexpr size_t FRAME_PAYLOAD_SIZE = 0x10000; + static constexpr size_t PAYLOAD_SIZE = FRAME_PAYLOAD_SIZE * FRAMES_IN_FLIGHT; + public: explicit UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_); ~UpdateDescriptorQueue(); @@ -73,9 +79,11 @@ private: const Device& device; Scheduler& scheduler; + size_t frame_index{0}; DescriptorUpdateEntry* payload_cursor = nullptr; + DescriptorUpdateEntry* payload_start = nullptr; const DescriptorUpdateEntry* upload_start = nullptr; - std::array<DescriptorUpdateEntry, 0x10000> payload; + std::array<DescriptorUpdateEntry, PAYLOAD_SIZE> payload; }; } // namespace Vulkan diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index d9482371b..c5213875b 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -228,14 +228,14 @@ const ShaderInfo* ShaderCache::MakeShaderInfo(GenericEnvironment& env, VAddr cpu auto info = std::make_unique<ShaderInfo>(); if (const std::optional<u64> cached_hash{env.Analyze()}) { info->unique_hash = *cached_hash; - info->size_bytes = env.CachedSize(); + info->size_bytes = env.CachedSizeBytes(); } else { // Slow path, not really hit on commercial games // Build a control flow graph to get the real shader size Shader::ObjectPool<Shader::Maxwell::Flow::Block> flow_block; Shader::Maxwell::Flow::CFG cfg{env, flow_block, env.StartAddress()}; info->unique_hash = env.CalculateHash(); - info->size_bytes = env.ReadSize(); + info->size_bytes = env.ReadSizeBytes(); } const size_t size_bytes{info->size_bytes}; const ShaderInfo* const result{info.get()}; diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp index 574760f80..c7cb56243 100644 --- a/src/video_core/shader_environment.cpp +++ b/src/video_core/shader_environment.cpp @@ -170,15 +170,19 @@ std::optional<u64> GenericEnvironment::Analyze() { void GenericEnvironment::SetCachedSize(size_t size_bytes) { cached_lowest = start_address; cached_highest = start_address + static_cast<u32>(size_bytes); - code.resize(CachedSize()); + code.resize(CachedSizeWords()); gpu_memory->ReadBlock(program_base + cached_lowest, code.data(), code.size() * sizeof(u64)); } -size_t GenericEnvironment::CachedSize() const noexcept { - return cached_highest - cached_lowest + INST_SIZE; +size_t GenericEnvironment::CachedSizeWords() const noexcept { + return CachedSizeBytes() / INST_SIZE; } -size_t GenericEnvironment::ReadSize() const noexcept { +size_t GenericEnvironment::CachedSizeBytes() const noexcept { + return static_cast<size_t>(cached_highest) - cached_lowest + INST_SIZE; +} + +size_t GenericEnvironment::ReadSizeBytes() const noexcept { return read_highest - read_lowest + INST_SIZE; } @@ -187,7 +191,7 @@ bool GenericEnvironment::CanBeSerialized() const noexcept { } u64 GenericEnvironment::CalculateHash() const { - const size_t size{ReadSize()}; + const size_t size{ReadSizeBytes()}; const auto data{std::make_unique<char[]>(size)}; gpu_memory->ReadBlock(program_base + read_lowest, data.get(), size); return Common::CityHash64(data.get(), size); @@ -198,7 +202,7 @@ void GenericEnvironment::Dump(u64 hash) { } void GenericEnvironment::Serialize(std::ofstream& file) const { - const u64 code_size{static_cast<u64>(CachedSize())}; + const u64 code_size{static_cast<u64>(CachedSizeBytes())}; const u64 num_texture_types{static_cast<u64>(texture_types.size())}; const u64 num_texture_pixel_formats{static_cast<u64>(texture_pixel_formats.size())}; const u64 num_cbuf_values{static_cast<u64>(cbuf_values.size())}; diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h index d75987a52..a0f61cbda 100644 --- a/src/video_core/shader_environment.h +++ b/src/video_core/shader_environment.h @@ -48,9 +48,11 @@ public: void SetCachedSize(size_t size_bytes); - [[nodiscard]] size_t CachedSize() const noexcept; + [[nodiscard]] size_t CachedSizeWords() const noexcept; - [[nodiscard]] size_t ReadSize() const noexcept; + [[nodiscard]] size_t CachedSizeBytes() const noexcept; + + [[nodiscard]] size_t ReadSizeBytes() const noexcept; [[nodiscard]] bool CanBeSerialized() const noexcept; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 1a76d4178..cb51529e4 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -250,10 +250,13 @@ bool IsPixelFormatASTC(PixelFormat format) { case PixelFormat::ASTC_2D_6X6_UNORM: case PixelFormat::ASTC_2D_6X6_SRGB: case PixelFormat::ASTC_2D_10X6_UNORM: + case PixelFormat::ASTC_2D_10X6_SRGB: case PixelFormat::ASTC_2D_10X5_UNORM: case PixelFormat::ASTC_2D_10X5_SRGB: case PixelFormat::ASTC_2D_10X10_UNORM: case PixelFormat::ASTC_2D_10X10_SRGB: + case PixelFormat::ASTC_2D_12X10_UNORM: + case PixelFormat::ASTC_2D_12X10_SRGB: case PixelFormat::ASTC_2D_12X12_UNORM: case PixelFormat::ASTC_2D_12X12_SRGB: case PixelFormat::ASTC_2D_8X6_UNORM: @@ -279,11 +282,13 @@ bool IsPixelFormatSRGB(PixelFormat format) { case PixelFormat::ASTC_2D_8X5_SRGB: case PixelFormat::ASTC_2D_5X4_SRGB: case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X6_SRGB: case PixelFormat::ASTC_2D_10X8_SRGB: case PixelFormat::ASTC_2D_6X6_SRGB: case PixelFormat::ASTC_2D_10X5_SRGB: case PixelFormat::ASTC_2D_10X10_SRGB: case PixelFormat::ASTC_2D_12X12_SRGB: + case PixelFormat::ASTC_2D_12X10_SRGB: case PixelFormat::ASTC_2D_8X6_SRGB: case PixelFormat::ASTC_2D_6X5_SRGB: return true; diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 44b79af20..0225d3287 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -95,10 +95,13 @@ enum class PixelFormat { ASTC_2D_6X6_UNORM, ASTC_2D_6X6_SRGB, ASTC_2D_10X6_UNORM, + ASTC_2D_10X6_SRGB, ASTC_2D_10X5_UNORM, ASTC_2D_10X5_SRGB, ASTC_2D_10X10_UNORM, ASTC_2D_10X10_SRGB, + ASTC_2D_12X10_UNORM, + ASTC_2D_12X10_SRGB, ASTC_2D_12X12_UNORM, ASTC_2D_12X12_SRGB, ASTC_2D_8X6_UNORM, @@ -232,10 +235,13 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{ 6, // ASTC_2D_6X6_UNORM 6, // ASTC_2D_6X6_SRGB 10, // ASTC_2D_10X6_UNORM + 10, // ASTC_2D_10X6_SRGB 10, // ASTC_2D_10X5_UNORM 10, // ASTC_2D_10X5_SRGB 10, // ASTC_2D_10X10_UNORM 10, // ASTC_2D_10X10_SRGB + 12, // ASTC_2D_12X10_UNORM + 12, // ASTC_2D_12X10_SRGB 12, // ASTC_2D_12X12_UNORM 12, // ASTC_2D_12X12_SRGB 8, // ASTC_2D_8X6_UNORM @@ -338,10 +344,13 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{ 6, // ASTC_2D_6X6_UNORM 6, // ASTC_2D_6X6_SRGB 6, // ASTC_2D_10X6_UNORM + 6, // ASTC_2D_10X6_SRGB 5, // ASTC_2D_10X5_UNORM 5, // ASTC_2D_10X5_SRGB 10, // ASTC_2D_10X10_UNORM 10, // ASTC_2D_10X10_SRGB + 10, // ASTC_2D_12X10_UNORM + 10, // ASTC_2D_12X10_SRGB 12, // ASTC_2D_12X12_UNORM 12, // ASTC_2D_12X12_SRGB 6, // ASTC_2D_8X6_UNORM @@ -444,10 +453,13 @@ constexpr std::array<u8, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{ 128, // ASTC_2D_6X6_UNORM 128, // ASTC_2D_6X6_SRGB 128, // ASTC_2D_10X6_UNORM + 128, // ASTC_2D_10X6_SRGB 128, // ASTC_2D_10X5_UNORM 128, // ASTC_2D_10X5_SRGB 128, // ASTC_2D_10X10_UNORM 128, // ASTC_2D_10X10_SRGB + 128, // ASTC_2D_12X10_UNORM + 128, // ASTC_2D_12X10_SRGB 128, // ASTC_2D_12X12_UNORM 128, // ASTC_2D_12X12_SRGB 128, // ASTC_2D_8X6_UNORM diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 5fc2b2fec..11ced6c38 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -210,6 +210,8 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::ASTC_2D_6X6_SRGB; case Hash(TextureFormat::ASTC_2D_10X6, UNORM, LINEAR): return PixelFormat::ASTC_2D_10X6_UNORM; + case Hash(TextureFormat::ASTC_2D_10X6, UNORM, SRGB): + return PixelFormat::ASTC_2D_10X6_SRGB; case Hash(TextureFormat::ASTC_2D_10X5, UNORM, LINEAR): return PixelFormat::ASTC_2D_10X5_UNORM; case Hash(TextureFormat::ASTC_2D_10X5, UNORM, SRGB): @@ -218,6 +220,10 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::ASTC_2D_10X10_UNORM; case Hash(TextureFormat::ASTC_2D_10X10, UNORM, SRGB): return PixelFormat::ASTC_2D_10X10_SRGB; + case Hash(TextureFormat::ASTC_2D_12X10, UNORM, LINEAR): + return PixelFormat::ASTC_2D_12X10_UNORM; + case Hash(TextureFormat::ASTC_2D_12X10, UNORM, SRGB): + return PixelFormat::ASTC_2D_12X10_SRGB; case Hash(TextureFormat::ASTC_2D_12X12, UNORM, LINEAR): return PixelFormat::ASTC_2D_12X12_UNORM; case Hash(TextureFormat::ASTC_2D_12X12, UNORM, SRGB): diff --git a/src/video_core/texture_cache/formatter.cpp b/src/video_core/texture_cache/formatter.cpp index 30f72361d..6279d8e9e 100644 --- a/src/video_core/texture_cache/formatter.cpp +++ b/src/video_core/texture_cache/formatter.cpp @@ -46,7 +46,7 @@ std::string Name(const ImageBase& image) { return "Invalid"; } -std::string Name(const ImageViewBase& image_view) { +std::string Name(const ImageViewBase& image_view, GPUVAddr addr) { const u32 width = image_view.size.width; const u32 height = image_view.size.height; const u32 depth = image_view.size.depth; @@ -56,23 +56,25 @@ std::string Name(const ImageViewBase& image_view) { const std::string level = num_levels > 1 ? fmt::format(":{}", num_levels) : ""; switch (image_view.type) { case ImageViewType::e1D: - return fmt::format("ImageView 1D {}{}", width, level); + return fmt::format("ImageView 1D 0x{:X} {}{}", addr, width, level); case ImageViewType::e2D: - return fmt::format("ImageView 2D {}x{}{}", width, height, level); + return fmt::format("ImageView 2D 0x{:X} {}x{}{}", addr, width, height, level); case ImageViewType::Cube: - return fmt::format("ImageView Cube {}x{}{}", width, height, level); + return fmt::format("ImageView Cube 0x{:X} {}x{}{}", addr, width, height, level); case ImageViewType::e3D: - return fmt::format("ImageView 3D {}x{}x{}{}", width, height, depth, level); + return fmt::format("ImageView 3D 0x{:X} {}x{}x{}{}", addr, width, height, depth, level); case ImageViewType::e1DArray: - return fmt::format("ImageView 1DArray {}{}|{}", width, level, num_layers); + return fmt::format("ImageView 1DArray 0x{:X} {}{}|{}", addr, width, level, num_layers); case ImageViewType::e2DArray: - return fmt::format("ImageView 2DArray {}x{}{}|{}", width, height, level, num_layers); + return fmt::format("ImageView 2DArray 0x{:X} {}x{}{}|{}", addr, width, height, level, + num_layers); case ImageViewType::CubeArray: - return fmt::format("ImageView CubeArray {}x{}{}|{}", width, height, level, num_layers); + return fmt::format("ImageView CubeArray 0x{:X} {}x{}{}|{}", addr, width, height, level, + num_layers); case ImageViewType::Rect: - return fmt::format("ImageView Rect {}x{}{}", width, height, level); + return fmt::format("ImageView Rect 0x{:X} {}x{}{}", addr, width, height, level); case ImageViewType::Buffer: - return fmt::format("BufferView {}", width); + return fmt::format("BufferView 0x{:X} {}", addr, width); } return "Invalid"; } diff --git a/src/video_core/texture_cache/formatter.h b/src/video_core/texture_cache/formatter.h index f1f0a057b..9ee57a076 100644 --- a/src/video_core/texture_cache/formatter.h +++ b/src/video_core/texture_cache/formatter.h @@ -179,6 +179,8 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str return "ASTC_2D_6X6_SRGB"; case PixelFormat::ASTC_2D_10X6_UNORM: return "ASTC_2D_10X6_UNORM"; + case PixelFormat::ASTC_2D_10X6_SRGB: + return "ASTC_2D_10X6_SRGB"; case PixelFormat::ASTC_2D_10X5_UNORM: return "ASTC_2D_10X5_UNORM"; case PixelFormat::ASTC_2D_10X5_SRGB: @@ -187,6 +189,10 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str return "ASTC_2D_10X10_UNORM"; case PixelFormat::ASTC_2D_10X10_SRGB: return "ASTC_2D_10X10_SRGB"; + case PixelFormat::ASTC_2D_12X10_UNORM: + return "ASTC_2D_12X10_UNORM"; + case PixelFormat::ASTC_2D_12X10_SRGB: + return "ASTC_2D_12X10_SRGB"; case PixelFormat::ASTC_2D_12X12_UNORM: return "ASTC_2D_12X12_UNORM"; case PixelFormat::ASTC_2D_12X12_SRGB: @@ -268,7 +274,7 @@ struct RenderTargets; [[nodiscard]] std::string Name(const ImageBase& image); -[[nodiscard]] std::string Name(const ImageViewBase& image_view); +[[nodiscard]] std::string Name(const ImageViewBase& image_view, GPUVAddr addr); [[nodiscard]] std::string Name(const RenderTargets& render_targets); diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp index 04fb84bfa..bcad40353 100644 --- a/src/video_core/texture_cache/image_view_base.cpp +++ b/src/video_core/texture_cache/image_view_base.cpp @@ -16,8 +16,8 @@ namespace VideoCommon { ImageViewBase::ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, - ImageId image_id_) - : image_id{image_id_}, format{info.format}, type{info.type}, range{info.range}, + ImageId image_id_, GPUVAddr addr) + : image_id{image_id_}, gpu_addr{addr}, format{info.format}, type{info.type}, range{info.range}, size{ .width = std::max(image_info.size.width >> range.base.level, 1u), .height = std::max(image_info.size.height >> range.base.level, 1u), @@ -35,8 +35,8 @@ ImageViewBase::ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_i } } -ImageViewBase::ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info) - : image_id{NULL_IMAGE_ID}, format{info.format}, type{ImageViewType::Buffer}, +ImageViewBase::ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info, GPUVAddr addr) + : image_id{NULL_IMAGE_ID}, gpu_addr{addr}, format{info.format}, type{ImageViewType::Buffer}, size{ .width = info.size.width, .height = 1, diff --git a/src/video_core/texture_cache/image_view_base.h b/src/video_core/texture_cache/image_view_base.h index 69c9776e7..a25ae1d4a 100644 --- a/src/video_core/texture_cache/image_view_base.h +++ b/src/video_core/texture_cache/image_view_base.h @@ -24,9 +24,9 @@ enum class ImageViewFlagBits : u16 { DECLARE_ENUM_FLAG_OPERATORS(ImageViewFlagBits) struct ImageViewBase { - explicit ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, - ImageId image_id); - explicit ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info); + explicit ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, ImageId image_id, + GPUVAddr addr); + explicit ImageViewBase(const ImageInfo& info, const ImageViewInfo& view_info, GPUVAddr addr); explicit ImageViewBase(const NullImageViewParams&); [[nodiscard]] bool IsBuffer() const noexcept { @@ -34,6 +34,7 @@ struct ImageViewBase { } ImageId image_id{}; + GPUVAddr gpu_addr = 0; PixelFormat format{}; ImageViewType type{}; SubresourceRange range; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index ed5c768d8..b5297e76b 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: 2021 yuzu Emulator Project +// SPDX-FileCopyrightText: 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later #pragma once #include <unordered_set> +#include <boost/container/small_vector.hpp> #include "common/alignment.h" #include "common/settings.h" @@ -17,15 +18,10 @@ namespace VideoCommon { -using Tegra::Texture::SwizzleSource; -using Tegra::Texture::TextureType; using Tegra::Texture::TICEntry; using Tegra::Texture::TSCEntry; using VideoCore::Surface::GetFormatType; -using VideoCore::Surface::IsCopyCompatible; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; using namespace Common::Literals; @@ -143,6 +139,13 @@ void TextureCache<P>::TickFrame() { runtime.TickFrame(); critical_gc = 0; ++frame_tick; + + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + for (auto& buffer : async_buffers_death_ring) { + runtime.FreeDeferredStagingBuffer(buffer); + } + async_buffers_death_ring.clear(); + } } template <class P> @@ -661,25 +664,39 @@ template <class P> void TextureCache<P>::CommitAsyncFlushes() { // This is intentionally passing the value by copy if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - const std::span<const ImageId> download_ids = uncommitted_downloads; + auto& download_ids = uncommitted_downloads; if (download_ids.empty()) { committed_downloads.emplace_back(std::move(uncommitted_downloads)); uncommitted_downloads.clear(); - async_buffers.emplace_back(std::optional<AsyncBuffer>{}); + async_buffers.emplace_back(std::move(uncommitted_async_buffers)); + uncommitted_async_buffers.clear(); return; } size_t total_size_bytes = 0; - for (const ImageId image_id : download_ids) { - total_size_bytes += slot_images[image_id].unswizzled_size_bytes; + size_t last_async_buffer_id = uncommitted_async_buffers.size(); + bool any_none_dma = false; + for (PendingDownload& download_info : download_ids) { + if (download_info.is_swizzle) { + total_size_bytes += + Common::AlignUp(slot_images[download_info.object_id].unswizzled_size_bytes, 64); + any_none_dma = true; + download_info.async_buffer_id = last_async_buffer_id; + } } - auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true); - for (const ImageId image_id : download_ids) { - Image& image = slot_images[image_id]; - const auto copies = FullDownloadCopies(image.info); - image.DownloadMemory(download_map, copies); - download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64); + if (any_none_dma) { + auto download_map = runtime.DownloadStagingBuffer(total_size_bytes, true); + for (const PendingDownload& download_info : download_ids) { + if (download_info.is_swizzle) { + Image& image = slot_images[download_info.object_id]; + const auto copies = FullDownloadCopies(image.info); + image.DownloadMemory(download_map, copies); + download_map.offset += Common::AlignUp(image.unswizzled_size_bytes, 64); + } + } + uncommitted_async_buffers.emplace_back(download_map); } - async_buffers.emplace_back(download_map); + async_buffers.emplace_back(std::move(uncommitted_async_buffers)); + uncommitted_async_buffers.clear(); } committed_downloads.emplace_back(std::move(uncommitted_downloads)); uncommitted_downloads.clear(); @@ -691,39 +708,57 @@ void TextureCache<P>::PopAsyncFlushes() { return; } if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - const std::span<const ImageId> download_ids = committed_downloads.front(); + const auto& download_ids = committed_downloads.front(); if (download_ids.empty()) { committed_downloads.pop_front(); async_buffers.pop_front(); return; } - auto download_map = *async_buffers.front(); - std::span<u8> download_span = download_map.mapped_span; + auto download_map = std::move(async_buffers.front()); for (size_t i = download_ids.size(); i > 0; i--) { - const ImageBase& image = slot_images[download_ids[i - 1]]; - const auto copies = FullDownloadCopies(image.info); - download_map.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64); - std::span<u8> download_span_alt = download_span.subspan(download_map.offset); - SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span_alt, - swizzle_data_buffer); + auto& download_info = download_ids[i - 1]; + auto& download_buffer = download_map[download_info.async_buffer_id]; + if (download_info.is_swizzle) { + const ImageBase& image = slot_images[download_info.object_id]; + const auto copies = FullDownloadCopies(image.info); + download_buffer.offset -= Common::AlignUp(image.unswizzled_size_bytes, 64); + std::span<u8> download_span = + download_buffer.mapped_span.subspan(download_buffer.offset); + SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span, + swizzle_data_buffer); + } else { + const BufferDownload& buffer_info = slot_buffer_downloads[download_info.object_id]; + std::span<u8> download_span = + download_buffer.mapped_span.subspan(download_buffer.offset); + gpu_memory->WriteBlockUnsafe(buffer_info.address, download_span.data(), + buffer_info.size); + slot_buffer_downloads.erase(download_info.object_id); + } + } + for (auto& download_buffer : download_map) { + async_buffers_death_ring.emplace_back(download_buffer); } - runtime.FreeDeferredStagingBuffer(download_map); committed_downloads.pop_front(); async_buffers.pop_front(); } else { - const std::span<const ImageId> download_ids = committed_downloads.front(); + const auto& download_ids = committed_downloads.front(); if (download_ids.empty()) { committed_downloads.pop_front(); return; } size_t total_size_bytes = 0; - for (const ImageId image_id : download_ids) { - total_size_bytes += slot_images[image_id].unswizzled_size_bytes; + for (const PendingDownload& download_info : download_ids) { + if (download_info.is_swizzle) { + total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes; + } } auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); const size_t original_offset = download_map.offset; - for (const ImageId image_id : download_ids) { - Image& image = slot_images[image_id]; + for (const PendingDownload& download_info : download_ids) { + if (!download_info.is_swizzle) { + continue; + } + Image& image = slot_images[download_info.object_id]; const auto copies = FullDownloadCopies(image.info); image.DownloadMemory(download_map, copies); download_map.offset += image.unswizzled_size_bytes; @@ -732,8 +767,11 @@ void TextureCache<P>::PopAsyncFlushes() { runtime.Finish(); download_map.offset = original_offset; std::span<u8> download_span = download_map.mapped_span; - for (const ImageId image_id : download_ids) { - const ImageBase& image = slot_images[image_id]; + for (const PendingDownload& download_info : download_ids) { + if (!download_info.is_swizzle) { + continue; + } + const ImageBase& image = slot_images[download_info.object_id]; const auto copies = FullDownloadCopies(image.info); SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span, swizzle_data_buffer); @@ -834,6 +872,33 @@ std::pair<typename TextureCache<P>::Image*, BufferImageCopy> TextureCache<P>::Dm } template <class P> +void TextureCache<P>::DownloadImageIntoBuffer(typename TextureCache<P>::Image* image, + typename TextureCache<P>::BufferType buffer, + size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies, + GPUVAddr address, size_t size) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + const BufferDownload new_buffer_download{address, size}; + auto slot = slot_buffer_downloads.insert(new_buffer_download); + const PendingDownload new_download{false, uncommitted_async_buffers.size(), slot}; + uncommitted_downloads.emplace_back(new_download); + auto download_map = runtime.DownloadStagingBuffer(size, true); + uncommitted_async_buffers.emplace_back(download_map); + std::array buffers{ + buffer, + download_map.buffer, + }; + std::array<u64, 2> buffer_offsets{ + buffer_offset, + download_map.offset, + }; + image->DownloadMemory(buffers, buffer_offsets, copies); + } else { + image->DownloadMemory(buffer, buffer_offset, copies); + } +} + +template <class P> void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) { if (False(image.flags & ImageFlagBits::CpuModified)) { // Only upload modified images @@ -1294,6 +1359,12 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA ScaleDown(new_image); } + std::ranges::sort(overlap_ids, [this](const ImageId lhs, const ImageId rhs) { + const ImageBase& lhs_image = slot_images[lhs]; + const ImageBase& rhs_image = slot_images[rhs]; + return lhs_image.modification_tick < rhs_image.modification_tick; + }); + for (const ImageId overlap_id : overlap_ids) { Image& overlap = slot_images[overlap_id]; if (True(overlap.flags & ImageFlagBits::GpuModified)) { @@ -2209,7 +2280,8 @@ void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id) if (new_id) { const ImageViewBase& old_view = slot_image_views[new_id]; if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { - uncommitted_downloads.push_back(old_view.image_id); + const PendingDownload new_download{true, 0, old_view.image_id}; + uncommitted_downloads.emplace_back(new_download); } } *old_id = new_id; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 5a5b4179c..758b7e212 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2021 yuzu Emulator Project +// SPDX-FileCopyrightText: 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later #pragma once @@ -40,14 +40,9 @@ struct ChannelState; namespace VideoCommon { -using Tegra::Texture::SwizzleSource; using Tegra::Texture::TICEntry; using Tegra::Texture::TSCEntry; -using VideoCore::Surface::GetFormatType; -using VideoCore::Surface::IsCopyCompatible; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using namespace Common::Literals; struct ImageViewInOut { @@ -119,6 +114,7 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI using Sampler = typename P::Sampler; using Framebuffer = typename P::Framebuffer; using AsyncBuffer = typename P::AsyncBuffer; + using BufferType = typename P::BufferType; struct BlitImages { ImageId dst_id; @@ -215,6 +211,10 @@ public: const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& buffer_operand, const Tegra::DMA::ImageOperand& image_operand, ImageId image_id, bool modifies_image); + void DownloadImageIntoBuffer(Image* image, BufferType buffer, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies, + GPUVAddr address = 0, size_t size = 0); + /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); @@ -424,17 +424,32 @@ private: u64 critical_memory; size_t critical_gc; + struct BufferDownload { + GPUVAddr address; + size_t size; + }; + + struct PendingDownload { + bool is_swizzle; + size_t async_buffer_id; + SlotId object_id; + }; + SlotVector<Image> slot_images; SlotVector<ImageMapView> slot_map_views; SlotVector<ImageView> slot_image_views; SlotVector<ImageAlloc> slot_image_allocs; SlotVector<Sampler> slot_samplers; SlotVector<Framebuffer> slot_framebuffers; + SlotVector<BufferDownload> slot_buffer_downloads; // TODO: This data structure is not optimal and it should be reworked - std::vector<ImageId> uncommitted_downloads; - std::deque<std::vector<ImageId>> committed_downloads; - std::deque<std::optional<AsyncBuffer>> async_buffers; + + std::vector<PendingDownload> uncommitted_downloads; + std::deque<std::vector<PendingDownload>> committed_downloads; + std::vector<AsyncBuffer> uncommitted_async_buffers; + std::deque<std::vector<AsyncBuffer>> async_buffers; + std::deque<AsyncBuffer> async_buffers_death_ring; struct LRUItemParams { using ObjectType = ImageId; diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 6f288b3f8..6ffca2af2 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -617,7 +617,9 @@ bool Device::ShouldBoostClocks() const { const bool is_steam_deck = vendor_id == 0x1002 && device_id == 0x163F; - return validated_driver && !is_steam_deck; + const bool is_debugging = this->HasDebuggingToolAttached(); + + return validated_driver && !is_steam_deck && !is_debugging; } bool Device::GetSuitability(bool requires_swapchain) { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 7d5018151..5f1c63ff9 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -10,6 +10,7 @@ #include <vector> #include "common/common_types.h" +#include "common/settings.h" #include "video_core/vulkan_common/vulkan_wrapper.h" // Define all features which may be used by the implementation here. @@ -510,7 +511,7 @@ public: /// Returns true when a known debugging tool is attached. bool HasDebuggingToolAttached() const { - return has_renderdoc || has_nsight_graphics; + return has_renderdoc || has_nsight_graphics || Settings::values.renderer_debug.GetValue(); } /// Returns true when the device does not properly support cube compatibility. diff --git a/src/video_core/vulkan_common/vulkan_surface.cpp b/src/video_core/vulkan_common/vulkan_surface.cpp index fa9bafa20..c34599365 100644 --- a/src/video_core/vulkan_common/vulkan_surface.cpp +++ b/src/video_core/vulkan_common/vulkan_surface.cpp @@ -23,10 +23,10 @@ namespace Vulkan { -vk::SurfaceKHR CreateSurface(const vk::Instance& instance, - const Core::Frontend::EmuWindow& emu_window) { +vk::SurfaceKHR CreateSurface( + const vk::Instance& instance, + [[maybe_unused]] const Core::Frontend::EmuWindow::WindowSystemInfo& window_info) { [[maybe_unused]] const vk::InstanceDispatch& dld = instance.Dispatch(); - [[maybe_unused]] const auto& window_info = emu_window.GetWindowInfo(); VkSurfaceKHR unsafe_surface = nullptr; #ifdef _WIN32 diff --git a/src/video_core/vulkan_common/vulkan_surface.h b/src/video_core/vulkan_common/vulkan_surface.h index 5725143e6..5e18c06c4 100644 --- a/src/video_core/vulkan_common/vulkan_surface.h +++ b/src/video_core/vulkan_common/vulkan_surface.h @@ -3,15 +3,12 @@ #pragma once +#include "core/frontend/emu_window.h" #include "video_core/vulkan_common/vulkan_wrapper.h" -namespace Core::Frontend { -class EmuWindow; -} - namespace Vulkan { -[[nodiscard]] vk::SurfaceKHR CreateSurface(const vk::Instance& instance, - const Core::Frontend::EmuWindow& emu_window); +[[nodiscard]] vk::SurfaceKHR CreateSurface( + const vk::Instance& instance, const Core::Frontend::EmuWindow::WindowSystemInfo& window_info); } // namespace Vulkan |