summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorReinUsesLisp <reinuseslisp@airmail.cc>2021-01-17 00:48:58 +0100
committerReinUsesLisp <reinuseslisp@airmail.cc>2021-02-13 06:17:22 +0100
commit82c2601555b59a94d7160f2fd686cb63d32dd423 (patch)
treecd0ecd865945452fa589b572de614fc487f2f96a
parentvulkan_common: Expose interop and headless devices (diff)
downloadyuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.gz
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.bz2
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.lz
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.xz
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.tar.zst
yuzu-82c2601555b59a94d7160f2fd686cb63d32dd423.zip
-rw-r--r--src/video_core/CMakeLists.txt6
-rw-r--r--src/video_core/buffer_cache/buffer_block.h62
-rw-r--r--src/video_core/buffer_cache/buffer_cache.cpp13
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h1598
-rw-r--r--src/video_core/buffer_cache/map_interval.cpp33
-rw-r--r--src/video_core/buffer_cache/map_interval.h93
-rw-r--r--src/video_core/command_classes/vic.cpp3
-rw-r--r--src/video_core/dirty_flags.cpp29
-rw-r--r--src/video_core/dirty_flags.h8
-rw-r--r--src/video_core/dma_pusher.cpp2
-rw-r--r--src/video_core/engines/kepler_compute.cpp1
-rw-r--r--src/video_core/engines/kepler_memory.cpp1
-rw-r--r--src/video_core/engines/maxwell_3d.cpp17
-rw-r--r--src/video_core/engines/maxwell_3d.h12
-rw-r--r--src/video_core/engines/maxwell_dma.cpp3
-rw-r--r--src/video_core/fence_manager.h4
-rw-r--r--src/video_core/host_shaders/CMakeLists.txt1
-rw-r--r--src/video_core/host_shaders/vulkan_quad_array.comp28
-rw-r--r--src/video_core/host_shaders/vulkan_uint8.comp9
-rw-r--r--src/video_core/rasterizer_interface.h5
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp257
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h168
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp16
-rw-r--r--src/video_core/renderer_opengl/gl_device.h8
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.cpp2
-rw-r--r--src/video_core/renderer_opengl/gl_fence_manager.h9
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp574
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h73
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.cpp6
-rw-r--r--src/video_core/renderer_opengl/gl_resource_manager.h3
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp61
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h2
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.cpp25
-rw-r--r--src/video_core/renderer_opengl/gl_state_tracker.h32
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.cpp94
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.h60
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp19
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h21
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp95
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h14
-rw-r--r--src/video_core/renderer_opengl/util_shaders.cpp18
-rw-r--r--src/video_core/renderer_opengl/util_shaders.h3
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp6
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.h2
-rw-r--r--src/video_core/renderer_vulkan/renderer_vulkan.cpp2
-rw-r--r--src/video_core/renderer_vulkan/renderer_vulkan.h3
-rw-r--r--src/video_core/renderer_vulkan/vk_blit_screen.cpp9
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp366
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.h107
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.cpp97
-rw-r--r--src/video_core/renderer_vulkan/vk_compute_pass.h24
-rw-r--r--src/video_core/renderer_vulkan/vk_fence_manager.cpp4
-rw-r--r--src/video_core/renderer_vulkan/vk_fence_manager.h11
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp664
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h64
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.cpp14
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.h26
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp3
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.h20
-rw-r--r--src/video_core/renderer_vulkan/vk_state_tracker.cpp9
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp131
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h26
-rw-r--r--src/video_core/shader/async_shaders.h9
-rw-r--r--src/video_core/shader/decode/other.cpp1
-rw-r--r--src/video_core/shader/shader_ir.h5
-rw-r--r--src/video_core/texture_cache/texture_cache.h28
-rw-r--r--src/video_core/vulkan_common/vulkan_memory_allocator.h2
67 files changed, 2514 insertions, 2607 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index dd4c29ed3..9b931976a 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -2,10 +2,8 @@ add_subdirectory(host_shaders)
add_library(video_core STATIC
buffer_cache/buffer_base.h
- buffer_cache/buffer_block.h
+ buffer_cache/buffer_cache.cpp
buffer_cache/buffer_cache.h
- buffer_cache/map_interval.cpp
- buffer_cache/map_interval.h
cdma_pusher.cpp
cdma_pusher.h
command_classes/codecs/codec.cpp
@@ -152,8 +150,6 @@ add_library(video_core STATIC
renderer_vulkan/vk_staging_buffer_pool.h
renderer_vulkan/vk_state_tracker.cpp
renderer_vulkan/vk_state_tracker.h
- renderer_vulkan/vk_stream_buffer.cpp
- renderer_vulkan/vk_stream_buffer.h
renderer_vulkan/vk_swapchain.cpp
renderer_vulkan/vk_swapchain.h
renderer_vulkan/vk_texture_cache.cpp
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
deleted file mode 100644
index e9306194a..000000000
--- a/src/video_core/buffer_cache/buffer_block.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include "common/common_types.h"
-
-namespace VideoCommon {
-
-class BufferBlock {
-public:
- [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
- return (cpu_addr < end) && (cpu_addr_end > start);
- }
-
- [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
- return cpu_addr <= other_start && other_end <= cpu_addr_end;
- }
-
- [[nodiscard]] std::size_t Offset(VAddr in_addr) const {
- return static_cast<std::size_t>(in_addr - cpu_addr);
- }
-
- [[nodiscard]] VAddr CpuAddr() const {
- return cpu_addr;
- }
-
- [[nodiscard]] VAddr CpuAddrEnd() const {
- return cpu_addr_end;
- }
-
- void SetCpuAddr(VAddr new_addr) {
- cpu_addr = new_addr;
- cpu_addr_end = new_addr + size;
- }
-
- [[nodiscard]] std::size_t Size() const {
- return size;
- }
-
- [[nodiscard]] u64 Epoch() const {
- return epoch;
- }
-
- void SetEpoch(u64 new_epoch) {
- epoch = new_epoch;
- }
-
-protected:
- explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
- SetCpuAddr(cpu_addr_);
- }
-
-private:
- VAddr cpu_addr{};
- VAddr cpu_addr_end{};
- std::size_t size{};
- u64 epoch{};
-};
-
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
new file mode 100644
index 000000000..ab32294c8
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -0,0 +1,13 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/microprofile.h"
+
+namespace VideoCommon {
+
+MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
+MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 83b9ee871..e4f3c8e35 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -4,591 +4,1231 @@
#pragma once
-#include <list>
+#include <algorithm>
+#include <array>
+#include <deque>
#include <memory>
#include <mutex>
+#include <span>
#include <unordered_map>
-#include <unordered_set>
-#include <utility>
#include <vector>
#include <boost/container/small_vector.hpp>
-#include <boost/icl/interval_set.hpp>
-#include <boost/intrusive/set.hpp>
-#include "common/alignment.h"
-#include "common/assert.h"
#include "common/common_types.h"
-#include "common/logging/log.h"
-#include "core/core.h"
+#include "common/div_ceil.h"
+#include "common/microprofile.h"
+#include "common/scope_exit.h"
#include "core/memory.h"
#include "core/settings.h"
-#include "video_core/buffer_cache/buffer_block.h"
-#include "video_core/buffer_cache/map_interval.h"
+#include "video_core/buffer_cache/buffer_base.h"
+#include "video_core/delayed_destruction_ring.h"
+#include "video_core/dirty_flags.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
#include "video_core/memory_manager.h"
#include "video_core/rasterizer_interface.h"
+#include "video_core/texture_cache/slot_vector.h"
+#include "video_core/texture_cache/types.h"
namespace VideoCommon {
-template <typename Buffer, typename BufferType, typename StreamBuffer>
+MICROPROFILE_DECLARE(GPU_PrepareBuffers);
+MICROPROFILE_DECLARE(GPU_BindUploadBuffers);
+MICROPROFILE_DECLARE(GPU_DownloadMemory);
+
+using BufferId = SlotId;
+
+constexpr u32 NUM_VERTEX_BUFFERS = 32;
+constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
+constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
+constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
+constexpr u32 NUM_STORAGE_BUFFERS = 16;
+constexpr u32 NUM_STAGES = 5;
+
+template <typename P>
class BufferCache {
- using IntervalSet = boost::icl::interval_set<VAddr>;
- using IntervalType = typename IntervalSet::interval_type;
- using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+ // Page size for caching purposes.
+ // This is unrelated to the CPU page size and it can be changed as it seems optimal.
+ static constexpr u32 PAGE_BITS = 16;
+ static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS;
- static constexpr u64 WRITE_PAGE_BIT = 11;
- static constexpr u64 BLOCK_PAGE_BITS = 21;
- static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+ static constexpr bool IS_OPENGL = P::IS_OPENGL;
+ static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS =
+ P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS;
+ static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT =
+ P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT;
+ static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
+ static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
+ static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
-public:
- struct BufferInfo {
- BufferType handle;
- u64 offset;
- u64 address;
+ static constexpr BufferId NULL_BUFFER_ID{0};
+
+ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+ using Runtime = typename P::Runtime;
+ using Buffer = typename P::Buffer;
+
+ struct Empty {};
+
+ struct Binding {
+ VAddr cpu_addr{};
+ u32 size{};
+ BufferId buffer_id;
};
- BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
- bool is_written = false, bool use_fast_cbuf = false) {
- std::lock_guard lock{mutex};
+ static constexpr Binding NULL_BINDING{
+ .cpu_addr = 0,
+ .size = 0,
+ .buffer_id = NULL_BUFFER_ID,
+ };
- const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
- if (!cpu_addr) {
- return GetEmptyBuffer(size);
- }
+public:
+ static constexpr size_t SKIP_CACHE_SIZE = 4096;
- // Cache management is a big overhead, so only cache entries with a given size.
- // TODO: Figure out which size is the best for given games.
- constexpr std::size_t max_stream_size = 0x800;
- if (use_fast_cbuf || size < max_stream_size) {
- if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) {
- const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size);
- if (use_fast_cbuf) {
- u8* dest;
- if (is_granular) {
- dest = gpu_memory.GetPointer(gpu_addr);
- } else {
- staging_buffer.resize(size);
- dest = staging_buffer.data();
- gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
- }
- return ConstBufferUpload(dest, size);
- }
- if (is_granular) {
- u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
- return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
- std::memcpy(dest, host_ptr, size);
- });
- } else {
- return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) {
- gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size);
- });
- }
- }
- }
+ explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+ Tegra::Engines::Maxwell3D& maxwell3d_,
+ Tegra::Engines::KeplerCompute& kepler_compute_,
+ Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+ Runtime& runtime_);
- Buffer* const block = GetBlock(*cpu_addr, size);
- MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size);
- if (!map) {
- return GetEmptyBuffer(size);
- }
- if (is_written) {
- map->MarkAsModified(true, GetModifiedTicks());
- if (Settings::IsGPULevelHigh() &&
- Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
- MarkForAsyncFlush(map);
- }
- if (!map->is_written) {
- map->is_written = true;
- MarkRegionAsWritten(map->start, map->end - 1);
- }
- }
+ void TickFrame();
- return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()};
- }
+ void WriteMemory(VAddr cpu_addr, u64 size);
- /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
- BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
- std::size_t alignment = 4) {
- std::lock_guard lock{mutex};
- return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
- std::memcpy(dest, raw_pointer, size);
- });
- }
+ void CachedWriteMemory(VAddr cpu_addr, u64 size);
- /// Prepares the buffer cache for data uploading
- /// @param max_size Maximum number of bytes that will be uploaded
- /// @return True when a stream buffer invalidation was required, false otherwise
- void Map(std::size_t max_size) {
- std::lock_guard lock{mutex};
+ void DownloadMemory(VAddr cpu_addr, u64 size);
- std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
- buffer_offset = buffer_offset_base;
- }
+ void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
- /// Finishes the upload stream
- void Unmap() {
- std::lock_guard lock{mutex};
- stream_buffer.Unmap(buffer_offset - buffer_offset_base);
- }
+ void UpdateGraphicsBuffers(bool is_indexed);
- /// Function called at the end of each frame, inteded for deferred operations
- void TickFrame() {
- ++epoch;
+ void UpdateComputeBuffers();
- while (!pending_destruction.empty()) {
- // Delay at least 4 frames before destruction.
- // This is due to triple buffering happening on some drivers.
- static constexpr u64 epochs_to_destroy = 5;
- if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
- break;
- }
- pending_destruction.pop();
- }
- }
+ void BindHostGeometryBuffers(bool is_indexed);
- /// Write any cached resources overlapping the specified region back to memory
- void FlushRegion(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
+ void BindHostStageBuffers(size_t stage);
- VectorMapInterval objects = GetMapsInRange(addr, size);
- std::sort(objects.begin(), objects.end(),
- [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
- for (MapInterval* object : objects) {
- if (object->is_modified && object->is_registered) {
- mutex.unlock();
- FlushMap(object);
- mutex.lock();
- }
- }
- }
+ void BindHostComputeBuffers();
- bool MustFlushRegion(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
+ void SetEnabledUniformBuffers(size_t stage, u32 enabled);
- const VectorMapInterval objects = GetMapsInRange(addr, size);
- return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
- return map->is_modified && map->is_registered;
- });
- }
+ void SetEnabledComputeUniformBuffers(u32 enabled);
- /// Mark the specified region as being invalidated
- void InvalidateRegion(VAddr addr, u64 size) {
- std::lock_guard lock{mutex};
+ void UnbindGraphicsStorageBuffers(size_t stage);
- for (auto& object : GetMapsInRange(addr, size)) {
- if (object->is_registered) {
- Unregister(object);
- }
- }
- }
+ void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
+ bool is_written);
- void OnCPUWrite(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
+ void UnbindComputeStorageBuffers();
- for (MapInterval* object : GetMapsInRange(addr, size)) {
- if (object->is_memory_marked && object->is_registered) {
- UnmarkMemory(object);
- object->is_sync_pending = true;
- marked_for_unregister.emplace_back(object);
- }
- }
- }
+ void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
+ bool is_written);
- void SyncGuestHost() {
- std::lock_guard lock{mutex};
+ void FlushCachedWrites();
- for (auto& object : marked_for_unregister) {
- if (object->is_registered) {
- object->is_sync_pending = false;
- Unregister(object);
- }
+ /// Return true when there are uncommitted buffers to be downloaded
+ [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
+
+ /// Return true when the caller should wait for async downloads
+ [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
+
+ /// Commit asynchronous downloads
+ void CommitAsyncFlushes();
+
+ /// Pop asynchronous downloads
+ void PopAsyncFlushes();
+
+ /// Return true when a CPU region is modified from the GPU
+ [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
+
+ std::mutex mutex;
+
+private:
+ template <typename Func>
+ static void ForEachEnabledBit(u32 enabled_mask, Func&& func) {
+ for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) {
+ const int disabled_bits = std::countr_zero(enabled_mask);
+ index += disabled_bits;
+ enabled_mask >>= disabled_bits;
+ func(index);
}
- marked_for_unregister.clear();
}
- void CommitAsyncFlushes() {
- if (uncommitted_flushes) {
- auto commit_list = std::make_shared<std::list<MapInterval*>>();
- for (MapInterval* map : *uncommitted_flushes) {
- if (map->is_registered && map->is_modified) {
- // TODO(Blinkhawk): Implement backend asynchronous flushing
- // AsyncFlushMap(map)
- commit_list->push_back(map);
- }
- }
- if (!commit_list->empty()) {
- committed_flushes.push_back(commit_list);
- } else {
- committed_flushes.emplace_back();
+ template <typename Func>
+ void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) {
+ const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE);
+ for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) {
+ const BufferId buffer_id = page_table[page];
+ if (!buffer_id) {
+ ++page;
+ continue;
}
- } else {
- committed_flushes.emplace_back();
+ Buffer& buffer = slot_buffers[buffer_id];
+ func(buffer_id, buffer);
+
+ const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+ page = Common::DivCeil(end_addr, PAGE_SIZE);
}
- uncommitted_flushes.reset();
}
- bool ShouldWaitAsyncFlushes() const {
- return !committed_flushes.empty() && committed_flushes.front() != nullptr;
+ static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
+ return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
+ ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
}
- bool HasUncommittedFlushes() const {
- return uncommitted_flushes != nullptr;
- }
+ void BindHostIndexBuffer();
- void PopAsyncFlushes() {
- if (committed_flushes.empty()) {
- return;
- }
- auto& flush_list = committed_flushes.front();
- if (!flush_list) {
- committed_flushes.pop_front();
- return;
- }
- for (MapInterval* map : *flush_list) {
- if (map->is_registered) {
- // TODO(Blinkhawk): Replace this for reading the asynchronous flush
- FlushMap(map);
- }
- }
- committed_flushes.pop_front();
- }
+ void BindHostVertexBuffers();
- virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
+ void BindHostGraphicsUniformBuffers(size_t stage);
-protected:
- explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
- Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
- StreamBuffer& stream_buffer_)
- : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
- stream_buffer{stream_buffer_} {}
+ void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind);
- ~BufferCache() = default;
+ void BindHostGraphicsStorageBuffers(size_t stage);
- virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
+ void BindHostTransformFeedbackBuffers();
- virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
- return {};
- }
+ void BindHostComputeUniformBuffers();
- /// Register an object into the cache
- MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
- const VAddr cpu_addr = new_map.start;
- if (!cpu_addr) {
- LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
- new_map.gpu_addr);
- return nullptr;
- }
- const std::size_t size = new_map.end - new_map.start;
- new_map.is_registered = true;
- rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
- new_map.is_memory_marked = true;
- if (inherit_written) {
- MarkRegionAsWritten(new_map.start, new_map.end - 1);
- new_map.is_written = true;
- }
- MapInterval* const storage = mapped_addresses_allocator.Allocate();
- *storage = new_map;
- mapped_addresses.insert(*storage);
- return storage;
- }
+ void BindHostComputeStorageBuffers();
- void UnmarkMemory(MapInterval* map) {
- if (!map->is_memory_marked) {
- return;
- }
- const std::size_t size = map->end - map->start;
- rasterizer.UpdatePagesCachedCount(map->start, size, -1);
- map->is_memory_marked = false;
- }
-
- /// Unregisters an object from the cache
- void Unregister(MapInterval* map) {
- UnmarkMemory(map);
- map->is_registered = false;
- if (map->is_sync_pending) {
- map->is_sync_pending = false;
- marked_for_unregister.remove(map);
+ void DoUpdateGraphicsBuffers(bool is_indexed);
+
+ void DoUpdateComputeBuffers();
+
+ void UpdateIndexBuffer();
+
+ void UpdateVertexBuffers();
+
+ void UpdateVertexBuffer(u32 index);
+
+ void UpdateUniformBuffers(size_t stage);
+
+ void UpdateStorageBuffers(size_t stage);
+
+ void UpdateTransformFeedbackBuffers();
+
+ void UpdateTransformFeedbackBuffer(u32 index);
+
+ void UpdateComputeUniformBuffers();
+
+ void UpdateComputeStorageBuffers();
+
+ void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
+
+ [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
+
+ [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size);
+
+ void Register(BufferId buffer_id);
+
+ void Unregister(BufferId buffer_id);
+
+ template <bool insert>
+ void ChangeRegister(BufferId buffer_id);
+
+ void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
+
+ void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size);
+
+ void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
+ std::span<BufferCopy> copies);
+
+ void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
+ std::span<const BufferCopy> copies);
+
+ void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
+ std::span<const BufferCopy> copies);
+
+ void DeleteBuffer(BufferId buffer_id);
+
+ void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
+
+ void NotifyBufferDeletion();
+
+ [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
+
+ [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
+
+ [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
+
+ [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
+
+ VideoCore::RasterizerInterface& rasterizer;
+ Tegra::Engines::Maxwell3D& maxwell3d;
+ Tegra::Engines::KeplerCompute& kepler_compute;
+ Tegra::MemoryManager& gpu_memory;
+ Core::Memory::Memory& cpu_memory;
+ Runtime& runtime;
+
+ SlotVector<Buffer> slot_buffers;
+ DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
+
+ u32 last_index_count = 0;
+
+ Binding index_buffer;
+ std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
+ std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
+ std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
+ std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
+
+ std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
+ std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
+
+ std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
+ u32 enabled_compute_uniform_buffers = 0;
+
+ std::array<u32, NUM_STAGES> enabled_storage_buffers{};
+ std::array<u32, NUM_STAGES> written_storage_buffers{};
+ u32 enabled_compute_storage_buffers = 0;
+ u32 written_compute_storage_buffers = 0;
+
+ std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
+
+ bool has_deleted_buffers = false;
+
+ std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
+ dirty_uniform_buffers{};
+
+ std::vector<BufferId> cached_write_buffer_ids;
+
+ // TODO: This data structure is not optimal and it should be reworked
+ std::vector<BufferId> uncommitted_downloads;
+ std::deque<std::vector<BufferId>> committed_downloads;
+
+ size_t immediate_buffer_capacity = 0;
+ std::unique_ptr<u8[]> immediate_buffer_alloc;
+
+ std::array<BufferId, ((1ULL << 39) >> PAGE_BITS)> page_table;
+};
+
+template <class P>
+BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
+ Tegra::Engines::Maxwell3D& maxwell3d_,
+ Tegra::Engines::KeplerCompute& kepler_compute_,
+ Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
+ Runtime& runtime_)
+ : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
+ gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
+ // Ensure the first slot is used for the null buffer
+ void(slot_buffers.insert(runtime, NullBufferParams{}));
+}
+
+template <class P>
+void BufferCache<P>::TickFrame() {
+ delayed_destruction_ring.Tick();
+}
+
+template <class P>
+void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) {
+ ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
+ buffer.MarkRegionAsCpuModified(cpu_addr, size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) {
+ ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
+ if (!buffer.HasCachedWrites()) {
+ cached_write_buffer_ids.push_back(buffer_id);
}
- if (map->is_written) {
- UnmarkRegionAsWritten(map->start, map->end - 1);
+ buffer.CachedCpuWrite(cpu_addr, size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
+ ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) {
+ boost::container::small_vector<BufferCopy, 1> copies;
+ u64 total_size_bytes = 0;
+ u64 largest_copy = 0;
+ buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+ copies.push_back(BufferCopy{
+ .src_offset = range_offset,
+ .dst_offset = total_size_bytes,
+ .size = range_size,
+ });
+ total_size_bytes += range_size;
+ largest_copy = std::max(largest_copy, range_size);
+ });
+ if (total_size_bytes == 0) {
+ return;
}
- const auto it = mapped_addresses.find(*map);
- ASSERT(it != mapped_addresses.end());
- mapped_addresses.erase(it);
- mapped_addresses_allocator.Release(map);
- }
-
-private:
- MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) {
- const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
- if (overlaps.empty()) {
- const VAddr cpu_addr_end = cpu_addr + size;
- if (gpu_memory.IsGranularRange(gpu_addr, size)) {
- u8* const host_ptr = gpu_memory.GetPointer(gpu_addr);
- block->Upload(block->Offset(cpu_addr), size, host_ptr);
- } else {
- staging_buffer.resize(size);
- gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
- block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
+ MICROPROFILE_SCOPE(GPU_DownloadMemory);
+
+ if constexpr (USE_MEMORY_MAPS) {
+ auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
+ const u8* const mapped_memory = download_staging.mapped_span.data();
+ const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
+ runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
+ runtime.Finish();
+ for (const BufferCopy& copy : copies) {
+ const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ const u8* copy_mapped_memory = mapped_memory + copy.dst_offset;
+ cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
}
- return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
- }
-
- const VAddr cpu_addr_end = cpu_addr + size;
- if (overlaps.size() == 1) {
- MapInterval* const current_map = overlaps[0];
- if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
- return current_map;
+ } else {
+ const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
+ for (const BufferCopy& copy : copies) {
+ buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
+ const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size);
}
}
- VAddr new_start = cpu_addr;
- VAddr new_end = cpu_addr_end;
- bool write_inheritance = false;
- bool modified_inheritance = false;
- // Calculate new buffer parameters
- for (MapInterval* overlap : overlaps) {
- new_start = std::min(overlap->start, new_start);
- new_end = std::max(overlap->end, new_end);
- write_inheritance |= overlap->is_written;
- modified_inheritance |= overlap->is_modified;
+ });
+}
+
+template <class P>
+void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
+ u32 size) {
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr) {
+ uniform_buffers[stage][index] = NULL_BINDING;
+ return;
+ }
+ const Binding binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = BufferId{},
+ };
+ uniform_buffers[stage][index] = binding;
+}
+
+template <class P>
+void BufferCache<P>::UpdateGraphicsBuffers(bool is_indexed) {
+ MICROPROFILE_SCOPE(GPU_PrepareBuffers);
+ do {
+ has_deleted_buffers = false;
+ DoUpdateGraphicsBuffers(is_indexed);
+ } while (has_deleted_buffers);
+}
+
+template <class P>
+void BufferCache<P>::UpdateComputeBuffers() {
+ MICROPROFILE_SCOPE(GPU_PrepareBuffers);
+ do {
+ has_deleted_buffers = false;
+ DoUpdateComputeBuffers();
+ } while (has_deleted_buffers);
+}
+
+template <class P>
+void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
+ MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+ if (is_indexed) {
+ BindHostIndexBuffer();
+ } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
+ const auto& regs = maxwell3d.regs;
+ if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
+ runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count);
}
- GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
- for (auto& overlap : overlaps) {
- Unregister(overlap);
+ }
+ BindHostVertexBuffers();
+ BindHostTransformFeedbackBuffers();
+}
+
+template <class P>
+void BufferCache<P>::BindHostStageBuffers(size_t stage) {
+ MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+ BindHostGraphicsUniformBuffers(stage);
+ BindHostGraphicsStorageBuffers(stage);
+}
+
+template <class P>
+void BufferCache<P>::BindHostComputeBuffers() {
+ MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
+ BindHostComputeUniformBuffers();
+ BindHostComputeStorageBuffers();
+}
+
+template <class P>
+void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ if (enabled_uniform_buffers[stage] != enabled) {
+ dirty_uniform_buffers[stage] = ~u32{0};
}
- UpdateBlock(block, new_start, new_end, overlaps);
-
- const MapInterval new_map{new_start, new_end, new_gpu_addr};
- MapInterval* const map = Register(new_map, write_inheritance);
- if (!map) {
- return nullptr;
+ }
+ enabled_uniform_buffers[stage] = enabled;
+}
+
+template <class P>
+void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
+ enabled_compute_uniform_buffers = enabled;
+}
+
+template <class P>
+void BufferCache<P>::UnbindGraphicsStorageBuffers(size_t stage) {
+ enabled_storage_buffers[stage] = 0;
+ written_storage_buffers[stage] = 0;
+}
+
+template <class P>
+void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index,
+ u32 cbuf_offset, bool is_written) {
+ enabled_storage_buffers[stage] |= 1U << ssbo_index;
+ written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index;
+
+ const auto& cbufs = maxwell3d.state.shader_stages[stage];
+ const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset;
+ storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr);
+}
+
+template <class P>
+void BufferCache<P>::UnbindComputeStorageBuffers() {
+ enabled_compute_storage_buffers = 0;
+ written_compute_storage_buffers = 0;
+}
+
+template <class P>
+void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
+ bool is_written) {
+ enabled_compute_storage_buffers |= 1U << ssbo_index;
+ written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index;
+
+ const auto& launch_desc = kepler_compute.launch_description;
+ ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0);
+
+ const auto& cbufs = launch_desc.const_buffer_config;
+ const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset;
+ compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr);
+}
+
+template <class P>
+void BufferCache<P>::FlushCachedWrites() {
+ for (const BufferId buffer_id : cached_write_buffer_ids) {
+ slot_buffers[buffer_id].FlushCachedWrites();
+ }
+ cached_write_buffer_ids.clear();
+}
+
+template <class P>
+bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
+ return !uncommitted_downloads.empty();
+}
+
+template <class P>
+bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
+ return !committed_downloads.empty() && !committed_downloads.front().empty();
+}
+
+template <class P>
+void BufferCache<P>::CommitAsyncFlushes() {
+ // This is intentionally passing the value by copy
+ committed_downloads.push_front(uncommitted_downloads);
+ uncommitted_downloads.clear();
+}
+
+template <class P>
+void BufferCache<P>::PopAsyncFlushes() {
+ if (committed_downloads.empty()) {
+ return;
+ }
+ auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
+ const std::span<const BufferId> download_ids = committed_downloads.back();
+ if (download_ids.empty()) {
+ return;
+ }
+ MICROPROFILE_SCOPE(GPU_DownloadMemory);
+
+ boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
+ u64 total_size_bytes = 0;
+ u64 largest_copy = 0;
+ for (const BufferId buffer_id : download_ids) {
+ slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
+ downloads.push_back({
+ BufferCopy{
+ .src_offset = range_offset,
+ .dst_offset = total_size_bytes,
+ .size = range_size,
+ },
+ buffer_id,
+ });
+ total_size_bytes += range_size;
+ largest_copy = std::max(largest_copy, range_size);
+ });
+ }
+ if (downloads.empty()) {
+ return;
+ }
+ if constexpr (USE_MEMORY_MAPS) {
+ auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
+ for (const auto [copy, buffer_id] : downloads) {
+ const std::array copies{copy};
+ runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
}
- if (modified_inheritance) {
- map->MarkAsModified(true, GetModifiedTicks());
- if (Settings::IsGPULevelHigh() &&
- Settings::values.use_asynchronous_gpu_emulation.GetValue()) {
- MarkForAsyncFlush(map);
- }
+ runtime.Finish();
+ for (const auto [copy, buffer_id] : downloads) {
+ const Buffer& buffer = slot_buffers[buffer_id];
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset;
+ cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
+ }
+ } else {
+ const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy);
+ for (const auto [copy, buffer_id] : downloads) {
+ Buffer& buffer = slot_buffers[buffer_id];
+ buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size));
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
+ cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
}
- return map;
}
-
- void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) {
- const IntervalType base_interval{start, end};
- IntervalSet interval_set{};
- interval_set.add(base_interval);
- for (auto& overlap : overlaps) {
- const IntervalType subtract{overlap->start, overlap->end};
- interval_set.subtract(subtract);
+}
+
+template <class P>
+bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
+ const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
+ for (u64 page = addr >> PAGE_BITS; page < page_end;) {
+ const BufferId image_id = page_table[page];
+ if (!image_id) {
+ ++page;
+ continue;
}
- for (auto& interval : interval_set) {
- const std::size_t size = interval.upper() - interval.lower();
- if (size == 0) {
- continue;
- }
- staging_buffer.resize(size);
- cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
- block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
+ Buffer& buffer = slot_buffers[image_id];
+ if (buffer.IsRegionGpuModified(addr, size)) {
+ return true;
}
+ const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+ page = Common::DivCeil(end_addr, PAGE_SIZE);
}
-
- VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
- VectorMapInterval result;
- if (size == 0) {
- return result;
+ return false;
+}
+
+template <class P>
+void BufferCache<P>::BindHostIndexBuffer() {
+ Buffer& buffer = slot_buffers[index_buffer.buffer_id];
+ const u32 offset = buffer.Offset(index_buffer.cpu_addr);
+ const u32 size = index_buffer.size;
+ SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
+ if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
+ runtime.BindIndexBuffer(buffer, offset, size);
+ } else {
+ runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
+ maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
+ buffer, offset, size);
+ }
+}
+
+template <class P>
+void BufferCache<P>::BindHostVertexBuffers() {
+ auto& flags = maxwell3d.dirty.flags;
+ for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+ const Binding& binding = vertex_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
+ if (!flags[Dirty::VertexBuffer0 + index]) {
+ continue;
}
+ flags[Dirty::VertexBuffer0 + index] = false;
+
+ const u32 stride = maxwell3d.regs.vertex_array[index].stride;
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
+ }
+}
- const VAddr addr_end = addr + size;
- auto it = mapped_addresses.lower_bound(addr);
- if (it != mapped_addresses.begin()) {
- --it;
+template <class P>
+void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
+ u32 dirty = ~0U;
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ dirty = std::exchange(dirty_uniform_buffers[stage], 0);
+ }
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+ const bool needs_bind = ((dirty >> index) & 1) != 0;
+ BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
+ if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+ ++binding_index;
}
- while (it != mapped_addresses.end() && it->start < addr_end) {
- if (it->Overlaps(addr, addr_end)) {
- result.push_back(&*it);
+ });
+}
+
+template <class P>
+void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index,
+ bool needs_bind) {
+ const Binding& binding = uniform_buffers[stage][index];
+ const VAddr cpu_addr = binding.cpu_addr;
+ const u32 size = binding.size;
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ if constexpr (IS_OPENGL) {
+ if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) {
+ if (runtime.HasFastBufferSubData()) {
+ // Fast path for Nvidia
+ if (!HasFastUniformBufferBound(stage, binding_index)) {
+ // We only have to bind when the currently bound buffer is not the fast version
+ fast_bound_uniform_buffers[stage] |= 1U << binding_index;
+ runtime.BindFastUniformBuffer(stage, binding_index, size);
+ }
+ const auto span = ImmediateBufferWithData(cpu_addr, size);
+ runtime.PushFastUniformBuffer(stage, binding_index, span);
+ } else {
+ // Stream buffer path to avoid stalling on non-Nvidia drivers
+ const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
+ cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
}
- ++it;
+ return;
}
- return result;
}
-
- /// Returns a ticks counter used for tracking when cached objects were last modified
- u64 GetModifiedTicks() {
- return ++modified_ticks;
+ // Classic cached path
+ SynchronizeBuffer(buffer, cpu_addr, size);
+ if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
+ // Skip binding if it's not needed and if the bound buffer is not the fast version
+ // This exists to avoid instances where the fast buffer is bound and a GPU write happens
+ return;
}
+ fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
- void FlushMap(MapInterval* map) {
- const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
- ASSERT_OR_EXECUTE(it != blocks.end(), return;);
-
- std::shared_ptr<Buffer> block = it->second;
-
- const std::size_t size = map->end - map->start;
- staging_buffer.resize(size);
- block->Download(block->Offset(map->start), size, staging_buffer.data());
- cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size);
- map->MarkAsModified(false, 0);
+ const u32 offset = buffer.Offset(cpu_addr);
+ if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+ runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
+ } else {
+ runtime.BindUniformBuffer(buffer, offset, size);
}
+}
+
+template <class P>
+void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
+ const Binding& binding = storage_buffers[stage][index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0;
+ if constexpr (NEEDS_BIND_STORAGE_INDEX) {
+ runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written);
+ ++binding_index;
+ } else {
+ runtime.BindStorageBuffer(buffer, offset, size, is_written);
+ }
+ });
+}
- template <typename Callable>
- BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
- AlignBuffer(alignment);
- const std::size_t uploaded_offset = buffer_offset;
- callable(buffer_ptr);
-
- buffer_ptr += size;
- buffer_offset += size;
- return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
+template <class P>
+void BufferCache<P>::BindHostTransformFeedbackBuffers() {
+ if (maxwell3d.regs.tfb_enabled == 0) {
+ return;
}
-
- void AlignBuffer(std::size_t alignment) {
- // Align the offset, not the mapped pointer
- const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
- buffer_ptr += offset_aligned - buffer_offset;
- buffer_offset = offset_aligned;
+ for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
+ const Binding& binding = transform_feedback_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
}
+}
- std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
- const std::size_t old_size = buffer->Size();
- const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
- const VAddr cpu_addr = buffer->CpuAddr();
- std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
- new_buffer->CopyFrom(*buffer, 0, 0, old_size);
- QueueDestruction(std::move(buffer));
-
- const VAddr cpu_addr_end = cpu_addr + new_size - 1;
- const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
- for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
- blocks.insert_or_assign(page_start, new_buffer);
+template <class P>
+void BufferCache<P>::BindHostComputeUniformBuffers() {
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ // Mark all uniform buffers as dirty
+ dirty_uniform_buffers.fill(~u32{0});
+ }
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
+ const Binding& binding = compute_uniform_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
+ runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size);
+ ++binding_index;
+ } else {
+ runtime.BindUniformBuffer(buffer, offset, size);
}
+ });
+}
+
+template <class P>
+void BufferCache<P>::BindHostComputeStorageBuffers() {
+ u32 binding_index = 0;
+ ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
+ const Binding& binding = compute_storage_buffers[index];
+ Buffer& buffer = slot_buffers[binding.buffer_id];
+ const u32 size = binding.size;
+ SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+ const u32 offset = buffer.Offset(binding.cpu_addr);
+ const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0;
+ if constexpr (NEEDS_BIND_STORAGE_INDEX) {
+ runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written);
+ ++binding_index;
+ } else {
+ runtime.BindStorageBuffer(buffer, offset, size, is_written);
+ }
+ });
+}
- return new_buffer;
+template <class P>
+void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
+ if (is_indexed) {
+ UpdateIndexBuffer();
}
+ UpdateVertexBuffers();
+ UpdateTransformFeedbackBuffers();
+ for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
+ UpdateUniformBuffers(stage);
+ UpdateStorageBuffers(stage);
+ }
+}
+
+template <class P>
+void BufferCache<P>::DoUpdateComputeBuffers() {
+ UpdateComputeUniformBuffers();
+ UpdateComputeStorageBuffers();
+}
+
+template <class P>
+void BufferCache<P>::UpdateIndexBuffer() {
+ // We have to check for the dirty flags and index count
+ // The index count is currently changed without updating the dirty flags
+ const auto& index_array = maxwell3d.regs.index_array;
+ auto& flags = maxwell3d.dirty.flags;
+ if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) {
+ return;
+ }
+ flags[Dirty::IndexBuffer] = false;
+ last_index_count = index_array.count;
+
+ const GPUVAddr gpu_addr_begin = index_array.StartAddress();
+ const GPUVAddr gpu_addr_end = index_array.EndAddress();
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
+ const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+ const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
+ const u32 size = std::min(address_size, draw_size);
+ if (size == 0 || !cpu_addr) {
+ index_buffer = NULL_BINDING;
+ return;
+ }
+ index_buffer = Binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = FindBuffer(*cpu_addr, size),
+ };
+}
- std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
- std::shared_ptr<Buffer> second) {
- const std::size_t size_1 = first->Size();
- const std::size_t size_2 = second->Size();
- const VAddr first_addr = first->CpuAddr();
- const VAddr second_addr = second->CpuAddr();
- const VAddr new_addr = std::min(first_addr, second_addr);
- const std::size_t new_size = size_1 + size_2;
-
- std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
- new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
- new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
- QueueDestruction(std::move(first));
- QueueDestruction(std::move(second));
+template <class P>
+void BufferCache<P>::UpdateVertexBuffers() {
+ auto& flags = maxwell3d.dirty.flags;
+ if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) {
+ return;
+ }
+ flags[Dirty::VertexBuffers] = false;
- const VAddr cpu_addr_end = new_addr + new_size - 1;
- const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
- for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
- blocks.insert_or_assign(page_start, new_buffer);
- }
- return new_buffer;
+ for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+ UpdateVertexBuffer(index);
}
+}
- Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
- std::shared_ptr<Buffer> found;
+template <class P>
+void BufferCache<P>::UpdateVertexBuffer(u32 index) {
+ if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) {
+ return;
+ }
+ const auto& array = maxwell3d.regs.vertex_array[index];
+ const auto& limit = maxwell3d.regs.vertex_array_limit[index];
+ const GPUVAddr gpu_addr_begin = array.StartAddress();
+ const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1;
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
+ const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
+ const u32 size = address_size; // TODO: Analyze stride and number of vertices
+ if (array.enable == 0 || size == 0 || !cpu_addr) {
+ vertex_buffers[index] = NULL_BINDING;
+ return;
+ }
+ vertex_buffers[index] = Binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = FindBuffer(*cpu_addr, size),
+ };
+}
+
+template <class P>
+void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
+ ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+ Binding& binding = uniform_buffers[stage][index];
+ if (binding.buffer_id) {
+ // Already updated
+ return;
+ }
+ // Mark as dirty
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ dirty_uniform_buffers[stage] |= 1U << index;
+ }
+ // Resolve buffer
+ binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
+ const u32 written_mask = written_storage_buffers[stage];
+ ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
+ // Resolve buffer
+ Binding& binding = storage_buffers[stage][index];
+ const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ binding.buffer_id = buffer_id;
+ // Mark buffer as written if needed
+ if (((written_mask >> index) & 1) != 0) {
+ MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
+ }
+ });
+}
- const VAddr cpu_addr_end = cpu_addr + size - 1;
- const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
- for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
- auto it = blocks.find(page_start);
- if (it == blocks.end()) {
- if (found) {
- found = EnlargeBlock(found);
- continue;
- }
- const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
- found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
- blocks.insert_or_assign(page_start, found);
- continue;
- }
- if (!found) {
- found = it->second;
- continue;
- }
- if (found != it->second) {
- found = MergeBlocks(std::move(found), it->second);
+template <class P>
+void BufferCache<P>::UpdateTransformFeedbackBuffers() {
+ if (maxwell3d.regs.tfb_enabled == 0) {
+ return;
+ }
+ for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
+ UpdateTransformFeedbackBuffer(index);
+ }
+}
+
+template <class P>
+void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
+ const auto& binding = maxwell3d.regs.tfb_bindings[index];
+ const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset;
+ const u32 size = binding.buffer_size;
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+ if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) {
+ transform_feedback_buffers[index] = NULL_BINDING;
+ return;
+ }
+ const BufferId buffer_id = FindBuffer(*cpu_addr, size);
+ transform_feedback_buffers[index] = Binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = buffer_id,
+ };
+ MarkWrittenBuffer(buffer_id, *cpu_addr, size);
+}
+
+template <class P>
+void BufferCache<P>::UpdateComputeUniformBuffers() {
+ ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
+ Binding& binding = compute_uniform_buffers[index];
+ binding = NULL_BINDING;
+ const auto& launch_desc = kepler_compute.launch_description;
+ if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) {
+ const auto& cbuf = launch_desc.const_buffer_config[index];
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address());
+ if (cpu_addr) {
+ binding.cpu_addr = *cpu_addr;
+ binding.size = cbuf.size;
}
}
- return found.get();
+ binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ });
+}
+
+template <class P>
+void BufferCache<P>::UpdateComputeStorageBuffers() {
+ ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
+ // Resolve buffer
+ Binding& binding = compute_storage_buffers[index];
+ const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+ binding.buffer_id = buffer_id;
+ // Mark as written if needed
+ if (((written_compute_storage_buffers >> index) & 1) != 0) {
+ MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
+ }
+ });
+}
+
+template <class P>
+void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) {
+ Buffer& buffer = slot_buffers[buffer_id];
+ buffer.MarkRegionAsGpuModified(cpu_addr, size);
+
+ const bool is_accuracy_high = Settings::IsGPULevelHigh();
+ const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
+ if (!is_accuracy_high || !is_async) {
+ return;
}
+ if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
+ // Already inserted
+ return;
+ }
+ uncommitted_downloads.push_back(buffer_id);
+}
- void MarkRegionAsWritten(VAddr start, VAddr end) {
- const u64 page_end = end >> WRITE_PAGE_BIT;
- for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
- if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) {
- ++it->second;
- }
+template <class P>
+BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) {
+ if (cpu_addr == 0) {
+ return NULL_BUFFER_ID;
+ }
+ const u64 page = cpu_addr >> PAGE_BITS;
+ const BufferId buffer_id = page_table[page];
+ if (!buffer_id) {
+ return CreateBuffer(cpu_addr, size);
+ }
+ const Buffer& buffer = slot_buffers[buffer_id];
+ if (buffer.IsInBounds(cpu_addr, size)) {
+ return buffer_id;
+ }
+ return CreateBuffer(cpu_addr, size);
+}
+
+template <class P>
+BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
+ std::vector<BufferId> overlap_ids;
+ VAddr cpu_addr_begin = cpu_addr;
+ VAddr cpu_addr_end = cpu_addr + wanted_size;
+ for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE);
+ cpu_addr += PAGE_SIZE) {
+ const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS];
+ if (!overlap_id) {
+ continue;
+ }
+ Buffer& overlap = slot_buffers[overlap_id];
+ if (overlap.IsPicked()) {
+ continue;
+ }
+ overlap.Pick();
+ overlap_ids.push_back(overlap_id);
+ const VAddr overlap_cpu_addr = overlap.CpuAddr();
+ if (overlap_cpu_addr < cpu_addr_begin) {
+ cpu_addr = cpu_addr_begin = overlap_cpu_addr;
}
+ cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes());
}
-
- void UnmarkRegionAsWritten(VAddr start, VAddr end) {
- const u64 page_end = end >> WRITE_PAGE_BIT;
- for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
- auto it = written_pages.find(page_start);
- if (it != written_pages.end()) {
- if (it->second > 1) {
- --it->second;
- } else {
- written_pages.erase(it);
- }
- }
+ const u32 size = static_cast<u32>(cpu_addr_end - cpu_addr_begin);
+ const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size);
+ Buffer& new_buffer = slot_buffers[new_buffer_id];
+
+ for (const BufferId overlap_id : overlap_ids) {
+ Buffer& overlap = slot_buffers[overlap_id];
+ overlap.Unpick();
+
+ std::vector<BufferCopy> copies;
+ const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr();
+ overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) {
+ copies.push_back(BufferCopy{
+ .src_offset = begin,
+ .dst_offset = dst_base_offset + begin,
+ .size = range_size,
+ });
+ new_buffer.UnmarkRegionAsCpuModified(begin, range_size);
+ new_buffer.MarkRegionAsGpuModified(begin, range_size);
+ });
+ if (!copies.empty()) {
+ runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
+ }
+ ReplaceBufferDownloads(overlap_id, new_buffer_id);
+ DeleteBuffer(overlap_id);
+ }
+ Register(new_buffer_id);
+ return new_buffer_id;
+}
+
+template <class P>
+void BufferCache<P>::Register(BufferId buffer_id) {
+ ChangeRegister<true>(buffer_id);
+}
+
+template <class P>
+void BufferCache<P>::Unregister(BufferId buffer_id) {
+ ChangeRegister<false>(buffer_id);
+}
+
+template <class P>
+template <bool insert>
+void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
+ const Buffer& buffer = slot_buffers[buffer_id];
+ const VAddr cpu_addr_begin = buffer.CpuAddr();
+ const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes();
+ const u64 page_begin = cpu_addr_begin / PAGE_SIZE;
+ const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE);
+ for (u64 page = page_begin; page != page_end; ++page) {
+ if constexpr (insert) {
+ page_table[page] = buffer_id;
+ } else {
+ page_table[page] = BufferId{};
}
}
+}
- bool IsRegionWritten(VAddr start, VAddr end) const {
- const u64 page_end = end >> WRITE_PAGE_BIT;
- for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
- if (written_pages.contains(page_start)) {
- return true;
+template <class P>
+void BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) {
+ if (buffer.CpuAddr() == 0) {
+ return;
+ }
+ SynchronizeBufferImpl(buffer, cpu_addr, size);
+}
+
+template <class P>
+void BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) {
+ boost::container::small_vector<BufferCopy, 4> copies;
+ u64 total_size_bytes = 0;
+ u64 largest_copy = 0;
+ buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+ copies.push_back(BufferCopy{
+ .src_offset = total_size_bytes,
+ .dst_offset = range_offset,
+ .size = range_size,
+ });
+ total_size_bytes += range_size;
+ largest_copy = std::max(largest_copy, range_size);
+ });
+ if (total_size_bytes == 0) {
+ return;
+ }
+ const std::span<BufferCopy> copies_span(copies.data(), copies.size());
+ UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
+}
+
+template <class P>
+void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy,
+ std::span<BufferCopy> copies) {
+ if constexpr (USE_MEMORY_MAPS) {
+ MappedUploadMemory(buffer, total_size_bytes, copies);
+ } else {
+ ImmediateUploadMemory(buffer, largest_copy, copies);
+ }
+}
+
+template <class P>
+void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
+ std::span<const BufferCopy> copies) {
+ std::span<u8> immediate_buffer;
+ for (const BufferCopy& copy : copies) {
+ std::span<const u8> upload_span;
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
+ if (IsRangeGranular(cpu_addr, copy.size)) {
+ upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size);
+ } else {
+ if (immediate_buffer.empty()) {
+ immediate_buffer = ImmediateBuffer(largest_copy);
}
+ cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size);
+ upload_span = immediate_buffer.subspan(0, copy.size);
}
- return false;
+ buffer.ImmediateUpload(copy.dst_offset, upload_span);
}
-
- void QueueDestruction(std::shared_ptr<Buffer> buffer) {
- buffer->SetEpoch(epoch);
- pending_destruction.push(std::move(buffer));
+}
+
+template <class P>
+void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
+ std::span<const BufferCopy> copies) {
+ auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
+ const std::span<u8> staging_pointer = upload_staging.mapped_span;
+ for (const BufferCopy& copy : copies) {
+ const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
+ u8* const src_pointer = staging_pointer.data() + copy.src_offset;
+ cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
}
-
- void MarkForAsyncFlush(MapInterval* map) {
- if (!uncommitted_flushes) {
- uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
+ runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
+}
+
+template <class P>
+void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
+ const auto scalar_replace = [buffer_id](Binding& binding) {
+ if (binding.buffer_id == buffer_id) {
+ binding.buffer_id = BufferId{};
+ }
+ };
+ const auto replace = [scalar_replace](std::span<Binding> bindings) {
+ std::ranges::for_each(bindings, scalar_replace);
+ };
+ scalar_replace(index_buffer);
+ replace(vertex_buffers);
+ std::ranges::for_each(uniform_buffers, replace);
+ std::ranges::for_each(storage_buffers, replace);
+ replace(transform_feedback_buffers);
+ replace(compute_uniform_buffers);
+ replace(compute_storage_buffers);
+ std::erase(cached_write_buffer_ids, buffer_id);
+
+ // Mark the whole buffer as CPU written to stop tracking CPU writes
+ Buffer& buffer = slot_buffers[buffer_id];
+ buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes());
+
+ Unregister(buffer_id);
+ delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
+
+ NotifyBufferDeletion();
+}
+
+template <class P>
+void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
+ const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
+ std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
+ if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
+ buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
}
- uncommitted_flushes->insert(map);
+ };
+ replace(uncommitted_downloads);
+ std::ranges::for_each(committed_downloads, replace);
+}
+
+template <class P>
+void BufferCache<P>::NotifyBufferDeletion() {
+ if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+ dirty_uniform_buffers.fill(~u32{0});
}
+ auto& flags = maxwell3d.dirty.flags;
+ flags[Dirty::IndexBuffer] = true;
+ flags[Dirty::VertexBuffers] = true;
+ for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
+ flags[Dirty::VertexBuffer0 + index] = true;
+ }
+ has_deleted_buffers = true;
+}
+
+template <class P>
+typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr ssbo_addr) const {
+ const GPUVAddr gpu_addr = gpu_memory.Read<u64>(ssbo_addr);
+ const u32 size = gpu_memory.Read<u32>(ssbo_addr + 8);
+ const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+ if (!cpu_addr || size == 0) {
+ return NULL_BINDING;
+ }
+ const Binding binding{
+ .cpu_addr = *cpu_addr,
+ .size = size,
+ .buffer_id = BufferId{},
+ };
+ return binding;
+}
+
+template <class P>
+std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
+ u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
+ if (IsRangeGranular(cpu_addr, size) ||
+ base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) {
+ return std::span(base_pointer, size);
+ } else {
+ const std::span<u8> span = ImmediateBuffer(size);
+ cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
+ return span;
+ }
+}
- VideoCore::RasterizerInterface& rasterizer;
- Tegra::MemoryManager& gpu_memory;
- Core::Memory::Memory& cpu_memory;
- StreamBuffer& stream_buffer;
-
- u8* buffer_ptr = nullptr;
- u64 buffer_offset = 0;
- u64 buffer_offset_base = 0;
-
- MapIntervalAllocator mapped_addresses_allocator;
- boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
- mapped_addresses;
-
- std::unordered_map<u64, u32> written_pages;
- std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
-
- std::queue<std::shared_ptr<Buffer>> pending_destruction;
- u64 epoch = 0;
- u64 modified_ticks = 0;
-
- std::vector<u8> staging_buffer;
-
- std::list<MapInterval*> marked_for_unregister;
-
- std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
- std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;
-
- std::recursive_mutex mutex;
-};
+template <class P>
+std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
+ if (wanted_capacity > immediate_buffer_capacity) {
+ immediate_buffer_capacity = wanted_capacity;
+ immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
+ }
+ return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
+}
+
+template <class P>
+bool BufferCache<P>::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept {
+ if constexpr (IS_OPENGL) {
+ return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0;
+ } else {
+ // Only OpenGL has fast uniform buffers
+ return false;
+ }
+}
} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.cpp b/src/video_core/buffer_cache/map_interval.cpp
deleted file mode 100644
index 62587e18a..000000000
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright 2020 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <memory>
-
-#include "video_core/buffer_cache/map_interval.h"
-
-namespace VideoCommon {
-
-MapIntervalAllocator::MapIntervalAllocator() {
- FillFreeList(first_chunk);
-}
-
-MapIntervalAllocator::~MapIntervalAllocator() = default;
-
-void MapIntervalAllocator::AllocateNewChunk() {
- *new_chunk = std::make_unique<Chunk>();
- FillFreeList(**new_chunk);
- new_chunk = &(*new_chunk)->next;
-}
-
-void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
- const std::size_t old_size = free_list.size();
- free_list.resize(old_size + chunk.data.size());
- std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
- [](MapInterval& interval) { return &interval; });
-}
-
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
deleted file mode 100644
index ef974b08a..000000000
--- a/src/video_core/buffer_cache/map_interval.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <cstddef>
-#include <memory>
-#include <vector>
-
-#include <boost/intrusive/set_hook.hpp>
-
-#include "common/common_types.h"
-#include "video_core/gpu.h"
-
-namespace VideoCommon {
-
-struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
- MapInterval() = default;
-
- /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
-
- explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
- : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
-
- bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
- return start <= other_start && other_end <= end;
- }
-
- bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
- return start < other_end && other_start < end;
- }
-
- void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
- is_modified = is_modified_;
- ticks = ticks_;
- }
-
- boost::intrusive::set_member_hook<> member_hook_;
- VAddr start = 0;
- VAddr end = 0;
- GPUVAddr gpu_addr = 0;
- u64 ticks = 0;
- bool is_written = false;
- bool is_modified = false;
- bool is_registered = false;
- bool is_memory_marked = false;
- bool is_sync_pending = false;
-};
-
-struct MapIntervalCompare {
- constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
- return lhs.start < rhs.start;
- }
-};
-
-class MapIntervalAllocator {
-public:
- MapIntervalAllocator();
- ~MapIntervalAllocator();
-
- MapInterval* Allocate() {
- if (free_list.empty()) {
- AllocateNewChunk();
- }
- MapInterval* const interval = free_list.back();
- free_list.pop_back();
- return interval;
- }
-
- void Release(MapInterval* interval) {
- free_list.push_back(interval);
- }
-
-private:
- struct Chunk {
- std::unique_ptr<Chunk> next;
- std::array<MapInterval, 0x8000> data;
- };
-
- void AllocateNewChunk();
-
- void FillFreeList(Chunk& chunk);
-
- std::vector<MapInterval*> free_list;
-
- Chunk first_chunk;
-
- std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
-};
-
-} // namespace VideoCommon
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
index 55e632346..2b7569335 100644
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -110,12 +110,10 @@ void Vic::Execute() {
converted_frame_buffer.get(), block_height, 0, 0);
gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
- gpu.Maxwell3D().OnMemoryWrite();
} else {
// send pitch linear frame
gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
linear_size);
- gpu.Maxwell3D().OnMemoryWrite();
}
break;
}
@@ -163,7 +161,6 @@ void Vic::Execute() {
}
gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
chroma_buffer.size());
- gpu.Maxwell3D().OnMemoryWrite();
break;
}
default:
diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp
index b1eaac00c..7149af290 100644
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@@ -12,13 +12,30 @@
#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))
namespace VideoCommon::Dirty {
-
+namespace {
using Tegra::Engines::Maxwell3D;
-void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
+void SetupDirtyVertexBuffers(Maxwell3D::DirtyState::Tables& tables) {
+ static constexpr std::size_t num_array = 3;
+ for (std::size_t i = 0; i < Maxwell3D::Regs::NumVertexArrays; ++i) {
+ const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
+ const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
+
+ FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
+ FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
+ }
+}
+
+void SetupIndexBuffer(Maxwell3D::DirtyState::Tables& tables) {
+ FillBlock(tables[0], OFF(index_array), NUM(index_array), IndexBuffer);
+}
+
+void SetupDirtyDescriptors(Maxwell3D::DirtyState::Tables& tables) {
FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
+}
+void SetupDirtyRenderTargets(Maxwell3D::DirtyState::Tables& tables) {
static constexpr std::size_t num_per_rt = NUM(rt[0]);
static constexpr std::size_t begin = OFF(rt);
static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
@@ -41,5 +58,13 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl
FillBlock(table, OFF(zeta), NUM(zeta), flag);
}
}
+} // Anonymous namespace
+
+void SetupDirtyFlags(Maxwell3D::DirtyState::Tables& tables) {
+ SetupDirtyVertexBuffers(tables);
+ SetupIndexBuffer(tables);
+ SetupDirtyDescriptors(tables);
+ SetupDirtyRenderTargets(tables);
+}
} // namespace VideoCommon::Dirty
diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h
index 875527ddd..702688ace 100644
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -30,6 +30,12 @@ enum : u8 {
ColorBuffer7,
ZetaBuffer,
+ VertexBuffers,
+ VertexBuffer0,
+ VertexBuffer31 = VertexBuffer0 + 31,
+
+ IndexBuffer,
+
LastCommonEntry,
};
@@ -47,6 +53,6 @@ void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_
FillBlock(tables[1], begin, num, index_b);
}
-void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
+void SetupDirtyFlags(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
} // namespace VideoCommon::Dirty
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 2c8b20024..8b33c04ab 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -23,8 +23,6 @@ void DmaPusher::DispatchCalls() {
MICROPROFILE_SCOPE(DispatchCalls);
gpu.SyncGuestHost();
- // On entering GPU code, assume all memory may be touched by the ARM core.
- gpu.Maxwell3D().OnMemoryWrite();
dma_pushbuffer_subindex = 0;
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index ed29fc7ac..a9b75091e 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -39,7 +39,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
case KEPLER_COMPUTE_REG_INDEX(data_upload): {
upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) {
- system.GPU().Maxwell3D().OnMemoryWrite();
}
break;
}
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 9911140e9..560551157 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -33,7 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
case KEPLERMEMORY_REG_INDEX(data): {
upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) {
- system.GPU().Maxwell3D().OnMemoryWrite();
}
break;
}
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index d6ba9da5c..75517a4f7 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -223,7 +223,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
case MAXWELL3D_REG_INDEX(data_upload):
upload_state.ProcessData(argument, is_last_call);
if (is_last_call) {
- OnMemoryWrite();
}
return;
case MAXWELL3D_REG_INDEX(fragment_barrier):
@@ -570,17 +569,18 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
}
}
-void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
+void Maxwell3D::ProcessCBBind(size_t stage_index) {
// Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
- auto& shader = state.shader_stages[stage_index];
- auto& bind_data = regs.cb_bind[stage_index];
-
- ASSERT(bind_data.index < Regs::MaxConstBuffers);
- auto& buffer = shader.const_buffers[bind_data.index];
-
+ const auto& bind_data = regs.cb_bind[stage_index];
+ auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.index];
buffer.enabled = bind_data.valid.Value() != 0;
buffer.address = regs.const_buffer.BufferAddress();
buffer.size = regs.const_buffer.cb_size;
+
+ const bool is_enabled = bind_data.valid.Value() != 0;
+ const GPUVAddr gpu_addr = is_enabled ? regs.const_buffer.BufferAddress() : 0;
+ const u32 size = is_enabled ? regs.const_buffer.cb_size : 0;
+ rasterizer->BindGraphicsUniformBuffer(stage_index, bind_data.index, gpu_addr, size);
}
void Maxwell3D::ProcessCBData(u32 value) {
@@ -635,7 +635,6 @@ void Maxwell3D::FinishCBData() {
const u32 id = cb_data_state.id;
memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
- OnMemoryWrite();
cb_data_state.id = null_cb_data;
cb_data_state.current = null_cb_data;
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index cc94d2678..ffed42a29 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1314,8 +1314,7 @@ public:
GPUVAddr LimitAddress() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) |
- limit_low) +
- 1;
+ limit_low);
}
} vertex_array_limit[NumVertexArrays];
@@ -1403,6 +1402,7 @@ public:
};
std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages;
+
u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering.
};
@@ -1452,11 +1452,6 @@ public:
return *rasterizer;
}
- /// Notify a memory write has happened.
- void OnMemoryWrite() {
- dirty.flags |= dirty.on_write_stores;
- }
-
enum class MMEDrawMode : u32 {
Undefined,
Array,
@@ -1478,7 +1473,6 @@ public:
using Tables = std::array<Table, 2>;
Flags flags;
- Flags on_write_stores;
Tables tables{};
} dirty;
@@ -1541,7 +1535,7 @@ private:
void FinishCBData();
/// Handles a write to the CB_BIND register.
- void ProcessCBBind(std::size_t stage_index);
+ void ProcessCBBind(size_t stage_index);
/// Handles a write to the VERTEX_END_GL register, triggering a draw.
void DrawArrays();
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index ba750748c..a2f19559f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -60,9 +60,6 @@ void MaxwellDMA::Launch() {
return;
}
- // All copies here update the main memory, so mark all rasterizer states as invalid.
- system.GPU().Maxwell3D().OnMemoryWrite();
-
if (is_src_pitch && is_dst_pitch) {
CopyPitchToPitch();
} else {
diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h
index 3512283ff..f055b61e9 100644
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -143,22 +143,26 @@ private:
}
bool ShouldWait() const {
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
query_cache.ShouldWaitAsyncFlushes();
}
bool ShouldFlush() const {
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() ||
query_cache.HasUncommittedFlushes();
}
void PopAsyncFlushes() {
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
texture_cache.PopAsyncFlushes();
buffer_cache.PopAsyncFlushes();
query_cache.PopAsyncFlushes();
}
void CommitAsyncFlushes() {
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
texture_cache.CommitAsyncFlushes();
buffer_cache.CommitAsyncFlushes();
query_cache.CommitAsyncFlushes();
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 28f2b8614..970120acc 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -12,7 +12,6 @@ set(SHADER_FILES
vulkan_blit_depth_stencil.frag
vulkan_present.frag
vulkan_present.vert
- vulkan_quad_array.comp
vulkan_quad_indexed.comp
vulkan_uint8.comp
)
diff --git a/src/video_core/host_shaders/vulkan_quad_array.comp b/src/video_core/host_shaders/vulkan_quad_array.comp
deleted file mode 100644
index 212f4e998..000000000
--- a/src/video_core/host_shaders/vulkan_quad_array.comp
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#version 460 core
-
-layout (local_size_x = 1024) in;
-
-layout (std430, set = 0, binding = 0) buffer OutputBuffer {
- uint output_indexes[];
-};
-
-layout (push_constant) uniform PushConstants {
- uint first;
-};
-
-void main() {
- uint primitive = gl_GlobalInvocationID.x;
- if (primitive * 6 >= output_indexes.length()) {
- return;
- }
-
- const uint quad_map[6] = uint[](0, 1, 2, 0, 2, 3);
- for (uint vertex = 0; vertex < 6; ++vertex) {
- uint index = first + primitive * 4 + quad_map[vertex];
- output_indexes[primitive * 6 + vertex] = index;
- }
-}
diff --git a/src/video_core/host_shaders/vulkan_uint8.comp b/src/video_core/host_shaders/vulkan_uint8.comp
index ad74d7af9..872291670 100644
--- a/src/video_core/host_shaders/vulkan_uint8.comp
+++ b/src/video_core/host_shaders/vulkan_uint8.comp
@@ -16,9 +16,16 @@ layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
uint16_t output_indexes[];
};
+uint AssembleIndex(uint id) {
+ // Most primitive restart indices are 0xFF
+ // Hardcode this to 0xFF for now
+ uint index = uint(input_indexes[id]);
+ return index == 0xFF ? 0xFFFF : index;
+}
+
void main() {
uint id = gl_GlobalInvocationID.x;
if (id < input_indexes.length()) {
- output_indexes[id] = uint16_t(input_indexes[id]);
+ output_indexes[id] = uint16_t(AssembleIndex(id));
}
}
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 0cb0f387d..50491b758 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -7,6 +7,7 @@
#include <atomic>
#include <functional>
#include <optional>
+#include <span>
#include "common/common_types.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/gpu.h"
@@ -49,6 +50,10 @@ public:
/// Records a GPU query and caches it
virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
+ /// Signal an uniform buffer binding
+ virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
+ u32 size) = 0;
+
/// Signal a GPU based semaphore as a fence
virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0;
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 5772cad87..889ad6c56 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -2,98 +2,235 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include <memory>
+#include <span>
-#include <glad/glad.h>
-
-#include "common/assert.h"
-#include "common/microprofile.h"
#include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_device.h"
-#include "video_core/renderer_opengl/gl_rasterizer.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_instance.h"
+#include "video_core/vulkan_common/vulkan_library.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"
namespace OpenGL {
+namespace {
+struct BindlessSSBO {
+ GLuint64EXT address;
+ GLsizei length;
+ GLsizei padding;
+};
+static_assert(sizeof(BindlessSSBO) == sizeof(GLuint) * 4);
+
+constexpr std::array PROGRAM_LUT{
+ GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
+ GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
+};
+} // Anonymous namespace
+
+Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
+ : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
+
+Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
+ VAddr cpu_addr_, u64 size_bytes_)
+ : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
+ buffer.Create();
+ const std::string name = fmt::format("Buffer 0x{:x}", CpuAddr());
+ glObjectLabel(GL_BUFFER, buffer.handle, static_cast<GLsizei>(name.size()), name.data());
+ if (runtime.device.UseAssemblyShaders()) {
+ CreateMemoryObjects(runtime);
+ glNamedBufferStorageMemEXT(buffer.handle, SizeBytes(), memory_commit.ExportOpenGLHandle(),
+ memory_commit.Offset());
+ } else {
+ glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW);
+ }
+ if (runtime.has_unified_vertex_buffers) {
+ glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address);
+ }
+}
-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+void Buffer::ImmediateUpload(size_t offset, std::span<const u8> data) noexcept {
+ glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+ static_cast<GLsizeiptr>(data.size_bytes()), data.data());
+}
-MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
+void Buffer::ImmediateDownload(size_t offset, std::span<u8> data) noexcept {
+ glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+ static_cast<GLsizeiptr>(data.size_bytes()), data.data());
+}
-Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_)
- : BufferBlock{cpu_addr_, size_} {
- gl_buffer.Create();
- glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW);
- if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) {
- glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
- glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+void Buffer::MakeResident(GLenum access) noexcept {
+ // Abuse GLenum's order to exit early
+ // GL_NONE (default) < GL_READ_ONLY < GL_READ_WRITE
+ if (access <= current_residency_access || buffer.handle == 0) {
+ return;
+ }
+ if (std::exchange(current_residency_access, access) != GL_NONE) {
+ // If the buffer is already resident, remove its residency before promoting it
+ glMakeNamedBufferNonResidentNV(buffer.handle);
}
+ glMakeNamedBufferResidentNV(buffer.handle, access);
}
-Buffer::~Buffer() = default;
+GLuint Buffer::SubBuffer(u32 offset) {
+ if (offset == 0) {
+ return buffer.handle;
+ }
+ for (const auto& [sub_buffer, sub_offset] : subs) {
+ if (sub_offset == offset) {
+ return sub_buffer.handle;
+ }
+ }
+ OGLBuffer sub_buffer;
+ sub_buffer.Create();
+ glNamedBufferStorageMemEXT(sub_buffer.handle, SizeBytes() - offset,
+ memory_commit.ExportOpenGLHandle(), memory_commit.Offset() + offset);
+ return subs.emplace_back(std::move(sub_buffer), offset).first.handle;
+}
-void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
- glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset),
- static_cast<GLsizeiptr>(data_size), data);
+void Buffer::CreateMemoryObjects(BufferCacheRuntime& runtime) {
+ auto& allocator = runtime.vulkan_memory_allocator;
+ auto& device = runtime.vulkan_device->GetLogical();
+ auto vulkan_buffer = device.CreateBuffer(VkBufferCreateInfo{
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = nullptr,
+ .flags = 0,
+ .size = SizeBytes(),
+ .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+ VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
+ VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
+ VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = 0,
+ .pQueueFamilyIndices = nullptr,
+ });
+ const VkMemoryRequirements requirements = device.GetBufferMemoryRequirements(*vulkan_buffer);
+ memory_commit = allocator->Commit(requirements, Vulkan::MemoryUsage::DeviceLocal);
}
-void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
- MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
- const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size);
- const GLintptr gl_offset = static_cast<GLintptr>(offset);
- if (read_buffer.handle == 0) {
- read_buffer.Create();
- glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
- GL_STREAM_READ);
+BufferCacheRuntime::BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_,
+ Vulkan::MemoryAllocator* vulkan_memory_allocator_)
+ : device{device_}, vulkan_device{vulkan_device_},
+ vulkan_memory_allocator{vulkan_memory_allocator_},
+ stream_buffer{device.HasFastBufferSubData() ? std::nullopt
+ : std::make_optional<StreamBuffer>()} {
+ GLint gl_max_attributes;
+ glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes);
+ max_attributes = static_cast<u32>(gl_max_attributes);
+ use_assembly_shaders = device.UseAssemblyShaders();
+ has_unified_vertex_buffers = device.HasVertexBufferUnifiedMemory();
+
+ for (auto& stage_uniforms : fast_uniforms) {
+ for (OGLBuffer& buffer : stage_uniforms) {
+ buffer.Create();
+ glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW);
+ }
}
- glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
- glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
- glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
}
-void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
- std::size_t copy_size) {
- glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
- static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size));
+void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
+ std::span<const VideoCommon::BufferCopy> copies) {
+ for (const VideoCommon::BufferCopy& copy : copies) {
+ glCopyNamedBufferSubData(
+ src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset),
+ static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size));
+ }
}
-OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_,
- Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
- const Device& device_, OGLStreamBuffer& stream_buffer_,
- StateTracker& state_tracker)
- : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} {
- if (!device.HasFastBufferSubData()) {
- return;
+void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
+ if (has_unified_vertex_buffers) {
+ buffer.MakeResident(GL_READ_ONLY);
+ glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset,
+ static_cast<GLsizeiptr>(size));
+ } else {
+ glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle());
+ index_buffer_offset = offset;
}
+}
- static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
- glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
- for (const GLuint cbuf : cbufs) {
- glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
+void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size,
+ u32 stride) {
+ if (index >= max_attributes) {
+ return;
+ }
+ if (has_unified_vertex_buffers) {
+ buffer.MakeResident(GL_READ_ONLY);
+ glBindVertexBuffer(index, 0, 0, static_cast<GLsizei>(stride));
+ glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index,
+ buffer.HostGpuAddr() + offset, static_cast<GLsizeiptr>(size));
+ } else {
+ glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset),
+ static_cast<GLsizei>(stride));
}
}
-OGLBufferCache::~OGLBufferCache() {
- glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer,
+ u32 offset, u32 size) {
+ if (use_assembly_shaders) {
+ const GLuint sub_buffer = buffer.SubBuffer(offset);
+ glBindBufferRangeNV(PABO_LUT[stage], binding_index, sub_buffer, 0,
+ static_cast<GLsizeiptr>(size));
+ } else {
+ const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
+ const GLuint binding = base_binding + binding_index;
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer.Handle(),
+ static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+ }
}
-std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
- return std::make_shared<Buffer>(device, cpu_addr, size);
+void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset,
+ u32 size) {
+ if (use_assembly_shaders) {
+ glBindBufferRangeNV(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding_index,
+ buffer.SubBuffer(offset), 0, static_cast<GLsizeiptr>(size));
+ } else {
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding_index, buffer.Handle(),
+ static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+ }
}
-OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
- return {0, 0, 0};
+void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer,
+ u32 offset, u32 size, bool is_written) {
+ if (use_assembly_shaders) {
+ const BindlessSSBO ssbo{
+ .address = buffer.HostGpuAddr() + offset,
+ .length = static_cast<GLsizei>(size),
+ .padding = 0,
+ };
+ buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
+ glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1,
+ reinterpret_cast<const GLuint*>(&ssbo));
+ } else {
+ const GLuint base_binding = device.GetBaseBindings(stage).shader_storage_buffer;
+ const GLuint binding = base_binding + binding_index;
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(),
+ static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+ }
}
-OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
- std::size_t size) {
- DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
- const GLuint cbuf = cbufs[cbuf_cursor++];
+void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset,
+ u32 size, bool is_written) {
+ if (use_assembly_shaders) {
+ const BindlessSSBO ssbo{
+ .address = buffer.HostGpuAddr() + offset,
+ .length = static_cast<GLsizei>(size),
+ .padding = 0,
+ };
+ buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
+ glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1,
+ reinterpret_cast<const GLuint*>(&ssbo));
+ } else if (size == 0) {
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0);
+ } else {
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(),
+ static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+ }
+}
- glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
- return {cbuf, 0, 0};
+void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset,
+ u32 size) {
+ glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, buffer.Handle(),
+ static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 17ee90316..f4d8871a9 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -5,79 +5,167 @@
#pragma once
#include <array>
-#include <memory>
+#include <span>
+#include "common/alignment.h"
#include "common/common_types.h"
+#include "common/dynamic_library.h"
#include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/engines/maxwell_3d.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
+#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"
-namespace Core {
-class System;
-}
+namespace Vulkan {
+class Device;
+class MemoryAllocator;
+} // namespace Vulkan
namespace OpenGL {
-class Device;
-class OGLStreamBuffer;
-class RasterizerOpenGL;
-class StateTracker;
+class BufferCacheRuntime;
-class Buffer : public VideoCommon::BufferBlock {
+class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
public:
- explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_);
- ~Buffer();
+ explicit Buffer(BufferCacheRuntime&, VideoCore::RasterizerInterface& rasterizer, VAddr cpu_addr,
+ u64 size_bytes);
+ explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams);
- void Upload(std::size_t offset, std::size_t data_size, const u8* data);
+ void ImmediateUpload(size_t offset, std::span<const u8> data) noexcept;
- void Download(std::size_t offset, std::size_t data_size, u8* data);
+ void ImmediateDownload(size_t offset, std::span<u8> data) noexcept;
- void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
- std::size_t copy_size);
+ void MakeResident(GLenum access) noexcept;
- GLuint Handle() const noexcept {
- return gl_buffer.handle;
+ [[nodiscard]] GLuint SubBuffer(u32 offset);
+
+ [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept {
+ return address;
}
- u64 Address() const noexcept {
- return gpu_address;
+ [[nodiscard]] GLuint Handle() const noexcept {
+ return buffer.handle;
}
private:
- OGLBuffer gl_buffer;
- OGLBuffer read_buffer;
- u64 gpu_address = 0;
+ void CreateMemoryObjects(BufferCacheRuntime& runtime);
+
+ GLuint64EXT address = 0;
+ Vulkan::MemoryCommit memory_commit;
+ OGLBuffer buffer;
+ GLenum current_residency_access = GL_NONE;
+ std::vector<std::pair<OGLBuffer, u32>> subs;
};
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
-class OGLBufferCache final : public GenericBufferCache {
+class BufferCacheRuntime {
+ friend Buffer;
+
public:
- explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer,
- Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
- const Device& device, OGLStreamBuffer& stream_buffer,
- StateTracker& state_tracker);
- ~OGLBufferCache();
+ static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max();
+
+ explicit BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_,
+ Vulkan::MemoryAllocator* vulkan_memory_allocator_);
+
+ void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
+ std::span<const VideoCommon::BufferCopy> copies);
+
+ void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size);
+
+ void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride);
+
+ void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size);
+
+ void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size);
+
+ void BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size,
+ bool is_written);
+
+ void BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size,
+ bool is_written);
- BufferInfo GetEmptyBuffer(std::size_t) override;
+ void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size);
- void Acquire() noexcept {
- cbuf_cursor = 0;
+ void BindFastUniformBuffer(size_t stage, u32 binding_index, u32 size) {
+ if (use_assembly_shaders) {
+ const GLuint handle = fast_uniforms[stage][binding_index].handle;
+ const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+ glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, gl_size);
+ } else {
+ const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
+ const GLuint binding = base_binding + binding_index;
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding,
+ fast_uniforms[stage][binding_index].handle, 0,
+ static_cast<GLsizeiptr>(size));
+ }
}
-protected:
- std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
+ void PushFastUniformBuffer(size_t stage, u32 binding_index, std::span<const u8> data) {
+ if (use_assembly_shaders) {
+ glProgramBufferParametersIuivNV(
+ PABO_LUT[stage], binding_index, 0,
+ static_cast<GLsizei>(data.size_bytes() / sizeof(GLuint)),
+ reinterpret_cast<const GLuint*>(data.data()));
+ } else {
+ glNamedBufferSubData(fast_uniforms[stage][binding_index].handle, 0,
+ static_cast<GLsizeiptr>(data.size_bytes()), data.data());
+ }
+ }
+
+ std::span<u8> BindMappedUniformBuffer(size_t stage, u32 binding_index, u32 size) noexcept {
+ const auto [mapped_span, offset] = stream_buffer->Request(static_cast<size_t>(size));
+ const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
+ const GLuint binding = base_binding + binding_index;
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding, stream_buffer->Handle(),
+ static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
+ return mapped_span;
+ }
- BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
+ [[nodiscard]] const GLvoid* IndexOffset() const noexcept {
+ return reinterpret_cast<const GLvoid*>(static_cast<uintptr_t>(index_buffer_offset));
+ }
+
+ [[nodiscard]] bool HasFastBufferSubData() const noexcept {
+ return device.HasFastBufferSubData();
+ }
private:
- static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
- Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+ static constexpr std::array PABO_LUT{
+ GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+ GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+ GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
+ };
const Device& device;
+ const Vulkan::Device* vulkan_device;
+ Vulkan::MemoryAllocator* vulkan_memory_allocator;
+ std::optional<StreamBuffer> stream_buffer;
+
+ u32 max_attributes = 0;
- std::size_t cbuf_cursor = 0;
- std::array<GLuint, NUM_CBUFS> cbufs{};
+ bool use_assembly_shaders = false;
+ bool has_unified_vertex_buffers = false;
+
+ std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>,
+ VideoCommon::NUM_STAGES>
+ fast_uniforms;
+
+ u32 index_buffer_offset = 0;
+};
+
+struct BufferCacheParams {
+ using Runtime = OpenGL::BufferCacheRuntime;
+ using Buffer = OpenGL::Buffer;
+
+ static constexpr bool IS_OPENGL = true;
+ static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true;
+ static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true;
+ static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true;
+ static constexpr bool NEEDS_BIND_STORAGE_INDEX = true;
+ static constexpr bool USE_MEMORY_MAPS = false;
};
+using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
+
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 04c267ee4..0f492f006 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -21,9 +21,7 @@
#include "video_core/renderer_opengl/gl_resource_manager.h"
namespace OpenGL {
-
namespace {
-
// One uniform block is reserved for emulation purposes
constexpr u32 ReservedUniformBlocks = 1;
@@ -197,11 +195,13 @@ bool IsASTCSupported() {
const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
}
-
} // Anonymous namespace
-Device::Device()
- : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
+Device::Device(bool has_vulkan_instance) {
+ if (!GLAD_GL_VERSION_4_6) {
+ LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available");
+ throw std::runtime_error{"Insufficient version"};
+ }
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
const std::vector extensions = GetExtensions();
@@ -217,6 +217,9 @@ Device::Device()
"Beta driver 443.24 is known to have issues. There might be performance issues.");
disable_fast_buffer_sub_data = true;
}
+
+ max_uniform_buffers = BuildMaxUniformBuffers();
+ base_bindings = BuildBaseBindings();
uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
@@ -243,7 +246,8 @@ Device::Device()
use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
- GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
+ GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2 &&
+ has_vulkan_instance;
use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue();
use_driver_cache = is_nvidia;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 9141de635..eb62ae52d 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -10,18 +10,16 @@
namespace OpenGL {
-static constexpr u32 EmulationUniformBlockBinding = 0;
-
-class Device final {
+class Device {
public:
- struct BaseBindings final {
+ struct BaseBindings {
u32 uniform_buffer{};
u32 shader_storage_buffer{};
u32 sampler{};
u32 image{};
};
- explicit Device();
+ explicit Device(bool has_vulkan_instance);
explicit Device(std::nullptr_t);
u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp
index 3e9c922f5..151290101 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -47,7 +47,7 @@ void GLInnerFence::Wait() {
FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
Tegra::GPU& gpu_, TextureCache& texture_cache_,
- OGLBufferCache& buffer_cache_, QueryCache& query_cache_)
+ BufferCache& buffer_cache_, QueryCache& query_cache_)
: GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}
Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {
diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h
index 30dbee613..e714aa115 100644
--- a/src/video_core/renderer_opengl/gl_fence_manager.h
+++ b/src/video_core/renderer_opengl/gl_fence_manager.h
@@ -32,14 +32,13 @@ private:
};
using Fence = std::shared_ptr<GLInnerFence>;
-using GenericFenceManager =
- VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>;
+using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>;
class FenceManagerOpenGL final : public GenericFenceManager {
public:
- explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
- TextureCache& texture_cache_, OGLBufferCache& buffer_cache_,
- QueryCache& query_cache_);
+ explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+ TextureCache& texture_cache, BufferCache& buffer_cache,
+ QueryCache& query_cache);
protected:
Fence CreateFence(u32 value, bool is_stubbed) override;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index ea4ca9a82..52499ee4c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -44,17 +44,10 @@ using VideoCore::Surface::PixelFormat;
using VideoCore::Surface::SurfaceTarget;
using VideoCore::Surface::SurfaceType;
-MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
+MICROPROFILE_DEFINE(OpenGL_Clears, "OpenGL", "Clears", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
-MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
-MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100));
+MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Management", MP_RGB(100, 255, 100));
namespace {
@@ -101,20 +94,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const
return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
}
-std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
- const ConstBufferEntry& entry) {
- if (!entry.IsIndirect()) {
- return entry.GetSize();
- }
- if (buffer.size > Maxwell::MaxConstBufferSize) {
- LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
- Maxwell::MaxConstBufferSize);
- return Maxwell::MaxConstBufferSize;
- }
-
- return buffer.size;
-}
-
/// Translates hardware transform feedback indices
/// @param location Hardware location
/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
@@ -147,14 +126,6 @@ void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap);
}
-void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
- if (num_ssbos == 0) {
- return;
- }
- glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
- reinterpret_cast<const GLuint*>(ssbos));
-}
-
ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
if (entry.is_buffer) {
return ImageViewType::Buffer;
@@ -196,49 +167,35 @@ ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) {
RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
Core::Memory::Memory& cpu_memory_, const Device& device_,
+ const Vulkan::Device* vulkan_device,
+ Vulkan::MemoryAllocator* vulkan_memory_allocator,
ScreenInfo& screen_info_, ProgramManager& program_manager_,
StateTracker& state_tracker_)
: RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
- stream_buffer(device, state_tracker),
texture_cache_runtime(device, program_manager, state_tracker),
texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
+ buffer_cache_runtime(device, vulkan_device, vulkan_memory_allocator),
+ buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device),
query_cache(*this, maxwell3d, gpu_memory),
- buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
async_shaders(emu_window_) {
- unified_uniform_buffer.Create();
- glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
-
- if (device.UseAssemblyShaders()) {
- glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
- for (const GLuint cbuf : staging_cbufs) {
- glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
- nullptr, 0);
- }
- }
if (device.UseAsynchronousShaders()) {
async_shaders.AllocateWorkers();
}
}
-RasterizerOpenGL::~RasterizerOpenGL() {
- if (device.UseAssemblyShaders()) {
- glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
- }
-}
+RasterizerOpenGL::~RasterizerOpenGL() = default;
-void RasterizerOpenGL::SetupVertexFormat() {
+void RasterizerOpenGL::SyncVertexFormats() {
auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexFormats]) {
return;
}
flags[Dirty::VertexFormats] = false;
- MICROPROFILE_SCOPE(OpenGL_VAO);
-
// Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables
// the first 16 vertex attributes always, as we don't know which ones are actually used until
// shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to
@@ -274,55 +231,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
}
}
-void RasterizerOpenGL::SetupVertexBuffer() {
- auto& flags = maxwell3d.dirty.flags;
- if (!flags[Dirty::VertexBuffers]) {
- return;
- }
- flags[Dirty::VertexBuffers] = false;
-
- MICROPROFILE_SCOPE(OpenGL_VB);
-
- const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
-
- // Upload all guest vertex arrays sequentially to our buffer
- const auto& regs = maxwell3d.regs;
- for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
- if (!flags[Dirty::VertexBuffer0 + index]) {
- continue;
- }
- flags[Dirty::VertexBuffer0 + index] = false;
-
- const auto& vertex_array = regs.vertex_array[index];
- if (!vertex_array.IsEnabled()) {
- continue;
- }
-
- const GPUVAddr start = vertex_array.StartAddress();
- const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
- ASSERT(end >= start);
-
- const GLuint gl_index = static_cast<GLuint>(index);
- const u64 size = end - start;
- if (size == 0) {
- glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
- if (use_unified_memory) {
- glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
- }
- continue;
- }
- const auto info = buffer_cache.UploadMemory(start, size);
- if (use_unified_memory) {
- glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
- glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
- info.address + info.offset, size);
- } else {
- glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
- }
- }
-}
-
-void RasterizerOpenGL::SetupVertexInstances() {
+void RasterizerOpenGL::SyncVertexInstances() {
auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexInstances]) {
return;
@@ -343,17 +252,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
}
}
-GLintptr RasterizerOpenGL::SetupIndexBuffer() {
- MICROPROFILE_SCOPE(OpenGL_Index);
- const auto& regs = maxwell3d.regs;
- const std::size_t size = CalculateIndexBufferSize();
- const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
- glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
- return info.offset;
-}
-
-void RasterizerOpenGL::SetupShaders() {
- MICROPROFILE_SCOPE(OpenGL_Shader);
+void RasterizerOpenGL::SetupShaders(bool is_indexed) {
u32 clip_distances = 0;
std::array<Shader*, Maxwell::MaxShaderStage> shaders{};
@@ -410,11 +309,19 @@ void RasterizerOpenGL::SetupShaders() {
const size_t stage = index == 0 ? 0 : index - 1;
shaders[stage] = shader;
- SetupDrawConstBuffers(stage, shader);
- SetupDrawGlobalMemory(stage, shader);
SetupDrawTextures(shader, stage);
SetupDrawImages(shader, stage);
+ buffer_cache.SetEnabledUniformBuffers(stage, shader->GetEntries().enabled_uniform_buffers);
+
+ buffer_cache.UnbindGraphicsStorageBuffers(stage);
+ u32 ssbo_index = 0;
+ for (const auto& buffer : shader->GetEntries().global_memory_entries) {
+ buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index,
+ buffer.cbuf_offset, buffer.is_written);
+ ++ssbo_index;
+ }
+
// Workaround for Intel drivers.
// When a clip distance is enabled but not set in the shader it crops parts of the screen
// (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@@ -430,43 +337,26 @@ void RasterizerOpenGL::SetupShaders() {
SyncClipEnabled(clip_distances);
maxwell3d.dirty.flags[Dirty::Shaders] = false;
+ buffer_cache.UpdateGraphicsBuffers(is_indexed);
+
const std::span indices_span(image_view_indices.data(), image_view_indices.size());
texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
+ buffer_cache.BindHostGeometryBuffers(is_indexed);
+
size_t image_view_index = 0;
size_t texture_index = 0;
size_t image_index = 0;
for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
const Shader* const shader = shaders[stage];
- if (shader) {
- const auto base = device.GetBaseBindings(stage);
- BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
- texture_index, image_index);
- }
- }
-}
-
-std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
- const auto& regs = maxwell3d.regs;
-
- std::size_t size = 0;
- for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
- if (!regs.vertex_array[index].IsEnabled())
+ if (!shader) {
continue;
-
- const GPUVAddr start = regs.vertex_array[index].StartAddress();
- const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
- size += end - start;
- ASSERT(end >= start);
+ }
+ buffer_cache.BindHostStageBuffers(stage);
+ const auto& base = device.GetBaseBindings(stage);
+ BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
+ texture_index, image_index);
}
-
- return size;
-}
-
-std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
- return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
- static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
}
void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
@@ -475,6 +365,7 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s
}
void RasterizerOpenGL::Clear() {
+ MICROPROFILE_SCOPE(OpenGL_Clears);
if (!maxwell3d.ShouldExecute()) {
return;
}
@@ -525,11 +416,9 @@ void RasterizerOpenGL::Clear() {
}
UNIMPLEMENTED_IF(regs.clear_flags.viewport);
- {
- auto lock = texture_cache.AcquireLock();
- texture_cache.UpdateRenderTargets(true);
- state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
- }
+ std::scoped_lock lock{texture_cache.mutex};
+ texture_cache.UpdateRenderTargets(true);
+ state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
if (use_color) {
glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
@@ -541,7 +430,6 @@ void RasterizerOpenGL::Clear() {
} else if (use_stencil) {
glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
}
-
++num_queued_commands;
}
@@ -550,75 +438,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
query_cache.UpdateCounters();
- SyncViewport();
- SyncRasterizeEnable();
- SyncPolygonModes();
- SyncColorMask();
- SyncFragmentColorClampState();
- SyncMultiSampleState();
- SyncDepthTestState();
- SyncDepthClamp();
- SyncStencilTestState();
- SyncBlendState();
- SyncLogicOpState();
- SyncCullMode();
- SyncPrimitiveRestart();
- SyncScissorTest();
- SyncPointState();
- SyncLineState();
- SyncPolygonOffset();
- SyncAlphaTest();
- SyncFramebufferSRGB();
-
- buffer_cache.Acquire();
- current_cbuf = 0;
-
- std::size_t buffer_size = CalculateVertexArraysSize();
-
- // Add space for index buffer
- if (is_indexed) {
- buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
- }
-
- // Uniform space for the 5 shader stages
- buffer_size =
- Common::AlignUp<std::size_t>(buffer_size, 4) +
- (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
-
- // Add space for at least 18 constant buffers
- buffer_size += Maxwell::MaxConstBuffers *
- (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
-
- // Prepare the vertex array.
- buffer_cache.Map(buffer_size);
-
- // Prepare vertex array format.
- SetupVertexFormat();
-
- // Upload vertex and index data.
- SetupVertexBuffer();
- SetupVertexInstances();
- GLintptr index_buffer_offset = 0;
- if (is_indexed) {
- index_buffer_offset = SetupIndexBuffer();
- }
-
- // Setup emulation uniform buffer.
- if (!device.UseAssemblyShaders()) {
- MaxwellUniformData ubo;
- ubo.SetFromRegs(maxwell3d);
- const auto info =
- buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
- glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
- static_cast<GLsizeiptr>(sizeof(ubo)));
- }
+ SyncState();
// Setup shaders and their used resources.
- auto lock = texture_cache.AcquireLock();
- SetupShaders();
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
+ SetupShaders(is_indexed);
- // Signal the buffer cache that we are not going to upload more things.
- buffer_cache.Unmap();
texture_cache.UpdateRenderTargets(false);
state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
program_manager.BindGraphicsPipeline();
@@ -632,7 +457,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
if (is_indexed) {
const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base);
const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count);
- const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
+ const GLvoid* const offset = buffer_cache_runtime.IndexOffset();
const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format);
if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
glDrawElements(primitive_mode, num_vertices, format, offset);
@@ -672,22 +497,22 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
}
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
- buffer_cache.Acquire();
- current_cbuf = 0;
-
Shader* const kernel = shader_cache.GetComputeKernel(code_addr);
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
BindComputeTextures(kernel);
- const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers *
- (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
- buffer_cache.Map(buffer_size);
-
- SetupComputeConstBuffers(kernel);
- SetupComputeGlobalMemory(kernel);
-
- buffer_cache.Unmap();
+ const auto& entries = kernel->GetEntries();
+ buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers);
+ buffer_cache.UnbindComputeStorageBuffers();
+ u32 ssbo_index = 0;
+ for (const auto& buffer : entries.global_memory_entries) {
+ buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset,
+ buffer.is_written);
+ ++ssbo_index;
+ }
+ buffer_cache.UpdateComputeBuffers();
+ buffer_cache.BindHostComputeBuffers();
const auto& launch_desc = kepler_compute.launch_description;
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
@@ -703,6 +528,12 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
query_cache.Query(gpu_addr, type, timestamp);
}
+void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
+ u32 size) {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size);
+}
+
void RasterizerOpenGL::FlushAll() {}
void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
@@ -711,19 +542,23 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
return;
}
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.DownloadMemory(addr, size);
}
- buffer_cache.FlushRegion(addr, size);
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.DownloadMemory(addr, size);
+ }
query_cache.FlushRegion(addr, size);
}
bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
if (!Settings::IsGPULevelHigh()) {
- return buffer_cache.MustFlushRegion(addr, size);
+ return buffer_cache.IsRegionGpuModified(addr, size);
}
return texture_cache.IsRegionGpuModified(addr, size) ||
- buffer_cache.MustFlushRegion(addr, size);
+ buffer_cache.IsRegionGpuModified(addr, size);
}
void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
@@ -732,11 +567,14 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
return;
}
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size);
}
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.WriteMemory(addr, size);
+ }
shader_cache.InvalidateRegion(addr, size);
- buffer_cache.InvalidateRegion(addr, size);
query_cache.InvalidateRegion(addr, size);
}
@@ -745,26 +583,35 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
if (addr == 0 || size == 0) {
return;
}
+ shader_cache.OnCPUWrite(addr, size);
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size);
}
- shader_cache.OnCPUWrite(addr, size);
- buffer_cache.OnCPUWrite(addr, size);
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.CachedWriteMemory(addr, size);
+ }
}
void RasterizerOpenGL::SyncGuestHost() {
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
- buffer_cache.SyncGuestHost();
shader_cache.SyncGuestHost();
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.FlushCachedWrites();
+ }
}
void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.UnmapMemory(addr, size);
}
- buffer_cache.OnCPUWrite(addr, size);
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.WriteMemory(addr, size);
+ }
shader_cache.OnCPUWrite(addr, size);
}
@@ -799,14 +646,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
}
void RasterizerOpenGL::WaitForIdle() {
- // Place a barrier on everything that is not framebuffer related.
- // This is related to another flag that is not currently implemented.
- glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT |
- GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT |
- GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT |
- GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT |
- GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT |
- GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
+ glMemoryBarrier(GL_ALL_BARRIER_BITS);
}
void RasterizerOpenGL::FragmentBarrier() {
@@ -831,18 +671,21 @@ void RasterizerOpenGL::TickFrame() {
num_queued_commands = 0;
fence_manager.TickFrame();
- buffer_cache.TickFrame();
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.TickFrame();
}
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.TickFrame();
+ }
}
bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
MICROPROFILE_SCOPE(OpenGL_Blits);
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.BlitImage(dst, src, copy_config);
return true;
}
@@ -854,7 +697,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
}
MICROPROFILE_SCOPE(OpenGL_CacheManagement);
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)};
if (!image_view) {
return false;
@@ -921,166 +764,6 @@ void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_te
}
}
-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
- static constexpr std::array PARAMETER_LUT{
- GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
- GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
- GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
- };
- MICROPROFILE_SCOPE(OpenGL_UBO);
- const auto& stages = maxwell3d.state.shader_stages;
- const auto& shader_stage = stages[stage_index];
- const auto& entries = shader->GetEntries();
- const bool use_unified = entries.use_unified_uniforms;
- const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
-
- const auto base_bindings = device.GetBaseBindings(stage_index);
- u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
- for (const auto& entry : entries.const_buffers) {
- const u32 index = entry.GetIndex();
- const auto& buffer = shader_stage.const_buffers[index];
- SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
- base_unified_offset + index * Maxwell::MaxConstBufferSize);
- ++binding;
- }
- if (use_unified) {
- const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
- entries.global_memory_entries.size());
- glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
- base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
- }
-}
-
-void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
- MICROPROFILE_SCOPE(OpenGL_UBO);
- const auto& launch_desc = kepler_compute.launch_description;
- const auto& entries = kernel->GetEntries();
- const bool use_unified = entries.use_unified_uniforms;
-
- u32 binding = 0;
- for (const auto& entry : entries.const_buffers) {
- const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
- const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
- Tegra::Engines::ConstBufferInfo buffer;
- buffer.address = config.Address();
- buffer.size = config.size;
- buffer.enabled = mask[entry.GetIndex()];
- SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
- use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
- ++binding;
- }
- if (use_unified) {
- const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
- glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
- NUM_CONST_BUFFERS_BYTES_PER_STAGE);
- }
-}
-
-void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
- const Tegra::Engines::ConstBufferInfo& buffer,
- const ConstBufferEntry& entry, bool use_unified,
- std::size_t unified_offset) {
- if (!buffer.enabled) {
- // Set values to zero to unbind buffers
- if (device.UseAssemblyShaders()) {
- glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
- } else {
- glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
- }
- return;
- }
-
- // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
- // UBO alignment requirements.
- const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
-
- const bool fast_upload = !use_unified && device.HasFastBufferSubData();
-
- const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
- const GPUVAddr gpu_addr = buffer.address;
- auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
-
- if (device.UseAssemblyShaders()) {
- UNIMPLEMENTED_IF(use_unified);
- if (info.offset != 0) {
- const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
- glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
- info.handle = staging_cbuf;
- info.offset = 0;
- }
- glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
- return;
- }
-
- if (use_unified) {
- glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
- unified_offset, size);
- } else {
- glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
- }
-}
-
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
- static constexpr std::array TARGET_LUT = {
- GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
- GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
- };
- const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
- const auto& entries{shader->GetEntries().global_memory_entries};
-
- std::array<BindlessSSBO, 32> ssbos;
- ASSERT(entries.size() < ssbos.size());
-
- const bool assembly_shaders = device.UseAssemblyShaders();
- u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
- for (const auto& entry : entries) {
- const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
- const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
- const u32 size{gpu_memory.Read<u32>(addr + 8)};
- SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
- ++binding;
- }
- if (assembly_shaders) {
- UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
- }
-}
-
-void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
- const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
- const auto& entries{kernel->GetEntries().global_memory_entries};
-
- std::array<BindlessSSBO, 32> ssbos;
- ASSERT(entries.size() < ssbos.size());
-
- u32 binding = 0;
- for (const auto& entry : entries) {
- const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
- const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
- const u32 size{gpu_memory.Read<u32>(addr + 8)};
- SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
- ++binding;
- }
- if (device.UseAssemblyShaders()) {
- UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
- }
-}
-
-void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
- GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
- const size_t alignment{device.GetShaderStorageBufferAlignment()};
- const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
- if (device.UseAssemblyShaders()) {
- *ssbo = BindlessSSBO{
- .address = static_cast<GLuint64EXT>(info.address + info.offset),
- .length = static_cast<GLsizei>(size),
- .padding = 0,
- };
- } else {
- glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
- static_cast<GLsizeiptr>(size));
- }
-}
-
void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) {
const bool via_header_index =
maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
@@ -1128,6 +811,30 @@ void RasterizerOpenGL::SetupComputeImages(const Shader* shader) {
}
}
+void RasterizerOpenGL::SyncState() {
+ SyncViewport();
+ SyncRasterizeEnable();
+ SyncPolygonModes();
+ SyncColorMask();
+ SyncFragmentColorClampState();
+ SyncMultiSampleState();
+ SyncDepthTestState();
+ SyncDepthClamp();
+ SyncStencilTestState();
+ SyncBlendState();
+ SyncLogicOpState();
+ SyncCullMode();
+ SyncPrimitiveRestart();
+ SyncScissorTest();
+ SyncPointState();
+ SyncLineState();
+ SyncPolygonOffset();
+ SyncAlphaTest();
+ SyncFramebufferSRGB();
+ SyncVertexFormats();
+ SyncVertexInstances();
+}
+
void RasterizerOpenGL::SyncViewport() {
auto& flags = maxwell3d.dirty.flags;
const auto& regs = maxwell3d.regs;
@@ -1163,9 +870,11 @@ void RasterizerOpenGL::SyncViewport() {
if (regs.screen_y_control.y_negate != 0) {
flip_y = !flip_y;
}
- glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT,
- regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE
- : GL_NEGATIVE_ONE_TO_ONE);
+ const bool is_zero_to_one = regs.depth_mode == Maxwell::DepthMode::ZeroToOne;
+ const GLenum origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT;
+ const GLenum depth = is_zero_to_one ? GL_ZERO_TO_ONE : GL_NEGATIVE_ONE_TO_ONE;
+ state_tracker.ClipControl(origin, depth);
+ state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0);
}
if (dirty_viewport) {
@@ -1649,36 +1358,13 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
if (regs.tfb_enabled == 0) {
return;
}
-
if (device.UseAssemblyShaders()) {
SyncTransformFeedback();
}
-
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
-
- for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
- const auto& binding = regs.tfb_bindings[index];
- if (!binding.buffer_enable) {
- if (enabled_transform_feedback_buffers[index]) {
- glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0,
- 0);
- }
- enabled_transform_feedback_buffers[index] = false;
- continue;
- }
- enabled_transform_feedback_buffers[index] = true;
-
- auto& tfb_buffer = transform_feedback_buffers[index];
- tfb_buffer.Create();
-
- const GLuint handle = tfb_buffer.handle;
- const std::size_t size = binding.buffer_size;
- glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY);
- glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0,
- static_cast<GLsizeiptr>(size));
- }
+ UNIMPLEMENTED_IF(primitive_mode != GL_POINTS);
// We may have to call BeginTransformFeedbackNV here since they seem to call different
// implementations on Nvidia's driver (the pointer is different) but we are using
@@ -1692,23 +1378,7 @@ void RasterizerOpenGL::EndTransformFeedback() {
if (regs.tfb_enabled == 0) {
return;
}
-
glEndTransformFeedback();
-
- for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
- const auto& binding = regs.tfb_bindings[index];
- if (!binding.buffer_enable) {
- continue;
- }
- UNIMPLEMENTED_IF(binding.buffer_offset != 0);
-
- const GLuint handle = transform_feedback_buffers[index].handle;
- const GPUVAddr gpu_addr = binding.Address();
- const std::size_t size = binding.buffer_size;
- const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
- glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
- static_cast<GLsizeiptr>(size));
- }
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 82e03e677..31d69a94c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -30,7 +30,6 @@
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state_tracker.h"
-#include "video_core/renderer_opengl/gl_stream_buffer.h"
#include "video_core/renderer_opengl/gl_texture_cache.h"
#include "video_core/shader/async_shaders.h"
#include "video_core/textures/texture.h"
@@ -47,6 +46,11 @@ namespace Tegra {
class MemoryManager;
}
+namespace Vulkan {
+class Device;
+class MemoryAllocator;
+} // namespace Vulkan
+
namespace OpenGL {
struct ScreenInfo;
@@ -63,6 +67,8 @@ class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
public:
explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
Core::Memory::Memory& cpu_memory_, const Device& device_,
+ const Vulkan::Device* vulkan_device,
+ Vulkan::MemoryAllocator* vulkan_memory_allocator,
ScreenInfo& screen_info_, ProgramManager& program_manager_,
StateTracker& state_tracker_);
~RasterizerOpenGL() override;
@@ -72,6 +78,7 @@ public:
void DispatchCompute(GPUVAddr code_addr) override;
void ResetCounter(VideoCore::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
+ void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void FlushAll() override;
void FlushRegion(VAddr addr, u64 size) override;
bool MustFlushRegion(VAddr addr, u64 size) override;
@@ -119,27 +126,6 @@ private:
void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image,
size_t& image_view_index, size_t& texture_index, size_t& image_index);
- /// Configures the current constbuffers to use for the draw command.
- void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
-
- /// Configures the current constbuffers to use for the kernel invocation.
- void SetupComputeConstBuffers(Shader* kernel);
-
- /// Configures a constant buffer.
- void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
- const ConstBufferEntry& entry, bool use_unified,
- std::size_t unified_offset);
-
- /// Configures the current global memory entries to use for the draw command.
- void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
-
- /// Configures the current global memory entries to use for the kernel invocation.
- void SetupComputeGlobalMemory(Shader* kernel);
-
- /// Configures a global memory buffer.
- void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
- size_t size, BindlessSSBO* ssbo);
-
/// Configures the current textures to use for the draw command.
void SetupDrawTextures(const Shader* shader, size_t stage_index);
@@ -152,6 +138,9 @@ private:
/// Configures images in a compute shader.
void SetupComputeImages(const Shader* shader);
+ /// Syncs state to match guest's
+ void SyncState();
+
/// Syncs the viewport and depth range to match the guest state
void SyncViewport();
@@ -215,6 +204,12 @@ private:
/// Syncs the framebuffer sRGB state to match the guest state
void SyncFramebufferSRGB();
+ /// Syncs vertex formats to match the guest state
+ void SyncVertexFormats();
+
+ /// Syncs vertex instances to match the guest state
+ void SyncVertexInstances();
+
/// Syncs transform feedback state to match guest state
/// @note Only valid on assembly shaders
void SyncTransformFeedback();
@@ -225,19 +220,7 @@ private:
/// End a transform feedback
void EndTransformFeedback();
- std::size_t CalculateVertexArraysSize() const;
-
- std::size_t CalculateIndexBufferSize() const;
-
- /// Updates the current vertex format
- void SetupVertexFormat();
-
- void SetupVertexBuffer();
- void SetupVertexInstances();
-
- GLintptr SetupIndexBuffer();
-
- void SetupShaders();
+ void SetupShaders(bool is_indexed);
Tegra::GPU& gpu;
Tegra::Engines::Maxwell3D& maxwell3d;
@@ -249,12 +232,12 @@ private:
ProgramManager& program_manager;
StateTracker& state_tracker;
- OGLStreamBuffer stream_buffer;
TextureCacheRuntime texture_cache_runtime;
TextureCache texture_cache;
+ BufferCacheRuntime buffer_cache_runtime;
+ BufferCache buffer_cache;
ShaderCacheOpenGL shader_cache;
QueryCache query_cache;
- OGLBufferCache buffer_cache;
FenceManagerOpenGL fence_manager;
VideoCommon::Shader::AsyncShaders async_shaders;
@@ -262,20 +245,8 @@ private:
boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices;
std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids;
boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles;
- std::array<GLuint, MAX_TEXTURES> texture_handles;
- std::array<GLuint, MAX_IMAGES> image_handles;
-
- std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
- transform_feedback_buffers;
- std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
- enabled_transform_feedback_buffers;
-
- static constexpr std::size_t NUM_CONSTANT_BUFFERS =
- Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
- Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
- std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
- std::size_t current_cbuf = 0;
- OGLBuffer unified_uniform_buffer;
+ std::array<GLuint, MAX_TEXTURES> texture_handles{};
+ std::array<GLuint, MAX_IMAGES> image_handles{};
/// Number of commands queued to the OpenGL driver. Resetted on flush.
std::size_t num_queued_commands = 0;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 0e34a0f20..3428e5e21 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -171,12 +171,6 @@ void OGLBuffer::Release() {
handle = 0;
}
-void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) {
- ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; });
-
- glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY);
-}
-
void OGLSync::Create() {
if (handle != 0)
return;
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index f48398669..552d79db4 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -234,9 +234,6 @@ public:
/// Deletes the internal OpenGL resource
void Release();
- // Converts the buffer into a stream copy buffer with a fixed size
- void MakeStreamCopy(std::size_t buffer_size);
-
GLuint handle = 0;
};
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index c35b71b6b..ac78d344c 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -64,7 +64,7 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>
constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
-constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
+constexpr std::string_view COMMON_DECLARATIONS = R"(#define ftoi floatBitsToInt
#define ftou floatBitsToUint
#define itof intBitsToFloat
#define utof uintBitsToFloat
@@ -77,10 +77,6 @@ bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{
const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f );
const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f );
-
-layout (std140, binding = {}) uniform vs_config {{
- float y_direction;
-}};
)";
class ShaderWriter final {
@@ -402,13 +398,6 @@ std::string FlowStackTopName(MetaStackClass stack) {
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
}
-bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
- const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
- // We waste one UBO for emulation
- const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
- return num_ubos > num_available_ubos;
-}
-
struct GenericVaryingDescription {
std::string name;
u8 first_element = 0;
@@ -420,9 +409,8 @@ public:
explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
ShaderType stage_, std::string_view identifier_,
std::string_view suffix_)
- : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, identifier{identifier_},
- suffix{suffix_}, header{ir.GetHeader()}, use_unified_uniforms{
- UseUnifiedUniforms(device_, ir_, stage_)} {
+ : device{device_}, ir{ir_}, registry{registry_}, stage{stage_},
+ identifier{identifier_}, suffix{suffix_}, header{ir.GetHeader()} {
if (stage != ShaderType::Compute) {
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
}
@@ -516,7 +504,8 @@ private:
if (!identifier.empty()) {
code.AddLine("// {}", identifier);
}
- code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core");
+ const bool use_compatibility = ir.UsesLegacyVaryings() || ir.UsesYNegate();
+ code.AddLine("#version 440 {}", use_compatibility ? "compatibility" : "core");
code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
if (device.HasShaderBallot()) {
code.AddLine("#extension GL_ARB_shader_ballot : require");
@@ -542,7 +531,7 @@ private:
code.AddNewLine();
- code.AddLine(CommonDeclarations, EmulationUniformBlockBinding);
+ code.AddLine(COMMON_DECLARATIONS);
}
void DeclareVertex() {
@@ -865,17 +854,6 @@ private:
}
void DeclareConstantBuffers() {
- if (use_unified_uniforms) {
- const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
- static_cast<u32>(ir.GetGlobalMemory().size());
- code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
- binding);
- code.AddLine(" uint cbufs[];");
- code.AddLine("}};");
- code.AddNewLine();
- return;
- }
-
u32 binding = device.GetBaseBindings(stage).uniform_buffer;
for (const auto& [index, info] : ir.GetConstantBuffers()) {
const u32 num_elements = Common::DivCeil(info.GetSize(), 4 * sizeof(u32));
@@ -1081,29 +1059,17 @@ private:
if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
const Node offset = cbuf->GetOffset();
- const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
// Direct access
const u32 offset_imm = immediate->GetValue();
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
- if (use_unified_uniforms) {
- return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
- Type::Uint};
- } else {
- return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
- offset_imm / (4 * 4), (offset_imm / 4) % 4),
- Type::Uint};
- }
- }
-
- // Indirect access
- if (use_unified_uniforms) {
- return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
- Visit(offset).AsUint()),
+ return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+ offset_imm / (4 * 4), (offset_imm / 4) % 4),
Type::Uint};
}
+ // Indirect access
const std::string final_offset = code.GenerateTemporary();
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
@@ -2293,7 +2259,6 @@ private:
}
}
}
-
if (header.ps.omap.depth) {
// The depth output is always 2 registers after the last color output, and current_reg
// already contains one past the last color register.
@@ -2337,7 +2302,8 @@ private:
}
Expression YNegate(Operation operation) {
- return {"y_direction", Type::Float};
+ // Y_NEGATE is mapped to this uniform value
+ return {"gl_FrontMaterial.ambient.a", Type::Float};
}
template <u32 element>
@@ -2787,7 +2753,6 @@ private:
const std::string_view identifier;
const std::string_view suffix;
const Header header;
- const bool use_unified_uniforms;
std::unordered_map<u8, VaryingTFB> transform_feedback;
ShaderWriter code;
@@ -3003,8 +2968,10 @@ ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType s
for (std::size_t i = 0; i < std::size(clip_distances); ++i) {
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
}
+ for (const auto& buffer : entries.const_buffers) {
+ entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
+ }
entries.shader_length = ir.GetLength();
- entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
return entries;
}
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index be68994bb..0397a000c 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -55,7 +55,7 @@ struct ShaderEntries {
std::vector<ImageEntry> images;
std::size_t shader_length{};
u32 clip_distances{};
- bool use_unified_uniforms{};
+ u32 enabled_uniform_buffers{};
};
ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp
index 60e6fa39f..dbdf5230f 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -36,16 +36,10 @@ void SetupDirtyColorMasks(Tables& tables) {
FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks);
}
-void SetupDirtyVertexArrays(Tables& tables) {
- static constexpr std::size_t num_array = 3;
+void SetupDirtyVertexInstances(Tables& tables) {
static constexpr std::size_t instance_base_offset = 3;
for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
- const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
-
- FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
- FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
-
const std::size_t instance_array_offset = array_offset + instance_base_offset;
tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i);
tables[1][instance_array_offset] = VertexInstances;
@@ -217,11 +211,11 @@ void SetupDirtyMisc(Tables& tables) {
StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} {
auto& dirty = gpu.Maxwell3D().dirty;
auto& tables = dirty.tables;
- SetupDirtyRenderTargets(tables);
+ SetupDirtyFlags(tables);
SetupDirtyColorMasks(tables);
SetupDirtyViewports(tables);
SetupDirtyScissors(tables);
- SetupDirtyVertexArrays(tables);
+ SetupDirtyVertexInstances(tables);
SetupDirtyVertexFormat(tables);
SetupDirtyShaders(tables);
SetupDirtyPolygonModes(tables);
@@ -241,19 +235,6 @@ StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags}
SetupDirtyClipControl(tables);
SetupDirtyDepthClampEnabled(tables);
SetupDirtyMisc(tables);
-
- auto& store = dirty.on_write_stores;
- store[VertexBuffers] = true;
- for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
- store[VertexBuffer0 + i] = true;
- }
-}
-
-void StateTracker::InvalidateStreamBuffer() {
- flags[Dirty::VertexBuffers] = true;
- for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
- flags[index] = true;
- }
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h
index 574615d3c..94c905116 100644
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -28,10 +28,6 @@ enum : u8 {
VertexFormat0,
VertexFormat31 = VertexFormat0 + 31,
- VertexBuffers,
- VertexBuffer0,
- VertexBuffer31 = VertexBuffer0 + 31,
-
VertexInstances,
VertexInstance0,
VertexInstance31 = VertexInstance0 + 31,
@@ -92,8 +88,6 @@ class StateTracker {
public:
explicit StateTracker(Tegra::GPU& gpu);
- void InvalidateStreamBuffer();
-
void BindIndexBuffer(GLuint new_index_buffer) {
if (index_buffer == new_index_buffer) {
return;
@@ -110,13 +104,32 @@ public:
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
}
+ void ClipControl(GLenum new_origin, GLenum new_depth) {
+ if (new_origin == origin && new_depth == depth) {
+ return;
+ }
+ origin = new_origin;
+ depth = new_depth;
+ glClipControl(origin, depth);
+ }
+
+ void SetYNegate(bool new_y_negate) {
+ if (new_y_negate == y_negate) {
+ return;
+ }
+ // Y_NEGATE is mapped to gl_FrontMaterial.ambient.a
+ y_negate = new_y_negate;
+ const std::array ambient{0.0f, 0.0f, 0.0f, y_negate ? -1.0f : 1.0f};
+ glMaterialfv(GL_FRONT, GL_AMBIENT, ambient.data());
+ }
+
void NotifyScreenDrawVertexArray() {
flags[OpenGL::Dirty::VertexFormats] = true;
flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
flags[OpenGL::Dirty::VertexFormat0 + 1] = true;
- flags[OpenGL::Dirty::VertexBuffers] = true;
- flags[OpenGL::Dirty::VertexBuffer0] = true;
+ flags[VideoCommon::Dirty::VertexBuffers] = true;
+ flags[VideoCommon::Dirty::VertexBuffer0] = true;
flags[OpenGL::Dirty::VertexInstances] = true;
flags[OpenGL::Dirty::VertexInstance0 + 0] = true;
@@ -202,6 +215,9 @@ private:
GLuint framebuffer = 0;
GLuint index_buffer = 0;
+ GLenum origin = GL_LOWER_LEFT;
+ GLenum depth = GL_NEGATIVE_ONE_TO_ONE;
+ bool y_negate = false;
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index e0819cdf2..bfb992a79 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -1,70 +1,64 @@
-// Copyright 2018 Citra Emulator Project
+// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include <tuple>
-#include <vector>
+#include <array>
+#include <memory>
+#include <span>
+
+#include <glad/glad.h>
#include "common/alignment.h"
#include "common/assert.h"
-#include "common/microprofile.h"
-#include "video_core/renderer_opengl/gl_device.h"
-#include "video_core/renderer_opengl/gl_state_tracker.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
-MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
- MP_RGB(128, 128, 192));
-
namespace OpenGL {
-OGLStreamBuffer::OGLStreamBuffer(const Device& device, StateTracker& state_tracker_)
- : state_tracker{state_tracker_} {
- gl_buffer.Create();
-
- static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
- glNamedBufferStorage(gl_buffer.handle, BUFFER_SIZE, nullptr, flags);
- mapped_ptr = static_cast<u8*>(
- glMapNamedBufferRange(gl_buffer.handle, 0, BUFFER_SIZE, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
-
- if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
- glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
- glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+StreamBuffer::StreamBuffer() {
+ static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
+ buffer.Create();
+ glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer");
+ glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags);
+ mapped_pointer =
+ static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags));
+ for (OGLSync& sync : fences) {
+ sync.Create();
}
}
-OGLStreamBuffer::~OGLStreamBuffer() {
- glUnmapNamedBuffer(gl_buffer.handle);
- gl_buffer.Release();
-}
-
-std::pair<u8*, GLintptr> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
- ASSERT(size <= BUFFER_SIZE);
- ASSERT(alignment <= BUFFER_SIZE);
- mapped_size = size;
-
- if (alignment > 0) {
- buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
+std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept {
+ ASSERT(size < REGION_SIZE);
+ for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end;
+ ++region) {
+ fences[region].Create();
}
+ used_iterator = iterator;
- if (buffer_pos + size > BUFFER_SIZE) {
- MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
- glInvalidateBufferData(gl_buffer.handle);
- state_tracker.InvalidateStreamBuffer();
-
- buffer_pos = 0;
+ for (size_t region = Region(free_iterator) + 1,
+ region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
+ region < region_end; ++region) {
+ glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
+ fences[region].Release();
}
-
- return std::make_pair(mapped_ptr + buffer_pos, buffer_pos);
-}
-
-void OGLStreamBuffer::Unmap(GLsizeiptr size) {
- ASSERT(size <= mapped_size);
-
- if (size > 0) {
- glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
+ if (iterator + size > free_iterator) {
+ free_iterator = iterator + size;
}
-
- buffer_pos += size;
+ if (iterator + size > STREAM_BUFFER_SIZE) {
+ for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) {
+ fences[region].Create();
+ }
+ used_iterator = 0;
+ iterator = 0;
+ free_iterator = size;
+
+ for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) {
+ glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
+ fences[region].Release();
+ }
+ }
+ const size_t offset = iterator;
+ iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
+ return {std::span(mapped_pointer + offset, size), offset};
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index dd9cf67eb..6dbb6bfba 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -1,9 +1,12 @@
-// Copyright 2018 Citra Emulator Project
+// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
+#include <array>
+#include <memory>
+#include <span>
#include <utility>
#include <glad/glad.h>
@@ -13,48 +16,35 @@
namespace OpenGL {
-class Device;
-class StateTracker;
+class StreamBuffer {
+ static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024;
+ static constexpr size_t NUM_SYNCS = 16;
+ static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS;
+ static constexpr size_t MAX_ALIGNMENT = 256;
+ static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0);
+ static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0);
+ static_assert(REGION_SIZE % MAX_ALIGNMENT == 0);
-class OGLStreamBuffer : private NonCopyable {
public:
- explicit OGLStreamBuffer(const Device& device, StateTracker& state_tracker_);
- ~OGLStreamBuffer();
-
- /*
- * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
- * and the optional alignment requirement.
- * If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
- * The return values are the pointer to the new chunk, and the offset within the buffer.
- * The actual used size must be specified on unmapping the chunk.
- */
- std::pair<u8*, GLintptr> Map(GLsizeiptr size, GLintptr alignment = 0);
-
- void Unmap(GLsizeiptr size);
-
- GLuint Handle() const {
- return gl_buffer.handle;
- }
+ explicit StreamBuffer();
- u64 Address() const {
- return gpu_address;
- }
+ [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept;
- GLsizeiptr Size() const noexcept {
- return BUFFER_SIZE;
+ [[nodiscard]] GLuint Handle() const noexcept {
+ return buffer.handle;
}
private:
- static constexpr GLsizeiptr BUFFER_SIZE = 256 * 1024 * 1024;
-
- StateTracker& state_tracker;
-
- OGLBuffer gl_buffer;
+ [[nodiscard]] static size_t Region(size_t offset) noexcept {
+ return offset / REGION_SIZE;
+ }
- GLuint64EXT gpu_address = 0;
- GLintptr buffer_pos = 0;
- GLsizeiptr mapped_size = 0;
- u8* mapped_ptr = nullptr;
+ size_t iterator = 0;
+ size_t used_iterator = 0;
+ size_t free_iterator = 0;
+ u8* mapped_pointer = nullptr;
+ OGLBuffer buffer;
+ std::array<OGLSync, NUM_SYNCS> fences;
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 546cb6d00..37572ab28 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -398,9 +398,6 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) {
} // Anonymous namespace
-ImageBufferMap::ImageBufferMap(GLuint handle_, u8* map, size_t size, OGLSync* sync_)
- : span(map, size), sync{sync_}, handle{handle_} {}
-
ImageBufferMap::~ImageBufferMap() {
if (sync) {
sync->Create();
@@ -487,11 +484,11 @@ void TextureCacheRuntime::Finish() {
glFinish();
}
-ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) {
+ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
return upload_buffers.RequestMap(size, true);
}
-ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) {
+ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
return download_buffers.RequestMap(size, false);
}
@@ -596,7 +593,11 @@ ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_
bool insert_fence) {
const size_t index = RequestBuffer(requested_size);
OGLSync* const sync = insert_fence ? &syncs[index] : nullptr;
- return ImageBufferMap(buffers[index].handle, maps[index], requested_size, sync);
+ return ImageBufferMap{
+ .mapped_span = std::span(maps[index], requested_size),
+ .sync = sync,
+ .buffer = buffers[index].handle,
+ };
}
size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) {
@@ -711,7 +712,7 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.Handle());
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes);
glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
@@ -735,7 +736,7 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
std::span<const VideoCommon::BufferCopy> copies) {
for (const VideoCommon::BufferCopy& copy : copies) {
- glCopyNamedBufferSubData(map.Handle(), buffer.handle, copy.src_offset + buffer_offset,
+ glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + buffer_offset,
copy.dst_offset, copy.size);
}
}
@@ -744,7 +745,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
- glBindBuffer(GL_PIXEL_PACK_BUFFER, map.Handle());
+ glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer);
glPixelStorei(GL_PACK_ALIGNMENT, 1);
u32 current_row_length = std::numeric_limits<u32>::max();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 15b7c3676..60d08d6d6 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -31,23 +31,12 @@ using VideoCommon::NUM_RT;
using VideoCommon::Offset2D;
using VideoCommon::RenderTargets;
-class ImageBufferMap {
-public:
- explicit ImageBufferMap(GLuint handle, u8* map, size_t size, OGLSync* sync);
+struct ImageBufferMap {
~ImageBufferMap();
- GLuint Handle() const noexcept {
- return handle;
- }
-
- std::span<u8> Span() const noexcept {
- return span;
- }
-
-private:
- std::span<u8> span;
+ std::span<u8> mapped_span;
OGLSync* sync;
- GLuint handle;
+ GLuint buffer;
};
struct FormatProperties {
@@ -69,9 +58,9 @@ public:
void Finish();
- ImageBufferMap MapUploadBuffer(size_t size);
+ ImageBufferMap UploadStagingBuffer(size_t size);
- ImageBufferMap MapDownloadBuffer(size_t size);
+ ImageBufferMap DownloadStagingBuffer(size_t size);
void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 7eb5ab17a..8fcb86581 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -27,11 +27,14 @@
#include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
#include "video_core/textures/decoders.h"
+#include "video_core/vulkan_common/vulkan_debug_callback.h"
+#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_instance.h"
+#include "video_core/vulkan_common/vulkan_library.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"
namespace OpenGL {
-
namespace {
-
constexpr GLint PositionLocation = 0;
constexpr GLint TexCoordLocation = 1;
constexpr GLint ModelViewMatrixLocation = 0;
@@ -125,25 +128,98 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
}
}
+Vulkan::vk::PhysicalDevice FindPhysicalDevice(Vulkan::vk::Instance& instance) {
+ using namespace Vulkan;
+ using UUID = std::array<GLubyte, GL_UUID_SIZE_EXT>;
+
+ GLint num_device_uuids;
+ glGetIntegerv(GL_NUM_DEVICE_UUIDS_EXT, &num_device_uuids);
+ std::vector<UUID> device_uuids(num_device_uuids);
+ for (GLint index = 0; index < num_device_uuids; ++index) {
+ glGetUnsignedBytei_vEXT(GL_DEVICE_UUID_EXT, 0, device_uuids[index].data());
+ }
+ UUID driver_uuid;
+ glGetUnsignedBytevEXT(GL_DRIVER_UUID_EXT, driver_uuid.data());
+
+ for (const VkPhysicalDevice raw_physical_device : instance.EnumeratePhysicalDevices()) {
+ VkPhysicalDeviceIDProperties device_id_properties{};
+ device_id_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+
+ VkPhysicalDeviceProperties2KHR properties{
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &device_id_properties,
+ .properties{},
+ };
+ vk::PhysicalDevice physical_device(raw_physical_device, instance.Dispatch());
+ physical_device.GetProperties2KHR(properties);
+ if (!std::ranges::equal(device_id_properties.driverUUID, driver_uuid)) {
+ continue;
+ }
+ const auto it =
+ std::ranges::find_if(device_uuids, [&device_id_properties, driver_uuid](UUID uuid) {
+ return std::ranges::equal(device_id_properties.deviceUUID, uuid);
+ });
+ if (it != device_uuids.end()) {
+ return physical_device;
+ }
+ }
+ throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER);
+}
} // Anonymous namespace
+struct VulkanObjects {
+ static std::unique_ptr<VulkanObjects> TryCreate() {
+ if (!GLAD_GL_EXT_memory_object) {
+ // Interop is not present
+ return nullptr;
+ }
+ const std::string_view vendor{reinterpret_cast<const char*>(glGetString(GL_VENDOR))};
+ if (vendor == "ATI Technologies Inc.") {
+ // Avoid using GL_EXT_memory_object on AMD, as it makes the GL driver crash
+ return nullptr;
+ }
+ if (!Settings::values.use_assembly_shaders.GetValue()) {
+ // We only need interop when assembly shaders are enabled
+ return nullptr;
+ }
+#ifdef __linux__
+ LOG_WARNING(Render_OpenGL, "Interop doesn't work on Linux at the moment");
+ return nullptr;
+#endif
+ try {
+ return std::make_unique<VulkanObjects>();
+ } catch (const Vulkan::vk::Exception& exception) {
+ LOG_ERROR(Render_OpenGL, "Failed to initialize Vulkan objects with error: {}",
+ exception.what());
+ return nullptr;
+ }
+ }
+
+ Common::DynamicLibrary library{Vulkan::OpenLibrary()};
+ Vulkan::vk::InstanceDispatch dld;
+ Vulkan::vk::Instance instance{Vulkan::CreateInstance(library, dld, VK_API_VERSION_1_1)};
+ Vulkan::Device device{*instance, FindPhysicalDevice(instance), nullptr, dld};
+ Vulkan::MemoryAllocator memory_allocator{device, true};
+};
+
RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
Core::Frontend::EmuWindow& emu_window_,
Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
std::unique_ptr<Core::Frontend::GraphicsContext> context_)
: RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_},
- emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device},
- rasterizer{emu_window, gpu, cpu_memory, device, screen_info, program_manager, state_tracker} {
+ emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_},
+ vulkan_objects{VulkanObjects::TryCreate()}, device{vulkan_objects != nullptr},
+ state_tracker{gpu}, program_manager{device},
+ rasterizer(emu_window, gpu, cpu_memory, device,
+ vulkan_objects ? &vulkan_objects->device : nullptr,
+ vulkan_objects ? &vulkan_objects->memory_allocator : nullptr, screen_info,
+ program_manager, state_tracker) {
if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
glEnable(GL_DEBUG_OUTPUT);
glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
glDebugMessageCallback(DebugHandler, nullptr);
}
AddTelemetryFields();
-
- if (!GLAD_GL_VERSION_4_6) {
- throw std::runtime_error{"OpenGL 4.3 is not available"};
- }
InitOpenGLObjects();
}
@@ -280,6 +356,7 @@ void RendererOpenGL::InitOpenGLObjects() {
// Enable unified vertex attributes and query vertex buffer address when the driver supports it
if (device.HasVertexBufferUnifiedMemory()) {
glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+ glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
@@ -412,6 +489,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
program_manager.BindHostPipeline(pipeline.handle);
+ state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
glEnable(GL_CULL_FACE);
if (screen_info.display_srgb) {
glEnable(GL_FRAMEBUFFER_SRGB);
@@ -430,7 +508,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
glCullFace(GL_BACK);
glFrontFace(GL_CW);
glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
- glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width),
static_cast<GLfloat>(layout.height));
glDepthRangeIndexed(0, 0.0, 0.0);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index e043a0ccb..f210190dd 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -38,6 +38,8 @@ class GPU;
namespace OpenGL {
+struct VulkanObjects;
+
/// Structure used for storing information about the textures for the Switch screen
struct TextureInfo {
OGLTexture resource;
@@ -99,8 +101,11 @@ private:
Core::Memory::Memory& cpu_memory;
Tegra::GPU& gpu;
- const Device device;
- StateTracker state_tracker{gpu};
+ std::unique_ptr<VulkanObjects> vulkan_objects;
+ Device device;
+ StateTracker state_tracker;
+ ProgramManager program_manager;
+ RasterizerOpenGL rasterizer;
// OpenGL object IDs
OGLSampler present_sampler;
@@ -116,11 +121,6 @@ private:
/// Display information for Switch screen
ScreenInfo screen_info;
- /// Global dummy shader pipeline
- ProgramManager program_manager;
-
- RasterizerOpenGL rasterizer;
-
/// OpenGL framebuffer data
std::vector<u8> gl_framebuffer_data;
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index eb849cbf2..aeb36551c 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -71,7 +71,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle);
- glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes);
+ glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
@@ -91,8 +91,8 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
glUniform1ui(5, params.x_shift);
glUniform1ui(6, params.block_height);
glUniform1ui(7, params.block_height_mask);
- glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(),
- input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
+ image.guest_size_bytes - swizzle.buffer_offset);
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
GL_WRITE_ONLY, store_format);
glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
@@ -108,7 +108,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
static constexpr GLuint BINDING_INPUT_BUFFER = 1;
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
- glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes);
+ glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
@@ -132,8 +132,8 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
glUniform1ui(7, params.block_height_mask);
glUniform1ui(8, params.block_depth);
glUniform1ui(9, params.block_depth_mask);
- glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(),
- input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
+ image.guest_size_bytes - swizzle.buffer_offset);
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
GL_WRITE_ONLY, store_format);
glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z);
@@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
"Non-power of two images are not implemented");
program_manager.BindHostCompute(pitch_unswizzle_program.handle);
- glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes);
+ glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes);
glUniform2ui(LOC_ORIGIN, 0, 0);
glUniform2i(LOC_DESTINATION, 0, 0);
glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block);
@@ -172,8 +172,8 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
- glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(),
- input_offset, image.guest_size_bytes - swizzle.buffer_offset);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
+ image.guest_size_bytes - swizzle.buffer_offset);
glDispatchCompute(num_dispatches_x, num_dispatches_y, 1);
}
program_manager.RestoreGuestCompute();
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index 359997255..bec026bc3 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -15,9 +15,10 @@
namespace OpenGL {
class Image;
-class ImageBufferMap;
class ProgramManager;
+struct ImageBufferMap;
+
class UtilShaders {
public:
explicit UtilShaders(ProgramManager& program_manager);
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 85121d9fd..19aaf034f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -531,13 +531,9 @@ VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison) {
return {};
}
-VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format) {
+VkIndexType IndexFormat(Maxwell::IndexFormat index_format) {
switch (index_format) {
case Maxwell::IndexFormat::UnsignedByte:
- if (!device.IsExtIndexTypeUint8Supported()) {
- UNIMPLEMENTED_MSG("Native uint8 indices are not supported on this device");
- return VK_INDEX_TYPE_UINT16;
- }
return VK_INDEX_TYPE_UINT8_EXT;
case Maxwell::IndexFormat::UnsignedShort:
return VK_INDEX_TYPE_UINT16;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h
index 7c34b47dc..e3e06ba38 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.h
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h
@@ -53,7 +53,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison);
-VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format);
+VkIndexType IndexFormat(Maxwell::IndexFormat index_format);
VkStencilOp StencilOp(Maxwell::StencilOp stencil_op);
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 6909576cb..1cc720ddd 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -107,7 +107,7 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
surface(CreateSurface(instance, render_window)),
device(CreateDevice(instance, dld, *surface)),
- memory_allocator(device),
+ memory_allocator(device, false),
state_tracker(gpu),
scheduler(device, state_tracker),
swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width,
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h
index 1efaf3b77..72071316c 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -58,12 +58,11 @@ private:
vk::InstanceDispatch dld;
vk::Instance instance;
-
+ vk::DebugUtilsMessenger debug_callback;
vk::SurfaceKHR surface;
VKScreenInfo screen_info;
- vk::DebugUtilsMessenger debug_callback;
Device device;
MemoryAllocator memory_allocator;
StateTracker state_tracker;
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index df8992528..a1a32aabe 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -148,8 +148,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
SetUniformData(data, framebuffer);
SetVertexData(data, framebuffer);
- const std::span<u8> map = buffer_commit.Map();
- std::memcpy(map.data(), &data, sizeof(data));
+ const std::span<u8> mapped_span = buffer_commit.Map();
+ std::memcpy(mapped_span.data(), &data, sizeof(data));
if (!use_accelerated) {
const u64 image_offset = GetRawImageOffset(framebuffer, image_index);
@@ -162,8 +162,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
constexpr u32 block_height_log2 = 4;
const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer);
Tegra::Texture::UnswizzleTexture(
- map.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), bytes_per_pixel,
- framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
+ mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes),
+ bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
const VkBufferImageCopy copy{
.bufferOffset = image_offset,
@@ -263,7 +263,6 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
cmdbuf.Draw(4, 1, 0, 0);
cmdbuf.EndRenderPass();
});
-
return *semaphores[image_index];
}
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index d8ad40a0f..48fc5d966 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -3,188 +3,276 @@
// Refer to the license.txt file included.
#include <algorithm>
+#include <array>
#include <cstring>
-#include <memory>
+#include <span>
+#include <vector>
-#include "core/core.h"
#include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/renderer_vulkan/maxwell_to_vk.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
+#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/renderer_vulkan/vk_update_descriptor.h"
#include "video_core/vulkan_common/vulkan_device.h"
+#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
-
namespace {
+VkBufferCopy MakeBufferCopy(const VideoCommon::BufferCopy& copy) {
+ return VkBufferCopy{
+ .srcOffset = copy.src_offset,
+ .dstOffset = copy.dst_offset,
+ .size = copy.size,
+ };
+}
-constexpr VkBufferUsageFlags BUFFER_USAGE =
- VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
- VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
-
-constexpr VkPipelineStageFlags UPLOAD_PIPELINE_STAGE =
- VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
- VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
- VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-
-constexpr VkAccessFlags UPLOAD_ACCESS_BARRIERS =
- VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT |
- VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDEX_READ_BIT;
+VkIndexType IndexTypeFromNumElements(const Device& device, u32 num_elements) {
+ if (num_elements <= 0xff && device.IsExtIndexTypeUint8Supported()) {
+ return VK_INDEX_TYPE_UINT8_EXT;
+ }
+ if (num_elements <= 0xffff) {
+ return VK_INDEX_TYPE_UINT16;
+ }
+ return VK_INDEX_TYPE_UINT32;
+}
-constexpr VkAccessFlags TRANSFORM_FEEDBACK_WRITE_ACCESS =
- VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT;
+size_t BytesPerIndex(VkIndexType index_type) {
+ switch (index_type) {
+ case VK_INDEX_TYPE_UINT8_EXT:
+ return 1;
+ case VK_INDEX_TYPE_UINT16:
+ return 2;
+ case VK_INDEX_TYPE_UINT32:
+ return 4;
+ default:
+ UNREACHABLE_MSG("Invalid index type={}", index_type);
+ return 1;
+ }
+}
+template <typename T>
+std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) {
+ std::array<T, 6> indices{0, 1, 2, 0, 2, 3};
+ std::ranges::transform(indices, indices.begin(),
+ [quad, first](u32 index) { return first + index + quad * 4; });
+ return indices;
+}
} // Anonymous namespace
-Buffer::Buffer(const Device& device_, MemoryAllocator& memory_allocator, VKScheduler& scheduler_,
- StagingBufferPool& staging_pool_, VAddr cpu_addr_, std::size_t size_)
- : BufferBlock{cpu_addr_, size_}, device{device_}, scheduler{scheduler_}, staging_pool{
- staging_pool_} {
- buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
+Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
+ : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
+
+Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
+ VAddr cpu_addr_, u64 size_bytes_)
+ : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
+ buffer = runtime.device.GetLogical().CreateBuffer(VkBufferCreateInfo{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
- .size = static_cast<VkDeviceSize>(size_),
- .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+ .size = SizeBytes(),
+ .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+ VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
+ VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
+ VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
});
- commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
+ if (runtime.device.HasDebuggingToolAttached()) {
+ buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str());
+ }
+ commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
}
-Buffer::~Buffer() = default;
+BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_allocator_,
+ VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
+ VKUpdateDescriptorQueue& update_descriptor_queue_,
+ VKDescriptorPool& descriptor_pool)
+ : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
+ staging_pool{staging_pool_}, update_descriptor_queue{update_descriptor_queue_},
+ uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
+ quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {}
-void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) {
- const auto& staging = staging_pool.Request(data_size, MemoryUsage::Upload);
- std::memcpy(staging.mapped_span.data(), data, data_size);
+StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) {
+ return staging_pool.Request(size, MemoryUsage::Upload);
+}
- scheduler.RequestOutsideRenderPassOperationContext();
+StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) {
+ return staging_pool.Request(size, MemoryUsage::Download);
+}
- const VkBuffer handle = Handle();
- scheduler.Record([staging = staging.buffer, handle, offset, data_size,
- &device = device](vk::CommandBuffer cmdbuf) {
- const VkBufferMemoryBarrier read_barrier{
- .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
- .pNext = nullptr,
- .srcAccessMask =
- VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT |
- VK_ACCESS_HOST_WRITE_BIT |
- (device.IsExtTransformFeedbackSupported() ? TRANSFORM_FEEDBACK_WRITE_ACCESS : 0),
- .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
- .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .buffer = handle,
- .offset = offset,
- .size = data_size,
- };
- const VkBufferMemoryBarrier write_barrier{
- .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
- .pNext = nullptr,
- .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
- .dstAccessMask = UPLOAD_ACCESS_BARRIERS,
- .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .buffer = handle,
- .offset = offset,
- .size = data_size,
- };
+void BufferCacheRuntime::Finish() {
+ scheduler.Finish();
+}
+
+void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
+ std::span<const VideoCommon::BufferCopy> copies) {
+ static constexpr VkMemoryBarrier READ_BARRIER{
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
+ };
+ static constexpr VkMemoryBarrier WRITE_BARRIER{
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
+ };
+ // Measuring a popular game, this number never exceeds the specified size once data is warmed up
+ boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size());
+ std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy);
+ scheduler.RequestOutsideRenderPassOperationContext();
+ scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
- 0, read_barrier);
- cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, data_size});
- cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0,
- write_barrier);
+ 0, READ_BARRIER);
+ cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies);
+ cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ 0, WRITE_BARRIER);
});
}
-void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) {
- auto staging = staging_pool.Request(data_size, MemoryUsage::Download);
- scheduler.RequestOutsideRenderPassOperationContext();
+void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format,
+ u32 base_vertex, u32 num_indices, VkBuffer buffer,
+ u32 offset, [[maybe_unused]] u32 size) {
+ VkIndexType index_type = MaxwellToVK::IndexFormat(index_format);
+ if (topology == PrimitiveTopology::Quads) {
+ index_type = VK_INDEX_TYPE_UINT32;
+ std::tie(buffer, offset) =
+ quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
+ } else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
+ index_type = VK_INDEX_TYPE_UINT16;
+ std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset);
+ }
+ scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) {
+ cmdbuf.BindIndexBuffer(buffer, offset, index_type);
+ });
+}
- const VkBuffer handle = Handle();
- scheduler.Record(
- [staging = staging.buffer, handle, offset, data_size](vk::CommandBuffer cmdbuf) {
- const VkBufferMemoryBarrier barrier{
- .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
- .pNext = nullptr,
- .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
- .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
- .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
- .buffer = handle,
- .offset = offset,
- .size = data_size,
- };
-
- cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
- VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
- VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
- VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
- cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, data_size});
- });
- scheduler.Finish();
+void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) {
+ ReserveQuadArrayLUT(first + count, true);
- std::memcpy(data, staging.mapped_span.data(), data_size);
+ // The LUT has the indices 0, 1, 2, and 3 copied as an array
+ // To apply these 'first' offsets we can apply an offset based on the modulus.
+ const VkIndexType index_type = quad_array_lut_index_type;
+ const size_t sub_first_offset = static_cast<size_t>(first % 4) * (current_num_indices / 4);
+ const size_t offset = (sub_first_offset + first / 4) * 6ULL * BytesPerIndex(index_type);
+ scheduler.Record([buffer = *quad_array_lut, index_type, offset](vk::CommandBuffer cmdbuf) {
+ cmdbuf.BindIndexBuffer(buffer, offset, index_type);
+ });
}
-void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
- std::size_t copy_size) {
- scheduler.RequestOutsideRenderPassOperationContext();
+void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size,
+ u32 stride) {
+ if (device.IsExtExtendedDynamicStateSupported()) {
+ scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) {
+ const VkDeviceSize vk_offset = offset;
+ const VkDeviceSize vk_size = buffer != VK_NULL_HANDLE ? size : VK_WHOLE_SIZE;
+ const VkDeviceSize vk_stride = stride;
+ cmdbuf.BindVertexBuffers2EXT(index, 1, &buffer, &vk_offset, &vk_size, &vk_stride);
+ });
+ } else {
+ scheduler.Record([index, buffer, offset](vk::CommandBuffer cmdbuf) {
+ cmdbuf.BindVertexBuffer(index, buffer, offset);
+ });
+ }
+}
- const VkBuffer dst_buffer = Handle();
- scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
- copy_size](vk::CommandBuffer cmdbuf) {
- cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, copy_size});
-
- std::array<VkBufferMemoryBarrier, 2> barriers;
- barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
- barriers[0].pNext = nullptr;
- barriers[0].srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
- barriers[0].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
- barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barriers[0].buffer = src_buffer;
- barriers[0].offset = src_offset;
- barriers[0].size = copy_size;
- barriers[1].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
- barriers[1].pNext = nullptr;
- barriers[1].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
- barriers[1].dstAccessMask = UPLOAD_ACCESS_BARRIERS;
- barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barriers[1].buffer = dst_buffer;
- barriers[1].offset = dst_offset;
- barriers[1].size = copy_size;
- cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
- barriers, {});
+void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset,
+ u32 size) {
+ if (!device.IsExtTransformFeedbackSupported()) {
+ // Already logged in the rasterizer
+ return;
+ }
+ scheduler.Record([index, buffer, offset, size](vk::CommandBuffer cmdbuf) {
+ const VkDeviceSize vk_offset = offset;
+ const VkDeviceSize vk_size = size;
+ cmdbuf.BindTransformFeedbackBuffersEXT(index, 1, &buffer, &vk_offset, &vk_size);
});
}
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer_,
- Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
- const Device& device_, MemoryAllocator& memory_allocator_,
- VKScheduler& scheduler_, VKStreamBuffer& stream_buffer_,
- StagingBufferPool& staging_pool_)
- : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer_, gpu_memory_,
- cpu_memory_, stream_buffer_},
- device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
- staging_pool{staging_pool_} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
- return std::make_shared<Buffer>(device, memory_allocator, scheduler, staging_pool, cpu_addr,
- size);
+void BufferCacheRuntime::BindBuffer(VkBuffer buffer, u32 offset, u32 size) {
+ update_descriptor_queue.AddBuffer(buffer, offset, size);
}
-VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
- size = std::max(size, std::size_t(4));
- const auto& empty = staging_pool.Request(size, MemoryUsage::DeviceLocal);
+void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle) {
+ if (num_indices <= current_num_indices) {
+ return;
+ }
+ if (wait_for_idle) {
+ scheduler.Finish();
+ }
+ current_num_indices = num_indices;
+ quad_array_lut_index_type = IndexTypeFromNumElements(device, num_indices);
+
+ const u32 num_quads = num_indices / 4;
+ const u32 num_triangle_indices = num_quads * 6;
+ const u32 num_first_offset_copies = 4;
+ const size_t bytes_per_index = BytesPerIndex(quad_array_lut_index_type);
+ const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies;
+ quad_array_lut = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = nullptr,
+ .flags = 0,
+ .size = size_bytes,
+ .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = 0,
+ .pQueueFamilyIndices = nullptr,
+ });
+ if (device.HasDebuggingToolAttached()) {
+ quad_array_lut.SetObjectNameEXT("Quad LUT");
+ }
+ quad_array_lut_commit = memory_allocator.Commit(quad_array_lut, MemoryUsage::DeviceLocal);
+
+ const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload);
+ u8* staging_data = staging.mapped_span.data();
+ const size_t quad_size = bytes_per_index * 6;
+ for (u32 first = 0; first < num_first_offset_copies; ++first) {
+ for (u32 quad = 0; quad < num_quads; ++quad) {
+ switch (quad_array_lut_index_type) {
+ case VK_INDEX_TYPE_UINT8_EXT:
+ std::memcpy(staging_data, MakeQuadIndices<u8>(quad, first).data(), quad_size);
+ break;
+ case VK_INDEX_TYPE_UINT16:
+ std::memcpy(staging_data, MakeQuadIndices<u16>(quad, first).data(), quad_size);
+ break;
+ case VK_INDEX_TYPE_UINT32:
+ std::memcpy(staging_data, MakeQuadIndices<u32>(quad, first).data(), quad_size);
+ break;
+ default:
+ UNREACHABLE();
+ break;
+ }
+ staging_data += quad_size;
+ }
+ }
scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([size, buffer = empty.buffer](vk::CommandBuffer cmdbuf) {
- cmdbuf.FillBuffer(buffer, 0, size, 0);
+ scheduler.Record([src_buffer = staging.buffer, dst_buffer = *quad_array_lut,
+ size_bytes](vk::CommandBuffer cmdbuf) {
+ const VkBufferCopy copy{
+ .srcOffset = 0,
+ .dstOffset = 0,
+ .size = size_bytes,
+ };
+ const VkBufferMemoryBarrier write_barrier{
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_INDEX_READ_BIT,
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = dst_buffer,
+ .offset = 0,
+ .size = size_bytes,
+ };
+ cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy);
+ cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
+ 0, write_barrier);
});
- return {empty.buffer, 0, 0};
}
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 41d577510..d232e1f2d 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -4,69 +4,112 @@
#pragma once
-#include <memory>
-
-#include "common/common_types.h"
#include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
class Device;
+class VKDescriptorPool;
class VKScheduler;
+class VKUpdateDescriptorQueue;
-class Buffer final : public VideoCommon::BufferBlock {
-public:
- explicit Buffer(const Device& device, MemoryAllocator& memory_allocator, VKScheduler& scheduler,
- StagingBufferPool& staging_pool, VAddr cpu_addr_, std::size_t size_);
- ~Buffer();
-
- void Upload(std::size_t offset, std::size_t data_size, const u8* data);
-
- void Download(std::size_t offset, std::size_t data_size, u8* data);
+class BufferCacheRuntime;
- void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
- std::size_t copy_size);
+class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
+public:
+ explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params);
+ explicit Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
+ VAddr cpu_addr_, u64 size_bytes_);
- VkBuffer Handle() const {
+ [[nodiscard]] VkBuffer Handle() const noexcept {
return *buffer;
}
- u64 Address() const {
- return 0;
+ operator VkBuffer() const noexcept {
+ return *buffer;
}
private:
- const Device& device;
- VKScheduler& scheduler;
- StagingBufferPool& staging_pool;
-
vk::Buffer buffer;
MemoryCommit commit;
};
-class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
+class BufferCacheRuntime {
+ friend Buffer;
+
+ using PrimitiveTopology = Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology;
+ using IndexFormat = Tegra::Engines::Maxwell3D::Regs::IndexFormat;
+
public:
- explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer,
- Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
- const Device& device, MemoryAllocator& memory_allocator,
- VKScheduler& scheduler, VKStreamBuffer& stream_buffer,
- StagingBufferPool& staging_pool);
- ~VKBufferCache();
+ explicit BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_manager_,
+ VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
+ VKUpdateDescriptorQueue& update_descriptor_queue_,
+ VKDescriptorPool& descriptor_pool);
+
+ void Finish();
+
+ [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
+
+ [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
- BufferInfo GetEmptyBuffer(std::size_t size) override;
+ void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer,
+ std::span<const VideoCommon::BufferCopy> copies);
-protected:
- std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
+ void BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 num_indices,
+ u32 base_vertex, VkBuffer buffer, u32 offset, u32 size);
+
+ void BindQuadArrayIndexBuffer(u32 first, u32 count);
+
+ void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride);
+
+ void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size);
+
+ void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) {
+ BindBuffer(buffer, offset, size);
+ }
+
+ void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size,
+ [[maybe_unused]] bool is_written) {
+ BindBuffer(buffer, offset, size);
+ }
private:
+ void BindBuffer(VkBuffer buffer, u32 offset, u32 size);
+
+ void ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle);
+
const Device& device;
MemoryAllocator& memory_allocator;
VKScheduler& scheduler;
StagingBufferPool& staging_pool;
+ VKUpdateDescriptorQueue& update_descriptor_queue;
+
+ vk::Buffer quad_array_lut;
+ MemoryCommit quad_array_lut_commit;
+ VkIndexType quad_array_lut_index_type{};
+ u32 current_num_indices = 0;
+
+ Uint8Pass uint8_pass;
+ QuadIndexedPass quad_index_pass;
};
+struct BufferCacheParams {
+ using Runtime = Vulkan::BufferCacheRuntime;
+ using Buffer = Vulkan::Buffer;
+
+ static constexpr bool IS_OPENGL = false;
+ static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false;
+ static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = false;
+ static constexpr bool NEEDS_BIND_UNIFORM_INDEX = false;
+ static constexpr bool NEEDS_BIND_STORAGE_INDEX = false;
+ static constexpr bool USE_MEMORY_MAPS = true;
+};
+
+using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
+
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index 5eb6a54be..a4fdcdf81 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -10,7 +10,6 @@
#include "common/alignment.h"
#include "common/assert.h"
#include "common/common_types.h"
-#include "video_core/host_shaders/vulkan_quad_array_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
@@ -22,30 +21,7 @@
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
-
namespace {
-
-VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() {
- return {
- .binding = 0,
- .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .descriptorCount = 1,
- .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
- .pImmutableSamplers = nullptr,
- };
-}
-
-VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() {
- return {
- .dstBinding = 0,
- .dstArrayElement = 0,
- .descriptorCount = 1,
- .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .offset = 0,
- .stride = sizeof(DescriptorUpdateEntry),
- };
-}
-
VkPushConstantRange BuildComputePushConstantRange(std::size_t size) {
return {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
@@ -162,55 +138,6 @@ VkDescriptorSet VKComputePass::CommitDescriptorSet(
return set;
}
-QuadArrayPass::QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
- VKDescriptorPool& descriptor_pool_,
- StagingBufferPool& staging_buffer_pool_,
- VKUpdateDescriptorQueue& update_descriptor_queue_)
- : VKComputePass(device_, descriptor_pool_, BuildQuadArrayPassDescriptorSetLayoutBinding(),
- BuildQuadArrayPassDescriptorUpdateTemplateEntry(),
- BuildComputePushConstantRange(sizeof(u32)), VULKAN_QUAD_ARRAY_COMP_SPV),
- scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_},
- update_descriptor_queue{update_descriptor_queue_} {}
-
-QuadArrayPass::~QuadArrayPass() = default;
-
-std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) {
- const u32 num_triangle_vertices = (num_vertices / 4) * 6;
- const std::size_t staging_size = num_triangle_vertices * sizeof(u32);
- const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
-
- update_descriptor_queue.Acquire();
- update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
- const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
-
- scheduler.RequestOutsideRenderPassOperationContext();
-
- ASSERT(num_vertices % 4 == 0);
- const u32 num_quads = num_vertices / 4;
- scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer,
- num_quads, first, set](vk::CommandBuffer cmdbuf) {
- constexpr u32 dispatch_size = 1024;
- cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
- cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
- cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(first), &first);
- cmdbuf.Dispatch(Common::AlignUp(num_quads, dispatch_size) / dispatch_size, 1, 1);
-
- VkBufferMemoryBarrier barrier;
- barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
- barrier.pNext = nullptr;
- barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
- barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
- barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barrier.buffer = buffer;
- barrier.offset = 0;
- barrier.size = static_cast<VkDeviceSize>(num_quads) * 6 * sizeof(u32);
- cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
- VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {});
- });
- return {staging_ref.buffer, 0};
-}
-
Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_)
@@ -221,18 +148,18 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
Uint8Pass::~Uint8Pass() = default;
-std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
- u64 src_offset) {
+std::pair<VkBuffer, u32> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
+ u32 src_offset) {
const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
- const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
+ const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
- update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
+ update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set,
+ scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
num_vertices](vk::CommandBuffer cmdbuf) {
constexpr u32 dispatch_size = 1024;
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
@@ -252,7 +179,7 @@ std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buff
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
});
- return {staging_ref.buffer, 0};
+ return {staging.buffer, 0};
}
QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
@@ -267,9 +194,9 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
QuadIndexedPass::~QuadIndexedPass() = default;
-std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
+std::pair<VkBuffer, u32> QuadIndexedPass::Assemble(
Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
- VkBuffer src_buffer, u64 src_offset) {
+ VkBuffer src_buffer, u32 src_offset) {
const u32 index_shift = [index_format] {
switch (index_format) {
case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte:
@@ -286,15 +213,15 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
const u32 num_tri_vertices = (num_vertices / 4) * 6;
const std::size_t staging_size = num_tri_vertices * sizeof(u32);
- const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
+ const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
- update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
+ update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size);
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set,
+ scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
static constexpr u32 dispatch_size = 1024;
const std::array push_constants = {base_vertex, index_shift};
@@ -317,7 +244,7 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {});
});
- return {staging_ref.buffer, 0};
+ return {staging.buffer, 0};
}
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index f5c6f5f17..4904019f5 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -41,22 +41,6 @@ private:
vk::ShaderModule module;
};
-class QuadArrayPass final : public VKComputePass {
-public:
- explicit QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
- VKDescriptorPool& descriptor_pool_,
- StagingBufferPool& staging_buffer_pool_,
- VKUpdateDescriptorQueue& update_descriptor_queue_);
- ~QuadArrayPass();
-
- std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first);
-
-private:
- VKScheduler& scheduler;
- StagingBufferPool& staging_buffer_pool;
- VKUpdateDescriptorQueue& update_descriptor_queue;
-};
-
class Uint8Pass final : public VKComputePass {
public:
explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_,
@@ -64,7 +48,9 @@ public:
VKUpdateDescriptorQueue& update_descriptor_queue_);
~Uint8Pass();
- std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset);
+ /// Assemble uint8 indices into an uint16 index buffer
+ /// Returns a pair with the staging buffer, and the offset where the assembled data is
+ std::pair<VkBuffer, u32> Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset);
private:
VKScheduler& scheduler;
@@ -80,9 +66,9 @@ public:
VKUpdateDescriptorQueue& update_descriptor_queue_);
~QuadIndexedPass();
- std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format,
+ std::pair<VkBuffer, u32> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format,
u32 num_vertices, u32 base_vertex, VkBuffer src_buffer,
- u64 src_offset);
+ u32 src_offset);
private:
VKScheduler& scheduler;
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
index 6cd00884d..3bec48d14 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp
@@ -45,8 +45,8 @@ void InnerFence::Wait() {
}
VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
- Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_,
- VKBufferCache& buffer_cache_, VKQueryCache& query_cache_,
+ TextureCache& texture_cache_, BufferCache& buffer_cache_,
+ VKQueryCache& query_cache_, const Device& device_,
VKScheduler& scheduler_)
: GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_},
scheduler{scheduler_} {}
diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h
index 9c5e5aa8f..2f8322d29 100644
--- a/src/video_core/renderer_vulkan/vk_fence_manager.h
+++ b/src/video_core/renderer_vulkan/vk_fence_manager.h
@@ -22,7 +22,6 @@ class RasterizerInterface;
namespace Vulkan {
class Device;
-class VKBufferCache;
class VKQueryCache;
class VKScheduler;
@@ -45,14 +44,14 @@ private:
using Fence = std::shared_ptr<InnerFence>;
using GenericFenceManager =
- VideoCommon::FenceManager<Fence, TextureCache, VKBufferCache, VKQueryCache>;
+ VideoCommon::FenceManager<Fence, TextureCache, BufferCache, VKQueryCache>;
class VKFenceManager final : public GenericFenceManager {
public:
- explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
- Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_,
- VKBufferCache& buffer_cache_, VKQueryCache& query_cache_,
- VKScheduler& scheduler_);
+ explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
+ TextureCache& texture_cache, BufferCache& buffer_cache,
+ VKQueryCache& query_cache, const Device& device,
+ VKScheduler& scheduler);
protected:
Fence CreateFence(u32 value, bool is_stubbed) override;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index f0a111829..684d4e3a6 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -8,8 +8,6 @@
#include <mutex>
#include <vector>
-#include <boost/container/static_vector.hpp>
-
#include "common/alignment.h"
#include "common/assert.h"
#include "common/logging/log.h"
@@ -24,7 +22,6 @@
#include "video_core/renderer_vulkan/maxwell_to_vk.h"
#include "video_core/renderer_vulkan/renderer_vulkan.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
-#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
@@ -50,15 +47,16 @@ MICROPROFILE_DEFINE(Vulkan_WaitForWorker, "Vulkan", "Wait for worker", MP_RGB(25
MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128));
-MICROPROFILE_DEFINE(Vulkan_Geometry, "Vulkan", "Setup geometry", MP_RGB(192, 128, 128));
-MICROPROFILE_DEFINE(Vulkan_ConstBuffers, "Vulkan", "Setup constant buffers", MP_RGB(192, 128, 128));
-MICROPROFILE_DEFINE(Vulkan_GlobalBuffers, "Vulkan", "Setup global buffers", MP_RGB(192, 128, 128));
-MICROPROFILE_DEFINE(Vulkan_RenderTargets, "Vulkan", "Setup render targets", MP_RGB(192, 128, 128));
-MICROPROFILE_DEFINE(Vulkan_Textures, "Vulkan", "Setup textures", MP_RGB(192, 128, 128));
-MICROPROFILE_DEFINE(Vulkan_Images, "Vulkan", "Setup images", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128));
namespace {
+struct DrawParams {
+ u32 base_instance;
+ u32 num_instances;
+ u32 base_vertex;
+ u32 num_vertices;
+ bool is_indexed;
+};
constexpr auto COMPUTE_SHADER_INDEX = static_cast<size_t>(Tegra::Engines::ShaderType::Compute);
@@ -67,7 +65,6 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in
const float width = src.scale_x * 2.0f;
const float height = src.scale_y * 2.0f;
const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f;
-
VkViewport viewport{
.x = src.translate_x - src.scale_x,
.y = src.translate_y - src.scale_y,
@@ -76,12 +73,10 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in
.minDepth = src.translate_z - src.scale_z * reduce_z,
.maxDepth = src.translate_z + src.scale_z,
};
-
if (!device.IsExtDepthRangeUnrestrictedSupported()) {
viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f);
viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f);
}
-
return viewport;
}
@@ -146,13 +141,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const
return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
}
-template <size_t N>
-std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) {
- std::array<VkDeviceSize, N> expanded;
- std::copy(strides.begin(), strides.end(), expanded.begin());
- return expanded;
-}
-
ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
if (entry.is_buffer) {
return ImageViewType::e2D;
@@ -221,190 +209,25 @@ void PushImageDescriptors(const ShaderEntries& entries, TextureCache& texture_ca
}
}
-} // Anonymous namespace
-
-class BufferBindings final {
-public:
- void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) {
- vertex.buffers[vertex.num_buffers] = buffer;
- vertex.offsets[vertex.num_buffers] = offset;
- vertex.sizes[vertex.num_buffers] = size;
- vertex.strides[vertex.num_buffers] = static_cast<u16>(stride);
- ++vertex.num_buffers;
- }
-
- void SetIndexBinding(VkBuffer buffer, VkDeviceSize offset, VkIndexType type) {
- index.buffer = buffer;
- index.offset = offset;
- index.type = type;
- }
-
- void Bind(const Device& device, VKScheduler& scheduler) const {
- // Use this large switch case to avoid dispatching more memory in the record lambda than
- // what we need. It looks horrible, but it's the best we can do on standard C++.
- switch (vertex.num_buffers) {
- case 0:
- return BindStatic<0>(device, scheduler);
- case 1:
- return BindStatic<1>(device, scheduler);
- case 2:
- return BindStatic<2>(device, scheduler);
- case 3:
- return BindStatic<3>(device, scheduler);
- case 4:
- return BindStatic<4>(device, scheduler);
- case 5:
- return BindStatic<5>(device, scheduler);
- case 6:
- return BindStatic<6>(device, scheduler);
- case 7:
- return BindStatic<7>(device, scheduler);
- case 8:
- return BindStatic<8>(device, scheduler);
- case 9:
- return BindStatic<9>(device, scheduler);
- case 10:
- return BindStatic<10>(device, scheduler);
- case 11:
- return BindStatic<11>(device, scheduler);
- case 12:
- return BindStatic<12>(device, scheduler);
- case 13:
- return BindStatic<13>(device, scheduler);
- case 14:
- return BindStatic<14>(device, scheduler);
- case 15:
- return BindStatic<15>(device, scheduler);
- case 16:
- return BindStatic<16>(device, scheduler);
- case 17:
- return BindStatic<17>(device, scheduler);
- case 18:
- return BindStatic<18>(device, scheduler);
- case 19:
- return BindStatic<19>(device, scheduler);
- case 20:
- return BindStatic<20>(device, scheduler);
- case 21:
- return BindStatic<21>(device, scheduler);
- case 22:
- return BindStatic<22>(device, scheduler);
- case 23:
- return BindStatic<23>(device, scheduler);
- case 24:
- return BindStatic<24>(device, scheduler);
- case 25:
- return BindStatic<25>(device, scheduler);
- case 26:
- return BindStatic<26>(device, scheduler);
- case 27:
- return BindStatic<27>(device, scheduler);
- case 28:
- return BindStatic<28>(device, scheduler);
- case 29:
- return BindStatic<29>(device, scheduler);
- case 30:
- return BindStatic<30>(device, scheduler);
- case 31:
- return BindStatic<31>(device, scheduler);
- case 32:
- return BindStatic<32>(device, scheduler);
- }
- UNREACHABLE();
- }
-
-private:
- // Some of these fields are intentionally left uninitialized to avoid initializing them twice.
- struct {
- size_t num_buffers = 0;
- std::array<VkBuffer, Maxwell::NumVertexArrays> buffers;
- std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets;
- std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes;
- std::array<u16, Maxwell::NumVertexArrays> strides;
- } vertex;
-
- struct {
- VkBuffer buffer = nullptr;
- VkDeviceSize offset;
- VkIndexType type;
- } index;
-
- template <size_t N>
- void BindStatic(const Device& device, VKScheduler& scheduler) const {
- if (device.IsExtExtendedDynamicStateSupported()) {
- if (index.buffer) {
- BindStatic<N, true, true>(scheduler);
- } else {
- BindStatic<N, false, true>(scheduler);
- }
- } else {
- if (index.buffer) {
- BindStatic<N, true, false>(scheduler);
- } else {
- BindStatic<N, false, false>(scheduler);
- }
- }
- }
-
- template <size_t N, bool is_indexed, bool has_extended_dynamic_state>
- void BindStatic(VKScheduler& scheduler) const {
- static_assert(N <= Maxwell::NumVertexArrays);
- if constexpr (N == 0) {
- return;
- }
-
- std::array<VkBuffer, N> buffers;
- std::array<VkDeviceSize, N> offsets;
- std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin());
- std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin());
-
- if constexpr (has_extended_dynamic_state) {
- // With extended dynamic states we can specify the length and stride of a vertex buffer
- std::array<VkDeviceSize, N> sizes;
- std::array<u16, N> strides;
- std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin());
- std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin());
-
- if constexpr (is_indexed) {
- scheduler.Record(
- [buffers, offsets, sizes, strides, index = index](vk::CommandBuffer cmdbuf) {
- cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
- cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
- offsets.data(), sizes.data(),
- ExpandStrides(strides).data());
- });
- } else {
- scheduler.Record([buffers, offsets, sizes, strides](vk::CommandBuffer cmdbuf) {
- cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
- offsets.data(), sizes.data(),
- ExpandStrides(strides).data());
- });
- }
- return;
- }
-
- if constexpr (is_indexed) {
- // Indexed draw
- scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) {
- cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
- cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data());
- });
- } else {
- // Array draw
- scheduler.Record([buffers, offsets](vk::CommandBuffer cmdbuf) {
- cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data());
- });
- }
- }
-};
-
-void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const {
- if (is_indexed) {
- cmdbuf.DrawIndexed(num_vertices, num_instances, 0, base_vertex, base_instance);
- } else {
- cmdbuf.Draw(num_vertices, num_instances, base_vertex, base_instance);
+DrawParams MakeDrawParams(const Maxwell& regs, u32 num_instances, bool is_instanced,
+ bool is_indexed) {
+ DrawParams params{
+ .base_instance = regs.vb_base_instance,
+ .num_instances = is_instanced ? num_instances : 1,
+ .base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first,
+ .num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count,
+ .is_indexed = is_indexed,
+ };
+ if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
+ // 6 triangle vertices per quad, base vertex is part of the index
+ // See BindQuadArrayIndexBuffer for more details
+ params.num_vertices = (params.num_vertices / 4) * 6;
+ params.base_vertex = 0;
+ params.is_indexed = true;
}
+ return params;
}
+} // Anonymous namespace
RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
Tegra::MemoryManager& gpu_memory_,
@@ -414,21 +237,19 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
: RasterizerAccelerated{cpu_memory_}, gpu{gpu_},
gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()},
screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_},
- state_tracker{state_tracker_}, scheduler{scheduler_}, stream_buffer(device, scheduler),
+ state_tracker{state_tracker_}, scheduler{scheduler_},
staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler),
update_descriptor_queue(device, scheduler),
blit_image(device, scheduler, state_tracker, descriptor_pool),
- quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
- quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
- uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image},
texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
+ buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
+ update_descriptor_queue, descriptor_pool),
+ buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler,
descriptor_pool, update_descriptor_queue),
- buffer_cache(*this, gpu_memory, cpu_memory_, device, memory_allocator, scheduler,
- stream_buffer, staging_pool),
query_cache{*this, maxwell3d, gpu_memory, device, scheduler},
- fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, scheduler),
+ fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) {
scheduler.SetQueryCache(query_cache);
if (device.UseAsynchronousShaders()) {
@@ -449,22 +270,14 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
GraphicsPipelineCacheKey key;
key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported());
- buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed));
-
- BufferBindings buffer_bindings;
- const DrawParameters draw_params =
- SetupGeometry(key.fixed_state, buffer_bindings, is_indexed, is_instanced);
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
- auto lock = texture_cache.AcquireLock();
texture_cache.SynchronizeGraphicsDescriptors();
-
texture_cache.UpdateRenderTargets(false);
const auto shaders = pipeline_cache.GetShaders();
key.shaders = GetShaderAddresses(shaders);
- SetupShaderDescriptors(shaders);
-
- buffer_cache.Unmap();
+ SetupShaderDescriptors(shaders, is_indexed);
const Framebuffer* const framebuffer = texture_cache.GetFramebuffer();
key.renderpass = framebuffer->RenderPass();
@@ -476,22 +289,29 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
return;
}
- buffer_bindings.Bind(device, scheduler);
-
BeginTransformFeedback();
scheduler.RequestRenderpass(framebuffer);
scheduler.BindGraphicsPipeline(pipeline->GetHandle());
UpdateDynamicStates();
- const auto pipeline_layout = pipeline->GetLayout();
- const auto descriptor_set = pipeline->CommitDescriptorSet();
+ const auto& regs = maxwell3d.regs;
+ const u32 num_instances = maxwell3d.mme_draw.instance_count;
+ const DrawParams draw_params = MakeDrawParams(regs, num_instances, is_instanced, is_indexed);
+ const VkPipelineLayout pipeline_layout = pipeline->GetLayout();
+ const VkDescriptorSet descriptor_set = pipeline->CommitDescriptorSet();
scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) {
if (descriptor_set) {
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout,
- DESCRIPTOR_SET, descriptor_set, {});
+ DESCRIPTOR_SET, descriptor_set, nullptr);
+ }
+ if (draw_params.is_indexed) {
+ cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, 0,
+ draw_params.base_vertex, draw_params.base_instance);
+ } else {
+ cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances,
+ draw_params.base_vertex, draw_params.base_instance);
}
- draw_params.Draw(cmdbuf);
});
EndTransformFeedback();
@@ -515,7 +335,7 @@ void RasterizerVulkan::Clear() {
return;
}
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.UpdateRenderTargets(true);
const Framebuffer* const framebuffer = texture_cache.GetFramebuffer();
const VkExtent2D render_area = framebuffer->RenderArea();
@@ -559,7 +379,6 @@ void RasterizerVulkan::Clear() {
if (use_stencil) {
aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT;
}
-
scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil,
clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) {
VkClearAttachment attachment;
@@ -580,12 +399,11 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
auto& pipeline = pipeline_cache.GetComputePipeline({
.shader = code_addr,
.shared_memory_size = launch_desc.shared_alloc,
- .workgroup_size =
- {
- launch_desc.block_dim_x,
- launch_desc.block_dim_y,
- launch_desc.block_dim_z,
- },
+ .workgroup_size{
+ launch_desc.block_dim_x,
+ launch_desc.block_dim_y,
+ launch_desc.block_dim_z,
+ },
});
// Compute dispatches can't be executed inside a renderpass
@@ -594,10 +412,21 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
image_view_indices.clear();
sampler_handles.clear();
- auto lock = texture_cache.AcquireLock();
- texture_cache.SynchronizeComputeDescriptors();
+ std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
const auto& entries = pipeline.GetEntries();
+ buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers);
+ buffer_cache.UnbindComputeStorageBuffers();
+ u32 ssbo_index = 0;
+ for (const auto& buffer : entries.global_buffers) {
+ buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset,
+ buffer.is_written);
+ ++ssbo_index;
+ }
+ buffer_cache.UpdateComputeBuffers();
+
+ texture_cache.SynchronizeComputeDescriptors();
+
SetupComputeUniformTexels(entries);
SetupComputeTextures(entries);
SetupComputeStorageTexels(entries);
@@ -606,20 +435,15 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
const std::span indices_span(image_view_indices.data(), image_view_indices.size());
texture_cache.FillComputeImageViews(indices_span, image_view_ids);
- buffer_cache.Map(CalculateComputeStreamBufferSize());
-
update_descriptor_queue.Acquire();
- SetupComputeConstBuffers(entries);
- SetupComputeGlobalBuffers(entries);
+ buffer_cache.BindHostComputeBuffers();
ImageViewId* image_view_id_ptr = image_view_ids.data();
VkSampler* sampler_ptr = sampler_handles.data();
PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr,
sampler_ptr);
- buffer_cache.Unmap();
-
const VkPipeline pipeline_handle = pipeline.GetHandle();
const VkPipelineLayout pipeline_layout = pipeline.GetLayout();
const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet();
@@ -644,6 +468,11 @@ void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
query_cache.Query(gpu_addr, type, timestamp);
}
+void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
+ u32 size) {
+ buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size);
+}
+
void RasterizerVulkan::FlushAll() {}
void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
@@ -651,19 +480,23 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
return;
}
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.DownloadMemory(addr, size);
}
- buffer_cache.FlushRegion(addr, size);
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.DownloadMemory(addr, size);
+ }
query_cache.FlushRegion(addr, size);
}
bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) {
+ std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex};
if (!Settings::IsGPULevelHigh()) {
- return buffer_cache.MustFlushRegion(addr, size);
+ return buffer_cache.IsRegionGpuModified(addr, size);
}
return texture_cache.IsRegionGpuModified(addr, size) ||
- buffer_cache.MustFlushRegion(addr, size);
+ buffer_cache.IsRegionGpuModified(addr, size);
}
void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
@@ -671,11 +504,14 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
return;
}
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size);
}
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.WriteMemory(addr, size);
+ }
pipeline_cache.InvalidateRegion(addr, size);
- buffer_cache.InvalidateRegion(addr, size);
query_cache.InvalidateRegion(addr, size);
}
@@ -683,25 +519,34 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
if (addr == 0 || size == 0) {
return;
}
+ pipeline_cache.OnCPUWrite(addr, size);
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size);
}
- pipeline_cache.OnCPUWrite(addr, size);
- buffer_cache.OnCPUWrite(addr, size);
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.CachedWriteMemory(addr, size);
+ }
}
void RasterizerVulkan::SyncGuestHost() {
- buffer_cache.SyncGuestHost();
pipeline_cache.SyncGuestHost();
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.FlushCachedWrites();
+ }
}
void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) {
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.UnmapMemory(addr, size);
}
- buffer_cache.OnCPUWrite(addr, size);
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.WriteMemory(addr, size);
+ }
pipeline_cache.OnCPUWrite(addr, size);
}
@@ -774,18 +619,21 @@ void RasterizerVulkan::TickFrame() {
draw_counter = 0;
update_descriptor_queue.TickFrame();
fence_manager.TickFrame();
- buffer_cache.TickFrame();
staging_pool.TickFrame();
{
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.TickFrame();
}
+ {
+ std::scoped_lock lock{buffer_cache.mutex};
+ buffer_cache.TickFrame();
+ }
}
bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
texture_cache.BlitImage(dst, src, copy_config);
return true;
}
@@ -795,13 +643,11 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
if (!framebuffer_addr) {
return false;
}
-
- auto lock = texture_cache.AcquireLock();
+ std::scoped_lock lock{texture_cache.mutex};
ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr);
if (!image_view) {
return false;
}
-
screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D);
screen_info.width = image_view->size.width;
screen_info.height = image_view->size.height;
@@ -830,29 +676,8 @@ void RasterizerVulkan::FlushWork() {
draw_counter = 0;
}
-RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineState& fixed_state,
- BufferBindings& buffer_bindings,
- bool is_indexed,
- bool is_instanced) {
- MICROPROFILE_SCOPE(Vulkan_Geometry);
-
- const auto& regs = maxwell3d.regs;
-
- SetupVertexArrays(buffer_bindings);
-
- const u32 base_instance = regs.vb_base_instance;
- const u32 num_instances = is_instanced ? maxwell3d.mme_draw.instance_count : 1;
- const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first;
- const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count;
-
- DrawParameters params{base_instance, num_instances, base_vertex, num_vertices, is_indexed};
- SetupIndexBuffer(buffer_bindings, params, is_indexed);
-
- return params;
-}
-
void RasterizerVulkan::SetupShaderDescriptors(
- const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
+ const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders, bool is_indexed) {
image_view_indices.clear();
sampler_handles.clear();
for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
@@ -860,15 +685,27 @@ void RasterizerVulkan::SetupShaderDescriptors(
if (!shader) {
continue;
}
- const auto& entries = shader->GetEntries();
+ const ShaderEntries& entries = shader->GetEntries();
SetupGraphicsUniformTexels(entries, stage);
SetupGraphicsTextures(entries, stage);
SetupGraphicsStorageTexels(entries, stage);
SetupGraphicsImages(entries, stage);
+
+ buffer_cache.SetEnabledUniformBuffers(stage, entries.enabled_uniform_buffers);
+ buffer_cache.UnbindGraphicsStorageBuffers(stage);
+ u32 ssbo_index = 0;
+ for (const auto& buffer : entries.global_buffers) {
+ buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index,
+ buffer.cbuf_offset, buffer.is_written);
+ ++ssbo_index;
+ }
}
const std::span indices_span(image_view_indices.data(), image_view_indices.size());
+ buffer_cache.UpdateGraphicsBuffers(is_indexed);
texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
+ buffer_cache.BindHostGeometryBuffers(is_indexed);
+
update_descriptor_queue.Acquire();
ImageViewId* image_view_id_ptr = image_view_ids.data();
@@ -879,11 +716,9 @@ void RasterizerVulkan::SetupShaderDescriptors(
if (!shader) {
continue;
}
- const auto& entries = shader->GetEntries();
- SetupGraphicsConstBuffers(entries, stage);
- SetupGraphicsGlobalBuffers(entries, stage);
- PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr,
- sampler_ptr);
+ buffer_cache.BindHostStageBuffers(stage);
+ PushImageDescriptors(shader->GetEntries(), texture_cache, update_descriptor_queue,
+ image_view_id_ptr, sampler_ptr);
}
}
@@ -916,27 +751,11 @@ void RasterizerVulkan::BeginTransformFeedback() {
LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
return;
}
-
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
-
- UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable);
- UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable);
- UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable);
-
- const auto& binding = regs.tfb_bindings[0];
- UNIMPLEMENTED_IF(binding.buffer_enable == 0);
- UNIMPLEMENTED_IF(binding.buffer_offset != 0);
-
- const GPUVAddr gpu_addr = binding.Address();
- const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
- const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-
- scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
- cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
- cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
- });
+ scheduler.Record(
+ [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); });
}
void RasterizerVulkan::EndTransformFeedback() {
@@ -947,104 +766,11 @@ void RasterizerVulkan::EndTransformFeedback() {
if (!device.IsExtTransformFeedbackSupported()) {
return;
}
-
scheduler.Record(
[](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
}
-void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) {
- const auto& regs = maxwell3d.regs;
-
- for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
- const auto& vertex_array = regs.vertex_array[index];
- if (!vertex_array.IsEnabled()) {
- continue;
- }
- const GPUVAddr start{vertex_array.StartAddress()};
- const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
-
- ASSERT(end >= start);
- const size_t size = end - start;
- if (size == 0) {
- buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0);
- continue;
- }
- const auto info = buffer_cache.UploadMemory(start, size);
- buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride);
- }
-}
-
-void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params,
- bool is_indexed) {
- if (params.num_vertices == 0) {
- return;
- }
- const auto& regs = maxwell3d.regs;
- switch (regs.draw.topology) {
- case Maxwell::PrimitiveTopology::Quads: {
- if (!params.is_indexed) {
- const auto [buffer, offset] =
- quad_array_pass.Assemble(params.num_vertices, params.base_vertex);
- buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
- params.base_vertex = 0;
- params.num_vertices = params.num_vertices * 6 / 4;
- params.is_indexed = true;
- break;
- }
- const GPUVAddr gpu_addr = regs.index_array.IndexStart();
- const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
- VkBuffer buffer = info.handle;
- u64 offset = info.offset;
- std::tie(buffer, offset) = quad_indexed_pass.Assemble(
- regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
-
- buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
- params.num_vertices = (params.num_vertices / 4) * 6;
- params.base_vertex = 0;
- break;
- }
- default: {
- if (!is_indexed) {
- break;
- }
- const GPUVAddr gpu_addr = regs.index_array.IndexStart();
- const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
- VkBuffer buffer = info.handle;
- u64 offset = info.offset;
-
- auto format = regs.index_array.format;
- const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
- if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) {
- std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, buffer, offset);
- format = Maxwell::IndexFormat::UnsignedShort;
- }
-
- buffer_bindings.SetIndexBinding(buffer, offset, MaxwellToVK::IndexFormat(device, format));
- break;
- }
- }
-}
-
-void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, size_t stage) {
- MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
- const auto& shader_stage = maxwell3d.state.shader_stages[stage];
- for (const auto& entry : entries.const_buffers) {
- SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]);
- }
-}
-
-void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, size_t stage) {
- MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
- const auto& cbufs{maxwell3d.state.shader_stages[stage]};
-
- for (const auto& entry : entries.global_buffers) {
- const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset();
- SetupGlobalBuffer(entry, addr);
- }
-}
-
void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) {
- MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.uniform_texels) {
@@ -1054,7 +780,6 @@ void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries,
}
void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) {
- MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.samplers) {
@@ -1070,7 +795,6 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_
}
void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) {
- MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.storage_texels) {
@@ -1080,7 +804,6 @@ void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries,
}
void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) {
- MICROPROFILE_SCOPE(Vulkan_Images);
const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.images) {
@@ -1089,32 +812,7 @@ void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t
}
}
-void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) {
- MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
- const auto& launch_desc = kepler_compute.launch_description;
- for (const auto& entry : entries.const_buffers) {
- const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
- const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
- const Tegra::Engines::ConstBufferInfo info{
- .address = config.Address(),
- .size = config.size,
- .enabled = mask[entry.GetIndex()],
- };
- SetupConstBuffer(entry, info);
- }
-}
-
-void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
- MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
- const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
- for (const auto& entry : entries.global_buffers) {
- const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
- SetupGlobalBuffer(entry, addr);
- }
-}
-
void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
- MICROPROFILE_SCOPE(Vulkan_Textures);
const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.uniform_texels) {
const TextureHandle handle =
@@ -1124,7 +822,6 @@ void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
}
void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
- MICROPROFILE_SCOPE(Vulkan_Textures);
const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.samplers) {
for (size_t index = 0; index < entry.size; ++index) {
@@ -1139,7 +836,6 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
}
void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
- MICROPROFILE_SCOPE(Vulkan_Textures);
const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.storage_texels) {
const TextureHandle handle =
@@ -1149,7 +845,6 @@ void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
}
void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
- MICROPROFILE_SCOPE(Vulkan_Images);
const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.images) {
const TextureHandle handle =
@@ -1158,42 +853,6 @@ void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
}
}
-void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
- const Tegra::Engines::ConstBufferInfo& buffer) {
- if (!buffer.enabled) {
- // Set values to zero to unbind buffers
- update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE);
- return;
- }
- // Align the size to avoid bad std140 interactions
- const size_t size = Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
- ASSERT(size <= MaxConstbufferSize);
-
- const u64 alignment = device.GetUniformBufferAlignment();
- const auto info = buffer_cache.UploadMemory(buffer.address, size, alignment);
- update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
-}
-
-void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
- const u64 actual_addr = gpu_memory.Read<u64>(address);
- const u32 size = gpu_memory.Read<u32>(address + 8);
-
- if (size == 0) {
- // Sometimes global memory pointers don't have a proper size. Upload a dummy entry
- // because Vulkan doesn't like empty buffers.
- // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
- // default buffer.
- static constexpr size_t dummy_size = 4;
- const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
- update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
- return;
- }
-
- const auto info = buffer_cache.UploadMemory(
- actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
- update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
-}
-
void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) {
if (!state_tracker.TouchViewports()) {
return;
@@ -1206,7 +865,8 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg
GetViewportState(device, regs, 8), GetViewportState(device, regs, 9),
GetViewportState(device, regs, 10), GetViewportState(device, regs, 11),
GetViewportState(device, regs, 12), GetViewportState(device, regs, 13),
- GetViewportState(device, regs, 14), GetViewportState(device, regs, 15)};
+ GetViewportState(device, regs, 14), GetViewportState(device, regs, 15),
+ };
scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); });
}
@@ -1214,13 +874,14 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs
if (!state_tracker.TouchScissors()) {
return;
}
- const std::array scissors = {
+ const std::array scissors{
GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2),
GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5),
GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8),
GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11),
GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14),
- GetScissorState(regs, 15)};
+ GetScissorState(regs, 15),
+ };
scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); });
}
@@ -1385,73 +1046,4 @@ void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs&
});
}
-size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const {
- size_t size = CalculateVertexArraysSize();
- if (is_indexed) {
- size = Common::AlignUp(size, 4) + CalculateIndexBufferSize();
- }
- size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
- return size;
-}
-
-size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const {
- return Tegra::Engines::KeplerCompute::NumConstBuffers *
- (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
-}
-
-size_t RasterizerVulkan::CalculateVertexArraysSize() const {
- const auto& regs = maxwell3d.regs;
-
- size_t size = 0;
- for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
- // This implementation assumes that all attributes are used in the shader.
- const GPUVAddr start{regs.vertex_array[index].StartAddress()};
- const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
- DEBUG_ASSERT(end >= start);
-
- size += (end - start) * regs.vertex_array[index].enable;
- }
- return size;
-}
-
-size_t RasterizerVulkan::CalculateIndexBufferSize() const {
- return static_cast<size_t>(maxwell3d.regs.index_array.count) *
- static_cast<size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
-}
-
-size_t RasterizerVulkan::CalculateConstBufferSize(
- const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) const {
- if (entry.IsIndirect()) {
- // Buffer is accessed indirectly, so upload the entire thing
- return buffer.size;
- } else {
- // Buffer is accessed directly, upload just what we use
- return entry.GetSize();
- }
-}
-
-VkBuffer RasterizerVulkan::DefaultBuffer() {
- if (default_buffer) {
- return *default_buffer;
- }
- default_buffer = device.GetLogical().CreateBuffer({
- .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
- .pNext = nullptr,
- .flags = 0,
- .size = DEFAULT_BUFFER_SIZE,
- .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
- VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
- .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
- .queueFamilyIndexCount = 0,
- .pQueueFamilyIndices = nullptr,
- });
- default_buffer_commit = memory_allocator.Commit(default_buffer, MemoryUsage::DeviceLocal);
-
- scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) {
- cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0);
- });
- return *default_buffer;
-}
-
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 8e261b9bd..7fc6741da 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -18,14 +18,12 @@
#include "video_core/renderer_vulkan/blit_image.h"
#include "video_core/renderer_vulkan/fixed_pipeline_state.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
-#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_fence_manager.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
-#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/renderer_vulkan/vk_update_descriptor.h"
#include "video_core/shader/async_shaders.h"
@@ -49,7 +47,6 @@ namespace Vulkan {
struct VKScreenInfo;
class StateTracker;
-class BufferBindings;
class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
public:
@@ -65,6 +62,7 @@ public:
void DispatchCompute(GPUVAddr code_addr) override;
void ResetCounter(VideoCore::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
+ void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void FlushAll() override;
void FlushRegion(VAddr addr, u64 size) override;
bool MustFlushRegion(VAddr addr, u64 size) override;
@@ -107,24 +105,11 @@ private:
static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float);
- struct DrawParameters {
- void Draw(vk::CommandBuffer cmdbuf) const;
-
- u32 base_instance = 0;
- u32 num_instances = 0;
- u32 base_vertex = 0;
- u32 num_vertices = 0;
- bool is_indexed = 0;
- };
-
void FlushWork();
- /// Setups geometry buffers and state.
- DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings,
- bool is_indexed, bool is_instanced);
-
/// Setup descriptors in the graphics pipeline.
- void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);
+ void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders,
+ bool is_indexed);
void UpdateDynamicStates();
@@ -132,16 +117,6 @@ private:
void EndTransformFeedback();
- void SetupVertexArrays(BufferBindings& buffer_bindings);
-
- void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed);
-
- /// Setup constant buffers in the graphics pipeline.
- void SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage);
-
- /// Setup global buffers in the graphics pipeline.
- void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
-
/// Setup uniform texels in the graphics pipeline.
void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
@@ -154,12 +129,6 @@ private:
/// Setup images in the graphics pipeline.
void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
- /// Setup constant buffers in the compute pipeline.
- void SetupComputeConstBuffers(const ShaderEntries& entries);
-
- /// Setup global buffers in the compute pipeline.
- void SetupComputeGlobalBuffers(const ShaderEntries& entries);
-
/// Setup texel buffers in the compute pipeline.
void SetupComputeUniformTexels(const ShaderEntries& entries);
@@ -172,11 +141,6 @@ private:
/// Setup images in the compute pipeline.
void SetupComputeImages(const ShaderEntries& entries);
- void SetupConstBuffer(const ConstBufferEntry& entry,
- const Tegra::Engines::ConstBufferInfo& buffer);
-
- void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
-
void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs);
@@ -193,19 +157,6 @@ private:
void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
- size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const;
-
- size_t CalculateComputeStreamBufferSize() const;
-
- size_t CalculateVertexArraysSize() const;
-
- size_t CalculateIndexBufferSize() const;
-
- size_t CalculateConstBufferSize(const ConstBufferEntry& entry,
- const Tegra::Engines::ConstBufferInfo& buffer) const;
-
- VkBuffer DefaultBuffer();
-
Tegra::GPU& gpu;
Tegra::MemoryManager& gpu_memory;
Tegra::Engines::Maxwell3D& maxwell3d;
@@ -217,24 +168,19 @@ private:
StateTracker& state_tracker;
VKScheduler& scheduler;
- VKStreamBuffer stream_buffer;
StagingBufferPool staging_pool;
VKDescriptorPool descriptor_pool;
VKUpdateDescriptorQueue update_descriptor_queue;
BlitImageHelper blit_image;
- QuadArrayPass quad_array_pass;
- QuadIndexedPass quad_indexed_pass;
- Uint8Pass uint8_pass;
TextureCacheRuntime texture_cache_runtime;
TextureCache texture_cache;
+ BufferCacheRuntime buffer_cache_runtime;
+ BufferCache buffer_cache;
VKPipelineCache pipeline_cache;
- VKBufferCache buffer_cache;
VKQueryCache query_cache;
VKFenceManager fence_manager;
- vk::Buffer default_buffer;
- MemoryCommit default_buffer_commit;
vk::Event wfi_event;
VideoCommon::Shader::AsyncShaders async_shaders;
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 66004f9c0..f35c120b0 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -52,18 +52,6 @@ VKScheduler::~VKScheduler() {
worker_thread.join();
}
-u64 VKScheduler::CurrentTick() const noexcept {
- return master_semaphore->CurrentTick();
-}
-
-bool VKScheduler::IsFree(u64 tick) const noexcept {
- return master_semaphore->IsFree(tick);
-}
-
-void VKScheduler::Wait(u64 tick) {
- master_semaphore->Wait(tick);
-}
-
void VKScheduler::Flush(VkSemaphore semaphore) {
SubmitExecution(semaphore);
AllocateNewContext();
@@ -269,7 +257,7 @@ void VKScheduler::EndRenderPass() {
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
- VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, 0, nullptr, nullptr,
+ VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, nullptr, nullptr,
vk::Span(barriers.data(), num_images));
});
state.renderpass = nullptr;
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 15f2987eb..3ce48e9d2 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -14,6 +14,7 @@
#include "common/alignment.h"
#include "common/common_types.h"
#include "common/threadsafe_queue.h"
+#include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
@@ -21,7 +22,6 @@ namespace Vulkan {
class CommandPool;
class Device;
class Framebuffer;
-class MasterSemaphore;
class StateTracker;
class VKQueryCache;
@@ -32,15 +32,6 @@ public:
explicit VKScheduler(const Device& device, StateTracker& state_tracker);
~VKScheduler();
- /// Returns the current command buffer tick.
- [[nodiscard]] u64 CurrentTick() const noexcept;
-
- /// Returns true when a tick has been triggered by the GPU.
- [[nodiscard]] bool IsFree(u64 tick) const noexcept;
-
- /// Waits for the given tick to trigger on the GPU.
- void Wait(u64 tick);
-
/// Sends the current execution context to the GPU.
void Flush(VkSemaphore semaphore = nullptr);
@@ -82,6 +73,21 @@ public:
(void)chunk->Record(command);
}
+ /// Returns the current command buffer tick.
+ [[nodiscard]] u64 CurrentTick() const noexcept {
+ return master_semaphore->CurrentTick();
+ }
+
+ /// Returns true when a tick has been triggered by the GPU.
+ [[nodiscard]] bool IsFree(u64 tick) const noexcept {
+ return master_semaphore->IsFree(tick);
+ }
+
+ /// Waits for the given tick to trigger on the GPU.
+ void Wait(u64 tick) {
+ master_semaphore->Wait(tick);
+ }
+
/// Returns the master timeline semaphore.
[[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept {
return *master_semaphore;
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 61d52b961..e165a6987 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -3127,6 +3127,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
entries.attributes.insert(GetGenericAttributeLocation(attribute));
}
}
+ for (const auto& buffer : entries.const_buffers) {
+ entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
+ }
entries.clip_distances = ir.GetClipDistances();
entries.shader_length = ir.GetLength();
entries.uses_warps = ir.UsesWarps();
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index 26381e444..5d94132a5 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -39,24 +39,7 @@ private:
u32 index{};
};
-class GlobalBufferEntry {
-public:
- constexpr explicit GlobalBufferEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_written_)
- : cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_written{is_written_} {}
-
- constexpr u32 GetCbufIndex() const {
- return cbuf_index;
- }
-
- constexpr u32 GetCbufOffset() const {
- return cbuf_offset;
- }
-
- constexpr bool IsWritten() const {
- return is_written;
- }
-
-private:
+struct GlobalBufferEntry {
u32 cbuf_index{};
u32 cbuf_offset{};
bool is_written{};
@@ -78,6 +61,7 @@ struct ShaderEntries {
std::set<u32> attributes;
std::array<bool, Maxwell::NumClipDistances> clip_distances{};
std::size_t shader_length{};
+ u32 enabled_uniform_buffers{};
bool uses_warps{};
};
diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
index 1779a2e30..e81fad007 100644
--- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp
+++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp
@@ -30,15 +30,18 @@ using Table = Maxwell3D::DirtyState::Table;
using Flags = Maxwell3D::DirtyState::Flags;
Flags MakeInvalidationFlags() {
- static constexpr std::array INVALIDATION_FLAGS{
+ static constexpr int INVALIDATION_FLAGS[]{
Viewports, Scissors, DepthBias, BlendConstants, DepthBounds,
StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable,
- DepthCompareOp, FrontFace, StencilOp, StencilTestEnable,
+ DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, VertexBuffers,
};
Flags flags{};
for (const int flag : INVALIDATION_FLAGS) {
flags[flag] = true;
}
+ for (int index = VertexBuffer0; index <= VertexBuffer31; ++index) {
+ flags[index] = true;
+ }
return flags;
}
@@ -130,7 +133,7 @@ void SetupDirtyStencilTestEnable(Tables& tables) {
StateTracker::StateTracker(Tegra::GPU& gpu)
: flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} {
auto& tables = gpu.Maxwell3D().dirty.tables;
- SetupDirtyRenderTargets(tables);
+ SetupDirtyFlags(tables);
SetupDirtyViewports(tables);
SetupDirtyScissors(tables);
SetupDirtyDepthBias(tables);
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index aa7c5d7c6..1eeb45ca9 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -426,46 +426,47 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image,
VkImageAspectFlags aspect_mask, bool is_initialized,
std::span<const VkBufferImageCopy> copies) {
- static constexpr VkAccessFlags ACCESS_FLAGS = VK_ACCESS_SHADER_WRITE_BIT |
- VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+ static constexpr VkAccessFlags WRITE_ACCESS_FLAGS =
+ VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
+ VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+ static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT |
+ VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
+ VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
const VkImageMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
- .srcAccessMask = ACCESS_FLAGS,
+ .srcAccessMask = WRITE_ACCESS_FLAGS,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
- .subresourceRange =
- {
- .aspectMask = aspect_mask,
- .baseMipLevel = 0,
- .levelCount = VK_REMAINING_MIP_LEVELS,
- .baseArrayLayer = 0,
- .layerCount = VK_REMAINING_ARRAY_LAYERS,
- },
+ .subresourceRange{
+ .aspectMask = aspect_mask,
+ .baseMipLevel = 0,
+ .levelCount = VK_REMAINING_MIP_LEVELS,
+ .baseArrayLayer = 0,
+ .layerCount = VK_REMAINING_ARRAY_LAYERS,
+ },
};
const VkImageMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
- .dstAccessMask = ACCESS_FLAGS,
+ .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
- .subresourceRange =
- {
- .aspectMask = aspect_mask,
- .baseMipLevel = 0,
- .levelCount = VK_REMAINING_MIP_LEVELS,
- .baseArrayLayer = 0,
- .layerCount = VK_REMAINING_ARRAY_LAYERS,
- },
+ .subresourceRange{
+ .aspectMask = aspect_mask,
+ .baseMipLevel = 0,
+ .levelCount = VK_REMAINING_MIP_LEVELS,
+ .baseArrayLayer = 0,
+ .layerCount = VK_REMAINING_ARRAY_LAYERS,
+ },
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
read_barrier);
@@ -569,20 +570,12 @@ void TextureCacheRuntime::Finish() {
scheduler.Finish();
}
-ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) {
- const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Upload);
- return {
- .handle = staging_ref.buffer,
- .span = staging_ref.mapped_span,
- };
+StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
+ return staging_buffer_pool.Request(size, MemoryUsage::Upload);
}
-ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) {
- const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Download);
- return {
- .handle = staging_ref.buffer,
- .span = staging_ref.mapped_span,
- };
+StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
+ return staging_buffer_pool.Request(size, MemoryUsage::Download);
}
void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
@@ -754,7 +747,7 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT,
- .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@@ -765,12 +758,9 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
VkImageMemoryBarrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
- .srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT |
- VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
- VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
- VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
+ .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
- VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
+ VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
@@ -828,12 +818,12 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
}
}
-void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
std::span<const BufferImageCopy> copies) {
// TODO: Move this to another API
scheduler->RequestOutsideRenderPassOperationContext();
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
- const VkBuffer src_buffer = map.handle;
+ const VkBuffer src_buffer = map.buffer;
const VkImage vk_image = *image;
const VkImageAspectFlags vk_aspect_mask = aspect_mask;
const bool is_initialized = std::exchange(initialized, true);
@@ -843,12 +833,12 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
});
}
-void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
std::span<const VideoCommon::BufferCopy> copies) {
// TODO: Move this to another API
scheduler->RequestOutsideRenderPassOperationContext();
std::vector vk_copies = TransformBufferCopies(copies, buffer_offset);
- const VkBuffer src_buffer = map.handle;
+ const VkBuffer src_buffer = map.buffer;
const VkBuffer dst_buffer = *buffer;
scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
// TODO: Barriers
@@ -856,13 +846,58 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
});
}
-void Image::DownloadMemory(const ImageBufferMap& map, size_t buffer_offset,
+void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
std::span<const BufferImageCopy> copies) {
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
- scheduler->Record([buffer = map.handle, image = *image, aspect_mask = aspect_mask,
+ scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask,
vk_copies](vk::CommandBuffer cmdbuf) {
- // TODO: Barriers
- cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_GENERAL, buffer, vk_copies);
+ const VkImageMemoryBarrier read_barrier{
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+ .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
+ .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .image = image,
+ .subresourceRange{
+ .aspectMask = aspect_mask,
+ .baseMipLevel = 0,
+ .levelCount = VK_REMAINING_MIP_LEVELS,
+ .baseArrayLayer = 0,
+ .layerCount = VK_REMAINING_ARRAY_LAYERS,
+ },
+ };
+ const VkImageMemoryBarrier image_write_barrier{
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = 0,
+ .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+ .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .image = image,
+ .subresourceRange{
+ .aspectMask = aspect_mask,
+ .baseMipLevel = 0,
+ .levelCount = VK_REMAINING_MIP_LEVELS,
+ .baseArrayLayer = 0,
+ .layerCount = VK_REMAINING_ARRAY_LAYERS,
+ },
+ };
+ const VkMemoryBarrier memory_write_barrier{
+ .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+ .pNext = nullptr,
+ .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+ .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
+ };
+ cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+ 0, read_barrier);
+ cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies);
+ cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ 0, memory_write_barrier, nullptr, image_write_barrier);
});
}
@@ -1127,7 +1162,7 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
.pAttachments = attachments.data(),
.width = key.size.width,
.height = key.size.height,
- .layers = static_cast<u32>(num_layers),
+ .layers = static_cast<u32>(std::max(num_layers, 1)),
});
if (runtime.device.HasDebuggingToolAttached()) {
framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str());
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 8d29361a1..4558c3297 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -7,6 +7,7 @@
#include <compare>
#include <span>
+#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/texture_cache/texture_cache.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -53,19 +54,6 @@ struct hash<Vulkan::RenderPassKey> {
namespace Vulkan {
-struct ImageBufferMap {
- [[nodiscard]] VkBuffer Handle() const noexcept {
- return handle;
- }
-
- [[nodiscard]] std::span<u8> Span() const noexcept {
- return span;
- }
-
- VkBuffer handle;
- std::span<u8> span;
-};
-
struct TextureCacheRuntime {
const Device& device;
VKScheduler& scheduler;
@@ -76,9 +64,9 @@ struct TextureCacheRuntime {
void Finish();
- [[nodiscard]] ImageBufferMap MapUploadBuffer(size_t size);
+ [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
- [[nodiscard]] ImageBufferMap MapDownloadBuffer(size_t size);
+ [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
const std::array<Offset2D, 2>& dst_region,
@@ -94,7 +82,7 @@ struct TextureCacheRuntime {
return false;
}
- void AccelerateImageUpload(Image&, const ImageBufferMap&, size_t,
+ void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t,
std::span<const VideoCommon::SwizzleParameters>) {
UNREACHABLE();
}
@@ -112,13 +100,13 @@ public:
explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
VAddr cpu_addr);
- void UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+ void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies);
- void UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
+ void UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
std::span<const VideoCommon::BufferCopy> copies);
- void DownloadMemory(const ImageBufferMap& map, size_t buffer_offset,
+ void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset,
std::span<const VideoCommon::BufferImageCopy> copies);
[[nodiscard]] VkImage Handle() const noexcept {
diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h
index 0dbb1a31f..7fdff6e56 100644
--- a/src/video_core/shader/async_shaders.h
+++ b/src/video_core/shader/async_shaders.h
@@ -9,16 +9,7 @@
#include <shared_mutex>
#include <thread>
-// This header includes both Vulkan and OpenGL headers, this has to be fixed
-// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues.
-// Forcefully include glad early and undefine macros
#include <glad/glad.h>
-#ifdef CreateEvent
-#undef CreateEvent
-#endif
-#ifdef CreateSemaphore
-#undef CreateSemaphore
-#endif
#include "common/common_types.h"
#include "video_core/renderer_opengl/gl_device.h"
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d3ea07aac..5f88537bc 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -76,6 +76,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
case SystemVariable::InvocationId:
return Operation(OperationCode::InvocationId);
case SystemVariable::Ydirection:
+ uses_y_negate = true;
return Operation(OperationCode::YNegate);
case SystemVariable::InvocationInfo:
LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 0c6ab0f07..1cd7c14d7 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -139,6 +139,10 @@ public:
return uses_legacy_varyings;
}
+ bool UsesYNegate() const {
+ return uses_y_negate;
+ }
+
bool UsesWarps() const {
return uses_warps;
}
@@ -465,6 +469,7 @@ private:
bool uses_instance_id{};
bool uses_vertex_id{};
bool uses_legacy_varyings{};
+ bool uses_y_negate{};
bool uses_warps{};
bool uses_indexed_samplers{};
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d1080300f..f336b705f 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -103,9 +103,6 @@ public:
/// Notify the cache that a new frame has been queued
void TickFrame();
- /// Return an unique mutually exclusive lock for the cache
- [[nodiscard]] std::unique_lock<std::mutex> AcquireLock();
-
/// Return a constant reference to the given image view id
[[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept;
@@ -179,6 +176,8 @@ public:
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
+ std::mutex mutex;
+
private:
/// Iterate over all page indices in a range
template <typename Func>
@@ -212,8 +211,8 @@ private:
void RefreshContents(Image& image);
/// Upload data from guest to an image
- template <typename MapBuffer>
- void UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset);
+ template <typename StagingBuffer>
+ void UploadImageContents(Image& image, StagingBuffer& staging_buffer, size_t buffer_offset);
/// Find or create an image view from a guest descriptor
[[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
@@ -325,8 +324,6 @@ private:
RenderTargets render_targets;
- std::mutex mutex;
-
std::unordered_map<TICEntry, ImageViewId> image_views;
std::unordered_map<TSCEntry, SamplerId> samplers;
std::unordered_map<RenderTargets, FramebufferId> framebuffers;
@@ -386,11 +383,6 @@ void TextureCache<P>::TickFrame() {
}
template <class P>
-std::unique_lock<std::mutex> TextureCache<P>::AcquireLock() {
- return std::unique_lock{mutex};
-}
-
-template <class P>
const typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) const noexcept {
return slot_image_views[id];
}
@@ -598,11 +590,11 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
});
for (const ImageId image_id : images) {
Image& image = slot_images[image_id];
- auto map = runtime.MapDownloadBuffer(image.unswizzled_size_bytes);
+ auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(map, 0, copies);
runtime.Finish();
- SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.Span());
+ SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
}
}
@@ -757,7 +749,7 @@ void TextureCache<P>::PopAsyncFlushes() {
for (const ImageId image_id : download_ids) {
total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
}
- auto download_map = runtime.MapDownloadBuffer(total_size_bytes);
+ auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
size_t buffer_offset = 0;
for (const ImageId image_id : download_ids) {
Image& image = slot_images[image_id];
@@ -769,7 +761,7 @@ void TextureCache<P>::PopAsyncFlushes() {
runtime.Finish();
buffer_offset = 0;
- const std::span<u8> download_span = download_map.Span();
+ const std::span<u8> download_span = download_map.mapped_span;
for (const ImageId image_id : download_ids) {
const ImageBase& image = slot_images[image_id];
const auto copies = FullDownloadCopies(image.info);
@@ -806,7 +798,7 @@ void TextureCache<P>::RefreshContents(Image& image) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
return;
}
- auto map = runtime.MapUploadBuffer(MapSizeBytes(image));
+ auto map = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, map, 0);
runtime.InsertUploadMemoryBarrier();
}
@@ -814,7 +806,7 @@ void TextureCache<P>::RefreshContents(Image& image) {
template <class P>
template <typename MapBuffer>
void TextureCache<P>::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) {
- const std::span<u8> mapped_span = map.Span().subspan(buffer_offset);
+ const std::span<u8> mapped_span = map.mapped_span.subspan(buffer_offset);
const GPUVAddr gpu_addr = image.gpu_addr;
if (True(image.flags & ImageFlagBits::AcceleratedUpload)) {
diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h
index 86393310a..d1ce29450 100644
--- a/src/video_core/vulkan_common/vulkan_memory_allocator.h
+++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h
@@ -78,7 +78,7 @@ public:
*
* @throw vk::Exception on failure
*/
- explicit MemoryAllocator(const Device& device_, bool export_allocations_ = false);
+ explicit MemoryAllocator(const Device& device_, bool export_allocations_);
~MemoryAllocator();
MemoryAllocator& operator=(const MemoryAllocator&) = delete;