diff options
Diffstat (limited to 'src')
27 files changed, 1182 insertions, 168 deletions
diff --git a/src/core/hle/service/bcat/backend/boxcat.cpp b/src/core/hle/service/bcat/backend/boxcat.cpp index 67e39a5c4..f589864ee 100644 --- a/src/core/hle/service/bcat/backend/boxcat.cpp +++ b/src/core/hle/service/bcat/backend/boxcat.cpp @@ -200,7 +200,8 @@ private: DownloadResult DownloadInternal(const std::string& resolved_path, u32 timeout_seconds, const std::string& content_type_name) { if (client == nullptr) { - client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT, timeout_seconds); + client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT); + client->set_timeout_sec(timeout_seconds); } httplib::Headers headers{ @@ -448,8 +449,8 @@ std::optional<std::vector<u8>> Boxcat::GetLaunchParameter(TitleIDVersion title) Boxcat::StatusResult Boxcat::GetStatus(std::optional<std::string>& global, std::map<std::string, EventStatus>& games) { - httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT), - static_cast<int>(TIMEOUT_SECONDS)}; + httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT)}; + client.set_timeout_sec(static_cast<int>(TIMEOUT_SECONDS)); httplib::Headers headers{ {std::string("Game-Assets-API-Version"), std::string(BOXCAT_API_VERSION)}, diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index db9332d00..4b0c6346f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -37,6 +37,7 @@ add_library(video_core STATIC memory_manager.h morton.cpp morton.h + query_cache.h rasterizer_accelerated.cpp rasterizer_accelerated.h rasterizer_cache.cpp @@ -74,6 +75,8 @@ add_library(video_core STATIC renderer_opengl/gl_stream_buffer.h renderer_opengl/gl_texture_cache.cpp renderer_opengl/gl_texture_cache.h + renderer_opengl/gl_query_cache.cpp + renderer_opengl/gl_query_cache.h renderer_opengl/maxwell_to_gl.h renderer_opengl/renderer_opengl.cpp renderer_opengl/renderer_opengl.h @@ -177,6 +180,8 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_memory_manager.h renderer_vulkan/vk_pipeline_cache.cpp renderer_vulkan/vk_pipeline_cache.h + renderer_vulkan/vk_query_cache.cpp + renderer_vulkan/vk_query_cache.h renderer_vulkan/vk_rasterizer.cpp renderer_vulkan/vk_rasterizer.h renderer_vulkan/vk_renderpass_cache.cpp diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 0b3e8749b..b28de1092 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -4,6 +4,7 @@ #include <cinttypes> #include <cstring> +#include <optional> #include "common/assert.h" #include "core/core.h" #include "core/core_timing.h" @@ -16,6 +17,8 @@ namespace Tegra::Engines { +using VideoCore::QueryType; + /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; @@ -400,6 +403,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ProcessQueryCondition(); break; } + case MAXWELL3D_REG_INDEX(counter_reset): { + ProcessCounterReset(); + break; + } case MAXWELL3D_REG_INDEX(sync_info): { ProcessSyncPoint(); break; @@ -482,7 +489,7 @@ void Maxwell3D::FlushMMEInlineDraw() { const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed; if (ShouldExecute()) { - rasterizer.DrawMultiBatch(is_indexed); + rasterizer.Draw(is_indexed, true); } // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -544,40 +551,28 @@ void Maxwell3D::ProcessQueryGet() { "Units other than CROP are unimplemented"); switch (regs.query.query_get.operation) { - case Regs::QueryOperation::Release: { - const u64 result = regs.query.query_sequence; - StampQueryResult(result, regs.query.query_get.short_query == 0); + case Regs::QueryOperation::Release: + StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0); break; - } - case Regs::QueryOperation::Acquire: { - // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU - // to write a value that matches the current payload. + case Regs::QueryOperation::Acquire: + // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that + // matches the current payload. UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); break; - } - case Regs::QueryOperation::Counter: { - u64 result{}; - switch (regs.query.query_get.select) { - case Regs::QuerySelect::Zero: - result = 0; - break; - default: - result = 1; - UNIMPLEMENTED_MSG("Unimplemented query select type {}", - static_cast<u32>(regs.query.query_get.select.Value())); + case Regs::QueryOperation::Counter: + if (const std::optional<u64> result = GetQueryResult()) { + // If the query returns an empty optional it means it's cached and deferred. + // In this case we have a non-empty result, so we stamp it immediately. + StampQueryResult(*result, regs.query.query_get.short_query == 0); } - StampQueryResult(result, regs.query.query_get.short_query == 0); break; - } - case Regs::QueryOperation::Trap: { + case Regs::QueryOperation::Trap: UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); break; - } - default: { + default: UNIMPLEMENTED_MSG("Unknown query operation"); break; } - } } void Maxwell3D::ProcessQueryCondition() { @@ -593,20 +588,20 @@ void Maxwell3D::ProcessQueryCondition() { } case Regs::ConditionMode::ResNonZero: { Regs::QueryCompare cmp; - memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp)); execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U; break; } case Regs::ConditionMode::Equal: { Regs::QueryCompare cmp; - memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp)); execute_on = cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode; break; } case Regs::ConditionMode::NotEqual: { Regs::QueryCompare cmp; - memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp)); + memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp)); execute_on = cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode; break; @@ -619,6 +614,18 @@ void Maxwell3D::ProcessQueryCondition() { } } +void Maxwell3D::ProcessCounterReset() { + switch (regs.counter_reset) { + case Regs::CounterReset::SampleCnt: + rasterizer.ResetCounter(QueryType::SamplesPassed); + break; + default: + LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}", + static_cast<int>(regs.counter_reset)); + break; + } +} + void Maxwell3D::ProcessSyncPoint() { const u32 sync_point = regs.sync_info.sync_point.Value(); const u32 increment = regs.sync_info.increment.Value(); @@ -647,7 +654,7 @@ void Maxwell3D::DrawArrays() { const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count}; if (ShouldExecute()) { - rasterizer.DrawBatch(is_indexed); + rasterizer.Draw(is_indexed, false); } // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if @@ -661,6 +668,22 @@ void Maxwell3D::DrawArrays() { } } +std::optional<u64> Maxwell3D::GetQueryResult() { + switch (regs.query.query_get.select) { + case Regs::QuerySelect::Zero: + return 0; + case Regs::QuerySelect::SamplesPassed: + // Deferred. + rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, + system.GPU().GetTicks()); + return {}; + default: + UNIMPLEMENTED_MSG("Unimplemented query select type {}", + static_cast<u32>(regs.query.query_get.select.Value())); + return 1; + } +} + void Maxwell3D::ProcessCBBind(std::size_t stage_index) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage. auto& shader = state.shader_stages[stage_index]; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 0a2af54e5..26939be3f 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@ #include <array> #include <bitset> +#include <optional> #include <type_traits> #include <unordered_map> #include <vector> @@ -409,6 +410,27 @@ public: Linear = 1, }; + enum class CounterReset : u32 { + SampleCnt = 0x01, + Unk02 = 0x02, + Unk03 = 0x03, + Unk04 = 0x04, + EmittedPrimitives = 0x10, // Not tested + Unk11 = 0x11, + Unk12 = 0x12, + Unk13 = 0x13, + Unk15 = 0x15, + Unk16 = 0x16, + Unk17 = 0x17, + Unk18 = 0x18, + Unk1A = 0x1A, + Unk1B = 0x1B, + Unk1C = 0x1C, + Unk1D = 0x1D, + Unk1E = 0x1E, + GeneratedPrimitives = 0x1F, + }; + struct Cull { enum class FrontFace : u32 { ClockWise = 0x0900, @@ -857,7 +879,7 @@ public: BitField<7, 1, u32> c7; } clip_distance_enabled; - INSERT_UNION_PADDING_WORDS(0x1); + u32 samplecnt_enable; float point_size; @@ -865,7 +887,11 @@ public: u32 point_sprite_enable; - INSERT_UNION_PADDING_WORDS(0x5); + INSERT_UNION_PADDING_WORDS(0x3); + + CounterReset counter_reset; + + INSERT_UNION_PADDING_WORDS(0x1); u32 zeta_enable; @@ -1412,12 +1438,15 @@ private: /// Handles a write to the QUERY_GET register. void ProcessQueryGet(); - // Writes the query result accordingly + /// Writes the query result accordingly. void StampQueryResult(u64 payload, bool long_query); - // Handles Conditional Rendering + /// Handles conditional rendering. void ProcessQueryCondition(); + /// Handles counter resets. + void ProcessCounterReset(); + /// Handles writes to syncing register. void ProcessSyncPoint(); @@ -1434,6 +1463,9 @@ private: // Handles a instance drawcall from MME void StepInstance(MMEDrawMode expected_mode, u32 count); + + /// Returns a query's value or an empty object if the value will be deferred through a cache. + std::optional<u64> GetQueryResult(); }; #define ASSERT_REG_POSITION(field_name, position) \ @@ -1499,8 +1531,10 @@ ASSERT_REG_POSITION(screen_y_control, 0x4EB); ASSERT_REG_POSITION(vb_element_base, 0x50D); ASSERT_REG_POSITION(vb_base_instance, 0x50E); ASSERT_REG_POSITION(clip_distance_enabled, 0x544); +ASSERT_REG_POSITION(samplecnt_enable, 0x545); ASSERT_REG_POSITION(point_size, 0x546); ASSERT_REG_POSITION(point_sprite_enable, 0x548); +ASSERT_REG_POSITION(counter_reset, 0x54C); ASSERT_REG_POSITION(zeta_enable, 0x54E); ASSERT_REG_POSITION(multisample_control, 0x54F); ASSERT_REG_POSITION(condition, 0x554); diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 11848fbce..f5d33f27a 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -9,6 +9,7 @@ #include "core/hle/kernel/process.h" #include "core/hle/kernel/vm_manager.h" #include "core/memory.h" +#include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" @@ -84,7 +85,9 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) { const auto cpu_addr = GpuToCpuAddress(gpu_addr); ASSERT(cpu_addr); - rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size); + // Flush and invalidate through the GPU interface, to be asynchronous if possible. + system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size); + UnmapRange(gpu_addr, aligned_size); ASSERT(system.CurrentProcess() ->VMManager() @@ -242,6 +245,8 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { const u8* src_ptr{page_table.pointers[page_index] + page_offset}; + // Flush must happen on the rasterizer interface, such that memory is always synchronous + // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu. rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); std::memcpy(dest_buffer, src_ptr, copy_amount); break; @@ -292,6 +297,8 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { u8* dest_ptr{page_table.pointers[page_index] + page_offset}; + // Invalidate must happen on the rasterizer interface, such that memory is always + // synchronous when it is written (even when in asynchronous GPU mode). rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount); std::memcpy(dest_ptr, src_buffer, copy_amount); break; @@ -339,6 +346,8 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std:: switch (page_table.attributes[page_index]) { case Common::PageType::Memory: { + // Flush must happen on the rasterizer interface, such that memory is always synchronous + // when it is copied (even when in asynchronous GPU mode). const u8* src_ptr{page_table.pointers[page_index] + page_offset}; rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount); WriteBlock(dest_addr, src_ptr, copy_amount); diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h new file mode 100644 index 000000000..e66054ed0 --- /dev/null +++ b/src/video_core/query_cache.h @@ -0,0 +1,359 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <array> +#include <cstring> +#include <iterator> +#include <memory> +#include <mutex> +#include <optional> +#include <unordered_map> +#include <vector> + +#include "common/assert.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <class QueryCache, class HostCounter> +class CounterStreamBase { +public: + explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type) + : cache{cache}, type{type} {} + + /// Updates the state of the stream, enabling or disabling as needed. + void Update(bool enabled) { + if (enabled) { + Enable(); + } else { + Disable(); + } + } + + /// Resets the stream to zero. It doesn't disable the query after resetting. + void Reset() { + if (current) { + current->EndQuery(); + + // Immediately start a new query to avoid disabling its state. + current = cache.Counter(nullptr, type); + } + last = nullptr; + } + + /// Returns the current counter slicing as needed. + std::shared_ptr<HostCounter> Current() { + if (!current) { + return nullptr; + } + current->EndQuery(); + last = std::move(current); + current = cache.Counter(last, type); + return last; + } + + /// Returns true when the counter stream is enabled. + bool IsEnabled() const { + return current != nullptr; + } + +private: + /// Enables the stream. + void Enable() { + if (current) { + return; + } + current = cache.Counter(last, type); + } + + // Disables the stream. + void Disable() { + if (current) { + current->EndQuery(); + } + last = std::exchange(current, nullptr); + } + + QueryCache& cache; + const VideoCore::QueryType type; + + std::shared_ptr<HostCounter> current; + std::shared_ptr<HostCounter> last; +}; + +template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter, + class QueryPool> +class QueryCacheBase { +public: + explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer) + : system{system}, rasterizer{rasterizer}, streams{{CounterStream{ + static_cast<QueryCache&>(*this), + VideoCore::QueryType::SamplesPassed}}} {} + + void InvalidateRegion(CacheAddr addr, std::size_t size) { + std::unique_lock lock{mutex}; + FlushAndRemoveRegion(addr, size); + } + + void FlushRegion(CacheAddr addr, std::size_t size) { + std::unique_lock lock{mutex}; + FlushAndRemoveRegion(addr, size); + } + + /** + * Records a query in GPU mapped memory, potentially marked with a timestamp. + * @param gpu_addr GPU address to flush to when the mapped memory is read. + * @param type Query type, e.g. SamplesPassed. + * @param timestamp Timestamp, when empty the flushed query is assumed to be short. + */ + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) { + std::unique_lock lock{mutex}; + auto& memory_manager = system.GPU().MemoryManager(); + const auto host_ptr = memory_manager.GetPointer(gpu_addr); + + CachedQuery* query = TryGet(ToCacheAddr(host_ptr)); + if (!query) { + const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); + ASSERT_OR_EXECUTE(cpu_addr, return;); + + query = Register(type, *cpu_addr, host_ptr, timestamp.has_value()); + } + + query->BindCounter(Stream(type).Current(), timestamp); + } + + /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch. + void UpdateCounters() { + std::unique_lock lock{mutex}; + const auto& regs = system.GPU().Maxwell3D().regs; + Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable); + } + + /// Resets a counter to zero. It doesn't disable the query after resetting. + void ResetCounter(VideoCore::QueryType type) { + std::unique_lock lock{mutex}; + Stream(type).Reset(); + } + + /// Disable all active streams. Expected to be called at the end of a command buffer. + void DisableStreams() { + std::unique_lock lock{mutex}; + for (auto& stream : streams) { + stream.Update(false); + } + } + + /// Returns a new host counter. + std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type) { + return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency), + type); + } + + /// Returns the counter stream of the specified type. + CounterStream& Stream(VideoCore::QueryType type) { + return streams[static_cast<std::size_t>(type)]; + } + + /// Returns the counter stream of the specified type. + const CounterStream& Stream(VideoCore::QueryType type) const { + return streams[static_cast<std::size_t>(type)]; + } + +protected: + std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; + +private: + /// Flushes a memory range to guest memory and removes it from the cache. + void FlushAndRemoveRegion(CacheAddr addr, std::size_t size) { + const u64 addr_begin = static_cast<u64>(addr); + const u64 addr_end = addr_begin + static_cast<u64>(size); + const auto in_range = [addr_begin, addr_end](CachedQuery& query) { + const u64 cache_begin = query.GetCacheAddr(); + const u64 cache_end = cache_begin + query.SizeInBytes(); + return cache_begin < addr_end && addr_begin < cache_end; + }; + + const u64 page_end = addr_end >> PAGE_SHIFT; + for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) { + const auto& it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + continue; + } + auto& contents = it->second; + for (auto& query : contents) { + if (!in_range(query)) { + continue; + } + rasterizer.UpdatePagesCachedCount(query.CpuAddr(), query.SizeInBytes(), -1); + query.Flush(); + } + contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range), + std::end(contents)); + } + } + + /// Registers the passed parameters as cached and returns a pointer to the stored cached query. + CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) { + rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1); + const u64 page = static_cast<u64>(ToCacheAddr(host_ptr)) >> PAGE_SHIFT; + return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr, + host_ptr); + } + + /// Tries to a get a cached query. Returns nullptr on failure. + CachedQuery* TryGet(CacheAddr addr) { + const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT; + const auto it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + return nullptr; + } + auto& contents = it->second; + const auto found = + std::find_if(std::begin(contents), std::end(contents), + [addr](auto& query) { return query.GetCacheAddr() == addr; }); + return found != std::end(contents) ? &*found : nullptr; + } + + static constexpr std::uintptr_t PAGE_SIZE = 4096; + static constexpr unsigned PAGE_SHIFT = 12; + + Core::System& system; + VideoCore::RasterizerInterface& rasterizer; + + std::recursive_mutex mutex; + + std::unordered_map<u64, std::vector<CachedQuery>> cached_queries; + + std::array<CounterStream, VideoCore::NumQueryTypes> streams; +}; + +template <class QueryCache, class HostCounter> +class HostCounterBase { +public: + explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_) + : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} { + // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted. + constexpr u64 depth_threshold = 96; + if (depth > depth_threshold) { + depth = 0; + base_result = dependency->Query(); + dependency = nullptr; + } + } + virtual ~HostCounterBase() = default; + + /// Returns the current value of the query. + u64 Query() { + if (result) { + return *result; + } + + u64 value = BlockingQuery() + base_result; + if (dependency) { + value += dependency->Query(); + dependency = nullptr; + } + + result = value; + return *result; + } + + /// Returns true when flushing this query will potentially wait. + bool WaitPending() const noexcept { + return result.has_value(); + } + + u64 Depth() const noexcept { + return depth; + } + +protected: + /// Returns the value of query from the backend API blocking as needed. + virtual u64 BlockingQuery() const = 0; + +private: + std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value. + std::optional<u64> result; ///< Filled with the already returned value. + u64 depth; ///< Number of nested dependencies. + u64 base_result = 0; ///< Equivalent to nested dependencies value. +}; + +template <class HostCounter> +class CachedQueryBase { +public: + explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr) + : cpu_addr{cpu_addr}, host_ptr{host_ptr} {} + virtual ~CachedQueryBase() = default; + + CachedQueryBase(CachedQueryBase&&) noexcept = default; + CachedQueryBase(const CachedQueryBase&) = delete; + + CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default; + CachedQueryBase& operator=(const CachedQueryBase&) = delete; + + /// Flushes the query to guest memory. + virtual void Flush() { + // When counter is nullptr it means that it's just been reseted. We are supposed to write a + // zero in these cases. + const u64 value = counter ? counter->Query() : 0; + std::memcpy(host_ptr, &value, sizeof(u64)); + + if (timestamp) { + std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64)); + } + } + + /// Binds a counter to this query. + void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) { + if (counter) { + // If there's an old counter set it means the query is being rewritten by the game. + // To avoid losing the data forever, flush here. + Flush(); + } + counter = std::move(counter_); + timestamp = timestamp_; + } + + VAddr CpuAddr() const noexcept { + return cpu_addr; + } + + CacheAddr GetCacheAddr() const noexcept { + return ToCacheAddr(host_ptr); + } + + u64 SizeInBytes() const noexcept { + return SizeInBytes(timestamp.has_value()); + } + + static constexpr u64 SizeInBytes(bool with_timestamp) noexcept { + return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE; + } + +protected: + /// Returns true when querying the counter may potentially block. + bool WaitPending() const noexcept { + return counter && counter->WaitPending(); + } + +private: + static constexpr std::size_t SMALL_QUERY_SIZE = 8; // Query size without timestamp. + static constexpr std::size_t LARGE_QUERY_SIZE = 16; // Query size with timestamp. + static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query. + + VAddr cpu_addr; ///< Guest CPU address. + u8* host_ptr; ///< Writable host pointer. + std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree. + std::optional<u64> timestamp; ///< Timestamp to flush to guest memory. +}; + +} // namespace VideoCommon diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index c586cd6fe..f18eaf4bc 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -6,6 +6,7 @@ #include <atomic> #include <functional> +#include <optional> #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" @@ -17,6 +18,11 @@ class MemoryManager; namespace VideoCore { +enum class QueryType { + SamplesPassed, +}; +constexpr std::size_t NumQueryTypes = 1; + enum class LoadCallbackStage { Prepare, Decompile, @@ -29,11 +35,8 @@ class RasterizerInterface { public: virtual ~RasterizerInterface() {} - /// Draw the current batch of vertex arrays - virtual bool DrawBatch(bool is_indexed) = 0; - - /// Draw the current batch of multiple instances of vertex arrays - virtual bool DrawMultiBatch(bool is_indexed) = 0; + /// Dispatches a draw invocation + virtual void Draw(bool is_indexed, bool is_instanced) = 0; /// Clear the current framebuffer virtual void Clear() = 0; @@ -41,6 +44,12 @@ public: /// Dispatches a compute shader invocation virtual void DispatchCompute(GPUVAddr code_addr) = 0; + /// Resets the counter of a query + virtual void ResetCounter(QueryType type) = 0; + + /// Records a GPU query and caches it + virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; + /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp new file mode 100644 index 000000000..f12e9f55f --- /dev/null +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -0,0 +1,120 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstring> +#include <memory> +#include <unordered_map> +#include <utility> +#include <vector> + +#include <glad/glad.h> + +#include "common/assert.h" +#include "core/core.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_query_cache.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" + +namespace OpenGL { + +namespace { + +constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED}; + +constexpr GLenum GetTarget(VideoCore::QueryType type) { + return QueryTargets[static_cast<std::size_t>(type)]; +} + +} // Anonymous namespace + +QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer) + : VideoCommon::QueryCacheBase< + QueryCache, CachedQuery, CounterStream, HostCounter, + std::vector<OGLQuery>>{system, + static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)}, + gl_rasterizer{gl_rasterizer} {} + +QueryCache::~QueryCache() = default; + +OGLQuery QueryCache::AllocateQuery(VideoCore::QueryType type) { + auto& reserve = query_pools[static_cast<std::size_t>(type)]; + OGLQuery query; + if (reserve.empty()) { + query.Create(GetTarget(type)); + return query; + } + + query = std::move(reserve.back()); + reserve.pop_back(); + return query; +} + +void QueryCache::Reserve(VideoCore::QueryType type, OGLQuery&& query) { + query_pools[static_cast<std::size_t>(type)].push_back(std::move(query)); +} + +bool QueryCache::AnyCommandQueued() const noexcept { + return gl_rasterizer.AnyCommandQueued(); +} + +HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type) + : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache}, + type{type}, query{cache.AllocateQuery(type)} { + glBeginQuery(GetTarget(type), query.handle); +} + +HostCounter::~HostCounter() { + cache.Reserve(type, std::move(query)); +} + +void HostCounter::EndQuery() { + if (!cache.AnyCommandQueued()) { + // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not + // having any of these causes a lock. glFlush is considered a command, so we can safely wait + // for this. Insert to the OpenGL command stream a flush. + glFlush(); + } + glEndQuery(GetTarget(type)); +} + +u64 HostCounter::BlockingQuery() const { + GLint64 value; + glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value); + return static_cast<u64>(value); +} + +CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr) + : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {} + +CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept + : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} + +CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { + VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs)); + cache = rhs.cache; + type = rhs.type; + return *this; +} + +void CachedQuery::Flush() { + // Waiting for a query while another query of the same target is enabled locks Nvidia's driver. + // To avoid this disable and re-enable keeping the dependency stream. + // But we only have to do this if we have pending waits to be done. + auto& stream = cache->Stream(type); + const bool slice_counter = WaitPending() && stream.IsEnabled(); + if (slice_counter) { + stream.Update(false); + } + + VideoCommon::CachedQueryBase<HostCounter>::Flush(); + + if (slice_counter) { + stream.Update(true); + } +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h new file mode 100644 index 000000000..d8e7052a1 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -0,0 +1,78 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <memory> +#include <vector> + +#include "common/common_types.h" +#include "video_core/query_cache.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" + +namespace Core { +class System; +} + +namespace OpenGL { + +class CachedQuery; +class HostCounter; +class QueryCache; +class RasterizerOpenGL; + +using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; + +class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, + HostCounter, std::vector<OGLQuery>> { +public: + explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); + ~QueryCache(); + + OGLQuery AllocateQuery(VideoCore::QueryType type); + + void Reserve(VideoCore::QueryType type, OGLQuery&& query); + + bool AnyCommandQueued() const noexcept; + +private: + RasterizerOpenGL& gl_rasterizer; +}; + +class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { +public: + explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type); + ~HostCounter(); + + void EndQuery(); + +private: + u64 BlockingQuery() const override; + + QueryCache& cache; + const VideoCore::QueryType type; + OGLQuery query; +}; + +class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> { +public: + explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, + u8* host_ptr); + CachedQuery(CachedQuery&& rhs) noexcept; + CachedQuery(const CachedQuery&) = delete; + + CachedQuery& operator=(CachedQuery&& rhs) noexcept; + CachedQuery& operator=(const CachedQuery&) = delete; + + void Flush() override; + +private: + QueryCache* cache; + VideoCore::QueryType type; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index b0eb14c8b..e1965fb21 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -25,6 +25,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_gen.h" @@ -92,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, ScreenInfo& info) : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device}, - shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info}, - buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { + shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system}, + screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} { shader_program_manager = std::make_unique<GLShader::ProgramManager>(); state.draw.shader_program = 0; state.Apply(); @@ -541,11 +542,16 @@ void RasterizerOpenGL::Clear() { } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } + + ++num_queued_commands; } void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(OpenGL_Drawing); auto& gpu = system.GPU().Maxwell3D(); + const auto& regs = gpu.regs; + + query_cache.UpdateCounters(); SyncRasterizeEnable(state); SyncColorMask(); @@ -611,7 +617,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { // Setup shaders and their used resources. texture_cache.GuardSamplers(true); - const auto primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); + const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology); SetupShaders(primitive_mode); texture_cache.GuardSamplers(false); @@ -638,35 +644,47 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { glTextureBarrier(); } + ++num_queued_commands; + const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); const GLsizei num_instances = static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1); if (is_indexed) { - const GLenum index_format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base); const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count); - glDrawElementsInstancedBaseVertexBaseInstance( - primitive_mode, num_vertices, index_format, - reinterpret_cast<const void*>(index_buffer_offset), num_instances, base_vertex, - base_instance); + const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); + const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format); + if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { + glDrawElements(primitive_mode, num_vertices, format, offset); + } else if (num_instances == 1 && base_instance == 0) { + glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex); + } else if (base_vertex == 0 && base_instance == 0) { + glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances); + } else if (base_vertex == 0) { + glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset, + num_instances, base_instance); + } else if (base_instance == 0) { + glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset, + num_instances, base_vertex); + } else { + glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format, + offset, num_instances, base_vertex, + base_instance); + } } else { const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first); const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count); - glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, num_instances, - base_instance); + if (num_instances == 1 && base_instance == 0) { + glDrawArrays(primitive_mode, base_vertex, num_vertices); + } else if (base_instance == 0) { + glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances); + } else { + glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, + num_instances, base_instance); + } } } -bool RasterizerOpenGL::DrawBatch(bool is_indexed) { - Draw(is_indexed, false); - return true; -} - -bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) { - Draw(is_indexed, true); - return true; -} - void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { if (device.HasBrokenCompute()) { return; @@ -707,6 +725,16 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { state.ApplyProgramPipeline(); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); + ++num_queued_commands; +} + +void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { + query_cache.ResetCounter(type); +} + +void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional<u64> timestamp) { + query_cache.Query(gpu_addr, type, timestamp); } void RasterizerOpenGL::FlushAll() {} @@ -718,6 +746,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) { } texture_cache.FlushRegion(addr, size); buffer_cache.FlushRegion(addr, size); + query_cache.FlushRegion(addr, size); } void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { @@ -728,6 +757,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) { texture_cache.InvalidateRegion(addr, size); shader_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); + query_cache.InvalidateRegion(addr, size); } void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { @@ -738,10 +768,18 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { } void RasterizerOpenGL::FlushCommands() { + // Only flush when we have commands queued to OpenGL. + if (num_queued_commands == 0) { + return; + } + num_queued_commands = 0; glFlush(); } void RasterizerOpenGL::TickFrame() { + // Ticking a frame means that buffers will be swapped, calling glFlush implicitly. + num_queued_commands = 0; + buffer_cache.TickFrame(); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 0501f3828..68abe9a21 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -24,6 +24,7 @@ #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_framebuffer_cache.h" +#include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_sampler_cache.h" #include "video_core/renderer_opengl/gl_shader_cache.h" @@ -57,10 +58,11 @@ public: ScreenInfo& info); ~RasterizerOpenGL() override; - bool DrawBatch(bool is_indexed) override; - bool DrawMultiBatch(bool is_indexed) override; + void Draw(bool is_indexed, bool is_instanced) override; void Clear() override; void DispatchCompute(GPUVAddr code_addr) override; + void ResetCounter(VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; @@ -75,6 +77,11 @@ public: void LoadDiskResources(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) override; + /// Returns true when there are commands queued to the OpenGL server. + bool AnyCommandQueued() const { + return num_queued_commands > 0; + } + private: /// Configures the color and depth framebuffer states. void ConfigureFramebuffers(); @@ -102,9 +109,6 @@ private: void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr, std::size_t size); - /// Syncs all the state, shaders, render targets and textures setting before a draw call. - void Draw(bool is_indexed, bool is_instanced); - /// Configures the current textures to use for the draw command. void SetupDrawTextures(std::size_t stage_index, const Shader& shader); @@ -180,10 +184,23 @@ private: /// Syncs the alpha test state to match the guest state void SyncAlphaTest(); - /// Check for extension that are not strictly required - /// but are needed for correct emulation + /// Check for extension that are not strictly required but are needed for correct emulation void CheckExtensions(); + std::size_t CalculateVertexArraysSize() const; + + std::size_t CalculateIndexBufferSize() const; + + /// Updates and returns a vertex array object representing current vertex format + GLuint SetupVertexFormat(); + + void SetupVertexBuffer(GLuint vao); + void SetupVertexInstances(GLuint vao); + + GLintptr SetupIndexBuffer(); + + void SetupShaders(GLenum primitive_mode); + const Device device; OpenGLState state; @@ -191,6 +208,7 @@ private: ShaderCacheOpenGL shader_cache; SamplerCacheOpenGL sampler_cache; FramebufferCacheOpenGL framebuffer_cache; + QueryCache query_cache; Core::System& system; ScreenInfo& screen_info; @@ -208,19 +226,8 @@ private: BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; - std::size_t CalculateVertexArraysSize() const; - - std::size_t CalculateIndexBufferSize() const; - - /// Updates and returns a vertex array object representing current vertex format - GLuint SetupVertexFormat(); - - void SetupVertexBuffer(GLuint vao); - void SetupVertexInstances(GLuint vao); - - GLintptr SetupIndexBuffer(); - - void SetupShaders(GLenum primitive_mode); + /// Number of commands queued to the OpenGL driver. Reseted on flush. + std::size_t num_queued_commands = 0; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 5c96c1d46..f0ddfb276 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -207,4 +207,21 @@ void OGLFramebuffer::Release() { handle = 0; } +void OGLQuery::Create(GLenum target) { + if (handle != 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceCreation); + glCreateQueries(target, 1, &handle); +} + +void OGLQuery::Release() { + if (handle == 0) + return; + + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteQueries(1, &handle); + handle = 0; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 3a85a1d4c..514d1d165 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -266,4 +266,29 @@ public: GLuint handle = 0; }; +class OGLQuery : private NonCopyable { +public: + OGLQuery() = default; + + OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + + ~OGLQuery() { + Release(); + } + + OGLQuery& operator=(OGLQuery&& o) noexcept { + Release(); + handle = std::exchange(o.handle, 0); + return *this; + } + + /// Creates a new internal OpenGL resource and stores the handle + void Create(GLenum target); + + /// Deletes the internal OpenGL resource + void Release(); + + GLuint handle = 0; +}; + } // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 331808113..5403c3ab7 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -164,7 +164,7 @@ struct FormatTuple { {vk::Format::eUndefined, {}}, // ASTC_2D_5X4 {vk::Format::eUndefined, {}}, // BGRA8_SRGB {vk::Format::eBc1RgbaSrgbBlock, {}}, // DXT1_SRGB - {vk::Format::eUndefined, {}}, // DXT23_SRGB + {vk::Format::eBc2SrgbBlock, {}}, // DXT23_SRGB {vk::Format::eBc3SrgbBlock, {}}, // DXT45_SRGB {vk::Format::eBc7SrgbBlock, {}}, // BC7U_SRGB {vk::Format::eR4G4B4A4UnormPack16, Attachable}, // R4G4B4A4U @@ -363,6 +363,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR8G8B8A8Uint; case Maxwell::VertexAttribute::Size::Size_32: return vk::Format::eR32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32_32_32: + return vk::Format::eR32G32B32A32Uint; default: break; } diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 9840f26e5..d1da4f9d3 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -104,8 +104,11 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan features.depthBiasClamp = true; features.geometryShader = true; features.tessellationShader = true; + features.occlusionQueryPrecise = true; features.fragmentStoresAndAtomics = true; features.shaderImageGatherExtended = true; + features.shaderStorageImageReadWithoutFormat = + is_shader_storage_img_read_without_format_supported; features.shaderStorageImageWriteWithoutFormat = true; features.textureCompressionASTC_LDR = is_optimal_astc_supported; @@ -117,6 +120,10 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan bit8_storage.uniformAndStorageBuffer8BitAccess = true; SetNext(next, bit8_storage); + vk::PhysicalDeviceHostQueryResetFeaturesEXT host_query_reset; + host_query_reset.hostQueryReset = true; + SetNext(next, host_query_reset); + vk::PhysicalDeviceFloat16Int8FeaturesKHR float16_int8; if (is_float16_supported) { float16_int8.shaderFloat16 = true; @@ -273,6 +280,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, + VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, }; std::bitset<required_extensions.size()> available_extensions{}; @@ -340,6 +348,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev std::make_pair(features.depthBiasClamp, "depthBiasClamp"), std::make_pair(features.geometryShader, "geometryShader"), std::make_pair(features.tessellationShader, "tessellationShader"), + std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"), std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"), std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"), std::make_pair(features.shaderStorageImageWriteWithoutFormat, @@ -376,7 +385,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } }; - extensions.reserve(13); + extensions.reserve(14); extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME); extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME); @@ -384,6 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami extensions.push_back(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME); extensions.push_back(VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME); extensions.push_back(VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME); + extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME); [[maybe_unused]] const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); @@ -457,6 +467,8 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { const auto supported_features{physical.getFeatures(dldi)}; + is_shader_storage_img_read_without_format_supported = + supported_features.shaderStorageImageReadWithoutFormat; is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); } @@ -530,6 +542,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eBc6HUfloatBlock, vk::Format::eBc6HSfloatBlock, vk::Format::eBc1RgbaSrgbBlock, + vk::Format::eBc2SrgbBlock, vk::Format::eBc3SrgbBlock, vk::Format::eBc7SrgbBlock, vk::Format::eAstc4x4SrgbBlock, diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index 72603f9f6..2c27ad730 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -122,6 +122,11 @@ public: return properties.limits.maxPushConstantsSize; } + /// Returns true if Shader storage Image Read Without Format supported. + bool IsShaderStorageImageReadWithoutFormatSupported() const { + return is_shader_storage_img_read_without_format_supported; + } + /// Returns true if ASTC is natively supported. bool IsOptimalAstcSupported() const { return is_optimal_astc_supported; @@ -227,6 +232,8 @@ private: bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. + bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage + ///< image read without format // Telemetry parameters std::string vendor_name; ///< Device's driver name. diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp new file mode 100644 index 000000000..ffbf60dda --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -0,0 +1,122 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <utility> +#include <vector> + +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" + +namespace Vulkan { + +namespace { + +constexpr std::array QUERY_TARGETS = {vk::QueryType::eOcclusion}; + +constexpr vk::QueryType GetTarget(VideoCore::QueryType type) { + return QUERY_TARGETS[static_cast<std::size_t>(type)]; +} + +} // Anonymous namespace + +QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {} + +QueryPool::~QueryPool() = default; + +void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) { + device = &device_; + type = type_; +} + +std::pair<vk::QueryPool, std::uint32_t> QueryPool::Commit(VKFence& fence) { + std::size_t index; + do { + index = CommitResource(fence); + } while (usage[index]); + usage[index] = true; + + return {*pools[index / GROW_STEP], static_cast<std::uint32_t>(index % GROW_STEP)}; +} + +void QueryPool::Allocate(std::size_t begin, std::size_t end) { + usage.resize(end); + + const auto dev = device->GetLogical(); + const u32 size = static_cast<u32>(end - begin); + const vk::QueryPoolCreateInfo query_pool_ci({}, GetTarget(type), size, {}); + pools.push_back(dev.createQueryPoolUnique(query_pool_ci, nullptr, device->GetDispatchLoader())); +} + +void QueryPool::Reserve(std::pair<vk::QueryPool, std::uint32_t> query) { + const auto it = + std::find_if(std::begin(pools), std::end(pools), + [query_pool = query.first](auto& pool) { return query_pool == *pool; }); + ASSERT(it != std::end(pools)); + + const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it); + usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; +} + +VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKScheduler& scheduler) + : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, + QueryPool>{system, rasterizer}, + device{device}, scheduler{scheduler} { + for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) { + query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i)); + } +} + +VKQueryCache::~VKQueryCache() = default; + +std::pair<vk::QueryPool, std::uint32_t> VKQueryCache::AllocateQuery(VideoCore::QueryType type) { + return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence()); +} + +void VKQueryCache::Reserve(VideoCore::QueryType type, + std::pair<vk::QueryPool, std::uint32_t> query) { + query_pools[static_cast<std::size_t>(type)].Reserve(query); +} + +HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type) + : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache}, + type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} { + const auto dev = cache.Device().GetLogical(); + cache.Scheduler().Record([dev, query = query](vk::CommandBuffer cmdbuf, auto& dld) { + dev.resetQueryPoolEXT(query.first, query.second, 1, dld); + cmdbuf.beginQuery(query.first, query.second, vk::QueryControlFlagBits::ePrecise, dld); + }); +} + +HostCounter::~HostCounter() { + cache.Reserve(type, query); +} + +void HostCounter::EndQuery() { + cache.Scheduler().Record([query = query](auto cmdbuf, auto& dld) { + cmdbuf.endQuery(query.first, query.second, dld); + }); +} + +u64 HostCounter::BlockingQuery() const { + if (ticks >= cache.Scheduler().Ticks()) { + cache.Scheduler().Flush(); + } + + const auto dev = cache.Device().GetLogical(); + const auto& dld = cache.Device().GetDispatchLoader(); + u64 value; + dev.getQueryPoolResults(query.first, query.second, 1, sizeof(value), &value, sizeof(value), + vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait, dld); + return value; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h new file mode 100644 index 000000000..c3092ee96 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -0,0 +1,104 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <cstdint> +#include <memory> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "video_core/query_cache.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Vulkan { + +class CachedQuery; +class HostCounter; +class VKDevice; +class VKQueryCache; +class VKScheduler; + +using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>; + +class QueryPool final : public VKFencedPool { +public: + explicit QueryPool(); + ~QueryPool() override; + + void Initialize(const VKDevice& device, VideoCore::QueryType type); + + std::pair<vk::QueryPool, std::uint32_t> Commit(VKFence& fence); + + void Reserve(std::pair<vk::QueryPool, std::uint32_t> query); + +protected: + void Allocate(std::size_t begin, std::size_t end) override; + +private: + static constexpr std::size_t GROW_STEP = 512; + + const VKDevice* device = nullptr; + VideoCore::QueryType type = {}; + + std::vector<UniqueQueryPool> pools; + std::vector<bool> usage; +}; + +class VKQueryCache final + : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter, + QueryPool> { +public: + explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKScheduler& scheduler); + ~VKQueryCache(); + + std::pair<vk::QueryPool, std::uint32_t> AllocateQuery(VideoCore::QueryType type); + + void Reserve(VideoCore::QueryType type, std::pair<vk::QueryPool, std::uint32_t> query); + + const VKDevice& Device() const noexcept { + return device; + } + + VKScheduler& Scheduler() const noexcept { + return scheduler; + } + +private: + const VKDevice& device; + VKScheduler& scheduler; +}; + +class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> { +public: + explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, + VideoCore::QueryType type); + ~HostCounter(); + + void EndQuery(); + +private: + u64 BlockingQuery() const override; + + VKQueryCache& cache; + const VideoCore::QueryType type; + const std::pair<vk::QueryPool, std::uint32_t> query; + const u64 ticks; +}; + +class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { +public: + explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr) + : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {} +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index aada38702..31c078f6a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -289,25 +289,19 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind staging_pool), pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue), buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), - sampler_cache(device) {} - -RasterizerVulkan::~RasterizerVulkan() = default; - -bool RasterizerVulkan::DrawBatch(bool is_indexed) { - Draw(is_indexed, false); - return true; + sampler_cache(device), query_cache(system, *this, device, scheduler) { + scheduler.SetQueryCache(query_cache); } -bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) { - Draw(is_indexed, true); - return true; -} +RasterizerVulkan::~RasterizerVulkan() = default; void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { MICROPROFILE_SCOPE(Vulkan_Drawing); FlushWork(); + query_cache.UpdateCounters(); + const auto& gpu = system.GPU().Maxwell3D(); GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)}; @@ -362,6 +356,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); + query_cache.UpdateCounters(); + const auto& gpu = system.GPU().Maxwell3D(); if (!system.GPU().Maxwell3D().ShouldExecute()) { return; @@ -429,6 +425,8 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { sampled_views.clear(); image_views.clear(); + query_cache.UpdateCounters(); + const auto& launch_desc = system.GPU().KeplerCompute().launch_description; const ComputePipelineCacheKey key{ code_addr, @@ -471,17 +469,28 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { }); } +void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { + query_cache.ResetCounter(type); +} + +void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional<u64> timestamp) { + query_cache.Query(gpu_addr, type, timestamp); +} + void RasterizerVulkan::FlushAll() {} void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) { texture_cache.FlushRegion(addr, size); buffer_cache.FlushRegion(addr, size); + query_cache.FlushRegion(addr, size); } void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) { texture_cache.InvalidateRegion(addr, size); pipeline_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); + query_cache.InvalidateRegion(addr, size); } void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 7be71e734..138903d60 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -24,6 +24,7 @@ #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_sampler_cache.h" @@ -96,7 +97,7 @@ struct ImageView { vk::ImageLayout* layout = nullptr; }; -class RasterizerVulkan : public VideoCore::RasterizerAccelerated { +class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { public: explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window, VKScreenInfo& screen_info, const VKDevice& device, @@ -104,10 +105,11 @@ public: VKScheduler& scheduler); ~RasterizerVulkan() override; - bool DrawBatch(bool is_indexed) override; - bool DrawMultiBatch(bool is_indexed) override; + void Draw(bool is_indexed, bool is_instanced) override; void Clear() override; void DispatchCompute(GPUVAddr code_addr) override; + void ResetCounter(VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; @@ -140,8 +142,6 @@ private: static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8; - void Draw(bool is_indexed, bool is_instanced); - void FlushWork(); Texceptions UpdateAttachments(); @@ -247,6 +247,7 @@ private: VKPipelineCache pipeline_cache; VKBufferCache buffer_cache; VKSamplerCache sampler_cache; + VKQueryCache query_cache; std::array<View, Maxwell::NumRenderTargets> color_attachments; View zeta_attachment; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index d66133ad1..92bd6c344 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -6,6 +6,7 @@ #include "common/microprofile.h" #include "video_core/renderer_vulkan/declarations.h" #include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -139,6 +140,8 @@ void VKScheduler::SubmitExecution(vk::Semaphore semaphore) { } void VKScheduler::AllocateNewContext() { + ++ticks; + std::unique_lock lock{mutex}; current_fence = next_fence; next_fence = &resource_manager.CommitFence(); @@ -146,6 +149,10 @@ void VKScheduler::AllocateNewContext() { current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence); current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit}, device.GetDispatchLoader()); + // Enable counters once again. These are disabled when a command buffer is finished. + if (query_cache) { + query_cache->UpdateCounters(); + } } void VKScheduler::InvalidateState() { @@ -159,6 +166,7 @@ void VKScheduler::InvalidateState() { } void VKScheduler::EndPendingOperations() { + query_cache->DisableStreams(); EndRenderPass(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index bcdffbba0..62fd7858b 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -4,6 +4,7 @@ #pragma once +#include <atomic> #include <condition_variable> #include <memory> #include <optional> @@ -18,6 +19,7 @@ namespace Vulkan { class VKDevice; class VKFence; +class VKQueryCache; class VKResourceManager; class VKFenceView { @@ -67,6 +69,11 @@ public: /// Binds a pipeline to the current execution context. void BindGraphicsPipeline(vk::Pipeline pipeline); + /// Assigns the query cache. + void SetQueryCache(VKQueryCache& query_cache_) { + query_cache = &query_cache_; + } + /// Returns true when viewports have been set in the current command buffer. bool TouchViewports() { return std::exchange(state.viewports, true); @@ -112,6 +119,11 @@ public: return current_fence; } + /// Returns the current command buffer tick. + u64 Ticks() const { + return ticks; + } + private: class Command { public: @@ -205,6 +217,8 @@ private: const VKDevice& device; VKResourceManager& resource_manager; + VKQueryCache* query_cache = nullptr; + vk::CommandBuffer current_cmdbuf; VKFence* current_fence = nullptr; VKFence* next_fence = nullptr; @@ -227,6 +241,7 @@ private: Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve; std::mutex mutex; std::condition_variable cv; + std::atomic<u64> ticks = 0; bool quit = false; }; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 24a658dce..6d0bf6aa1 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -275,12 +275,14 @@ public: AddCapability(spv::Capability::ImageGatherExtended); AddCapability(spv::Capability::SampledBuffer); AddCapability(spv::Capability::StorageImageWriteWithoutFormat); + AddCapability(spv::Capability::DrawParameters); AddCapability(spv::Capability::SubgroupBallotKHR); AddCapability(spv::Capability::SubgroupVoteKHR); AddExtension("SPV_KHR_shader_ballot"); AddExtension("SPV_KHR_subgroup_vote"); AddExtension("SPV_KHR_storage_buffer_storage_class"); AddExtension("SPV_KHR_variable_pointers"); + AddExtension("SPV_KHR_shader_draw_parameters"); if (ir.UsesViewportIndex()) { AddCapability(spv::Capability::MultiViewport); @@ -290,6 +292,10 @@ public: } } + if (device.IsShaderStorageImageReadWithoutFormatSupported()) { + AddCapability(spv::Capability::StorageImageReadWithoutFormat); + } + if (device.IsFloat16Supported()) { AddCapability(spv::Capability::Float16); } @@ -492,9 +498,11 @@ private: interfaces.push_back(AddGlobalVariable(Name(out_vertex, "out_vertex"))); // Declare input attributes - vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_uint, "vertex_index"); + vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_int, "vertex_index"); instance_index = - DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_uint, "instance_index"); + DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_int, "instance_index"); + base_vertex = DeclareInputBuiltIn(spv::BuiltIn::BaseVertex, t_in_int, "base_vertex"); + base_instance = DeclareInputBuiltIn(spv::BuiltIn::BaseInstance, t_in_int, "base_instance"); } void DeclareTessControl() { @@ -1068,9 +1076,12 @@ private: return {OpLoad(t_float, AccessElement(t_in_float, tess_coord, element)), Type::Float}; case 2: - return {OpLoad(t_uint, instance_index), Type::Uint}; + return { + OpISub(t_int, OpLoad(t_int, instance_index), OpLoad(t_int, base_instance)), + Type::Int}; case 3: - return {OpLoad(t_uint, vertex_index), Type::Uint}; + return {OpISub(t_int, OpLoad(t_int, vertex_index), OpLoad(t_int, base_vertex)), + Type::Int}; } UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element); return {Constant(t_uint, 0U), Type::Uint}; @@ -1748,8 +1759,16 @@ private: } Expression ImageLoad(Operation operation) { - UNIMPLEMENTED(); - return {}; + if (!device.IsShaderStorageImageReadWithoutFormatSupported()) { + return {v_float_zero, Type::Float}; + } + + const auto& meta{std::get<MetaImage>(operation.GetMeta())}; + + const Id coords = GetCoordinates(operation, Type::Int); + const Id texel = OpImageRead(t_uint4, GetImage(operation), coords); + + return {OpCompositeExtract(t_uint, texel, meta.element), Type::Uint}; } Expression ImageStore(Operation operation) { @@ -2542,6 +2561,8 @@ private: Id instance_index{}; Id vertex_index{}; + Id base_instance{}; + Id base_vertex{}; std::array<Id, Maxwell::NumRenderTargets> frag_colors{}; Id frag_depth{}; Id frag_coord{}; diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index 0eeb75559..6ead42070 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -83,14 +83,14 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { const bool input_signed = instr.conversion.is_input_signed; - if (instr.conversion.src_size == Register::Size::Byte) { - const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8; - if (offset > 0) { - value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed, - std::move(value), Immediate(offset)); + if (const u32 offset = static_cast<u32>(instr.conversion.int_src.selector); offset > 0) { + ASSERT(instr.conversion.src_size == Register::Size::Byte || + instr.conversion.src_size == Register::Size::Short); + if (instr.conversion.src_size == Register::Size::Short) { + ASSERT(offset == 0 || offset == 2); } - } else { - UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0); + value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed, + std::move(value), Immediate(offset * 8)); } value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 351c8c2f1..542636430 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -522,68 +522,53 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi, std::optional<Tegra::Shader::Register> bindless_reg) { - const auto is_array = static_cast<bool>(array); - const auto is_shadow = static_cast<bool>(depth_compare); + const bool is_array = array != nullptr; + const bool is_shadow = depth_compare != nullptr; const bool is_bindless = bindless_reg.has_value(); - UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) || - (texture_type == TextureType::TextureCube && is_array && is_shadow), - "This method is not supported."); + UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow); + ASSERT_MSG(texture_type != TextureType::Texture3D || is_array || is_shadow, + "Illegal texture type"); const SamplerInfo info{texture_type, is_array, is_shadow, false}; - Node index_var{}; + Node index_var; const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info) : GetSampler(instr.sampler, info); - Node4 values; - if (sampler == nullptr) { - for (u32 element = 0; element < values.size(); ++element) { - values[element] = Immediate(0); - } - return values; + if (!sampler) { + return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)}; } const bool lod_needed = process_mode == TextureProcessMode::LZ || process_mode == TextureProcessMode::LL || process_mode == TextureProcessMode::LLA; - - // LOD selection (either via bias or explicit textureLod) not supported in GL for - // sampler2DArrayShadow and samplerCubeArrayShadow. - const bool gl_lod_supported = - !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) || - (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow)); - - const OperationCode read_method = - (lod_needed && gl_lod_supported) ? OperationCode::TextureLod : OperationCode::Texture; - - UNIMPLEMENTED_IF(process_mode != TextureProcessMode::None && !gl_lod_supported); + const OperationCode opcode = lod_needed ? OperationCode::TextureLod : OperationCode::Texture; Node bias; Node lod; - if (process_mode != TextureProcessMode::None && gl_lod_supported) { - switch (process_mode) { - case TextureProcessMode::LZ: - lod = Immediate(0.0f); - break; - case TextureProcessMode::LB: - // If present, lod or bias are always stored in the register - // indexed by the gpr20 field with an offset depending on the - // usage of the other registers - bias = GetRegister(instr.gpr20.Value() + bias_offset); - break; - case TextureProcessMode::LL: - lod = GetRegister(instr.gpr20.Value() + bias_offset); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); - break; - } + switch (process_mode) { + case TextureProcessMode::None: + break; + case TextureProcessMode::LZ: + lod = Immediate(0.0f); + break; + case TextureProcessMode::LB: + // If present, lod or bias are always stored in the register indexed by the gpr20 field with + // an offset depending on the usage of the other registers. + bias = GetRegister(instr.gpr20.Value() + bias_offset); + break; + case TextureProcessMode::LL: + lod = GetRegister(instr.gpr20.Value() + bias_offset); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); + break; } + Node4 values; for (u32 element = 0; element < values.size(); ++element) { - auto copy_coords = coords; MetaTexture meta{*sampler, array, depth_compare, aoffi, {}, {}, bias, lod, {}, element, index_var}; - values[element] = Operation(read_method, meta, std::move(copy_coords)); + values[element] = Operation(opcode, meta, coords); } return values; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f4c015635..0d105d386 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -721,7 +721,6 @@ private: std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const CacheAddr cache_addr, const SurfaceParams& params, bool preserve_contents, bool is_render) { - // Step 1 // Check Level 1 Cache for a fast structural match. If candidate surface // matches at certain level we are pretty much done. @@ -733,14 +732,18 @@ private: return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, topological_result); } + const auto struct_result = current_surface->MatchesStructure(params); - if (struct_result != MatchStructureResult::None && - (params.target != SurfaceTarget::Texture3D || - current_surface->MatchTarget(params.target))) { - if (struct_result == MatchStructureResult::FullMatch) { - return ManageStructuralMatch(current_surface, params, is_render); - } else { - return RebuildSurface(current_surface, params, is_render); + if (struct_result != MatchStructureResult::None) { + const auto& old_params = current_surface->GetSurfaceParams(); + const bool not_3d = params.target != SurfaceTarget::Texture3D && + old_params.target != SurfaceTarget::Texture3D; + if (not_3d || current_surface->MatchTarget(params.target)) { + if (struct_result == MatchStructureResult::FullMatch) { + return ManageStructuralMatch(current_surface, params, is_render); + } else { + return RebuildSurface(current_surface, params, is_render); + } } } } diff --git a/src/web_service/web_backend.cpp b/src/web_service/web_backend.cpp index 6683f459f..737ffe409 100644 --- a/src/web_service/web_backend.cpp +++ b/src/web_service/web_backend.cpp @@ -73,14 +73,12 @@ struct Client::Impl { if (!parsedUrl.GetPort(&port)) { port = HTTP_PORT; } - cli = std::make_unique<httplib::Client>(parsedUrl.m_Host.c_str(), port, - TIMEOUT_SECONDS); + cli = std::make_unique<httplib::Client>(parsedUrl.m_Host.c_str(), port); } else if (parsedUrl.m_Scheme == "https") { if (!parsedUrl.GetPort(&port)) { port = HTTPS_PORT; } - cli = std::make_unique<httplib::SSLClient>(parsedUrl.m_Host.c_str(), port, - TIMEOUT_SECONDS); + cli = std::make_unique<httplib::SSLClient>(parsedUrl.m_Host.c_str(), port); } else { LOG_ERROR(WebService, "Bad URL scheme {}", parsedUrl.m_Scheme); return Common::WebResult{Common::WebResult::Code::InvalidURL, "Bad URL scheme"}; @@ -90,6 +88,7 @@ struct Client::Impl { LOG_ERROR(WebService, "Invalid URL {}", host + path); return Common::WebResult{Common::WebResult::Code::InvalidURL, "Invalid URL"}; } + cli->set_timeout_sec(TIMEOUT_SECONDS); httplib::Headers params; if (!jwt.empty()) { |