From 73d2d3342dc8867d32f08f89b2ca36ff071598dc Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 28 Nov 2019 02:15:34 -0300 Subject: gl_query_cache: Optimize query cache Use a custom cache instead of relying on a ranged cache. --- src/video_core/engines/maxwell_3d.cpp | 11 +- src/video_core/rasterizer_interface.h | 3 +- src/video_core/renderer_opengl/gl_query_cache.cpp | 214 ++++++++++++++++------ src/video_core/renderer_opengl/gl_query_cache.h | 61 ++++-- src/video_core/renderer_opengl/gl_rasterizer.cpp | 5 +- src/video_core/renderer_opengl/gl_rasterizer.h | 2 +- 6 files changed, 217 insertions(+), 79 deletions(-) diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 9add2bc94..842cdcbcf 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -4,6 +4,7 @@ #include #include +#include #include "common/assert.h" #include "core/core.h" #include "core/core_timing.h" @@ -16,6 +17,8 @@ namespace Tegra::Engines { +using VideoCore::QueryType; + /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; @@ -614,10 +617,11 @@ void Maxwell3D::ProcessQueryCondition() { void Maxwell3D::ProcessCounterReset() { switch (regs.counter_reset) { case Regs::CounterReset::SampleCnt: - rasterizer.ResetCounter(VideoCore::QueryType::SamplesPassed); + rasterizer.ResetCounter(QueryType::SamplesPassed); break; default: - UNIMPLEMENTED_MSG("counter_reset={}", static_cast(regs.counter_reset)); + LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}", + static_cast(regs.counter_reset)); break; } } @@ -670,7 +674,8 @@ std::optional Maxwell3D::GetQueryResult() { return 0; case Regs::QuerySelect::SamplesPassed: // Deferred. - rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed); + rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, + system.GPU().GetTicks()); return {}; default: UNIMPLEMENTED_MSG("Unimplemented query select type {}", diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index a394f2d3e..e9f1436f0 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -6,6 +6,7 @@ #include #include +#include #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" @@ -50,7 +51,7 @@ public: virtual void ResetCounter(QueryType type) = 0; /// Records a GPU query and caches it - virtual void Query(GPUVAddr gpu_addr, QueryType type) = 0; + virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional timestamp) = 0; /// Notify rasterizer that all caches should be flushed to Switch memory virtual void FlushAll() = 0; diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 8f0e8241d..74cb73209 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -2,8 +2,10 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include #include +#include #include #include @@ -22,6 +24,13 @@ using VideoCore::QueryType; namespace { +constexpr std::uintptr_t PAGE_SIZE = 4096; +constexpr int PAGE_SHIFT = 12; + +constexpr std::size_t SMALL_QUERY_SIZE = 8; // Query size without timestamp +constexpr std::size_t LARGE_QUERY_SIZE = 16; // Query size with timestamp +constexpr std::ptrdiff_t TIMESTAMP_OFFSET = 8; + constexpr std::array QueryTargets = {GL_SAMPLES_PASSED}; constexpr GLenum GetTarget(QueryType type) { @@ -37,23 +46,19 @@ CounterStream::~CounterStream() = default; void CounterStream::Update(bool enabled, bool any_command_queued) { if (enabled) { - if (!current) { - current = cache.GetHostCounter(last, type); - } - return; - } - - if (current) { - EndQuery(any_command_queued); + Enable(); + } else { + Disable(any_command_queued); } - last = std::exchange(current, nullptr); } void CounterStream::Reset(bool any_command_queued) { if (current) { EndQuery(any_command_queued); + + // Immediately start a new query to avoid disabling its state. + current = cache.GetHostCounter(nullptr, type); } - current = nullptr; last = nullptr; } @@ -67,6 +72,20 @@ std::shared_ptr CounterStream::GetCurrent(bool any_command_queued) return last; } +void CounterStream::Enable() { + if (current) { + return; + } + current = cache.GetHostCounter(last, type); +} + +void CounterStream::Disable(bool any_command_queued) { + if (current) { + EndQuery(any_command_queued); + } + last = std::exchange(current, nullptr); +} + void CounterStream::EndQuery(bool any_command_queued) { if (!any_command_queued) { // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not @@ -78,26 +97,57 @@ void CounterStream::EndQuery(bool any_command_queued) { } QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& rasterizer) - : RasterizerCache{rasterizer}, system{system}, - rasterizer{rasterizer}, streams{{CounterStream{*this, QueryType::SamplesPassed}}} {} + : system{system}, rasterizer{rasterizer}, streams{{CounterStream{*this, + QueryType::SamplesPassed}}} {} QueryCache::~QueryCache() = default; -void QueryCache::Query(GPUVAddr gpu_addr, QueryType type) { +void QueryCache::InvalidateRegion(CacheAddr addr, std::size_t size) { + const u64 addr_begin = static_cast(addr); + const u64 addr_end = addr_begin + static_cast(size); + const auto in_range = [addr_begin, addr_end](CachedQuery& query) { + const u64 cache_begin = query.GetCacheAddr(); + const u64 cache_end = cache_begin + query.GetSizeInBytes(); + return cache_begin < addr_end && addr_begin < cache_end; + }; + + const u64 page_end = addr_end >> PAGE_SHIFT; + for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) { + const auto& it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + continue; + } + auto& contents = it->second; + for (auto& query : contents) { + if (!in_range(query)) { + continue; + } + rasterizer.UpdatePagesCachedCount(query.GetCpuAddr(), query.GetSizeInBytes(), -1); + Flush(query); + } + contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range), + std::end(contents)); + } +} + +void QueryCache::FlushRegion(CacheAddr addr, std::size_t size) { + // We can handle flushes in the same way as invalidations. + InvalidateRegion(addr, size); +} + +void QueryCache::Query(GPUVAddr gpu_addr, QueryType type, std::optional timestamp) { auto& memory_manager = system.GPU().MemoryManager(); const auto host_ptr = memory_manager.GetPointer(gpu_addr); - auto query = TryGet(host_ptr); + CachedQuery* query = TryGet(ToCacheAddr(host_ptr)); if (!query) { const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr); ASSERT_OR_EXECUTE(cpu_addr, return;); - query = std::make_shared(type, *cpu_addr, host_ptr); - Register(query); + query = &Register(CachedQuery(type, *cpu_addr, host_ptr)); } - query->SetCounter(GetStream(type).GetCurrent(rasterizer.AnyCommandQueued())); - query->MarkAsModified(true, *this); + query->SetCounter(GetStream(type).GetCurrent(rasterizer.AnyCommandQueued()), timestamp); } void QueryCache::UpdateCounters() { @@ -117,34 +167,54 @@ void QueryCache::Reserve(QueryType type, OGLQuery&& query) { std::shared_ptr QueryCache::GetHostCounter(std::shared_ptr dependency, QueryType type) { - const auto type_index = static_cast(type); - auto& reserve = reserved_queries[type_index]; - + auto& reserve = reserved_queries[static_cast(type)]; + OGLQuery query; if (reserve.empty()) { - return std::make_shared(*this, std::move(dependency), type); + query.Create(GetTarget(type)); + } else { + query = std::move(reserve.back()); + reserve.pop_back(); } - auto counter = std::make_shared(*this, std::move(dependency), type, - std::move(reserve.back())); - reserve.pop_back(); - return counter; + return std::make_shared(*this, std::move(dependency), type, std::move(query)); +} + +CachedQuery& QueryCache::Register(CachedQuery&& cached_query) { + const u64 page = static_cast(cached_query.GetCacheAddr()) >> PAGE_SHIFT; + auto& stored_ref = cached_queries[page].emplace_back(std::move(cached_query)); + rasterizer.UpdatePagesCachedCount(stored_ref.GetCpuAddr(), stored_ref.GetSizeInBytes(), 1); + return stored_ref; +} + +CachedQuery* QueryCache::TryGet(CacheAddr addr) { + const u64 page = static_cast(addr) >> PAGE_SHIFT; + const auto it = cached_queries.find(page); + if (it == std::end(cached_queries)) { + return nullptr; + } + auto& contents = it->second; + const auto found = + std::find_if(std::begin(contents), std::end(contents), + [addr](const auto& query) { return query.GetCacheAddr() == addr; }); + return found != std::end(contents) ? &*found : nullptr; } -void QueryCache::FlushObjectInner(const std::shared_ptr& counter_) { - auto& counter = *counter_; - auto& stream = GetStream(counter.GetType()); +void QueryCache::Flush(CachedQuery& cached_query) { + auto& stream = GetStream(cached_query.GetType()); // Waiting for a query while another query of the same target is enabled locks Nvidia's driver. // To avoid this disable and re-enable keeping the dependency stream. - const bool is_enabled = stream.IsEnabled(); - if (is_enabled) { - stream.Update(false, false); + // But we only have to do this if we have pending waits to be done. + const bool slice_counter = stream.IsEnabled() && cached_query.WaitPending(); + const bool any_command_queued = rasterizer.AnyCommandQueued(); + if (slice_counter) { + stream.Update(false, any_command_queued); } - counter.Flush(); + cached_query.Flush(); - if (is_enabled) { - stream.Update(true, false); + if (slice_counter) { + stream.Update(true, any_command_queued); } } @@ -152,13 +222,6 @@ CounterStream& QueryCache::GetStream(QueryType type) { return streams[static_cast(type)]; } -HostCounter::HostCounter(QueryCache& cache, std::shared_ptr dependency, QueryType type) - : cache{cache}, type{type}, dependency{std::move(dependency)} { - const GLenum target = GetTarget(type); - query.Create(target); - glBeginQuery(target, query.handle); -} - HostCounter::HostCounter(QueryCache& cache, std::shared_ptr dependency, QueryType type, OGLQuery&& query_) : cache{cache}, type{type}, dependency{std::move(dependency)}, query{std::move(query_)} { @@ -170,35 +233,80 @@ HostCounter::~HostCounter() { } u64 HostCounter::Query() { - if (query.handle == 0) { - return result; + if (result) { + return *result; } - glGetQueryObjectui64v(query.handle, GL_QUERY_RESULT, &result); - + u64 value; + glGetQueryObjectui64v(query.handle, GL_QUERY_RESULT, &value); if (dependency) { - result += dependency->Query(); + value += dependency->Query(); } - return result; + return *(result = value); +} + +bool HostCounter::WaitPending() const noexcept { + return result.has_value(); } CachedQuery::CachedQuery(QueryType type, VAddr cpu_addr, u8* host_ptr) - : RasterizerCacheObject{host_ptr}, type{type}, cpu_addr{cpu_addr}, host_ptr{host_ptr} {} + : type{type}, cpu_addr{cpu_addr}, host_ptr{host_ptr} {} + +CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept + : type{rhs.type}, cpu_addr{rhs.cpu_addr}, host_ptr{rhs.host_ptr}, + counter{std::move(rhs.counter)}, timestamp{rhs.timestamp} {} CachedQuery::~CachedQuery() = default; +CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { + type = rhs.type; + cpu_addr = rhs.cpu_addr; + host_ptr = rhs.host_ptr; + counter = std::move(rhs.counter); + timestamp = rhs.timestamp; + return *this; +} + void CachedQuery::Flush() { - const u64 value = counter->Query(); - std::memcpy(host_ptr, &value, sizeof(value)); + // When counter is nullptr it means that it's just been reseted. We are supposed to write a zero + // in these cases. + const u64 value = counter ? counter->Query() : 0; + std::memcpy(host_ptr, &value, sizeof(u64)); + + if (timestamp) { + std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64)); + } } -void CachedQuery::SetCounter(std::shared_ptr counter_) { +void CachedQuery::SetCounter(std::shared_ptr counter_, std::optional timestamp_) { + if (counter) { + // If there's an old counter set it means the query is being rewritten by the game. + // To avoid losing the data forever, flush here. + Flush(); + } counter = std::move(counter_); + timestamp = timestamp_; +} + +bool CachedQuery::WaitPending() const noexcept { + return counter && counter->WaitPending(); } -QueryType CachedQuery::GetType() const { +QueryType CachedQuery::GetType() const noexcept { return type; } +VAddr CachedQuery::GetCpuAddr() const noexcept { + return cpu_addr; +} + +CacheAddr CachedQuery::GetCacheAddr() const noexcept { + return ToCacheAddr(host_ptr); +} + +u64 CachedQuery::GetSizeInBytes() const noexcept { + return timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 91594b120..d9f22b44d 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -7,12 +7,12 @@ #include #include #include +#include #include #include #include "common/common_types.h" -#include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -43,6 +43,10 @@ public: } private: + void Enable(); + + void Disable(bool any_command_queued); + void EndQuery(bool any_command_queued); QueryCache& cache; @@ -53,12 +57,16 @@ private: GLenum target; }; -class QueryCache final : public RasterizerCache> { +class QueryCache final { public: explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer); ~QueryCache(); - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type); + void InvalidateRegion(CacheAddr addr, std::size_t size); + + void FlushRegion(CacheAddr addr, std::size_t size); + + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp); void UpdateCounters(); @@ -69,23 +77,26 @@ public: std::shared_ptr GetHostCounter(std::shared_ptr dependency, VideoCore::QueryType type); -protected: - void FlushObjectInner(const std::shared_ptr& counter) override; - private: + CachedQuery& Register(CachedQuery&& cached_query); + + CachedQuery* TryGet(CacheAddr addr); + + void Flush(CachedQuery& cached_query); + CounterStream& GetStream(VideoCore::QueryType type); Core::System& system; RasterizerOpenGL& rasterizer; + std::unordered_map> cached_queries; + std::array streams; std::array, VideoCore::NumQueryTypes> reserved_queries; }; class HostCounter final { public: - explicit HostCounter(QueryCache& cache, std::shared_ptr dependency, - VideoCore::QueryType type); explicit HostCounter(QueryCache& cache, std::shared_ptr dependency, VideoCore::QueryType type, OGLQuery&& query); ~HostCounter(); @@ -93,42 +104,54 @@ public: /// Returns the current value of the query. u64 Query(); + /// Returns true when querying this counter will potentially wait for OpenGL. + bool WaitPending() const noexcept; + private: QueryCache& cache; VideoCore::QueryType type; std::shared_ptr dependency; ///< Counter queued before this one. OGLQuery query; ///< OpenGL query. - u64 result; ///< Added values of the counter. + std::optional result; ///< Added values of the counter. }; -class CachedQuery final : public RasterizerCacheObject { +class CachedQuery final { public: explicit CachedQuery(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr); + CachedQuery(CachedQuery&&) noexcept; + CachedQuery(const CachedQuery&) = delete; ~CachedQuery(); + CachedQuery& operator=(CachedQuery&&) noexcept; + /// Writes the counter value to host memory. void Flush(); /// Updates the counter this cached query registered in guest memory will write when requested. - void SetCounter(std::shared_ptr counter); + void SetCounter(std::shared_ptr counter, std::optional timestamp); + + /// Returns true when a flushing this query will potentially wait for OpenGL. + bool WaitPending() const noexcept; /// Returns the query type. - VideoCore::QueryType GetType() const; + VideoCore::QueryType GetType() const noexcept; - VAddr GetCpuAddr() const override { - return cpu_addr; - } + /// Returns the guest CPU address for this query. + VAddr GetCpuAddr() const noexcept; - std::size_t GetSizeInBytes() const override { - return sizeof(u64); - } + /// Returns the cache address for this query. + CacheAddr GetCacheAddr() const noexcept; + + /// Returns the number of cached bytes. + u64 GetSizeInBytes() const noexcept; private: - VideoCore::QueryType type; + VideoCore::QueryType type; ///< Abstracted query type (e.g. samples passed). VAddr cpu_addr; ///< Guest CPU address. u8* host_ptr; ///< Writable host pointer. std::shared_ptr counter; ///< Host counter to query, owns the dependency tree. + std::optional timestamp; ///< Timestamp to flush to guest memory. }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 827f85884..4bdc8db85 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -722,8 +722,9 @@ void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { query_cache.ResetCounter(type); } -void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type) { - query_cache.Query(gpu_addr, type); +void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, + std::optional timestamp) { + query_cache.Query(gpu_addr, type, timestamp); } void RasterizerOpenGL::FlushAll() {} diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 4fb6811a7..c772fd4ba 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -63,7 +63,7 @@ public: void Clear() override; void DispatchCompute(GPUVAddr code_addr) override; void ResetCounter(VideoCore::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCore::QueryType type) override; + void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; void FlushAll() override; void FlushRegion(CacheAddr addr, u64 size) override; void InvalidateRegion(CacheAddr addr, u64 size) override; -- cgit v1.2.3