diff options
Diffstat (limited to '')
36 files changed, 1106 insertions, 393 deletions
diff --git a/src/video_core/renderer_opengl/blit_image.cpp b/src/video_core/renderer_opengl/blit_image.cpp index 9a560a73b..3b03e8d5a 100644 --- a/src/video_core/renderer_opengl/blit_image.cpp +++ b/src/video_core/renderer_opengl/blit_image.cpp @@ -22,7 +22,7 @@ BlitImageHelper::~BlitImageHelper() = default; void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, GLuint src_sampler, const Region2D& dst_region, const Region2D& src_region, const Extent3D& src_size) { - glEnable(GL_CULL_FACE); + glDisable(GL_CULL_FACE); glDisable(GL_COLOR_LOGIC_OP); glDisable(GL_DEPTH_TEST); glDisable(GL_STENCIL_TEST); @@ -31,7 +31,6 @@ void BlitImageHelper::BlitColor(GLuint dst_framebuffer, GLuint src_image_view, G glDisable(GL_ALPHA_TEST); glDisablei(GL_BLEND, 0); glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); - glCullFace(GL_BACK); glFrontFace(GL_CW); glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); glDepthRangeIndexed(0, 0.0, 0.0); diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 6af4ae793..38d553d3c 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -106,8 +106,10 @@ GLuint Buffer::View(u32 offset, u32 size, PixelFormat format) { return views.back().texture.handle; } -BufferCacheRuntime::BufferCacheRuntime(const Device& device_) - : device{device_}, has_fast_buffer_sub_data{device.HasFastBufferSubData()}, +BufferCacheRuntime::BufferCacheRuntime(const Device& device_, + StagingBufferPool& staging_buffer_pool_) + : device{device_}, staging_buffer_pool{staging_buffer_pool_}, + has_fast_buffer_sub_data{device.HasFastBufferSubData()}, use_assembly_shaders{device.UseAssemblyShaders()}, has_unified_vertex_buffers{device.HasVertexBufferUnifiedMemory()}, stream_buffer{has_fast_buffer_sub_data ? std::nullopt : std::make_optional<StreamBuffer>()} { @@ -117,7 +119,7 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_) for (auto& stage_uniforms : fast_uniforms) { for (OGLBuffer& buffer : stage_uniforms) { buffer.Create(); - glNamedBufferData(buffer.handle, BufferCache::DEFAULT_SKIP_CACHE_SIZE, nullptr, + glNamedBufferData(buffer.handle, VideoCommon::DEFAULT_SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW); } } @@ -140,6 +142,14 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_) }(); } +StagingBufferMap BufferCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestUploadBuffer(size); +} + +StagingBufferMap BufferCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestDownloadBuffer(size); +} + u64 BufferCacheRuntime::GetDeviceMemoryUsage() const { if (device.CanReportMemoryUsage()) { return device_access_memory - device.GetCurrentDedicatedVideoMemory(); @@ -147,13 +157,47 @@ u64 BufferCacheRuntime::GetDeviceMemoryUsage() const { return 2_GiB; } -void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, - std::span<const VideoCommon::BufferCopy> copies) { +void BufferCacheRuntime::CopyBuffer(GLuint dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier) { + if (barrier) { + PreCopyBarrier(); + } for (const VideoCommon::BufferCopy& copy : copies) { - glCopyNamedBufferSubData( - src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset), - static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size)); + glCopyNamedBufferSubData(src_buffer, dst_buffer, static_cast<GLintptr>(copy.src_offset), + static_cast<GLintptr>(copy.dst_offset), + static_cast<GLsizeiptr>(copy.size)); } + if (barrier) { + PostCopyBarrier(); + } +} + +void BufferCacheRuntime::CopyBuffer(GLuint dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier) { + CopyBuffer(dst_buffer, src_buffer.Handle(), copies, barrier); +} + +void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier) { + CopyBuffer(dst_buffer.Handle(), src_buffer, copies, barrier); +} + +void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies) { + CopyBuffer(dst_buffer.Handle(), src_buffer.Handle(), copies); +} + +void BufferCacheRuntime::PreCopyBarrier() { + // TODO: finer grained barrier? + glMemoryBarrier(GL_ALL_BARRIER_BITS); +} + +void BufferCacheRuntime::PostCopyBarrier() { + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT | GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT); +} + +void BufferCacheRuntime::Finish() { + glFinish(); } void BufferCacheRuntime::ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value) { @@ -188,6 +232,15 @@ void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, } } +void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings<Buffer>& bindings) { + for (u32 index = 0; index < bindings.buffers.size(); ++index) { + BindVertexBuffer(bindings.min_index + index, *bindings.buffers[index], + static_cast<u32>(bindings.offsets[index]), + static_cast<u32>(bindings.sizes[index]), + static_cast<u32>(bindings.strides[index])); + } +} + void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size) { if (use_assembly_shaders) { @@ -276,6 +329,14 @@ void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); } +void BufferCacheRuntime::BindTransformFeedbackBuffers(VideoCommon::HostBindings<Buffer>& bindings) { + for (u32 index = 0; index < bindings.buffers.size(); ++index) { + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, bindings.buffers[index]->Handle(), + static_cast<GLintptr>(bindings.offsets[index]), + static_cast<GLsizeiptr>(bindings.sizes[index])); + } +} + void BufferCacheRuntime::BindTextureBuffer(Buffer& buffer, u32 offset, u32 size, PixelFormat format) { *texture_handles++ = buffer.View(offset, size, format); diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index bb1962073..41b746f3b 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -7,11 +7,12 @@ #include <span> #include "common/common_types.h" -#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/buffer_cache_base.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_stream_buffer.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" namespace OpenGL { @@ -59,17 +60,36 @@ class BufferCacheRuntime { public: static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max(); - explicit BufferCacheRuntime(const Device& device_); + explicit BufferCacheRuntime(const Device& device_, StagingBufferPool& staging_buffer_pool_); + + [[nodiscard]] StagingBufferMap UploadStagingBuffer(size_t size); + + [[nodiscard]] StagingBufferMap DownloadStagingBuffer(size_t size); + + void CopyBuffer(GLuint dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier = true); + + void CopyBuffer(GLuint dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier = true); + + void CopyBuffer(Buffer& dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier = true); void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, std::span<const VideoCommon::BufferCopy> copies); + void PreCopyBarrier(); + void PostCopyBarrier(); + void Finish(); + void ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value); void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size); void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride); + void BindVertexBuffers(VideoCommon::HostBindings<Buffer>& bindings); + void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size); void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size); @@ -82,6 +102,8 @@ public: void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size); + void BindTransformFeedbackBuffers(VideoCommon::HostBindings<Buffer>& bindings); + void BindTextureBuffer(Buffer& buffer, u32 offset, u32 size, VideoCore::Surface::PixelFormat format); @@ -160,10 +182,6 @@ public: return device.CanReportMemoryUsage(); } - u32 GetStorageBufferAlignment() const { - return static_cast<u32>(device.GetShaderStorageBufferAlignment()); - } - private: static constexpr std::array PABO_LUT{ GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, @@ -172,6 +190,7 @@ private: }; const Device& device; + StagingBufferPool& staging_buffer_pool; bool has_fast_buffer_sub_data = false; bool use_assembly_shaders = false; @@ -204,14 +223,20 @@ private: struct BufferCacheParams { using Runtime = OpenGL::BufferCacheRuntime; using Buffer = OpenGL::Buffer; + using Async_Buffer = OpenGL::StagingBufferMap; + using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; static constexpr bool IS_OPENGL = true; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true; static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true; static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; - static constexpr bool USE_MEMORY_MAPS = false; + static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false; + + // TODO: Investigate why OpenGL seems to perform worse with persistently mapped buffer uploads + static constexpr bool USE_MEMORY_MAPS_FOR_UPLOADS = false; }; using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp new file mode 100644 index 000000000..f15ae8e25 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache<OpenGL::BufferCacheParams>; +} diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp index 26d066004..f9ca55c36 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp @@ -30,7 +30,7 @@ bool ComputePipelineKey::operator==(const ComputePipelineKey& rhs) const noexcep ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cache_, BufferCache& buffer_cache_, ProgramManager& program_manager_, const Shader::Info& info_, std::string code, - std::vector<u32> code_v) + std::vector<u32> code_v, bool force_context_flush) : texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, program_manager{program_manager_}, info{info_} { switch (device.GetShaderBackend()) { @@ -63,6 +63,16 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac writes_global_memory = !use_storage_buffers && std::ranges::any_of(info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory = info.uses_local_memory; + if (force_context_flush) { + std::scoped_lock lock{built_mutex}; + built_fence.Create(); + // Flush this context to ensure compilation commands and fence are in the GPU pipe. + glFlush(); + built_condvar.notify_one(); + } else { + is_built = true; + } } void ComputePipeline::Configure() { @@ -78,7 +88,8 @@ void ComputePipeline::Configure() { texture_cache.SynchronizeComputeDescriptors(); boost::container::static_vector<VideoCommon::ImageViewInOut, MAX_TEXTURES + MAX_IMAGES> views; - std::array<GLuint, MAX_TEXTURES> samplers; + boost::container::static_vector<VideoCommon::SamplerId, MAX_TEXTURES> samplers; + std::array<GLuint, MAX_TEXTURES> gl_samplers; std::array<GLuint, MAX_TEXTURES> textures; std::array<GLuint, MAX_IMAGES> images; GLsizei sampler_binding{}; @@ -122,7 +133,6 @@ void ComputePipeline::Configure() { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; views.push_back({handle.first}); - samplers[sampler_binding++] = 0; } } for (const auto& desc : info.image_buffer_descriptors) { @@ -133,8 +143,8 @@ void ComputePipeline::Configure() { const auto handle{read_handle(desc, index)}; views.push_back({handle.first}); - Sampler* const sampler = texture_cache.GetComputeSampler(handle.second); - samplers[sampler_binding++] = sampler->Handle(); + VideoCommon::SamplerId sampler = texture_cache.GetComputeSamplerId(handle.second); + samplers.push_back(sampler); } } for (const auto& desc : info.image_descriptors) { @@ -142,6 +152,9 @@ void ComputePipeline::Configure() { } texture_cache.FillComputeImageViews(std::span(views.data(), views.size())); + if (!is_built) { + WaitForBuild(); + } if (assembly_program.handle != 0) { program_manager.BindComputeAssemblyProgram(assembly_program.handle); } else { @@ -174,10 +187,17 @@ void ComputePipeline::Configure() { const VideoCommon::ImageViewInOut* views_it{views.data() + num_texture_buffers + num_image_buffers}; + const VideoCommon::SamplerId* samplers_it{samplers.data()}; texture_binding += num_texture_buffers; image_binding += num_image_buffers; u32 texture_scaling_mask{}; + + for (const auto& desc : info.texture_buffer_descriptors) { + for (u32 index = 0; index < desc.count; ++index) { + gl_samplers[sampler_binding++] = 0; + } + } for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; @@ -186,6 +206,12 @@ void ComputePipeline::Configure() { texture_scaling_mask |= 1u << texture_binding; } ++texture_binding; + + const Sampler& sampler{texture_cache.GetSampler(*(samplers_it++))}; + const bool use_fallback_sampler{sampler.HasAddedAnisotropy() && + !image_view.SupportsAnisotropy()}; + gl_samplers[sampler_binding++] = + use_fallback_sampler ? sampler.HandleWithDefaultAnisotropy() : sampler.Handle(); } } u32 image_scaling_mask{}; @@ -216,11 +242,20 @@ void ComputePipeline::Configure() { if (texture_binding != 0) { ASSERT(texture_binding == sampler_binding); glBindTextures(0, texture_binding, textures.data()); - glBindSamplers(0, sampler_binding, samplers.data()); + glBindSamplers(0, sampler_binding, gl_samplers.data()); } if (image_binding != 0) { glBindImageTextures(0, image_binding, images.data()); } } +void ComputePipeline::WaitForBuild() { + if (built_fence.handle == 0) { + std::unique_lock lock{built_mutex}; + built_condvar.wait(lock, [this] { return built_fence.handle != 0; }); + } + ASSERT(glClientWaitSync(built_fence.handle, 0, GL_TIMEOUT_IGNORED) != GL_WAIT_FAILED); + is_built = true; +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h index 6534dec32..c26b4fa5e 100644 --- a/src/video_core/renderer_opengl/gl_compute_pipeline.h +++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h @@ -50,7 +50,8 @@ class ComputePipeline { public: explicit ComputePipeline(const Device& device, TextureCache& texture_cache_, BufferCache& buffer_cache_, ProgramManager& program_manager_, - const Shader::Info& info_, std::string code, std::vector<u32> code_v); + const Shader::Info& info_, std::string code, std::vector<u32> code_v, + bool force_context_flush = false); void Configure(); @@ -58,6 +59,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_, Tegra::MemoryManager* gpu_memory_) { kepler_compute = kepler_compute_; @@ -65,6 +70,8 @@ public: } private: + void WaitForBuild(); + TextureCache& texture_cache; BufferCache& buffer_cache; Tegra::MemoryManager* gpu_memory; @@ -81,6 +88,12 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; + + std::mutex built_mutex; + std::condition_variable built_condvar; + OGLSync built_fence{}; + bool is_built{false}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 22ed16ebf..33e63c17d 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -108,7 +108,8 @@ bool IsASTCSupported() { [[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) { const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); + return nsight || HasExtension(extensions, "GL_EXT_debug_tool") || + Settings::values.renderer_debug.GetValue(); } } // Anonymous namespace @@ -193,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { has_bool_ref_bug = true; } } + has_lmem_perf_bug = is_nvidia; strict_context_required = emu_window.StrictContextRequired(); // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation. @@ -200,6 +202,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) { use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue() && !(is_amd || (is_intel && !is_linux)) && !strict_context_required; use_driver_cache = is_nvidia; + supports_conditional_barriers = !is_intel; LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 3ff8cad83..a5a6bbbba 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -176,6 +176,10 @@ public: return vendor_name == "ATI Technologies Inc."; } + bool IsIntel() const { + return vendor_name == "Intel"; + } + bool CanReportMemoryUsage() const { return can_report_memory; } @@ -184,6 +188,14 @@ public: return strict_context_required; } + bool SupportsConditionalBarriers() const { + return supports_conditional_barriers; + } + + bool HasLmemPerfBug() const { + return has_lmem_perf_bug; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); @@ -229,6 +241,8 @@ private: bool has_bool_ref_bug{}; bool can_report_memory{}; bool strict_context_required{}; + bool supports_conditional_barriers{}; + bool has_lmem_perf_bug{}; std::string vendor_name; }; diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp index 91463f854..5326172af 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.cpp +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -27,9 +27,7 @@ bool GLInnerFence::IsSignaled() const { return true; } ASSERT(sync_object.handle != 0); - GLint sync_status; - glGetSynciv(sync_object.handle, GL_SYNC_STATUS, 1, nullptr, &sync_status); - return sync_status == GL_SIGNALED; + return sync_object.IsSignaled(); } void GLInnerFence::Wait() { diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h index f1446e732..e21b19dcc 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.h +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -30,7 +30,17 @@ private: }; using Fence = std::shared_ptr<GLInnerFence>; -using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>; + +struct FenceManagerParams { + using FenceType = Fence; + using BufferCacheType = BufferCache; + using TextureCacheType = TextureCache; + using QueryCacheType = QueryCache; + + static constexpr bool HAS_ASYNC_CHECK = false; +}; + +using GenericFenceManager = VideoCommon::FenceManager<FenceManagerParams>; class FenceManagerOpenGL final : public GenericFenceManager { public: diff --git a/src/video_core/renderer_opengl/gl_fsr.cpp b/src/video_core/renderer_opengl/gl_fsr.cpp new file mode 100644 index 000000000..77262dcf1 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_fsr.cpp @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/settings.h" +#include "video_core/fsr.h" +#include "video_core/renderer_opengl/gl_fsr.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_shader_util.h" + +namespace OpenGL { +using namespace FSR; + +using FsrConstants = std::array<u32, 4 * 4>; + +FSR::FSR(std::string_view fsr_vertex_source, std::string_view fsr_easu_source, + std::string_view fsr_rcas_source) + : fsr_vertex{CreateProgram(fsr_vertex_source, GL_VERTEX_SHADER)}, + fsr_easu_frag{CreateProgram(fsr_easu_source, GL_FRAGMENT_SHADER)}, + fsr_rcas_frag{CreateProgram(fsr_rcas_source, GL_FRAGMENT_SHADER)} { + glProgramUniform2f(fsr_vertex.handle, 0, 1.0f, 1.0f); + glProgramUniform2f(fsr_vertex.handle, 1, 0.0f, 0.0f); +} + +FSR::~FSR() = default; + +void FSR::Draw(ProgramManager& program_manager, const Common::Rectangle<u32>& screen, + u32 input_image_width, u32 input_image_height, + const Common::Rectangle<int>& crop_rect) { + + const auto output_image_width = screen.GetWidth(); + const auto output_image_height = screen.GetHeight(); + + if (fsr_intermediate_tex.handle) { + GLint fsr_tex_width, fsr_tex_height; + glGetTextureLevelParameteriv(fsr_intermediate_tex.handle, 0, GL_TEXTURE_WIDTH, + &fsr_tex_width); + glGetTextureLevelParameteriv(fsr_intermediate_tex.handle, 0, GL_TEXTURE_HEIGHT, + &fsr_tex_height); + if (static_cast<u32>(fsr_tex_width) != output_image_width || + static_cast<u32>(fsr_tex_height) != output_image_height) { + fsr_intermediate_tex.Release(); + } + } + if (!fsr_intermediate_tex.handle) { + fsr_intermediate_tex.Create(GL_TEXTURE_2D); + glTextureStorage2D(fsr_intermediate_tex.handle, 1, GL_RGB16F, output_image_width, + output_image_height); + glNamedFramebufferTexture(fsr_framebuffer.handle, GL_COLOR_ATTACHMENT0, + fsr_intermediate_tex.handle, 0); + } + + GLint old_draw_fb; + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &old_draw_fb); + + glFrontFace(GL_CW); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, fsr_framebuffer.handle); + glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(output_image_width), + static_cast<GLfloat>(output_image_height)); + + FsrConstants constants; + FsrEasuConOffset( + constants.data() + 0, constants.data() + 4, constants.data() + 8, constants.data() + 12, + + static_cast<f32>(crop_rect.GetWidth()), static_cast<f32>(crop_rect.GetHeight()), + static_cast<f32>(input_image_width), static_cast<f32>(input_image_height), + static_cast<f32>(output_image_width), static_cast<f32>(output_image_height), + static_cast<f32>(crop_rect.left), static_cast<f32>(crop_rect.top)); + + glProgramUniform4uiv(fsr_easu_frag.handle, 0, sizeof(constants), std::data(constants)); + + program_manager.BindPresentPrograms(fsr_vertex.handle, fsr_easu_frag.handle); + glDrawArrays(GL_TRIANGLES, 0, 3); + + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb); + glBindTextureUnit(0, fsr_intermediate_tex.handle); + + const float sharpening = + static_cast<float>(Settings::values.fsr_sharpening_slider.GetValue()) / 100.0f; + + FsrRcasCon(constants.data(), sharpening); + glProgramUniform4uiv(fsr_rcas_frag.handle, 0, sizeof(constants), std::data(constants)); +} + +void FSR::InitBuffers() { + fsr_framebuffer.Create(); +} + +void FSR::ReleaseBuffers() { + fsr_framebuffer.Release(); + fsr_intermediate_tex.Release(); +} + +const OGLProgram& FSR::GetPresentFragmentProgram() const noexcept { + return fsr_rcas_frag; +} + +bool FSR::AreBuffersInitialized() const noexcept { + return fsr_framebuffer.handle; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_fsr.h b/src/video_core/renderer_opengl/gl_fsr.h new file mode 100644 index 000000000..1f6ae3115 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_fsr.h @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <string_view> + +#include "common/common_types.h" +#include "common/math_util.h" +#include "video_core/fsr.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" + +namespace OpenGL { + +class ProgramManager; + +class FSR { +public: + explicit FSR(std::string_view fsr_vertex_source, std::string_view fsr_easu_source, + std::string_view fsr_rcas_source); + ~FSR(); + + void Draw(ProgramManager& program_manager, const Common::Rectangle<u32>& screen, + u32 input_image_width, u32 input_image_height, + const Common::Rectangle<int>& crop_rect); + + void InitBuffers(); + + void ReleaseBuffers(); + + [[nodiscard]] const OGLProgram& GetPresentFragmentProgram() const noexcept; + + [[nodiscard]] bool AreBuffersInitialized() const noexcept; + +private: + OGLFramebuffer fsr_framebuffer; + OGLProgram fsr_vertex; + OGLProgram fsr_easu_frag; + OGLProgram fsr_rcas_frag; + OGLTexture fsr_intermediate_tex; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp index c115dabe1..23a48c6fe 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp @@ -176,7 +176,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c std::array<std::string, 5> sources, std::array<std::vector<u32>, 5> sources_spirv, const std::array<const Shader::Info*, 5>& infos, - const GraphicsPipelineKey& key_) + const GraphicsPipelineKey& key_, bool force_context_flush) : texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, program_manager{program_manager_}, state_tracker{state_tracker_}, key{key_} { if (shader_notify) { @@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c writes_global_memory |= std::ranges::any_of( info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; }); + uses_local_memory |= info.uses_local_memory; } ASSERT(num_textures <= MAX_TEXTURES); ASSERT(num_images <= MAX_IMAGES); @@ -231,7 +232,8 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c const bool in_parallel = thread_worker != nullptr; const auto backend = device.GetShaderBackend(); auto func{[this, sources = std::move(sources), sources_spirv = std::move(sources_spirv), - shader_notify, backend, in_parallel](ShaderContext::Context*) mutable { + shader_notify, backend, in_parallel, + force_context_flush](ShaderContext::Context*) mutable { for (size_t stage = 0; stage < 5; ++stage) { switch (backend) { case Settings::ShaderBackend::GLSL: @@ -251,7 +253,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c break; } } - if (in_parallel) { + if (force_context_flush || in_parallel) { std::scoped_lock lock{built_mutex}; built_fence.Create(); // Flush this context to ensure compilation commands and fence are in the GPU pipe. @@ -274,9 +276,9 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c template <typename Spec> void GraphicsPipeline::ConfigureImpl(bool is_indexed) { std::array<VideoCommon::ImageViewInOut, MAX_TEXTURES + MAX_IMAGES> views; - std::array<GLuint, MAX_TEXTURES> samplers; + std::array<VideoCommon::SamplerId, MAX_TEXTURES> samplers; size_t views_index{}; - GLsizei sampler_binding{}; + size_t samplers_index{}; texture_cache.SynchronizeGraphicsDescriptors(); @@ -336,7 +338,6 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; views[views_index++] = {handle.first}; - samplers[sampler_binding++] = 0; } } } @@ -350,8 +351,8 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { const auto handle{read_handle(desc, index)}; views[views_index++] = {handle.first}; - Sampler* const sampler{texture_cache.GetGraphicsSampler(handle.second)}; - samplers[sampler_binding++] = sampler->Handle(); + VideoCommon::SamplerId sampler{texture_cache.GetGraphicsSamplerId(handle.second)}; + samplers[samplers_index++] = sampler; } } if constexpr (Spec::has_images) { @@ -444,10 +445,13 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { program_manager.BindSourcePrograms(source_programs); } const VideoCommon::ImageViewInOut* views_it{views.data()}; + const VideoCommon::SamplerId* samplers_it{samplers.data()}; GLsizei texture_binding = 0; GLsizei image_binding = 0; + GLsizei sampler_binding{}; std::array<GLuint, MAX_TEXTURES> textures; std::array<GLuint, MAX_IMAGES> images; + std::array<GLuint, MAX_TEXTURES> gl_samplers; const auto prepare_stage{[&](size_t stage) { buffer_cache.runtime.SetImagePointers(&textures[texture_binding], &images[image_binding]); buffer_cache.BindHostStageBuffers(stage); @@ -464,6 +468,13 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { u32 stage_image_binding{}; const auto& info{stage_infos[stage]}; + if constexpr (Spec::has_texture_buffers) { + for (const auto& desc : info.texture_buffer_descriptors) { + for (u32 index = 0; index < desc.count; ++index) { + gl_samplers[sampler_binding++] = 0; + } + } + } for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { ImageView& image_view{texture_cache.GetImageView((views_it++)->id)}; @@ -473,6 +484,12 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { } ++texture_binding; ++stage_texture_binding; + + const Sampler& sampler{texture_cache.GetSampler(*(samplers_it++))}; + const bool use_fallback_sampler{sampler.HasAddedAnisotropy() && + !image_view.SupportsAnisotropy()}; + gl_samplers[sampler_binding++] = + use_fallback_sampler ? sampler.HandleWithDefaultAnisotropy() : sampler.Handle(); } } for (const auto& desc : info.image_descriptors) { @@ -533,7 +550,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) { if (texture_binding != 0) { ASSERT(texture_binding == sampler_binding); glBindTextures(0, texture_binding, textures.data()); - glBindSamplers(0, sampler_binding, samplers.data()); + glBindSamplers(0, sampler_binding, gl_samplers.data()); } if (image_binding != 0) { glBindImageTextures(0, image_binding, images.data()); @@ -620,10 +637,7 @@ bool GraphicsPipeline::IsBuilt() noexcept { if (built_fence.handle == 0) { return false; } - // Timeout of zero means this is non-blocking - const auto sync_status = glClientWaitSync(built_fence.handle, 0, 0); - ASSERT(sync_status != GL_WAIT_FAILED); - is_built = sync_status != GL_TIMEOUT_EXPIRED; + is_built = built_fence.IsSignaled(); return is_built; } diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h index 1c06b3655..7b3d7eae8 100644 --- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h +++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h @@ -78,7 +78,7 @@ public: std::array<std::string, 5> sources, std::array<std::vector<u32>, 5> sources_spirv, const std::array<const Shader::Info*, 5>& infos, - const GraphicsPipelineKey& key_); + const GraphicsPipelineKey& key_, bool force_context_flush = false); void Configure(bool is_indexed) { configure_func(this, is_indexed); @@ -98,6 +98,10 @@ public: return writes_global_memory; } + [[nodiscard]] bool UsesLocalMemory() const noexcept { + return uses_local_memory; + } + [[nodiscard]] bool IsBuilt() noexcept; template <typename Spec> @@ -146,6 +150,7 @@ private: bool use_storage_buffers{}; bool writes_global_memory{}; + bool uses_local_memory{}; static constexpr std::size_t XFB_ENTRY_STRIDE = 3; GLsizei num_xfb_attribs{}; diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 5070db441..99d7347f5 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -26,8 +26,8 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace -QueryCache::QueryCache(RasterizerOpenGL& rasterizer_) - : QueryCacheBase(rasterizer_), gl_rasterizer{rasterizer_} {} +QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) + : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; @@ -74,7 +74,7 @@ void HostCounter::EndQuery() { glEndQuery(GetTarget(type)); } -u64 HostCounter::BlockingQuery() const { +u64 HostCounter::BlockingQuery([[maybe_unused]] bool async) const { GLint64 value; glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value); return static_cast<u64>(value); @@ -96,7 +96,7 @@ CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { return *this; } -void CachedQuery::Flush() { +u64 CachedQuery::Flush([[maybe_unused]] bool async) { // Waiting for a query while another query of the same target is enabled locks Nvidia's driver. // To avoid this disable and re-enable keeping the dependency stream. // But we only have to do this if we have pending waits to be done. @@ -106,11 +106,13 @@ void CachedQuery::Flush() { stream.Update(false); } - VideoCommon::CachedQueryBase<HostCounter>::Flush(); + auto result = VideoCommon::CachedQueryBase<HostCounter>::Flush(); if (slice_counter) { stream.Update(true); } + + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 14ce59990..872513f22 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -28,7 +28,7 @@ using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(RasterizerOpenGL& rasterizer_); + explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); ~QueryCache(); OGLQuery AllocateQuery(VideoCore::QueryType type); @@ -51,7 +51,7 @@ public: void EndQuery(); private: - u64 BlockingQuery() const override; + u64 BlockingQuery(bool async = false) const override; QueryCache& cache; const VideoCore::QueryType type; @@ -70,7 +70,7 @@ public: CachedQuery(const CachedQuery&) = delete; CachedQuery& operator=(const CachedQuery&) = delete; - void Flush() override; + u64 Flush(bool async = false) override; private: QueryCache* cache; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 7bced675c..edf527f2d 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -24,6 +24,7 @@ #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" @@ -58,12 +59,13 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra StateTracker& state_tracker_) : RasterizerAccelerated(cpu_memory_), gpu(gpu_), device(device_), screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), - texture_cache_runtime(device, program_manager, state_tracker), - texture_cache(texture_cache_runtime, *this), buffer_cache_runtime(device), + texture_cache_runtime(device, program_manager, state_tracker, staging_buffer_pool), + texture_cache(texture_cache_runtime, *this), + buffer_cache_runtime(device, staging_buffer_pool), buffer_cache(*this, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager, state_tracker, gpu.ShaderNotify()), - query_cache(*this), accelerate_dma(buffer_cache), + query_cache(*this, cpu_memory_), accelerate_dma(buffer_cache, texture_cache), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), blit_image(program_manager_) {} @@ -220,6 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) { gpu.TickWork(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->Configure(is_indexed); @@ -357,6 +362,7 @@ void RasterizerOpenGL::DrawTexture() { .y = static_cast<s32>(draw_texture_state.src_y1)}}; blit_image.BlitColor(texture_cache.GetFramebuffer()->Handle(), texture.DefaultHandle(), sampler->Handle(), dst_region, src_region, texture.size); + state_tracker.InvalidateState(); } ++num_queued_commands; @@ -368,6 +374,9 @@ void RasterizerOpenGL::DispatchCompute() { if (!pipeline) { return; } + if (pipeline->UsesLocalMemory()) { + program_manager.LocalMemoryWarmup(); + } pipeline->SetEngine(kepler_compute, gpu_memory); pipeline->Configure(); const auto& qmd{kepler_compute->launch_description}; @@ -432,6 +441,29 @@ bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size, VideoCommon::CacheT return false; } +VideoCore::RasterizerDownloadArea RasterizerOpenGL::GetFlushArea(VAddr addr, u64 size) { + { + std::scoped_lock lock{texture_cache.mutex}; + auto area = texture_cache.GetFlushArea(addr, size); + if (area) { + return *area; + } + } + { + std::scoped_lock lock{buffer_cache.mutex}; + auto area = buffer_cache.GetFlushArea(addr, size); + if (area) { + return *area; + } + } + VideoCore::RasterizerDownloadArea new_area{ + .start_address = Common::AlignDown(addr, Core::Memory::YUZU_PAGESIZE), + .end_address = Common::AlignUp(addr + size, Core::Memory::YUZU_PAGESIZE), + .preemtive = true, + }; + return new_area; +} + void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); if (addr == 0 || size == 0) { @@ -576,7 +608,7 @@ bool RasterizerOpenGL::AccelerateConditionalRendering() { // Reimplement Host conditional rendering. return false; } - // Medium / Low Hack: stub any checks on queries writen into the buffer cache. + // Medium / Low Hack: stub any checks on queries written into the buffer cache. const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; Maxwell::ReportSemaphore::Compare cmp; if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), @@ -1262,7 +1294,8 @@ void RasterizerOpenGL::ReleaseChannel(s32 channel_id) { query_cache.EraseChannel(channel_id); } -AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_) : buffer_cache{buffer_cache_} {} +AccelerateDMA::AccelerateDMA(BufferCache& buffer_cache_, TextureCache& texture_cache_) + : buffer_cache{buffer_cache_}, texture_cache{texture_cache_} {} bool AccelerateDMA::BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) { std::scoped_lock lock{buffer_cache.mutex}; @@ -1274,4 +1307,44 @@ bool AccelerateDMA::BufferClear(GPUVAddr src_address, u64 amount, u32 value) { return buffer_cache.DMAClear(src_address, amount, value); } +template <bool IS_IMAGE_UPLOAD> +bool AccelerateDMA::DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand) { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + const auto image_id = texture_cache.DmaImageId(image_operand, IS_IMAGE_UPLOAD); + if (image_id == VideoCommon::NULL_IMAGE_ID) { + return false; + } + const u32 buffer_size = static_cast<u32>(buffer_operand.pitch * buffer_operand.height); + static constexpr auto sync_info = VideoCommon::ObtainBufferSynchronize::FullSynchronize; + const auto post_op = VideoCommon::ObtainBufferOperation::DoNothing; + const auto [buffer, offset] = + buffer_cache.ObtainBuffer(buffer_operand.address, buffer_size, sync_info, post_op); + + const auto [image, copy] = texture_cache.DmaBufferImageCopy( + copy_info, buffer_operand, image_operand, image_id, IS_IMAGE_UPLOAD); + const std::span copy_span{©, 1}; + + if constexpr (IS_IMAGE_UPLOAD) { + image->UploadMemory(buffer->Handle(), offset, copy_span); + } else { + texture_cache.DownloadImageIntoBuffer(image, buffer->Handle(), offset, copy_span, + buffer_operand.address, buffer_size); + } + return true; +} + +bool AccelerateDMA::ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::ImageOperand& image_operand, + const Tegra::DMA::BufferOperand& buffer_operand) { + return DmaBufferImageCopy<false>(copy_info, buffer_operand, image_operand); +} + +bool AccelerateDMA::BufferToImage(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& buffer_operand, + const Tegra::DMA::ImageOperand& image_operand) { + return DmaBufferImageCopy<true>(copy_info, buffer_operand, image_operand); +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 0c45832ae..a73ad15c1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -50,14 +50,26 @@ static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128); class AccelerateDMA : public Tegra::Engines::AccelerateDMAInterface { public: - explicit AccelerateDMA(BufferCache& buffer_cache); + explicit AccelerateDMA(BufferCache& buffer_cache, TextureCache& texture_cache); bool BufferCopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) override; bool BufferClear(GPUVAddr src_address, u64 amount, u32 value) override; + bool ImageToBuffer(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::ImageOperand& src, + const Tegra::DMA::BufferOperand& dst) override; + + bool BufferToImage(const Tegra::DMA::ImageCopy& copy_info, const Tegra::DMA::BufferOperand& src, + const Tegra::DMA::ImageOperand& dst) override; + private: + template <bool IS_IMAGE_UPLOAD> + bool DmaBufferImageCopy(const Tegra::DMA::ImageCopy& copy_info, + const Tegra::DMA::BufferOperand& src, + const Tegra::DMA::ImageOperand& dst); + BufferCache& buffer_cache; + TextureCache& texture_cache; }; class RasterizerOpenGL : public VideoCore::RasterizerAccelerated, @@ -83,6 +95,7 @@ public: VideoCommon::CacheType which = VideoCommon::CacheType::All) override; bool MustFlushRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; + VideoCore::RasterizerDownloadArea GetFlushArea(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void OnCPUWrite(VAddr addr, u64 size) override; @@ -150,7 +163,7 @@ private: /// Syncs the cull mode to match the guest state void SyncCullMode(); - /// Syncs the primitve restart to match the guest state + /// Syncs the primitive restart to match the guest state void SyncPrimitiveRestart(); /// Syncs the depth test state to match the guest state @@ -217,6 +230,7 @@ private: ProgramManager& program_manager; StateTracker& state_tracker; + StagingBufferPool staging_buffer_pool; TextureCacheRuntime texture_cache_runtime; TextureCache texture_cache; BufferCacheRuntime buffer_cache_runtime; @@ -234,7 +248,7 @@ private: std::array<GLuint, MAX_TEXTURES> texture_handles{}; std::array<GLuint, MAX_IMAGES> image_handles{}; - /// Number of commands queued to the OpenGL driver. Resetted on flush. + /// Number of commands queued to the OpenGL driver. Reset on flush. size_t num_queued_commands = 0; bool has_written_global_memory = false; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 3a664fdec..eae8fd110 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -3,6 +3,7 @@ #include <string_view> #include <glad/glad.h> +#include "common/assert.h" #include "common/microprofile.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" @@ -158,6 +159,15 @@ void OGLSync::Release() { handle = 0; } +bool OGLSync::IsSignaled() const noexcept { + // At least on Nvidia, glClientWaitSync with a timeout of 0 + // is faster than glGetSynciv of GL_SYNC_STATUS. + // Timeout of 0 means this check is non-blocking. + const auto sync_status = glClientWaitSync(handle, 0, 0); + ASSERT(sync_status != GL_WAIT_FAILED); + return sync_status != GL_TIMEOUT_EXPIRED; +} + void OGLFramebuffer::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index bc05ba4bd..77362acd2 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -263,6 +263,9 @@ public: /// Deletes the internal OpenGL resource void Release(); + /// Checks if the sync has been signaled + bool IsSignaled() const noexcept; + GLsync handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 7dd854e0f..0329ed820 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -85,7 +85,9 @@ Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key, case Shader::Stage::VertexB: case Shader::Stage::Geometry: if (!use_assembly_shaders && key.xfb_enabled != 0) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + auto [varyings, count] = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } break; case Shader::Stage::TessellationEval: @@ -218,6 +220,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo .lower_left_origin_mode = true, .need_declared_frag_colors = true, .need_fastmath_off = device.NeedsFastmathOff(), + .need_gather_subpixel_offset = device.IsAmd() || device.IsIntel(), .has_broken_spirv_clamp = true, .has_broken_unsigned_image_offsets = true, @@ -231,13 +234,14 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo .gl_max_compute_smem_size = device.GetMaxComputeSharedMemorySize(), }, host_info{ + .support_float64 = true, .support_float16 = false, .support_int64 = device.HasShaderInt64(), .needs_demote_reorder = device.IsAmd(), .support_snorm_render_buffer = false, .support_viewport_index_layer = device.HasVertexViewportLayer(), - .min_ssbo_alignment = static_cast<u32>(device.GetShaderStorageBufferAlignment()), .support_geometry_shader_passthrough = device.HasGeometryShaderPassthrough(), + .support_conditional_barrier = device.SupportsConditionalBarriers(), } { if (use_asynchronous_shaders) { workers = CreateWorkers(); @@ -286,7 +290,7 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading, file.read(reinterpret_cast<char*>(&key), sizeof(key)); queue_work([this, key, env = std::move(env), &state, &callback](Context* ctx) mutable { ctx->pools.ReleaseContents(); - auto pipeline{CreateComputePipeline(ctx->pools, key, env)}; + auto pipeline{CreateComputePipeline(ctx->pools, key, env, true)}; std::scoped_lock lock{state.mutex}; if (pipeline) { compute_cache.emplace(key, std::move(pipeline)); @@ -307,7 +311,7 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading, env_ptrs.push_back(&env); } ctx->pools.ReleaseContents(); - auto pipeline{CreateGraphicsPipeline(ctx->pools, key, MakeSpan(env_ptrs), false)}; + auto pipeline{CreateGraphicsPipeline(ctx->pools, key, MakeSpan(env_ptrs), false, true)}; std::scoped_lock lock{state.mutex}; if (pipeline) { graphics_cache.emplace(key, std::move(pipeline)); @@ -439,7 +443,8 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline() { std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline( ShaderContext::ShaderPools& pools, const GraphicsPipelineKey& key, - std::span<Shader::Environment* const> envs, bool build_in_parallel) try { + std::span<Shader::Environment* const> envs, bool use_shader_workers, + bool force_context_flush) try { LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash()); size_t env_index{}; u32 total_storage_buffers{}; @@ -531,10 +536,10 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline( } previous_program = &program; } - auto* const thread_worker{build_in_parallel ? workers.get() : nullptr}; + auto* const thread_worker{use_shader_workers ? workers.get() : nullptr}; return std::make_unique<GraphicsPipeline>(device, texture_cache, buffer_cache, program_manager, state_tracker, thread_worker, &shader_notify, sources, - sources_spirv, infos, key); + sources_spirv, infos, key, force_context_flush); } catch (Shader::Exception& exception) { LOG_ERROR(Render_OpenGL, "{}", exception.what()); @@ -559,8 +564,8 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline( } std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline( - ShaderContext::ShaderPools& pools, const ComputePipelineKey& key, - Shader::Environment& env) try { + ShaderContext::ShaderPools& pools, const ComputePipelineKey& key, Shader::Environment& env, + bool force_context_flush) try { LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash()); Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()}; @@ -589,7 +594,7 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline( } return std::make_unique<ComputePipeline>(device, texture_cache, buffer_cache, program_manager, - program.info, code, code_spirv); + program.info, code, code_spirv, force_context_flush); } catch (Shader::Exception& exception) { LOG_ERROR(Render_OpenGL, "{}", exception.what()); return nullptr; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index f82420592..6b9732fca 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -50,14 +50,16 @@ private: std::unique_ptr<GraphicsPipeline> CreateGraphicsPipeline( ShaderContext::ShaderPools& pools, const GraphicsPipelineKey& key, - std::span<Shader::Environment* const> envs, bool build_in_parallel); + std::span<Shader::Environment* const> envs, bool use_shader_workers, + bool force_context_flush = false); std::unique_ptr<ComputePipeline> CreateComputePipeline(const ComputePipelineKey& key, const VideoCommon::ShaderInfo* shader); std::unique_ptr<ComputePipeline> CreateComputePipeline(ShaderContext::ShaderPools& pools, const ComputePipelineKey& key, - Shader::Environment& env); + Shader::Environment& env, + bool force_context_flush = false); std::unique_ptr<ShaderWorker> CreateWorkers() const; diff --git a/src/video_core/renderer_opengl/gl_shader_context.h b/src/video_core/renderer_opengl/gl_shader_context.h index ca2bd8e8e..d12cd06fa 100644 --- a/src/video_core/renderer_opengl/gl_shader_context.h +++ b/src/video_core/renderer_opengl/gl_shader_context.h @@ -4,6 +4,7 @@ #pragma once #include "core/frontend/emu_window.h" +#include "core/frontend/graphics_context.h" #include "shader_recompiler/frontend/ir/basic_block.h" #include "shader_recompiler/frontend/maxwell/control_flow.h" @@ -15,9 +16,9 @@ struct ShaderPools { inst.ReleaseContents(); } - Shader::ObjectPool<Shader::IR::Inst> inst; - Shader::ObjectPool<Shader::IR::Block> block; - Shader::ObjectPool<Shader::Maxwell::Flow::Block> flow_block; + Shader::ObjectPool<Shader::IR::Inst> inst{8192}; + Shader::ObjectPool<Shader::IR::Block> block{32}; + Shader::ObjectPool<Shader::Maxwell::Flow::Block> flow_block{32}; }; struct Context { diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 98841ae65..03d4b9d06 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -3,7 +3,9 @@ #include <glad/glad.h> +#include "video_core/host_shaders/opengl_lmem_warmup_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_shader_util.h" namespace OpenGL { @@ -17,6 +19,10 @@ ProgramManager::ProgramManager(const Device& device) { if (device.UseAssemblyShaders()) { glEnable(GL_COMPUTE_PROGRAM_NV); } + if (device.HasLmemPerfBug()) { + lmem_warmup_program = + CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER); + } } void ProgramManager::BindComputeProgram(GLuint program) { @@ -98,6 +104,13 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU void ProgramManager::RestoreGuestCompute() {} +void ProgramManager::LocalMemoryWarmup() { + if (lmem_warmup_program.handle != 0) { + BindComputeProgram(lmem_warmup_program.handle); + glDispatchCompute(1, 1, 1); + } +} + void ProgramManager::BindPipeline() { if (!is_pipeline_bound) { is_pipeline_bound = true; diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 07ffab77f..852d8c88e 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -30,6 +30,8 @@ public: void RestoreGuestCompute(); + void LocalMemoryWarmup(); + private: void BindPipeline(); @@ -44,6 +46,7 @@ private: u32 current_stage_mask = 0; std::array<GLuint, NUM_STAGES> current_programs{}; GLuint current_assembly_compute_program = 0; + OGLProgram lmem_warmup_program; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp b/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp new file mode 100644 index 000000000..bbb06e51f --- /dev/null +++ b/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp @@ -0,0 +1,150 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <array> +#include <memory> +#include <span> + +#include <glad/glad.h> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" + +MICROPROFILE_DEFINE(OpenGL_BufferRequest, "OpenGL", "BufferRequest", MP_RGB(128, 128, 192)); + +namespace OpenGL { + +StagingBufferMap::~StagingBufferMap() { + if (sync) { + sync->Create(); + } +} + +StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) + : storage_flags{storage_flags_}, map_flags{map_flags_} {} + +StagingBuffers::~StagingBuffers() = default; + +StagingBufferMap StagingBuffers::RequestMap(size_t requested_size, bool insert_fence) { + MICROPROFILE_SCOPE(OpenGL_BufferRequest); + + const size_t index = RequestBuffer(requested_size); + OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; + sync_indices[index] = insert_fence ? ++current_sync_index : 0; + return StagingBufferMap{ + .mapped_span = std::span(maps[index], requested_size), + .sync = sync, + .buffer = buffers[index].handle, + }; +} + +size_t StagingBuffers::RequestBuffer(size_t requested_size) { + if (const std::optional<size_t> index = FindBuffer(requested_size); index) { + return *index; + } + + OGLBuffer& buffer = buffers.emplace_back(); + buffer.Create(); + const auto next_pow2_size = Common::NextPow2(requested_size); + glNamedBufferStorage(buffer.handle, next_pow2_size, nullptr, + storage_flags | GL_MAP_PERSISTENT_BIT); + maps.push_back(static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, next_pow2_size, + map_flags | GL_MAP_PERSISTENT_BIT))); + syncs.emplace_back(); + sync_indices.emplace_back(); + sizes.push_back(next_pow2_size); + + ASSERT(syncs.size() == buffers.size() && buffers.size() == maps.size() && + maps.size() == sizes.size()); + + return buffers.size() - 1; +} + +std::optional<size_t> StagingBuffers::FindBuffer(size_t requested_size) { + size_t known_unsignaled_index = current_sync_index + 1; + size_t smallest_buffer = std::numeric_limits<size_t>::max(); + std::optional<size_t> found; + const size_t num_buffers = sizes.size(); + for (size_t index = 0; index < num_buffers; ++index) { + const size_t buffer_size = sizes[index]; + if (buffer_size < requested_size || buffer_size >= smallest_buffer) { + continue; + } + if (syncs[index].handle != 0) { + if (sync_indices[index] >= known_unsignaled_index) { + // This fence is later than a fence that is known to not be signaled + continue; + } + if (!syncs[index].IsSignaled()) { + // Since this fence hasn't been signaled, it's safe to assume all later + // fences haven't been signaled either + known_unsignaled_index = std::min(known_unsignaled_index, sync_indices[index]); + continue; + } + syncs[index].Release(); + } + smallest_buffer = buffer_size; + found = index; + } + return found; +} + +StreamBuffer::StreamBuffer() { + static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; + buffer.Create(); + glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer"); + glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags); + mapped_pointer = + static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags)); + for (OGLSync& sync : fences) { + sync.Create(); + } +} + +std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept { + ASSERT(size < REGION_SIZE); + for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; + ++region) { + fences[region].Create(); + } + used_iterator = iterator; + + for (size_t region = Region(free_iterator) + 1, + region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); + region < region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); + } + if (iterator + size >= free_iterator) { + free_iterator = iterator + size; + } + if (iterator + size > STREAM_BUFFER_SIZE) { + for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { + fences[region].Create(); + } + used_iterator = 0; + iterator = 0; + free_iterator = size; + + for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); + } + } + const size_t offset = iterator; + iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + return {std::span(mapped_pointer + offset, size), offset}; +} + +StagingBufferMap StagingBufferPool::RequestUploadBuffer(size_t size) { + return upload_buffers.RequestMap(size, true); +} + +StagingBufferMap StagingBufferPool::RequestDownloadBuffer(size_t size) { + return download_buffers.RequestMap(size, false); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_staging_buffer_pool.h b/src/video_core/renderer_opengl/gl_staging_buffer_pool.h new file mode 100644 index 000000000..60f72d3a0 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_staging_buffer_pool.h @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <array> +#include <optional> +#include <span> +#include <utility> +#include <vector> + +#include <glad/glad.h> + +#include "common/common_types.h" +#include "common/literals.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" + +namespace OpenGL { + +using namespace Common::Literals; + +struct StagingBufferMap { + ~StagingBufferMap(); + + std::span<u8> mapped_span; + size_t offset = 0; + OGLSync* sync; + GLuint buffer; +}; + +struct StagingBuffers { + explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); + ~StagingBuffers(); + + StagingBufferMap RequestMap(size_t requested_size, bool insert_fence); + + size_t RequestBuffer(size_t requested_size); + + std::optional<size_t> FindBuffer(size_t requested_size); + + std::vector<OGLSync> syncs; + std::vector<OGLBuffer> buffers; + std::vector<u8*> maps; + std::vector<size_t> sizes; + std::vector<size_t> sync_indices; + GLenum storage_flags; + GLenum map_flags; + size_t current_sync_index = 0; +}; + +class StreamBuffer { + static constexpr size_t STREAM_BUFFER_SIZE = 64_MiB; + static constexpr size_t NUM_SYNCS = 16; + static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS; + static constexpr size_t MAX_ALIGNMENT = 256; + static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0); + static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0); + static_assert(REGION_SIZE % MAX_ALIGNMENT == 0); + +public: + explicit StreamBuffer(); + + [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept; + + [[nodiscard]] GLuint Handle() const noexcept { + return buffer.handle; + } + +private: + [[nodiscard]] static size_t Region(size_t offset) noexcept { + return offset / REGION_SIZE; + } + + size_t iterator = 0; + size_t used_iterator = 0; + size_t free_iterator = 0; + u8* mapped_pointer = nullptr; + OGLBuffer buffer; + std::array<OGLSync, NUM_SYNCS> fences; +}; + +class StagingBufferPool { +public: + StagingBufferPool() = default; + ~StagingBufferPool() = default; + + StagingBufferMap RequestUploadBuffer(size_t size); + StagingBufferMap RequestDownloadBuffer(size_t size); + +private: + StagingBuffers upload_buffers{GL_MAP_WRITE_BIT, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT}; + StagingBuffers download_buffers{GL_MAP_READ_BIT | GL_CLIENT_STORAGE_BIT, GL_MAP_READ_BIT}; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp deleted file mode 100644 index 2005c8993..000000000 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <array> -#include <memory> -#include <span> - -#include <glad/glad.h> - -#include "common/alignment.h" -#include "common/assert.h" -#include "video_core/renderer_opengl/gl_stream_buffer.h" - -namespace OpenGL { - -StreamBuffer::StreamBuffer() { - static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; - buffer.Create(); - glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer"); - glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags); - mapped_pointer = - static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags)); - for (OGLSync& sync : fences) { - sync.Create(); - } -} - -std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept { - ASSERT(size < REGION_SIZE); - for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; - ++region) { - fences[region].Create(); - } - used_iterator = iterator; - - for (size_t region = Region(free_iterator) + 1, - region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); - region < region_end; ++region) { - glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); - fences[region].Release(); - } - if (iterator + size >= free_iterator) { - free_iterator = iterator + size; - } - if (iterator + size > STREAM_BUFFER_SIZE) { - for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { - fences[region].Create(); - } - used_iterator = 0; - iterator = 0; - free_iterator = size; - - for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { - glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); - fences[region].Release(); - } - } - const size_t offset = iterator; - iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); - return {std::span(mapped_pointer + offset, size), offset}; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h deleted file mode 100644 index 8fe927aaf..000000000 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include <array> -#include <span> -#include <utility> - -#include <glad/glad.h> - -#include "common/common_types.h" -#include "common/literals.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" - -namespace OpenGL { - -using namespace Common::Literals; - -class StreamBuffer { - static constexpr size_t STREAM_BUFFER_SIZE = 64_MiB; - static constexpr size_t NUM_SYNCS = 16; - static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS; - static constexpr size_t MAX_ALIGNMENT = 256; - static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0); - static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0); - static_assert(REGION_SIZE % MAX_ALIGNMENT == 0); - -public: - explicit StreamBuffer(); - - [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept; - - [[nodiscard]] GLuint Handle() const noexcept { - return buffer.handle; - } - -private: - [[nodiscard]] static size_t Region(size_t offset) noexcept { - return offset / REGION_SIZE; - } - - size_t iterator = 0; - size_t used_iterator = 0; - size_t free_iterator = 0; - u8* mapped_pointer = nullptr; - OGLBuffer buffer; - std::array<OGLSync, NUM_SYNCS> fences; -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 9f7ce7414..3b446be07 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -112,13 +112,17 @@ GLenum ImageTarget(Shader::TextureType type, int num_samples = 1) { return GL_NONE; } -GLenum TextureMode(PixelFormat format, bool is_first) { +GLenum TextureMode(PixelFormat format, std::array<SwizzleSource, 4> swizzle) { + bool any_r = + std::ranges::any_of(swizzle, [](SwizzleSource s) { return s == SwizzleSource::R; }); switch (format) { case PixelFormat::D24_UNORM_S8_UINT: case PixelFormat::D32_FLOAT_S8_UINT: - return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; + // R = depth, G = stencil + return any_r ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; case PixelFormat::S8_UINT_D24_UNORM: - return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; + // R = stencil, G = depth + return any_r ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; default: ASSERT(false); return GL_DEPTH_COMPONENT; @@ -208,8 +212,7 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 case PixelFormat::D32_FLOAT_S8_UINT: case PixelFormat::S8_UINT_D24_UNORM: UNIMPLEMENTED_IF(swizzle[0] != SwizzleSource::R && swizzle[0] != SwizzleSource::G); - glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, - TextureMode(format, swizzle[0] == SwizzleSource::R)); + glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, TextureMode(format, swizzle)); std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed); break; case PixelFormat::A5B5G5R1_UNORM: { @@ -228,8 +231,11 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 [[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info) { - if (IsPixelFormatASTC(info.format)) { - return !runtime.HasNativeASTC() && Settings::values.accelerate_astc.GetValue(); + if (IsPixelFormatASTC(info.format) && info.size.depth == 1 && !runtime.HasNativeASTC()) { + return Settings::values.accelerate_astc.GetValue() && + Settings::values.astc_recompression.GetValue() == + Settings::AstcRecompression::Uncompressed && + !Settings::values.async_astc.GetValue(); } // Disable other accelerated uploads for now as they don't implement swizzled uploads return false; @@ -258,6 +264,14 @@ void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4 return format_info.compatibility_class == store_class; } +[[nodiscard]] bool CanBeDecodedAsync(const TextureCacheRuntime& runtime, + const VideoCommon::ImageInfo& info) { + if (IsPixelFormatASTC(info.format) && !runtime.HasNativeASTC()) { + return Settings::values.async_astc.GetValue(); + } + return false; +} + [[nodiscard]] CopyOrigin MakeCopyOrigin(VideoCommon::Offset3D offset, VideoCommon::SubresourceLayers subresource, GLenum target) { switch (target) { @@ -425,18 +439,31 @@ OGLTexture MakeImage(const VideoCommon::ImageInfo& info, GLenum gl_internal_form return GL_R32UI; } -} // Anonymous namespace +[[nodiscard]] bool IsAstcRecompressionEnabled() { + return Settings::values.astc_recompression.GetValue() != + Settings::AstcRecompression::Uncompressed; +} -ImageBufferMap::~ImageBufferMap() { - if (sync) { - sync->Create(); +[[nodiscard]] GLenum SelectAstcFormat(PixelFormat format, bool is_srgb) { + switch (Settings::values.astc_recompression.GetValue()) { + case Settings::AstcRecompression::Bc1: + return is_srgb ? GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT : GL_COMPRESSED_RGBA_S3TC_DXT1_EXT; + break; + case Settings::AstcRecompression::Bc3: + return is_srgb ? GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT : GL_COMPRESSED_RGBA_S3TC_DXT5_EXT; + break; + default: + return is_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8; } } +} // Anonymous namespace TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, - StateTracker& state_tracker_) - : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager), - format_conversion_pass{util_shaders}, resolution{Settings::values.resolution_info} { + StateTracker& state_tracker_, + StagingBufferPool& staging_buffer_pool_) + : device{device_}, state_tracker{state_tracker_}, staging_buffer_pool{staging_buffer_pool_}, + util_shaders(program_manager), format_conversion_pass{util_shaders}, + resolution{Settings::values.resolution_info} { static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; for (size_t i = 0; i < TARGETS.size(); ++i) { const GLenum target = TARGETS[i]; @@ -526,12 +553,12 @@ void TextureCacheRuntime::Finish() { glFinish(); } -ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { - return upload_buffers.RequestMap(size, true); +StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestUploadBuffer(size); } -ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { - return download_buffers.RequestMap(size, false); +StagingBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestDownloadBuffer(size); } u64 TextureCacheRuntime::GetDeviceMemoryUsage() const { @@ -557,6 +584,14 @@ void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image, } } +void TextureCacheRuntime::CopyImageMSAA(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies) { + LOG_DEBUG(Render_OpenGL, "Copying from {} samples to {} samples", src_image.info.num_samples, + dst_image.info.num_samples); + // TODO: Leverage the format conversion pass if possible/accurate. + util_shaders.CopyMSAA(dst_image, src_image, copies); +} + void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies) { LOG_DEBUG(Render_OpenGL, "Converting {} to {}", src.info.format, dst.info.format); @@ -608,7 +643,7 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, is_linear ? GL_LINEAR : GL_NEAREST); } -void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, +void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { switch (image.info.type) { case ImageType::e2D: @@ -650,78 +685,27 @@ bool TextureCacheRuntime::HasNativeASTC() const noexcept { return device.HasASTC(); } -TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) - : storage_flags{storage_flags_}, map_flags{map_flags_} {} - -TextureCacheRuntime::StagingBuffers::~StagingBuffers() = default; - -ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_size, - bool insert_fence) { - const size_t index = RequestBuffer(requested_size); - OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; - return ImageBufferMap{ - .mapped_span = std::span(maps[index], requested_size), - .sync = sync, - .buffer = buffers[index].handle, - }; -} - -size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { - if (const std::optional<size_t> index = FindBuffer(requested_size); index) { - return *index; - } - - OGLBuffer& buffer = buffers.emplace_back(); - buffer.Create(); - glNamedBufferStorage(buffer.handle, requested_size, nullptr, - storage_flags | GL_MAP_PERSISTENT_BIT); - maps.push_back(static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, requested_size, - map_flags | GL_MAP_PERSISTENT_BIT))); - - syncs.emplace_back(); - sizes.push_back(requested_size); - - ASSERT(syncs.size() == buffers.size() && buffers.size() == maps.size() && - maps.size() == sizes.size()); - - return buffers.size() - 1; -} - -std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t requested_size) { - size_t smallest_buffer = std::numeric_limits<size_t>::max(); - std::optional<size_t> found; - const size_t num_buffers = sizes.size(); - for (size_t index = 0; index < num_buffers; ++index) { - const size_t buffer_size = sizes[index]; - if (buffer_size < requested_size || buffer_size >= smallest_buffer) { - continue; - } - if (syncs[index].handle != 0) { - GLint status; - glGetSynciv(syncs[index].handle, GL_SYNC_STATUS, 1, nullptr, &status); - if (status != GL_SIGNALED) { - continue; - } - syncs[index].Release(); - } - smallest_buffer = buffer_size; - found = index; - } - return found; -} - Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} { - if (CanBeAccelerated(*runtime, info)) { + if (CanBeDecodedAsync(*runtime, info)) { + flags |= ImageFlagBits::AsynchronousDecode; + } else if (CanBeAccelerated(*runtime, info)) { flags |= ImageFlagBits::AcceleratedUpload; } if (IsConverted(runtime->device, info.format, info.type)) { flags |= ImageFlagBits::Converted; flags |= ImageFlagBits::CostlyLoad; - gl_internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; + + const bool is_srgb = IsPixelFormatSRGB(info.format); + gl_internal_format = is_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8; gl_format = GL_RGBA; gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; + + if (IsPixelFormatASTC(info.format) && IsAstcRecompressionEnabled()) { + gl_internal_format = SelectAstcFormat(info.format, is_srgb); + gl_format = GL_NONE; + } } else { const auto& tuple = MaxwellToGL::GetFormatTuple(info.format); gl_internal_format = tuple.internal_format; @@ -743,14 +727,14 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas Image::~Image() = default; -void Image::UploadMemory(const ImageBufferMap& map, +void Image::UploadMemory(GLuint buffer_handle, size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(true); } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); - glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, buffer_handle); + glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); @@ -769,45 +753,65 @@ void Image::UploadMemory(const ImageBufferMap& map, current_image_height = copy.buffer_image_height; glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); } - CopyBufferToImage(copy, map.offset); + CopyBufferToImage(copy, buffer_offset); } if (is_rescaled) { ScaleUp(); } } -void Image::DownloadMemory(ImageBufferMap& map, +void Image::UploadMemory(const StagingBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies) { + UploadMemory(map.buffer, map.offset, copies); +} + +void Image::DownloadMemory(GLuint buffer_handle, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies) { + std::array buffer_handles{buffer_handle}; + std::array buffer_offsets{buffer_offset}; + DownloadMemory(buffer_handles, buffer_offsets, copies); +} + +void Image::DownloadMemory(std::span<GLuint> buffer_handles, std::span<size_t> buffer_offsets, std::span<const VideoCommon::BufferImageCopy> copies) { const bool is_rescaled = True(flags & ImageFlagBits::Rescaled); if (is_rescaled) { ScaleDown(); } glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API - glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); - glPixelStorei(GL_PACK_ALIGNMENT, 1); + for (size_t i = 0; i < buffer_handles.size(); i++) { + auto& buffer_handle = buffer_handles[i]; + glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_handle); + glPixelStorei(GL_PACK_ALIGNMENT, 1); - u32 current_row_length = std::numeric_limits<u32>::max(); - u32 current_image_height = std::numeric_limits<u32>::max(); + u32 current_row_length = std::numeric_limits<u32>::max(); + u32 current_image_height = std::numeric_limits<u32>::max(); - for (const VideoCommon::BufferImageCopy& copy : copies) { - if (copy.image_subresource.base_level >= gl_num_levels) { - continue; - } - if (current_row_length != copy.buffer_row_length) { - current_row_length = copy.buffer_row_length; - glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length); - } - if (current_image_height != copy.buffer_image_height) { - current_image_height = copy.buffer_image_height; - glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); + for (const VideoCommon::BufferImageCopy& copy : copies) { + if (copy.image_subresource.base_level >= gl_num_levels) { + continue; + } + if (current_row_length != copy.buffer_row_length) { + current_row_length = copy.buffer_row_length; + glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length); + } + if (current_image_height != copy.buffer_image_height) { + current_image_height = copy.buffer_image_height; + glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); + } + CopyImageToBuffer(copy, buffer_offsets[i]); } - CopyImageToBuffer(copy, map.offset); } if (is_rescaled) { ScaleUp(true); } } +void Image::DownloadMemory(StagingBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies) { + DownloadMemory(map.buffer, map.offset, copies); +} + GLuint Image::StorageHandle() noexcept { switch (info.format) { case PixelFormat::A8B8G8R8_SRGB: @@ -821,9 +825,12 @@ GLuint Image::StorageHandle() noexcept { case PixelFormat::ASTC_2D_8X5_SRGB: case PixelFormat::ASTC_2D_5X4_SRGB: case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X5_SRGB: + case PixelFormat::ASTC_2D_10X6_SRGB: case PixelFormat::ASTC_2D_10X8_SRGB: case PixelFormat::ASTC_2D_6X6_SRGB: case PixelFormat::ASTC_2D_10X10_SRGB: + case PixelFormat::ASTC_2D_12X10_SRGB: case PixelFormat::ASTC_2D_12X12_SRGB: case PixelFormat::ASTC_2D_8X6_SRGB: case PixelFormat::ASTC_2D_6X5_SRGB: @@ -1083,10 +1090,16 @@ bool Image::ScaleDown(bool ignore) { ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, ImageId image_id_, Image& image, const SlotVector<Image>&) - : VideoCommon::ImageViewBase{info, image.info, image_id_}, views{runtime.null_image_views} { + : VideoCommon::ImageViewBase{info, image.info, image_id_, image.gpu_addr}, + views{runtime.null_image_views} { const Device& device = runtime.device; if (True(image.flags & ImageFlagBits::Converted)) { - internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; + const bool is_srgb = IsPixelFormatSRGB(info.format); + internal_format = is_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8; + + if (IsPixelFormatASTC(info.format) && IsAstcRecompressionEnabled()) { + internal_format = SelectAstcFormat(info.format, is_srgb); + } } else { internal_format = MaxwellToGL::GetFormatTuple(format).internal_format; } @@ -1174,12 +1187,12 @@ ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewI ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, const VideoCommon::ImageViewInfo& view_info, GPUVAddr gpu_addr_) - : VideoCommon::ImageViewBase{info, view_info}, gpu_addr{gpu_addr_}, + : VideoCommon::ImageViewBase{info, view_info, gpu_addr_}, buffer_size{VideoCommon::CalculateGuestSizeInBytes(info)} {} ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, const VideoCommon::ImageViewInfo& view_info) - : VideoCommon::ImageViewBase{info, view_info} {} + : VideoCommon::ImageViewBase{info, view_info, 0} {} ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageViewParams& params) : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {} @@ -1239,7 +1252,7 @@ GLuint ImageView::MakeView(Shader::TextureType view_type, GLenum view_format) { ApplySwizzle(view.handle, format, casted_swizzle); } if (set_object_label) { - const std::string name = VideoCommon::Name(*this); + const std::string name = VideoCommon::Name(*this, gpu_addr); glObjectLabel(GL_TEXTURE, view.handle, static_cast<GLsizei>(name.size()), name.data()); } return view.handle; @@ -1255,36 +1268,48 @@ Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) { UNIMPLEMENTED_IF(config.cubemap_anisotropy != 1); - sampler.Create(); - const GLuint handle = sampler.handle; - glSamplerParameteri(handle, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(config.wrap_u)); - glSamplerParameteri(handle, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(config.wrap_v)); - glSamplerParameteri(handle, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(config.wrap_p)); - glSamplerParameteri(handle, GL_TEXTURE_COMPARE_MODE, compare_mode); - glSamplerParameteri(handle, GL_TEXTURE_COMPARE_FUNC, compare_func); - glSamplerParameteri(handle, GL_TEXTURE_MAG_FILTER, mag); - glSamplerParameteri(handle, GL_TEXTURE_MIN_FILTER, min); - glSamplerParameterf(handle, GL_TEXTURE_LOD_BIAS, config.LodBias()); - glSamplerParameterf(handle, GL_TEXTURE_MIN_LOD, config.MinLod()); - glSamplerParameterf(handle, GL_TEXTURE_MAX_LOD, config.MaxLod()); - glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data()); - - if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) { - const f32 max_anisotropy = std::clamp(config.MaxAnisotropy(), 1.0f, 16.0f); - glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, max_anisotropy); - } else { - LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required"); - } - if (GLAD_GL_ARB_texture_filter_minmax || GLAD_GL_EXT_texture_filter_minmax) { - glSamplerParameteri(handle, GL_TEXTURE_REDUCTION_MODE_ARB, reduction_filter); - } else if (reduction_filter != GL_WEIGHTED_AVERAGE_ARB) { - LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_minmax is required"); - } - if (GLAD_GL_ARB_seamless_cubemap_per_texture || GLAD_GL_AMD_seamless_cubemap_per_texture) { - glSamplerParameteri(handle, GL_TEXTURE_CUBE_MAP_SEAMLESS, seamless); - } else if (seamless == GL_FALSE) { - // We default to false because it's more common - LOG_WARNING(Render_OpenGL, "GL_ARB_seamless_cubemap_per_texture is required"); + const f32 max_anisotropy = std::clamp(config.MaxAnisotropy(), 1.0f, 16.0f); + + const auto create_sampler = [&](const f32 anisotropy) { + OGLSampler new_sampler; + new_sampler.Create(); + const GLuint handle = new_sampler.handle; + glSamplerParameteri(handle, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(config.wrap_u)); + glSamplerParameteri(handle, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(config.wrap_v)); + glSamplerParameteri(handle, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(config.wrap_p)); + glSamplerParameteri(handle, GL_TEXTURE_COMPARE_MODE, compare_mode); + glSamplerParameteri(handle, GL_TEXTURE_COMPARE_FUNC, compare_func); + glSamplerParameteri(handle, GL_TEXTURE_MAG_FILTER, mag); + glSamplerParameteri(handle, GL_TEXTURE_MIN_FILTER, min); + glSamplerParameterf(handle, GL_TEXTURE_LOD_BIAS, config.LodBias()); + glSamplerParameterf(handle, GL_TEXTURE_MIN_LOD, config.MinLod()); + glSamplerParameterf(handle, GL_TEXTURE_MAX_LOD, config.MaxLod()); + glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data()); + + if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) { + glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, anisotropy); + } else { + LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required"); + } + if (GLAD_GL_ARB_texture_filter_minmax || GLAD_GL_EXT_texture_filter_minmax) { + glSamplerParameteri(handle, GL_TEXTURE_REDUCTION_MODE_ARB, reduction_filter); + } else if (reduction_filter != GL_WEIGHTED_AVERAGE_ARB) { + LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_minmax is required"); + } + if (GLAD_GL_ARB_seamless_cubemap_per_texture || GLAD_GL_AMD_seamless_cubemap_per_texture) { + glSamplerParameteri(handle, GL_TEXTURE_CUBE_MAP_SEAMLESS, seamless); + } else if (seamless == GL_FALSE) { + // We default to false because it's more common + LOG_WARNING(Render_OpenGL, "GL_ARB_seamless_cubemap_per_texture is required"); + } + return new_sampler; + }; + + sampler = create_sampler(max_anisotropy); + + const f32 max_anisotropy_default = static_cast<f32>(1U << config.max_anisotropy); + if (max_anisotropy > max_anisotropy_default) { + sampler_default_anisotropy = create_sampler(max_anisotropy_default); } } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 5d9d370f2..3676eaaa9 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -11,6 +11,7 @@ #include "shader_recompiler/shader_info.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" #include "video_core/renderer_opengl/util_shaders.h" #include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/texture_cache_base.h" @@ -37,15 +38,6 @@ using VideoCommon::Region2D; using VideoCommon::RenderTargets; using VideoCommon::SlotVector; -struct ImageBufferMap { - ~ImageBufferMap(); - - std::span<u8> mapped_span; - size_t offset = 0; - OGLSync* sync; - GLuint buffer; -}; - struct FormatProperties { GLenum compatibility_class; bool compatibility_by_size; @@ -74,14 +66,15 @@ class TextureCacheRuntime { public: explicit TextureCacheRuntime(const Device& device, ProgramManager& program_manager, - StateTracker& state_tracker); + StateTracker& state_tracker, + StagingBufferPool& staging_buffer_pool); ~TextureCacheRuntime(); void Finish(); - ImageBufferMap UploadStagingBuffer(size_t size); + StagingBufferMap UploadStagingBuffer(size_t size); - ImageBufferMap DownloadStagingBuffer(size_t size); + StagingBufferMap DownloadStagingBuffer(size_t size); u64 GetDeviceLocalMemory() const { return device_access_memory; @@ -93,12 +86,19 @@ public: return device.CanReportMemoryUsage(); } - bool ShouldReinterpret([[maybe_unused]] Image& dst, [[maybe_unused]] Image& src) { + bool ShouldReinterpret([[maybe_unused]] Image& dst, + [[maybe_unused]] Image& src) const noexcept { + return true; + } + + bool CanUploadMSAA() const noexcept { return true; } void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); + void CopyImageMSAA(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); + void ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { @@ -113,7 +113,7 @@ public: const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); - void AccelerateImageUpload(Image& image, const ImageBufferMap& map, + void AccelerateImageUpload(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); void InsertUploadMemoryBarrier(); @@ -137,36 +137,21 @@ public: return state_tracker; } -private: - struct StagingBuffers { - explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); - ~StagingBuffers(); - - ImageBufferMap RequestMap(size_t requested_size, bool insert_fence); - - size_t RequestBuffer(size_t requested_size); - - std::optional<size_t> FindBuffer(size_t requested_size); - - std::vector<OGLSync> syncs; - std::vector<OGLBuffer> buffers; - std::vector<u8*> maps; - std::vector<size_t> sizes; - GLenum storage_flags; - GLenum map_flags; - }; + void BarrierFeedbackLoop() const noexcept { + // OpenGL does not require a barrier for attachment feedback loops. + } +private: const Device& device; StateTracker& state_tracker; + StagingBufferPool& staging_buffer_pool; + UtilShaders util_shaders; FormatConversionPass format_conversion_pass; std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties; bool has_broken_texture_view_formats = false; - StagingBuffers upload_buffers{GL_MAP_WRITE_BIT, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT}; - StagingBuffers download_buffers{GL_MAP_READ_BIT | GL_CLIENT_STORAGE_BIT, GL_MAP_READ_BIT}; - OGLTexture null_image_1d_array; OGLTexture null_image_cube_array; OGLTexture null_image_3d; @@ -199,10 +184,20 @@ public: Image(Image&&) = default; Image& operator=(Image&&) = default; - void UploadMemory(const ImageBufferMap& map, + void UploadMemory(GLuint buffer_handle, size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies); - void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); + void UploadMemory(const StagingBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies); + + void DownloadMemory(GLuint buffer_handle, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); + + void DownloadMemory(std::span<GLuint> buffer_handle, std::span<size_t> buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); + + void DownloadMemory(StagingBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies); GLuint StorageHandle() noexcept; @@ -298,7 +293,6 @@ private: std::unique_ptr<StorageViews> storage_views; GLenum internal_format = GL_NONE; GLuint default_handle = 0; - GPUVAddr gpu_addr = 0; u32 buffer_size = 0; GLuint original_texture = 0; int num_samples = 0; @@ -315,12 +309,21 @@ class Sampler { public: explicit Sampler(TextureCacheRuntime&, const Tegra::Texture::TSCEntry&); - GLuint Handle() const noexcept { + [[nodiscard]] GLuint Handle() const noexcept { return sampler.handle; } + [[nodiscard]] GLuint HandleWithDefaultAnisotropy() const noexcept { + return sampler_default_anisotropy.handle; + } + + [[nodiscard]] bool HasAddedAnisotropy() const noexcept { + return static_cast<bool>(sampler_default_anisotropy.handle); + } + private: OGLSampler sampler; + OGLSampler sampler_default_anisotropy; }; class Framebuffer { @@ -363,6 +366,7 @@ struct TextureCacheParams { using Sampler = OpenGL::Sampler; using Framebuffer = OpenGL::Framebuffer; using AsyncBuffer = u32; + using BufferType = GLuint; }; using TextureCache = VideoCommon::TextureCache<TextureCacheParams>; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index ef1190e1f..c7dc7e0a1 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -100,10 +100,13 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB {GL_COMPRESSED_RGBA_ASTC_10x6_KHR}, // ASTC_2D_10X6_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR}, // ASTC_2D_10X6_SRGB {GL_COMPRESSED_RGBA_ASTC_10x5_KHR}, // ASTC_2D_10X5_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR}, // ASTC_2D_10X5_SRGB {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB + {GL_COMPRESSED_RGBA_ASTC_12x10_KHR}, // ASTC_2D_12X10_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR}, // ASTC_2D_12X10_SRGB {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index de95f2634..2a74c1d05 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -17,8 +17,14 @@ #include "core/frontend/emu_window.h" #include "core/memory.h" #include "core/telemetry_session.h" +#include "video_core/host_shaders/ffx_a_h.h" +#include "video_core/host_shaders/ffx_fsr1_h.h" +#include "video_core/host_shaders/full_screen_triangle_vert.h" #include "video_core/host_shaders/fxaa_frag.h" #include "video_core/host_shaders/fxaa_vert.h" +#include "video_core/host_shaders/opengl_fidelityfx_fsr_easu_frag.h" +#include "video_core/host_shaders/opengl_fidelityfx_fsr_frag.h" +#include "video_core/host_shaders/opengl_fidelityfx_fsr_rcas_frag.h" #include "video_core/host_shaders/opengl_present_frag.h" #include "video_core/host_shaders/opengl_present_scaleforce_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" @@ -31,6 +37,7 @@ #include "video_core/host_shaders/smaa_edge_detection_vert.h" #include "video_core/host_shaders/smaa_neighborhood_blending_frag.h" #include "video_core/host_shaders/smaa_neighborhood_blending_vert.h" +#include "video_core/renderer_opengl/gl_fsr.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" @@ -268,12 +275,17 @@ void RendererOpenGL::InitOpenGLObjects() { fxaa_vertex = CreateProgram(HostShaders::FXAA_VERT, GL_VERTEX_SHADER); fxaa_fragment = CreateProgram(HostShaders::FXAA_FRAG, GL_FRAGMENT_SHADER); - const auto SmaaShader = [](std::string_view specialized_source, GLenum stage) { - std::string shader_source{specialized_source}; - constexpr std::string_view include_string = "#include \"opengl_smaa.glsl\""; + const auto replace_include = [](std::string& shader_source, std::string_view include_name, + std::string_view include_content) { + const std::string include_string = fmt::format("#include \"{}\"", include_name); const std::size_t pos = shader_source.find(include_string); ASSERT(pos != std::string::npos); - shader_source.replace(pos, include_string.size(), HostShaders::OPENGL_SMAA_GLSL); + shader_source.replace(pos, include_string.size(), include_content); + }; + + const auto SmaaShader = [&](std::string_view specialized_source, GLenum stage) { + std::string shader_source{specialized_source}; + replace_include(shader_source, "opengl_smaa.glsl", HostShaders::OPENGL_SMAA_GLSL); return CreateProgram(shader_source, stage); }; @@ -298,14 +310,32 @@ void RendererOpenGL::InitOpenGLObjects() { CreateProgram(fmt::format("#version 460\n{}", HostShaders::OPENGL_PRESENT_SCALEFORCE_FRAG), GL_FRAGMENT_SHADER); + std::string fsr_source{HostShaders::OPENGL_FIDELITYFX_FSR_FRAG}; + replace_include(fsr_source, "ffx_a.h", HostShaders::FFX_A_H); + replace_include(fsr_source, "ffx_fsr1.h", HostShaders::FFX_FSR1_H); + + std::string fsr_easu_frag_source{HostShaders::OPENGL_FIDELITYFX_FSR_EASU_FRAG}; + std::string fsr_rcas_frag_source{HostShaders::OPENGL_FIDELITYFX_FSR_RCAS_FRAG}; + replace_include(fsr_easu_frag_source, "opengl_fidelityfx_fsr.frag", fsr_source); + replace_include(fsr_rcas_frag_source, "opengl_fidelityfx_fsr.frag", fsr_source); + + fsr = std::make_unique<FSR>(HostShaders::FULL_SCREEN_TRIANGLE_VERT, fsr_easu_frag_source, + fsr_rcas_frag_source); + // Generate presentation sampler present_sampler.Create(); glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glSamplerParameteri(present_sampler.handle, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glSamplerParameteri(present_sampler.handle, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glSamplerParameteri(present_sampler.handle, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE); present_sampler_nn.Create(); glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glSamplerParameteri(present_sampler_nn.handle, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE); // Generate VBO handle for drawing vertex_buffer.Create(); @@ -525,6 +555,31 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { glBindTextureUnit(0, aa_texture.handle); } + glDisablei(GL_SCISSOR_TEST, 0); + + if (Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::Fsr) { + if (!fsr->AreBuffersInitialized()) { + fsr->InitBuffers(); + } + + auto crop_rect = framebuffer_crop_rect; + if (crop_rect.GetWidth() == 0) { + crop_rect.right = framebuffer_width; + } + if (crop_rect.GetHeight() == 0) { + crop_rect.bottom = framebuffer_height; + } + crop_rect = crop_rect.Scale(Settings::values.resolution_info.up_factor); + const auto fsr_input_width = Settings::values.resolution_info.ScaleUp(framebuffer_width); + const auto fsr_input_height = Settings::values.resolution_info.ScaleUp(framebuffer_height); + glBindSampler(0, present_sampler.handle); + fsr->Draw(program_manager, layout.screen, fsr_input_width, fsr_input_height, crop_rect); + } else { + if (fsr->AreBuffersInitialized()) { + fsr->ReleaseBuffers(); + } + } + const std::array ortho_matrix = MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height)); @@ -540,10 +595,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { case Settings::ScalingFilter::ScaleForce: return present_scaleforce_fragment.handle; case Settings::ScalingFilter::Fsr: - LOG_WARNING( - Render_OpenGL, - "FidelityFX Super Resolution is not supported in OpenGL, changing to ScaleForce"); - return present_scaleforce_fragment.handle; + return fsr->GetPresentFragmentProgram().handle; default: return present_bilinear_fragment.handle; } @@ -578,15 +630,18 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { f32 scale_u = static_cast<f32>(framebuffer_width) / static_cast<f32>(screen_info.texture.width); f32 scale_v = static_cast<f32>(framebuffer_height) / static_cast<f32>(screen_info.texture.height); - // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering - // (e.g. handheld mode) on a 1920x1080 framebuffer. - if (framebuffer_crop_rect.GetWidth() > 0) { - scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) / - static_cast<f32>(screen_info.texture.width); - } - if (framebuffer_crop_rect.GetHeight() > 0) { - scale_v = static_cast<f32>(framebuffer_crop_rect.GetHeight()) / - static_cast<f32>(screen_info.texture.height); + + if (Settings::values.scaling_filter.GetValue() != Settings::ScalingFilter::Fsr) { + // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering + // (e.g. handheld mode) on a 1920x1080 framebuffer. + if (framebuffer_crop_rect.GetWidth() > 0) { + scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) / + static_cast<f32>(screen_info.texture.width); + } + if (framebuffer_crop_rect.GetHeight() > 0) { + scale_v = static_cast<f32>(framebuffer_crop_rect.GetHeight()) / + static_cast<f32>(screen_info.texture.height); + } } if (Settings::values.anti_aliasing.GetValue() == Settings::AntiAliasing::Fxaa && !screen_info.was_accelerated) { @@ -612,7 +667,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { } else { glDisable(GL_FRAMEBUFFER_SRGB); } - glDisablei(GL_SCISSOR_TEST, 0); glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), static_cast<GLfloat>(layout.height)); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index cc97d7b26..f1d5fd954 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -10,6 +10,7 @@ #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_fsr.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h" @@ -141,6 +142,8 @@ private: OGLTexture smaa_edges_tex; OGLTexture smaa_blend_tex; + std::unique_ptr<FSR> fsr; + /// OpenGL framebuffer data std::vector<u8> gl_framebuffer_data; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 404def62e..544982d18 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -12,11 +12,14 @@ #include "video_core/host_shaders/astc_decoder_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" +#include "video_core/host_shaders/convert_msaa_to_non_msaa_comp.h" +#include "video_core/host_shaders/convert_non_msaa_to_msaa_comp.h" #include "video_core/host_shaders/opengl_convert_s8d24_comp.h" #include "video_core/host_shaders/opengl_copy_bc4_comp.h" #include "video_core/host_shaders/pitch_unswizzle_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/util_shaders.h" #include "video_core/texture_cache/accelerated_swizzle.h" @@ -51,7 +54,9 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)), - convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)) { + convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)), + convert_ms_to_nonms_program(MakeProgram(CONVERT_MSAA_TO_NON_MSAA_COMP)), + convert_nonms_to_ms_program(MakeProgram(CONVERT_NON_MSAA_TO_MSAA_COMP)) { const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); swizzle_table_buffer.Create(); glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); @@ -59,7 +64,7 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) UtilShaders::~UtilShaders() = default; -void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, +void UtilShaders::ASTCDecode(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles) { static constexpr GLuint BINDING_INPUT_BUFFER = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; @@ -107,7 +112,7 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, program_manager.RestoreGuestCompute(); } -void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, +void UtilShaders::BlockLinearUpload2D(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; @@ -144,7 +149,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, program_manager.RestoreGuestCompute(); } -void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, +void UtilShaders::BlockLinearUpload3D(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; @@ -185,7 +190,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, program_manager.RestoreGuestCompute(); } -void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, +void UtilShaders::PitchUpload(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_INPUT_BUFFER = 0; @@ -269,6 +274,33 @@ void UtilShaders::ConvertS8D24(Image& dst_image, std::span<const ImageCopy> copi program_manager.RestoreGuestCompute(); } +void UtilShaders::CopyMSAA(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies) { + const bool is_ms_to_non_ms = src_image.info.num_samples > 1 && dst_image.info.num_samples == 1; + const auto program_handle = + is_ms_to_non_ms ? convert_ms_to_nonms_program.handle : convert_nonms_to_ms_program.handle; + program_manager.BindComputeProgram(program_handle); + + for (const ImageCopy& copy : copies) { + ASSERT(copy.src_subresource.base_layer == 0); + ASSERT(copy.src_subresource.num_layers == 1); + ASSERT(copy.dst_subresource.base_layer == 0); + ASSERT(copy.dst_subresource.num_layers == 1); + + glBindImageTexture(0, src_image.StorageHandle(), copy.src_subresource.base_level, GL_TRUE, + 0, GL_READ_ONLY, GL_RGBA8); + glBindImageTexture(1, dst_image.StorageHandle(), copy.dst_subresource.base_level, GL_TRUE, + 0, GL_WRITE_ONLY, GL_RGBA8); + + const u32 num_dispatches_x = Common::DivCeil(copy.extent.width, 8U); + const u32 num_dispatches_y = Common::DivCeil(copy.extent.height, 8U); + const u32 num_dispatches_z = copy.extent.depth; + + glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z); + } + program_manager.RestoreGuestCompute(); +} + GLenum StoreFormat(u32 bytes_per_block) { switch (bytes_per_block) { case 1: diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 44efb6ecf..feecd404c 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -16,23 +16,23 @@ namespace OpenGL { class Image; class ProgramManager; -struct ImageBufferMap; +struct StagingBufferMap; class UtilShaders { public: explicit UtilShaders(ProgramManager& program_manager); ~UtilShaders(); - void ASTCDecode(Image& image, const ImageBufferMap& map, + void ASTCDecode(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, + void BlockLinearUpload2D(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, + void BlockLinearUpload3D(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void PitchUpload(Image& image, const ImageBufferMap& map, + void PitchUpload(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); void CopyBC4(Image& dst_image, Image& src_image, @@ -40,6 +40,9 @@ public: void ConvertS8D24(Image& dst_image, std::span<const VideoCommon::ImageCopy> copies); + void CopyMSAA(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies); + private: ProgramManager& program_manager; @@ -51,6 +54,8 @@ private: OGLProgram pitch_unswizzle_program; OGLProgram copy_bc4_program; OGLProgram convert_s8d24_program; + OGLProgram convert_ms_to_nonms_program; + OGLProgram convert_nonms_to_ms_program; }; GLenum StoreFormat(u32 bytes_per_block); |