diff options
Diffstat (limited to 'src/video_core/renderer_opengl')
22 files changed, 624 insertions, 904 deletions
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 5772cad87..6da3906a4 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -2,98 +2,208 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <memory> +#include <span> -#include <glad/glad.h> - -#include "common/assert.h" -#include "common/microprofile.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_rasterizer.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { +namespace { +struct BindlessSSBO { + GLuint64EXT address; + GLsizei length; + GLsizei padding; +}; +static_assert(sizeof(BindlessSSBO) == sizeof(GLuint) * 4); + +constexpr std::array PROGRAM_LUT{ + GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, + GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, +}; +} // Anonymous namespace + +Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) + : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {} + +Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, + VAddr cpu_addr_, u64 size_bytes_) + : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) { + buffer.Create(); + const std::string name = fmt::format("Buffer 0x{:x}", CpuAddr()); + glObjectLabel(GL_BUFFER, buffer.handle, static_cast<GLsizei>(name.size()), name.data()); + glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW); + + if (runtime.has_unified_vertex_buffers) { + glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address); + } +} -using Maxwell = Tegra::Engines::Maxwell3D::Regs; +void Buffer::ImmediateUpload(size_t offset, std::span<const u8> data) noexcept { + glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(data.size_bytes()), data.data()); +} -MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); +void Buffer::ImmediateDownload(size_t offset, std::span<u8> data) noexcept { + glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(data.size_bytes()), data.data()); +} -Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_) - : BufferBlock{cpu_addr_, size_} { - gl_buffer.Create(); - glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW); - if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) { - glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); - glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); +void Buffer::MakeResident(GLenum access) noexcept { + // Abuse GLenum's order to exit early + // GL_NONE (default) < GL_READ_ONLY < GL_READ_WRITE + if (access <= current_residency_access || buffer.handle == 0) { + return; + } + if (std::exchange(current_residency_access, access) != GL_NONE) { + // If the buffer is already resident, remove its residency before promoting it + glMakeNamedBufferNonResidentNV(buffer.handle); } + glMakeNamedBufferResidentNV(buffer.handle, access); } -Buffer::~Buffer() = default; - -void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { - glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(data_size), data); +BufferCacheRuntime::BufferCacheRuntime(const Device& device_) + : device{device_}, has_fast_buffer_sub_data{device.HasFastBufferSubData()}, + use_assembly_shaders{device.UseAssemblyShaders()}, + has_unified_vertex_buffers{device.HasVertexBufferUnifiedMemory()}, + stream_buffer{has_fast_buffer_sub_data ? std::nullopt : std::make_optional<StreamBuffer>()} { + GLint gl_max_attributes; + glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes); + max_attributes = static_cast<u32>(gl_max_attributes); + for (auto& stage_uniforms : fast_uniforms) { + for (OGLBuffer& buffer : stage_uniforms) { + buffer.Create(); + glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW); + } + } + for (auto& stage_uniforms : copy_uniforms) { + for (OGLBuffer& buffer : stage_uniforms) { + buffer.Create(); + glNamedBufferData(buffer.handle, 0x10'000, nullptr, GL_STREAM_COPY); + } + } + for (OGLBuffer& buffer : copy_compute_uniforms) { + buffer.Create(); + glNamedBufferData(buffer.handle, 0x10'000, nullptr, GL_STREAM_COPY); + } } -void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { - MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size); - const GLintptr gl_offset = static_cast<GLintptr>(offset); - if (read_buffer.handle == 0) { - read_buffer.Create(); - glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr, - GL_STREAM_READ); +void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies) { + for (const VideoCommon::BufferCopy& copy : copies) { + glCopyNamedBufferSubData( + src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset), + static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size)); } - glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); - glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size); - glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data); } -void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t copy_size) { - glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), - static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size)); +void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) { + if (has_unified_vertex_buffers) { + buffer.MakeResident(GL_READ_ONLY); + glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset, + static_cast<GLsizeiptr>(size)); + } else { + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle()); + index_buffer_offset = offset; + } } -OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_, - Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - const Device& device_, OGLStreamBuffer& stream_buffer_, - StateTracker& state_tracker) - : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} { - if (!device.HasFastBufferSubData()) { +void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, + u32 stride) { + if (index >= max_attributes) { return; } - - static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); - glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); - for (const GLuint cbuf : cbufs) { - glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); + if (has_unified_vertex_buffers) { + buffer.MakeResident(GL_READ_ONLY); + glBindVertexBuffer(index, 0, 0, static_cast<GLsizei>(stride)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index, + buffer.HostGpuAddr() + offset, static_cast<GLsizeiptr>(size)); + } else { + glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset), + static_cast<GLsizei>(stride)); } } -OGLBufferCache::~OGLBufferCache() { - glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); +void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, + u32 offset, u32 size) { + if (use_assembly_shaders) { + GLuint handle; + if (offset != 0) { + handle = copy_uniforms[stage][binding_index].handle; + glCopyNamedBufferSubData(buffer.Handle(), handle, offset, 0, size); + } else { + handle = buffer.Handle(); + } + glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, + static_cast<GLsizeiptr>(size)); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } } -std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared<Buffer>(device, cpu_addr, size); +void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, + u32 size) { + if (use_assembly_shaders) { + GLuint handle; + if (offset != 0) { + handle = copy_compute_uniforms[binding_index].handle; + glCopyNamedBufferSubData(buffer.Handle(), handle, offset, 0, size); + } else { + handle = buffer.Handle(); + } + glBindBufferRangeNV(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding_index, handle, 0, + static_cast<GLsizeiptr>(size)); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding_index, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } } -OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { - return {0, 0, 0}; +void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, + u32 offset, u32 size, bool is_written) { + if (use_assembly_shaders) { + const BindlessSSBO ssbo{ + .address = buffer.HostGpuAddr() + offset, + .length = static_cast<GLsizei>(size), + .padding = 0, + }; + buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); + glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1, + reinterpret_cast<const GLuint*>(&ssbo)); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).shader_storage_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } } -OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, - std::size_t size) { - DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); - const GLuint cbuf = cbufs[cbuf_cursor++]; +void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, + u32 size, bool is_written) { + if (use_assembly_shaders) { + const BindlessSSBO ssbo{ + .address = buffer.HostGpuAddr() + offset, + .length = static_cast<GLsizei>(size), + .padding = 0, + }; + buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); + glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1, + reinterpret_cast<const GLuint*>(&ssbo)); + } else if (size == 0) { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0); + } else { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + } +} - glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); - return {cbuf, 0, 0}; +void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, + u32 size) { + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, buffer.Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 17ee90316..d8b20a9af 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -5,79 +5,157 @@ #pragma once #include <array> -#include <memory> +#include <span> +#include "common/alignment.h" #include "common/common_types.h" +#include "common/dynamic_library.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/engines/maxwell_3d.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" -namespace Core { -class System; -} - namespace OpenGL { -class Device; -class OGLStreamBuffer; -class RasterizerOpenGL; -class StateTracker; +class BufferCacheRuntime; -class Buffer : public VideoCommon::BufferBlock { +class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> { public: - explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_); - ~Buffer(); + explicit Buffer(BufferCacheRuntime&, VideoCore::RasterizerInterface& rasterizer, VAddr cpu_addr, + u64 size_bytes); + explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams); - void Upload(std::size_t offset, std::size_t data_size, const u8* data); + void ImmediateUpload(size_t offset, std::span<const u8> data) noexcept; - void Download(std::size_t offset, std::size_t data_size, u8* data); + void ImmediateDownload(size_t offset, std::span<u8> data) noexcept; - void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t copy_size); + void MakeResident(GLenum access) noexcept; - GLuint Handle() const noexcept { - return gl_buffer.handle; + [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept { + return address; } - u64 Address() const noexcept { - return gpu_address; + [[nodiscard]] GLuint Handle() const noexcept { + return buffer.handle; } private: - OGLBuffer gl_buffer; - OGLBuffer read_buffer; - u64 gpu_address = 0; + GLuint64EXT address = 0; + OGLBuffer buffer; + GLenum current_residency_access = GL_NONE; }; -using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; -class OGLBufferCache final : public GenericBufferCache { +class BufferCacheRuntime { + friend Buffer; + public: - explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - const Device& device, OGLStreamBuffer& stream_buffer, - StateTracker& state_tracker); - ~OGLBufferCache(); + static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max(); + + explicit BufferCacheRuntime(const Device& device_); + + void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies); + + void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size); + + void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride); + + void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size); + + void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size); + + void BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size, + bool is_written); + + void BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size, + bool is_written); + + void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size); + + void BindFastUniformBuffer(size_t stage, u32 binding_index, u32 size) { + if (use_assembly_shaders) { + const GLuint handle = fast_uniforms[stage][binding_index].handle; + const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size); + glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, gl_size); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, + fast_uniforms[stage][binding_index].handle, 0, + static_cast<GLsizeiptr>(size)); + } + } - BufferInfo GetEmptyBuffer(std::size_t) override; + void PushFastUniformBuffer(size_t stage, u32 binding_index, std::span<const u8> data) { + if (use_assembly_shaders) { + glProgramBufferParametersIuivNV( + PABO_LUT[stage], binding_index, 0, + static_cast<GLsizei>(data.size_bytes() / sizeof(GLuint)), + reinterpret_cast<const GLuint*>(data.data())); + } else { + glNamedBufferSubData(fast_uniforms[stage][binding_index].handle, 0, + static_cast<GLsizeiptr>(data.size_bytes()), data.data()); + } + } - void Acquire() noexcept { - cbuf_cursor = 0; + std::span<u8> BindMappedUniformBuffer(size_t stage, u32 binding_index, u32 size) noexcept { + const auto [mapped_span, offset] = stream_buffer->Request(static_cast<size_t>(size)); + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, stream_buffer->Handle(), + static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size)); + return mapped_span; } -protected: - std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; + [[nodiscard]] const GLvoid* IndexOffset() const noexcept { + return reinterpret_cast<const GLvoid*>(static_cast<uintptr_t>(index_buffer_offset)); + } - BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; + [[nodiscard]] bool HasFastBufferSubData() const noexcept { + return has_fast_buffer_sub_data; + } private: - static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + static constexpr std::array PABO_LUT{ + GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, + GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, + GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, + }; const Device& device; - std::size_t cbuf_cursor = 0; - std::array<GLuint, NUM_CBUFS> cbufs{}; + bool has_fast_buffer_sub_data = false; + bool use_assembly_shaders = false; + bool has_unified_vertex_buffers = false; + + u32 max_attributes = 0; + + std::optional<StreamBuffer> stream_buffer; + + std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>, + VideoCommon::NUM_STAGES> + fast_uniforms; + std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>, + VideoCommon::NUM_STAGES> + copy_uniforms; + std::array<OGLBuffer, VideoCommon::NUM_COMPUTE_UNIFORM_BUFFERS> copy_compute_uniforms; + + u32 index_buffer_offset = 0; +}; + +struct BufferCacheParams { + using Runtime = OpenGL::BufferCacheRuntime; + using Buffer = OpenGL::Buffer; + + static constexpr bool IS_OPENGL = true; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; + static constexpr bool USE_MEMORY_MAPS = false; }; +using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 04c267ee4..48d5c4a5e 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -21,9 +21,7 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { - namespace { - // One uniform block is reserved for emulation purposes constexpr u32 ReservedUniformBlocks = 1; @@ -197,11 +195,13 @@ bool IsASTCSupported() { const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); } - } // Anonymous namespace -Device::Device() - : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { +Device::Device() { + if (!GLAD_GL_VERSION_4_6) { + LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available"); + throw std::runtime_error{"Insufficient version"}; + } const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); @@ -217,6 +217,9 @@ Device::Device() "Beta driver 443.24 is known to have issues. There might be performance issues."); disable_fast_buffer_sub_data = true; } + + max_uniform_buffers = BuildMaxUniformBuffers(); + base_bindings = BuildBaseBindings(); uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 9141de635..ee053776d 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -10,11 +10,9 @@ namespace OpenGL { -static constexpr u32 EmulationUniformBlockBinding = 0; - -class Device final { +class Device { public: - struct BaseBindings final { + struct BaseBindings { u32 uniform_buffer{}; u32 shader_storage_buffer{}; u32 sampler{}; diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp index 3e9c922f5..151290101 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.cpp +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -47,7 +47,7 @@ void GLInnerFence::Wait() { FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, TextureCache& texture_cache_, - OGLBufferCache& buffer_cache_, QueryCache& query_cache_) + BufferCache& buffer_cache_, QueryCache& query_cache_) : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {} Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h index 30dbee613..e714aa115 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.h +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -32,14 +32,13 @@ private: }; using Fence = std::shared_ptr<GLInnerFence>; -using GenericFenceManager = - VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>; +using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>; class FenceManagerOpenGL final : public GenericFenceManager { public: - explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, - TextureCache& texture_cache_, OGLBufferCache& buffer_cache_, - QueryCache& query_cache_); + explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCache& texture_cache, BufferCache& buffer_cache, + QueryCache& query_cache); protected: Fence CreateFence(u32 value, bool is_stubbed) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index ea4ca9a82..ecffc6abf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -44,28 +44,14 @@ using VideoCore::Surface::PixelFormat; using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceType; -MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_Clears, "OpenGL", "Clears", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); -MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100)); +MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Management", MP_RGB(100, 255, 100)); namespace { -constexpr size_t NUM_CONST_BUFFERS_PER_STAGE = 18; -constexpr size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = - NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; -constexpr size_t TOTAL_CONST_BUFFER_BYTES = - NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; - constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; -constexpr size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; struct TextureHandle { constexpr TextureHandle(u32 data, bool via_header_index) { @@ -101,20 +87,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); } -std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry) { - if (!entry.IsIndirect()) { - return entry.GetSize(); - } - if (buffer.size > Maxwell::MaxConstBufferSize) { - LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size, - Maxwell::MaxConstBufferSize); - return Maxwell::MaxConstBufferSize; - } - - return buffer.size; -} - /// Translates hardware transform feedback indices /// @param location Hardware location /// @return Pair of ARB_transform_feedback3 token stream first and third arguments @@ -147,14 +119,6 @@ void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } -void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) { - if (num_ssbos == 0) { - return; - } - glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos), - reinterpret_cast<const GLuint*>(ssbos)); -} - ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { if (entry.is_buffer) { return ImageViewType::Buffer; @@ -201,44 +165,28 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()), kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), - stream_buffer(device, state_tracker), texture_cache_runtime(device, program_manager, state_tracker), texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), + buffer_cache_runtime(device), + buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device), query_cache(*this, maxwell3d, gpu_memory), - buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), async_shaders(emu_window_) { - unified_uniform_buffer.Create(); - glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); - - if (device.UseAssemblyShaders()) { - glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); - for (const GLuint cbuf : staging_cbufs) { - glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize), - nullptr, 0); - } - } if (device.UseAsynchronousShaders()) { async_shaders.AllocateWorkers(); } } -RasterizerOpenGL::~RasterizerOpenGL() { - if (device.UseAssemblyShaders()) { - glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); - } -} +RasterizerOpenGL::~RasterizerOpenGL() = default; -void RasterizerOpenGL::SetupVertexFormat() { +void RasterizerOpenGL::SyncVertexFormats() { auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexFormats]) { return; } flags[Dirty::VertexFormats] = false; - MICROPROFILE_SCOPE(OpenGL_VAO); - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables // the first 16 vertex attributes always, as we don't know which ones are actually used until // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to @@ -274,55 +222,7 @@ void RasterizerOpenGL::SetupVertexFormat() { } } -void RasterizerOpenGL::SetupVertexBuffer() { - auto& flags = maxwell3d.dirty.flags; - if (!flags[Dirty::VertexBuffers]) { - return; - } - flags[Dirty::VertexBuffers] = false; - - MICROPROFILE_SCOPE(OpenGL_VB); - - const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); - - // Upload all guest vertex arrays sequentially to our buffer - const auto& regs = maxwell3d.regs; - for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) { - if (!flags[Dirty::VertexBuffer0 + index]) { - continue; - } - flags[Dirty::VertexBuffer0 + index] = false; - - const auto& vertex_array = regs.vertex_array[index]; - if (!vertex_array.IsEnabled()) { - continue; - } - - const GPUVAddr start = vertex_array.StartAddress(); - const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - ASSERT(end >= start); - - const GLuint gl_index = static_cast<GLuint>(index); - const u64 size = end - start; - if (size == 0) { - glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); - if (use_unified_memory) { - glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); - } - continue; - } - const auto info = buffer_cache.UploadMemory(start, size); - if (use_unified_memory) { - glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); - glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, - info.address + info.offset, size); - } else { - glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); - } - } -} - -void RasterizerOpenGL::SetupVertexInstances() { +void RasterizerOpenGL::SyncVertexInstances() { auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexInstances]) { return; @@ -343,17 +243,7 @@ void RasterizerOpenGL::SetupVertexInstances() { } } -GLintptr RasterizerOpenGL::SetupIndexBuffer() { - MICROPROFILE_SCOPE(OpenGL_Index); - const auto& regs = maxwell3d.regs; - const std::size_t size = CalculateIndexBufferSize(); - const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); - return info.offset; -} - -void RasterizerOpenGL::SetupShaders() { - MICROPROFILE_SCOPE(OpenGL_Shader); +void RasterizerOpenGL::SetupShaders(bool is_indexed) { u32 clip_distances = 0; std::array<Shader*, Maxwell::MaxShaderStage> shaders{}; @@ -410,11 +300,19 @@ void RasterizerOpenGL::SetupShaders() { const size_t stage = index == 0 ? 0 : index - 1; shaders[stage] = shader; - SetupDrawConstBuffers(stage, shader); - SetupDrawGlobalMemory(stage, shader); SetupDrawTextures(shader, stage); SetupDrawImages(shader, stage); + buffer_cache.SetEnabledUniformBuffers(stage, shader->GetEntries().enabled_uniform_buffers); + + buffer_cache.UnbindGraphicsStorageBuffers(stage); + u32 ssbo_index = 0; + for (const auto& buffer : shader->GetEntries().global_memory_entries) { + buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index, + buffer.cbuf_offset, buffer.is_written); + ++ssbo_index; + } + // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the @@ -430,43 +328,26 @@ void RasterizerOpenGL::SetupShaders() { SyncClipEnabled(clip_distances); maxwell3d.dirty.flags[Dirty::Shaders] = false; + buffer_cache.UpdateGraphicsBuffers(is_indexed); + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + buffer_cache.BindHostGeometryBuffers(is_indexed); + size_t image_view_index = 0; size_t texture_index = 0; size_t image_index = 0; for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { const Shader* const shader = shaders[stage]; - if (shader) { - const auto base = device.GetBaseBindings(stage); - BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index, - texture_index, image_index); - } - } -} - -std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { - const auto& regs = maxwell3d.regs; - - std::size_t size = 0; - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!regs.vertex_array[index].IsEnabled()) + if (!shader) { continue; - - const GPUVAddr start = regs.vertex_array[index].StartAddress(); - const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - - size += end - start; - ASSERT(end >= start); + } + buffer_cache.BindHostStageBuffers(stage); + const auto& base = device.GetBaseBindings(stage); + BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index, + texture_index, image_index); } - - return size; -} - -std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const { - return static_cast<std::size_t>(maxwell3d.regs.index_array.count) * - static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes()); } void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, @@ -475,6 +356,7 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s } void RasterizerOpenGL::Clear() { + MICROPROFILE_SCOPE(OpenGL_Clears); if (!maxwell3d.ShouldExecute()) { return; } @@ -525,11 +407,9 @@ void RasterizerOpenGL::Clear() { } UNIMPLEMENTED_IF(regs.clear_flags.viewport); - { - auto lock = texture_cache.AcquireLock(); - texture_cache.UpdateRenderTargets(true); - state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); - } + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UpdateRenderTargets(true); + state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); if (use_color) { glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); @@ -541,7 +421,6 @@ void RasterizerOpenGL::Clear() { } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } - ++num_queued_commands; } @@ -550,75 +429,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { query_cache.UpdateCounters(); - SyncViewport(); - SyncRasterizeEnable(); - SyncPolygonModes(); - SyncColorMask(); - SyncFragmentColorClampState(); - SyncMultiSampleState(); - SyncDepthTestState(); - SyncDepthClamp(); - SyncStencilTestState(); - SyncBlendState(); - SyncLogicOpState(); - SyncCullMode(); - SyncPrimitiveRestart(); - SyncScissorTest(); - SyncPointState(); - SyncLineState(); - SyncPolygonOffset(); - SyncAlphaTest(); - SyncFramebufferSRGB(); - - buffer_cache.Acquire(); - current_cbuf = 0; - - std::size_t buffer_size = CalculateVertexArraysSize(); - - // Add space for index buffer - if (is_indexed) { - buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize(); - } - - // Uniform space for the 5 shader stages - buffer_size = - Common::AlignUp<std::size_t>(buffer_size, 4) + - (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; - - // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - - // Prepare the vertex array. - buffer_cache.Map(buffer_size); - - // Prepare vertex array format. - SetupVertexFormat(); - - // Upload vertex and index data. - SetupVertexBuffer(); - SetupVertexInstances(); - GLintptr index_buffer_offset = 0; - if (is_indexed) { - index_buffer_offset = SetupIndexBuffer(); - } - - // Setup emulation uniform buffer. - if (!device.UseAssemblyShaders()) { - MaxwellUniformData ubo; - ubo.SetFromRegs(maxwell3d); - const auto info = - buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset, - static_cast<GLsizeiptr>(sizeof(ubo))); - } + SyncState(); // Setup shaders and their used resources. - auto lock = texture_cache.AcquireLock(); - SetupShaders(); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + SetupShaders(is_indexed); - // Signal the buffer cache that we are not going to upload more things. - buffer_cache.Unmap(); texture_cache.UpdateRenderTargets(false); state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); program_manager.BindGraphicsPipeline(); @@ -632,7 +448,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { if (is_indexed) { const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base); const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count); - const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); + const GLvoid* const offset = buffer_cache_runtime.IndexOffset(); const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format); if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { glDrawElements(primitive_mode, num_vertices, format, offset); @@ -672,22 +488,22 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - buffer_cache.Acquire(); - current_cbuf = 0; - Shader* const kernel = shader_cache.GetComputeKernel(code_addr); - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; BindComputeTextures(kernel); - const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - buffer_cache.Map(buffer_size); - - SetupComputeConstBuffers(kernel); - SetupComputeGlobalMemory(kernel); - - buffer_cache.Unmap(); + const auto& entries = kernel->GetEntries(); + buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers); + buffer_cache.UnbindComputeStorageBuffers(); + u32 ssbo_index = 0; + for (const auto& buffer : entries.global_memory_entries) { + buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset, + buffer.is_written); + ++ssbo_index; + } + buffer_cache.UpdateComputeBuffers(); + buffer_cache.BindHostComputeBuffers(); const auto& launch_desc = kepler_compute.launch_description; glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); @@ -703,6 +519,12 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, query_cache.Query(gpu_addr, type, timestamp); } +void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size); +} + void RasterizerOpenGL::FlushAll() {} void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { @@ -711,19 +533,23 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { return; } { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.DownloadMemory(addr, size); } - buffer_cache.FlushRegion(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.DownloadMemory(addr, size); + } query_cache.FlushRegion(addr, size); } bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; if (!Settings::IsGPULevelHigh()) { - return buffer_cache.MustFlushRegion(addr, size); + return buffer_cache.IsRegionGpuModified(addr, size); } return texture_cache.IsRegionGpuModified(addr, size) || - buffer_cache.MustFlushRegion(addr, size); + buffer_cache.IsRegionGpuModified(addr, size); } void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { @@ -732,11 +558,14 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { return; } { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } shader_cache.InvalidateRegion(addr, size); - buffer_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size); } @@ -745,26 +574,35 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } + shader_cache.OnCPUWrite(addr, size); { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); } - shader_cache.OnCPUWrite(addr, size); - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.CachedWriteMemory(addr, size); + } } void RasterizerOpenGL::SyncGuestHost() { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - buffer_cache.SyncGuestHost(); shader_cache.SyncGuestHost(); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.FlushCachedWrites(); + } } void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.UnmapMemory(addr, size); } - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } shader_cache.OnCPUWrite(addr, size); } @@ -799,14 +637,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { } void RasterizerOpenGL::WaitForIdle() { - // Place a barrier on everything that is not framebuffer related. - // This is related to another flag that is not currently implemented. - glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT | - GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT | - GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | - GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT | - GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT | - GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT); + glMemoryBarrier(GL_ALL_BARRIER_BITS); } void RasterizerOpenGL::FragmentBarrier() { @@ -831,18 +662,21 @@ void RasterizerOpenGL::TickFrame() { num_queued_commands = 0; fence_manager.TickFrame(); - buffer_cache.TickFrame(); { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.TickFrame(); } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.TickFrame(); + } } bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { MICROPROFILE_SCOPE(OpenGL_Blits); - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.BlitImage(dst, src, copy_config); return true; } @@ -854,7 +688,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, } MICROPROFILE_SCOPE(OpenGL_CacheManagement); - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)}; if (!image_view) { return false; @@ -921,166 +755,6 @@ void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_te } } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { - static constexpr std::array PARAMETER_LUT{ - GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, - GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, - GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, - }; - MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& stages = maxwell3d.state.shader_stages; - const auto& shader_stage = stages[stage_index]; - const auto& entries = shader->GetEntries(); - const bool use_unified = entries.use_unified_uniforms; - const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; - - const auto base_bindings = device.GetBaseBindings(stage_index); - u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; - for (const auto& entry : entries.const_buffers) { - const u32 index = entry.GetIndex(); - const auto& buffer = shader_stage.const_buffers[index]; - SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, - base_unified_offset + index * Maxwell::MaxConstBufferSize); - ++binding; - } - if (use_unified) { - const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer + - entries.global_memory_entries.size()); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, - base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); - } -} - -void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { - MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& launch_desc = kepler_compute.launch_description; - const auto& entries = kernel->GetEntries(); - const bool use_unified = entries.use_unified_uniforms; - - u32 binding = 0; - for (const auto& entry : entries.const_buffers) { - const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; - const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); - Tegra::Engines::ConstBufferInfo buffer; - buffer.address = config.Address(); - buffer.size = config.size; - buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, - use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); - ++binding; - } - if (use_unified) { - const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size()); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, - NUM_CONST_BUFFERS_BYTES_PER_STAGE); - } -} - -void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, - const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry, bool use_unified, - std::size_t unified_offset) { - if (!buffer.enabled) { - // Set values to zero to unbind buffers - if (device.UseAssemblyShaders()) { - glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); - } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float)); - } - return; - } - - // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140 - // UBO alignment requirements. - const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - - const bool fast_upload = !use_unified && device.HasFastBufferSubData(); - - const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); - const GPUVAddr gpu_addr = buffer.address; - auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); - - if (device.UseAssemblyShaders()) { - UNIMPLEMENTED_IF(use_unified); - if (info.offset != 0) { - const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; - glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); - info.handle = staging_cbuf; - info.offset = 0; - } - glBindBufferRangeNV(stage, binding, info.handle, info.offset, size); - return; - } - - if (use_unified) { - glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, - unified_offset, size); - } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size); - } -} - -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { - static constexpr std::array TARGET_LUT = { - GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, - GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, - }; - const auto& cbufs{maxwell3d.state.shader_stages[stage_index]}; - const auto& entries{shader->GetEntries().global_memory_entries}; - - std::array<BindlessSSBO, 32> ssbos; - ASSERT(entries.size() < ssbos.size()); - - const bool assembly_shaders = device.UseAssemblyShaders(); - u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : entries) { - const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; - const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; - const u32 size{gpu_memory.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); - ++binding; - } - if (assembly_shaders) { - UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size()); - } -} - -void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { - const auto& cbufs{kepler_compute.launch_description.const_buffer_config}; - const auto& entries{kernel->GetEntries().global_memory_entries}; - - std::array<BindlessSSBO, 32> ssbos; - ASSERT(entries.size() < ssbos.size()); - - u32 binding = 0; - for (const auto& entry : entries) { - const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; - const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; - const u32 size{gpu_memory.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); - ++binding; - } - if (device.UseAssemblyShaders()) { - UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size()); - } -} - -void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, - GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) { - const size_t alignment{device.GetShaderStorageBufferAlignment()}; - const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); - if (device.UseAssemblyShaders()) { - *ssbo = BindlessSSBO{ - .address = static_cast<GLuint64EXT>(info.address + info.offset), - .length = static_cast<GLsizei>(size), - .padding = 0, - }; - } else { - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, - static_cast<GLsizeiptr>(size)); - } -} - void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) { const bool via_header_index = maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; @@ -1128,6 +802,30 @@ void RasterizerOpenGL::SetupComputeImages(const Shader* shader) { } } +void RasterizerOpenGL::SyncState() { + SyncViewport(); + SyncRasterizeEnable(); + SyncPolygonModes(); + SyncColorMask(); + SyncFragmentColorClampState(); + SyncMultiSampleState(); + SyncDepthTestState(); + SyncDepthClamp(); + SyncStencilTestState(); + SyncBlendState(); + SyncLogicOpState(); + SyncCullMode(); + SyncPrimitiveRestart(); + SyncScissorTest(); + SyncPointState(); + SyncLineState(); + SyncPolygonOffset(); + SyncAlphaTest(); + SyncFramebufferSRGB(); + SyncVertexFormats(); + SyncVertexInstances(); +} + void RasterizerOpenGL::SyncViewport() { auto& flags = maxwell3d.dirty.flags; const auto& regs = maxwell3d.regs; @@ -1163,9 +861,11 @@ void RasterizerOpenGL::SyncViewport() { if (regs.screen_y_control.y_negate != 0) { flip_y = !flip_y; } - glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT, - regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE - : GL_NEGATIVE_ONE_TO_ONE); + const bool is_zero_to_one = regs.depth_mode == Maxwell::DepthMode::ZeroToOne; + const GLenum origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT; + const GLenum depth = is_zero_to_one ? GL_ZERO_TO_ONE : GL_NEGATIVE_ONE_TO_ONE; + state_tracker.ClipControl(origin, depth); + state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0); } if (dirty_viewport) { @@ -1649,36 +1349,13 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { if (regs.tfb_enabled == 0) { return; } - if (device.UseAssemblyShaders()) { SyncTransformFeedback(); } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); - - for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { - const auto& binding = regs.tfb_bindings[index]; - if (!binding.buffer_enable) { - if (enabled_transform_feedback_buffers[index]) { - glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0, - 0); - } - enabled_transform_feedback_buffers[index] = false; - continue; - } - enabled_transform_feedback_buffers[index] = true; - - auto& tfb_buffer = transform_feedback_buffers[index]; - tfb_buffer.Create(); - - const GLuint handle = tfb_buffer.handle; - const std::size_t size = binding.buffer_size; - glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY); - glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0, - static_cast<GLsizeiptr>(size)); - } + UNIMPLEMENTED_IF(primitive_mode != GL_POINTS); // We may have to call BeginTransformFeedbackNV here since they seem to call different // implementations on Nvidia's driver (the pointer is different) but we are using @@ -1692,23 +1369,7 @@ void RasterizerOpenGL::EndTransformFeedback() { if (regs.tfb_enabled == 0) { return; } - glEndTransformFeedback(); - - for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { - const auto& binding = regs.tfb_bindings[index]; - if (!binding.buffer_enable) { - continue; - } - UNIMPLEMENTED_IF(binding.buffer_offset != 0); - - const GLuint handle = transform_feedback_buffers[index].handle; - const GPUVAddr gpu_addr = binding.Address(); - const std::size_t size = binding.buffer_size; - const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, - static_cast<GLsizeiptr>(size)); - } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 82e03e677..3745cf637 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -30,7 +30,6 @@ #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" -#include "video_core/renderer_opengl/gl_stream_buffer.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/shader/async_shaders.h" #include "video_core/textures/texture.h" @@ -72,6 +71,7 @@ public: void DispatchCompute(GPUVAddr code_addr) override; void ResetCounter(VideoCore::QueryType type) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; bool MustFlushRegion(VAddr addr, u64 size) override; @@ -119,27 +119,6 @@ private: void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image, size_t& image_view_index, size_t& texture_index, size_t& image_index); - /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); - - /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(Shader* kernel); - - /// Configures a constant buffer. - void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry, bool use_unified, - std::size_t unified_offset); - - /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); - - /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(Shader* kernel); - - /// Configures a global memory buffer. - void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, - size_t size, BindlessSSBO* ssbo); - /// Configures the current textures to use for the draw command. void SetupDrawTextures(const Shader* shader, size_t stage_index); @@ -152,6 +131,9 @@ private: /// Configures images in a compute shader. void SetupComputeImages(const Shader* shader); + /// Syncs state to match guest's + void SyncState(); + /// Syncs the viewport and depth range to match the guest state void SyncViewport(); @@ -215,6 +197,12 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Syncs vertex formats to match the guest state + void SyncVertexFormats(); + + /// Syncs vertex instances to match the guest state + void SyncVertexInstances(); + /// Syncs transform feedback state to match guest state /// @note Only valid on assembly shaders void SyncTransformFeedback(); @@ -225,19 +213,7 @@ private: /// End a transform feedback void EndTransformFeedback(); - std::size_t CalculateVertexArraysSize() const; - - std::size_t CalculateIndexBufferSize() const; - - /// Updates the current vertex format - void SetupVertexFormat(); - - void SetupVertexBuffer(); - void SetupVertexInstances(); - - GLintptr SetupIndexBuffer(); - - void SetupShaders(); + void SetupShaders(bool is_indexed); Tegra::GPU& gpu; Tegra::Engines::Maxwell3D& maxwell3d; @@ -249,12 +225,12 @@ private: ProgramManager& program_manager; StateTracker& state_tracker; - OGLStreamBuffer stream_buffer; TextureCacheRuntime texture_cache_runtime; TextureCache texture_cache; + BufferCacheRuntime buffer_cache_runtime; + BufferCache buffer_cache; ShaderCacheOpenGL shader_cache; QueryCache query_cache; - OGLBufferCache buffer_cache; FenceManagerOpenGL fence_manager; VideoCommon::Shader::AsyncShaders async_shaders; @@ -262,20 +238,8 @@ private: boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices; std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids; boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles; - std::array<GLuint, MAX_TEXTURES> texture_handles; - std::array<GLuint, MAX_IMAGES> image_handles; - - std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> - transform_feedback_buffers; - std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> - enabled_transform_feedback_buffers; - - static constexpr std::size_t NUM_CONSTANT_BUFFERS = - Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; - std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; - std::size_t current_cbuf = 0; - OGLBuffer unified_uniform_buffer; + std::array<GLuint, MAX_TEXTURES> texture_handles{}; + std::array<GLuint, MAX_IMAGES> image_handles{}; /// Number of commands queued to the OpenGL driver. Resetted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 0e34a0f20..3428e5e21 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -171,12 +171,6 @@ void OGLBuffer::Release() { handle = 0; } -void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) { - ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; }); - - glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY); -} - void OGLSync::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index f48398669..552d79db4 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -234,9 +234,6 @@ public: /// Deletes the internal OpenGL resource void Release(); - // Converts the buffer into a stream copy buffer with a fixed size - void MakeStreamCopy(std::size_t buffer_size); - GLuint handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index c35b71b6b..ac78d344c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -64,7 +64,7 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument> constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); -constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt +constexpr std::string_view COMMON_DECLARATIONS = R"(#define ftoi floatBitsToInt #define ftou floatBitsToUint #define itof intBitsToFloat #define utof uintBitsToFloat @@ -77,10 +77,6 @@ bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{ const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); - -layout (std140, binding = {}) uniform vs_config {{ - float y_direction; -}}; )"; class ShaderWriter final { @@ -402,13 +398,6 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } -bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { - const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size()); - // We waste one UBO for emulation - const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; - return num_ubos > num_available_ubos; -} - struct GenericVaryingDescription { std::string name; u8 first_element = 0; @@ -420,9 +409,8 @@ public: explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, ShaderType stage_, std::string_view identifier_, std::string_view suffix_) - : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, identifier{identifier_}, - suffix{suffix_}, header{ir.GetHeader()}, use_unified_uniforms{ - UseUnifiedUniforms(device_, ir_, stage_)} { + : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, + identifier{identifier_}, suffix{suffix_}, header{ir.GetHeader()} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -516,7 +504,8 @@ private: if (!identifier.empty()) { code.AddLine("// {}", identifier); } - code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core"); + const bool use_compatibility = ir.UsesLegacyVaryings() || ir.UsesYNegate(); + code.AddLine("#version 440 {}", use_compatibility ? "compatibility" : "core"); code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); if (device.HasShaderBallot()) { code.AddLine("#extension GL_ARB_shader_ballot : require"); @@ -542,7 +531,7 @@ private: code.AddNewLine(); - code.AddLine(CommonDeclarations, EmulationUniformBlockBinding); + code.AddLine(COMMON_DECLARATIONS); } void DeclareVertex() { @@ -865,17 +854,6 @@ private: } void DeclareConstantBuffers() { - if (use_unified_uniforms) { - const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + - static_cast<u32>(ir.GetGlobalMemory().size()); - code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", - binding); - code.AddLine(" uint cbufs[];"); - code.AddLine("}};"); - code.AddNewLine(); - return; - } - u32 binding = device.GetBaseBindings(stage).uniform_buffer; for (const auto& [index, info] : ir.GetConstantBuffers()) { const u32 num_elements = Common::DivCeil(info.GetSize(), 4 * sizeof(u32)); @@ -1081,29 +1059,17 @@ private: if (const auto cbuf = std::get_if<CbufNode>(&*node)) { const Node offset = cbuf->GetOffset(); - const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - if (use_unified_uniforms) { - return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), - Type::Uint}; - } else { - return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4), - Type::Uint}; - } - } - - // Indirect access - if (use_unified_uniforms) { - return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, - Visit(offset).AsUint()), + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), Type::Uint}; } + // Indirect access const std::string final_offset = code.GenerateTemporary(); code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); @@ -2293,7 +2259,6 @@ private: } } } - if (header.ps.omap.depth) { // The depth output is always 2 registers after the last color output, and current_reg // already contains one past the last color register. @@ -2337,7 +2302,8 @@ private: } Expression YNegate(Operation operation) { - return {"y_direction", Type::Float}; + // Y_NEGATE is mapped to this uniform value + return {"gl_FrontMaterial.ambient.a", Type::Float}; } template <u32 element> @@ -2787,7 +2753,6 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; - const bool use_unified_uniforms; std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; @@ -3003,8 +2968,10 @@ ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType s for (std::size_t i = 0; i < std::size(clip_distances); ++i) { entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; } + for (const auto& buffer : entries.const_buffers) { + entries.enabled_uniform_buffers |= 1U << buffer.GetIndex(); + } entries.shader_length = ir.GetLength(); - entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); return entries; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index be68994bb..0397a000c 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -55,7 +55,7 @@ struct ShaderEntries { std::vector<ImageEntry> images; std::size_t shader_length{}; u32 clip_distances{}; - bool use_unified_uniforms{}; + u32 enabled_uniform_buffers{}; }; ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp index 60e6fa39f..dbdf5230f 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.cpp +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -36,16 +36,10 @@ void SetupDirtyColorMasks(Tables& tables) { FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks); } -void SetupDirtyVertexArrays(Tables& tables) { - static constexpr std::size_t num_array = 3; +void SetupDirtyVertexInstances(Tables& tables) { static constexpr std::size_t instance_base_offset = 3; for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); - const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]); - - FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers); - FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers); - const std::size_t instance_array_offset = array_offset + instance_base_offset; tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i); tables[1][instance_array_offset] = VertexInstances; @@ -217,11 +211,11 @@ void SetupDirtyMisc(Tables& tables) { StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} { auto& dirty = gpu.Maxwell3D().dirty; auto& tables = dirty.tables; - SetupDirtyRenderTargets(tables); + SetupDirtyFlags(tables); SetupDirtyColorMasks(tables); SetupDirtyViewports(tables); SetupDirtyScissors(tables); - SetupDirtyVertexArrays(tables); + SetupDirtyVertexInstances(tables); SetupDirtyVertexFormat(tables); SetupDirtyShaders(tables); SetupDirtyPolygonModes(tables); @@ -241,19 +235,6 @@ StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} SetupDirtyClipControl(tables); SetupDirtyDepthClampEnabled(tables); SetupDirtyMisc(tables); - - auto& store = dirty.on_write_stores; - store[VertexBuffers] = true; - for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { - store[VertexBuffer0 + i] = true; - } -} - -void StateTracker::InvalidateStreamBuffer() { - flags[Dirty::VertexBuffers] = true; - for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { - flags[index] = true; - } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h index 574615d3c..94c905116 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.h +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -28,10 +28,6 @@ enum : u8 { VertexFormat0, VertexFormat31 = VertexFormat0 + 31, - VertexBuffers, - VertexBuffer0, - VertexBuffer31 = VertexBuffer0 + 31, - VertexInstances, VertexInstance0, VertexInstance31 = VertexInstance0 + 31, @@ -92,8 +88,6 @@ class StateTracker { public: explicit StateTracker(Tegra::GPU& gpu); - void InvalidateStreamBuffer(); - void BindIndexBuffer(GLuint new_index_buffer) { if (index_buffer == new_index_buffer) { return; @@ -110,13 +104,32 @@ public: glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer); } + void ClipControl(GLenum new_origin, GLenum new_depth) { + if (new_origin == origin && new_depth == depth) { + return; + } + origin = new_origin; + depth = new_depth; + glClipControl(origin, depth); + } + + void SetYNegate(bool new_y_negate) { + if (new_y_negate == y_negate) { + return; + } + // Y_NEGATE is mapped to gl_FrontMaterial.ambient.a + y_negate = new_y_negate; + const std::array ambient{0.0f, 0.0f, 0.0f, y_negate ? -1.0f : 1.0f}; + glMaterialfv(GL_FRONT, GL_AMBIENT, ambient.data()); + } + void NotifyScreenDrawVertexArray() { flags[OpenGL::Dirty::VertexFormats] = true; flags[OpenGL::Dirty::VertexFormat0 + 0] = true; flags[OpenGL::Dirty::VertexFormat0 + 1] = true; - flags[OpenGL::Dirty::VertexBuffers] = true; - flags[OpenGL::Dirty::VertexBuffer0] = true; + flags[VideoCommon::Dirty::VertexBuffers] = true; + flags[VideoCommon::Dirty::VertexBuffer0] = true; flags[OpenGL::Dirty::VertexInstances] = true; flags[OpenGL::Dirty::VertexInstance0 + 0] = true; @@ -202,6 +215,9 @@ private: GLuint framebuffer = 0; GLuint index_buffer = 0; + GLenum origin = GL_LOWER_LEFT; + GLenum depth = GL_NEGATIVE_ONE_TO_ONE; + bool y_negate = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index e0819cdf2..bfb992a79 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -1,70 +1,64 @@ -// Copyright 2018 Citra Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <tuple> -#include <vector> +#include <array> +#include <memory> +#include <span> + +#include <glad/glad.h> #include "common/alignment.h" #include "common/assert.h" -#include "common/microprofile.h" -#include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" -MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", - MP_RGB(128, 128, 192)); - namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(const Device& device, StateTracker& state_tracker_) - : state_tracker{state_tracker_} { - gl_buffer.Create(); - - static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; - glNamedBufferStorage(gl_buffer.handle, BUFFER_SIZE, nullptr, flags); - mapped_ptr = static_cast<u8*>( - glMapNamedBufferRange(gl_buffer.handle, 0, BUFFER_SIZE, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); - - if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { - glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); - glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); +StreamBuffer::StreamBuffer() { + static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; + buffer.Create(); + glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer"); + glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags); + mapped_pointer = + static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags)); + for (OGLSync& sync : fences) { + sync.Create(); } } -OGLStreamBuffer::~OGLStreamBuffer() { - glUnmapNamedBuffer(gl_buffer.handle); - gl_buffer.Release(); -} - -std::pair<u8*, GLintptr> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { - ASSERT(size <= BUFFER_SIZE); - ASSERT(alignment <= BUFFER_SIZE); - mapped_size = size; - - if (alignment > 0) { - buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment); +std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept { + ASSERT(size < REGION_SIZE); + for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; + ++region) { + fences[region].Create(); } + used_iterator = iterator; - if (buffer_pos + size > BUFFER_SIZE) { - MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - glInvalidateBufferData(gl_buffer.handle); - state_tracker.InvalidateStreamBuffer(); - - buffer_pos = 0; + for (size_t region = Region(free_iterator) + 1, + region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); + region < region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); } - - return std::make_pair(mapped_ptr + buffer_pos, buffer_pos); -} - -void OGLStreamBuffer::Unmap(GLsizeiptr size) { - ASSERT(size <= mapped_size); - - if (size > 0) { - glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size); + if (iterator + size > free_iterator) { + free_iterator = iterator + size; } - - buffer_pos += size; + if (iterator + size > STREAM_BUFFER_SIZE) { + for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { + fences[region].Create(); + } + used_iterator = 0; + iterator = 0; + free_iterator = size; + + for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); + } + } + const size_t offset = iterator; + iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + return {std::span(mapped_pointer + offset, size), offset}; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index dd9cf67eb..6dbb6bfba 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -1,9 +1,12 @@ -// Copyright 2018 Citra Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once +#include <array> +#include <memory> +#include <span> #include <utility> #include <glad/glad.h> @@ -13,48 +16,35 @@ namespace OpenGL { -class Device; -class StateTracker; +class StreamBuffer { + static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024; + static constexpr size_t NUM_SYNCS = 16; + static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS; + static constexpr size_t MAX_ALIGNMENT = 256; + static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0); + static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0); + static_assert(REGION_SIZE % MAX_ALIGNMENT == 0); -class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(const Device& device, StateTracker& state_tracker_); - ~OGLStreamBuffer(); - - /* - * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes - * and the optional alignment requirement. - * If the buffer is full, the whole buffer is reallocated which invalidates old chunks. - * The return values are the pointer to the new chunk, and the offset within the buffer. - * The actual used size must be specified on unmapping the chunk. - */ - std::pair<u8*, GLintptr> Map(GLsizeiptr size, GLintptr alignment = 0); - - void Unmap(GLsizeiptr size); - - GLuint Handle() const { - return gl_buffer.handle; - } + explicit StreamBuffer(); - u64 Address() const { - return gpu_address; - } + [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept; - GLsizeiptr Size() const noexcept { - return BUFFER_SIZE; + [[nodiscard]] GLuint Handle() const noexcept { + return buffer.handle; } private: - static constexpr GLsizeiptr BUFFER_SIZE = 256 * 1024 * 1024; - - StateTracker& state_tracker; - - OGLBuffer gl_buffer; + [[nodiscard]] static size_t Region(size_t offset) noexcept { + return offset / REGION_SIZE; + } - GLuint64EXT gpu_address = 0; - GLintptr buffer_pos = 0; - GLsizeiptr mapped_size = 0; - u8* mapped_ptr = nullptr; + size_t iterator = 0; + size_t used_iterator = 0; + size_t free_iterator = 0; + u8* mapped_pointer = nullptr; + OGLBuffer buffer; + std::array<OGLSync, NUM_SYNCS> fences; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 546cb6d00..31eb54123 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -398,9 +398,6 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) { } // Anonymous namespace -ImageBufferMap::ImageBufferMap(GLuint handle_, u8* map, size_t size, OGLSync* sync_) - : span(map, size), sync{sync_}, handle{handle_} {} - ImageBufferMap::~ImageBufferMap() { if (sync) { sync->Create(); @@ -487,11 +484,11 @@ void TextureCacheRuntime::Finish() { glFinish(); } -ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { +ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { return upload_buffers.RequestMap(size, true); } -ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { +ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { return download_buffers.RequestMap(size, false); } @@ -553,15 +550,14 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, } void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, - size_t buffer_offset, std::span<const SwizzleParameters> swizzles) { switch (image.info.type) { case ImageType::e2D: - return util_shaders.BlockLinearUpload2D(image, map, buffer_offset, swizzles); + return util_shaders.BlockLinearUpload2D(image, map, swizzles); case ImageType::e3D: - return util_shaders.BlockLinearUpload3D(image, map, buffer_offset, swizzles); + return util_shaders.BlockLinearUpload3D(image, map, swizzles); case ImageType::Linear: - return util_shaders.PitchUpload(image, map, buffer_offset, swizzles); + return util_shaders.PitchUpload(image, map, swizzles); default: UNREACHABLE(); break; @@ -596,7 +592,11 @@ ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_ bool insert_fence) { const size_t index = RequestBuffer(requested_size); OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; - return ImageBufferMap(buffers[index].handle, maps[index], requested_size, sync); + return ImageBufferMap{ + .mapped_span = std::span(maps[index], requested_size), + .sync = sync, + .buffer = buffers[index].handle, + }; } size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { @@ -709,10 +709,10 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, } } -void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, +void Image::UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies) { - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.Handle()); - glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); + glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); @@ -728,23 +728,23 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, current_image_height = copy.buffer_image_height; glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); } - CopyBufferToImage(copy, buffer_offset); + CopyBufferToImage(copy, map.offset); } } -void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, +void Image::UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferCopy> copies) { for (const VideoCommon::BufferCopy& copy : copies) { - glCopyNamedBufferSubData(map.Handle(), buffer.handle, copy.src_offset + buffer_offset, + glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + map.offset, copy.dst_offset, copy.size); } } -void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, +void Image::DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies) { glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API - glBindBuffer(GL_PIXEL_PACK_BUFFER, map.Handle()); + glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); glPixelStorei(GL_PACK_ALIGNMENT, 1); u32 current_row_length = std::numeric_limits<u32>::max(); @@ -759,7 +759,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, current_image_height = copy.buffer_image_height; glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); } - CopyImageToBuffer(copy, buffer_offset); + CopyImageToBuffer(copy, map.offset); } } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 15b7c3676..874cf54f4 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -31,23 +31,13 @@ using VideoCommon::NUM_RT; using VideoCommon::Offset2D; using VideoCommon::RenderTargets; -class ImageBufferMap { -public: - explicit ImageBufferMap(GLuint handle, u8* map, size_t size, OGLSync* sync); +struct ImageBufferMap { ~ImageBufferMap(); - GLuint Handle() const noexcept { - return handle; - } - - std::span<u8> Span() const noexcept { - return span; - } - -private: - std::span<u8> span; + std::span<u8> mapped_span; + size_t offset = 0; OGLSync* sync; - GLuint handle; + GLuint buffer; }; struct FormatProperties { @@ -69,9 +59,9 @@ public: void Finish(); - ImageBufferMap MapUploadBuffer(size_t size); + ImageBufferMap UploadStagingBuffer(size_t size); - ImageBufferMap MapDownloadBuffer(size_t size); + ImageBufferMap DownloadStagingBuffer(size_t size); void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); @@ -89,7 +79,7 @@ public: Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); - void AccelerateImageUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void AccelerateImageUpload(Image& image, const ImageBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); void InsertUploadMemoryBarrier(); @@ -148,14 +138,12 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); - void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); - void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, - std::span<const VideoCommon::BufferCopy> copies); + void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferCopy> copies); - void DownloadMemory(ImageBufferMap& map, size_t buffer_offset, - std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); GLuint Handle() const noexcept { return texture.handle; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 21159e498..9d2acd4d9 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -29,9 +29,7 @@ #include "video_core/textures/decoders.h" namespace OpenGL { - namespace { - constexpr GLint PositionLocation = 0; constexpr GLint TexCoordLocation = 1; constexpr GLint ModelViewMatrixLocation = 0; @@ -124,7 +122,6 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit break; } } - } // Anonymous namespace RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, @@ -132,7 +129,17 @@ RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, std::unique_ptr<Core::Frontend::GraphicsContext> context_) : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_}, - emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {} + emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, state_tracker{gpu}, + program_manager{device}, + rasterizer(emu_window, gpu, cpu_memory, device, screen_info, program_manager, state_tracker) { + if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { + glEnable(GL_DEBUG_OUTPUT); + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); + glDebugMessageCallback(DebugHandler, nullptr); + } + AddTelemetryFields(); + InitOpenGLObjects(); +} RendererOpenGL::~RendererOpenGL() = default; @@ -148,7 +155,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { ++m_current_frame; - rasterizer->TickFrame(); + rasterizer.TickFrame(); context->SwapBuffers(); render_window.OnFrameDisplayed(); @@ -179,7 +186,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf framebuffer_crop_rect = framebuffer.crop_rect; const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; - if (rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { + if (rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { return; } @@ -267,6 +274,7 @@ void RendererOpenGL::InitOpenGLObjects() { // Enable unified vertex attributes and query vertex buffer address when the driver supports it if (device.HasVertexBufferUnifiedMemory()) { glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, @@ -289,14 +297,6 @@ void RendererOpenGL::AddTelemetryFields() { telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version)); } -void RendererOpenGL::CreateRasterizer() { - if (rasterizer) { - return; - } - rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device, - screen_info, program_manager, state_tracker); -} - void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, const Tegra::FramebufferConfig& framebuffer) { texture.width = framebuffer.width; @@ -407,6 +407,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { program_manager.BindHostPipeline(pipeline.handle); + state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); glEnable(GL_CULL_FACE); if (screen_info.display_srgb) { glEnable(GL_FRAMEBUFFER_SRGB); @@ -425,7 +426,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { glCullFace(GL_BACK); glFrontFace(GL_CW); glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); - glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), static_cast<GLfloat>(layout.height)); glDepthRangeIndexed(0, 0.0, 0.0); @@ -497,25 +497,4 @@ void RendererOpenGL::RenderScreenshot() { renderer_settings.screenshot_requested = false; } -bool RendererOpenGL::Init() { - if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { - glEnable(GL_DEBUG_OUTPUT); - glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); - glDebugMessageCallback(DebugHandler, nullptr); - } - - AddTelemetryFields(); - - if (!GLAD_GL_VERSION_4_6) { - return false; - } - - InitOpenGLObjects(); - CreateRasterizer(); - - return true; -} - -void RendererOpenGL::ShutDown() {} - } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 44e109794..cc19a110f 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -10,6 +10,7 @@ #include "common/math_util.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" @@ -63,18 +64,18 @@ public: std::unique_ptr<Core::Frontend::GraphicsContext> context_); ~RendererOpenGL() override; - bool Init() override; - void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; + VideoCore::RasterizerInterface* ReadRasterizer() override { + return &rasterizer; + } + private: /// Initializes the OpenGL state and creates persistent objects. void InitOpenGLObjects(); void AddTelemetryFields(); - void CreateRasterizer(); - void ConfigureFramebufferTexture(TextureInfo& texture, const Tegra::FramebufferConfig& framebuffer); @@ -98,8 +99,10 @@ private: Core::Memory::Memory& cpu_memory; Tegra::GPU& gpu; - const Device device; - StateTracker state_tracker{gpu}; + Device device; + StateTracker state_tracker; + ProgramManager program_manager; + RasterizerOpenGL rasterizer; // OpenGL object IDs OGLSampler present_sampler; @@ -115,9 +118,6 @@ private: /// Display information for Switch screen ScreenInfo screen_info; - /// Global dummy shader pipeline - ProgramManager program_manager; - /// OpenGL framebuffer data std::vector<u8> gl_framebuffer_data; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index eb849cbf2..1b58e8617 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -63,7 +63,7 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) UtilShaders::~UtilShaders() = default; -void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, +void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; @@ -71,13 +71,13 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); - glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); for (const SwizzleParameters& swizzle : swizzles) { const Extent3D num_tiles = swizzle.num_tiles; - const size_t input_offset = swizzle.buffer_offset + buffer_offset; + const size_t input_offset = swizzle.buffer_offset + map.offset; const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); @@ -91,8 +91,8 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s glUniform1ui(5, params.x_shift); glUniform1ui(6, params.block_height); glUniform1ui(7, params.block_height_mask); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), - input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, GL_WRITE_ONLY, store_format); glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); @@ -100,7 +100,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s program_manager.RestoreGuestCompute(); } -void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, +void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; @@ -108,14 +108,14 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s static constexpr GLuint BINDING_INPUT_BUFFER = 1; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; - glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); for (const SwizzleParameters& swizzle : swizzles) { const Extent3D num_tiles = swizzle.num_tiles; - const size_t input_offset = swizzle.buffer_offset + buffer_offset; + const size_t input_offset = swizzle.buffer_offset + map.offset; const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); @@ -132,8 +132,8 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s glUniform1ui(7, params.block_height_mask); glUniform1ui(8, params.block_depth); glUniform1ui(9, params.block_depth_mask); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), - input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, GL_WRITE_ONLY, store_format); glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z); @@ -141,7 +141,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s program_manager.RestoreGuestCompute(); } -void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, +void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_INPUT_BUFFER = 0; @@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu "Non-power of two images are not implemented"); program_manager.BindHostCompute(pitch_unswizzle_program.handle); - glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(LOC_ORIGIN, 0, 0); glUniform2i(LOC_DESTINATION, 0, 0); glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); @@ -167,13 +167,13 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format); for (const SwizzleParameters& swizzle : swizzles) { const Extent3D num_tiles = swizzle.num_tiles; - const size_t input_offset = swizzle.buffer_offset + buffer_offset; + const size_t input_offset = swizzle.buffer_offset + map.offset; const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), - input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); } program_manager.RestoreGuestCompute(); diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 359997255..7b1d16b09 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -15,21 +15,22 @@ namespace OpenGL { class Image; -class ImageBufferMap; class ProgramManager; +struct ImageBufferMap; + class UtilShaders { public: explicit UtilShaders(ProgramManager& program_manager); ~UtilShaders(); - void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, + void PitchUpload(Image& image, const ImageBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); void CopyBC4(Image& dst_image, Image& src_image, |