diff options
Diffstat (limited to 'src/video_core')
29 files changed, 992 insertions, 388 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 0101e5f0e..91df062d7 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -129,6 +129,8 @@ add_library(video_core STATIC shader/shader_ir.cpp shader/shader_ir.h shader/track.cpp + shader/transform_feedback.cpp + shader/transform_feedback.h surface.cpp surface.h texture_cache/format_lookup_table.cpp diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 8752a1cfb..8a9e9992e 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -628,19 +628,26 @@ public: float depth_range_far; }; - struct alignas(32) TransformFeedbackBinding { + struct TransformFeedbackBinding { u32 buffer_enable; u32 address_high; u32 address_low; s32 buffer_size; s32 buffer_offset; + INSERT_UNION_PADDING_WORDS(3); + + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } }; static_assert(sizeof(TransformFeedbackBinding) == 32); - struct alignas(16) TransformFeedbackLayout { + struct TransformFeedbackLayout { u32 stream; u32 varying_count; u32 stride; + INSERT_UNION_PADDING_WORDS(1); }; static_assert(sizeof(TransformFeedbackLayout) == 16); @@ -652,6 +659,10 @@ public: return shader_config[index].enable != 0; } + bool IsShaderConfigEnabled(Regs::ShaderProgram type) const { + return IsShaderConfigEnabled(static_cast<std::size_t>(type)); + } + union { struct { INSERT_UNION_PADDING_WORDS(0x45); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index c9bc83cd7..eba42deb4 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -911,14 +911,9 @@ union Instruction { } fadd32i; union { - BitField<20, 8, u64> shift_position; - BitField<28, 8, u64> shift_length; - BitField<48, 1, u64> negate_b; - BitField<49, 1, u64> negate_a; - - u64 GetLeftShiftValue() const { - return 32 - (shift_position + shift_length); - } + BitField<40, 1, u64> brev; + BitField<47, 1, u64> rd_cc; + BitField<48, 1, u64> is_signed; } bfe; union { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index ba8c9d665..64acb17df 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 { RGBA32_FLOAT = 0xC0, RGBA32_UINT = 0xC2, RGBA16_UNORM = 0xC6, + RGBA16_SNORM = 0xC7, RGBA16_UINT = 0xC9, RGBA16_FLOAT = 0xCA, RG32_FLOAT = 0xCB, diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index aea010087..073bdb491 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -174,7 +174,7 @@ private: /// End of address space, based on address space in bits. static constexpr GPUVAddr address_space_end{1ULL << address_space_width}; - Common::PageTable page_table{page_bits}; + Common::BackingPageTable page_table{page_bits}; VMAMap vma_map; VideoCore::RasterizerInterface& rasterizer; diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index f2c83266e..6d522c318 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = { MortonCopy<true, PixelFormat::R8UI>, MortonCopy<true, PixelFormat::RGBA16F>, MortonCopy<true, PixelFormat::RGBA16U>, + MortonCopy<true, PixelFormat::RGBA16S>, MortonCopy<true, PixelFormat::RGBA16UI>, MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>, @@ -131,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = { MortonCopy<false, PixelFormat::R8U>, MortonCopy<false, PixelFormat::R8UI>, MortonCopy<false, PixelFormat::RGBA16F>, + MortonCopy<false, PixelFormat::RGBA16S>, MortonCopy<false, PixelFormat::RGBA16U>, MortonCopy<false, PixelFormat::RGBA16UI>, MortonCopy<false, PixelFormat::R11FG11FB10F>, diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 8a2db8e36..1af4268a4 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -496,7 +496,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { SyncCullMode(); SyncPrimitiveRestart(); SyncScissorTest(); - SyncTransformFeedback(); SyncPointState(); SyncPolygonOffset(); SyncAlphaTest(); @@ -569,7 +568,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { glTextureBarrier(); } - ++num_queued_commands; + BeginTransformFeedback(primitive_mode); const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance); const GLsizei num_instances = @@ -608,6 +607,10 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { num_instances, base_instance); } } + + EndTransformFeedback(); + + ++num_queued_commands; } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { @@ -1290,11 +1293,6 @@ void RasterizerOpenGL::SyncScissorTest() { } } -void RasterizerOpenGL::SyncTransformFeedback() { - const auto& regs = system.GPU().Maxwell3D().regs; - UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented"); -} - void RasterizerOpenGL::SyncPointState() { auto& gpu = system.GPU().Maxwell3D(); auto& flags = gpu.dirty.flags; @@ -1370,4 +1368,62 @@ void RasterizerOpenGL::SyncFramebufferSRGB() { oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb); } +void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); + + for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { + const auto& binding = regs.tfb_bindings[index]; + if (!binding.buffer_enable) { + if (enabled_transform_feedback_buffers[index]) { + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0, + 0); + } + enabled_transform_feedback_buffers[index] = false; + continue; + } + enabled_transform_feedback_buffers[index] = true; + + auto& tfb_buffer = transform_feedback_buffers[index]; + tfb_buffer.Create(); + + const GLuint handle = tfb_buffer.handle; + const std::size_t size = binding.buffer_size; + glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY); + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0, + static_cast<GLsizeiptr>(size)); + } + + glBeginTransformFeedback(GL_POINTS); +} + +void RasterizerOpenGL::EndTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + glEndTransformFeedback(); + + for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { + const auto& binding = regs.tfb_bindings[index]; + if (!binding.buffer_enable) { + continue; + } + UNIMPLEMENTED_IF(binding.buffer_offset != 0); + + const GLuint handle = transform_feedback_buffers[index].handle; + const GPUVAddr gpu_addr = binding.Address(); + const std::size_t size = binding.buffer_size; + const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size)); + } +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index e6424f5d2..2d3be2437 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -168,9 +168,6 @@ private: /// Syncs the scissor test state to match the guest state void SyncScissorTest(); - /// Syncs the transform feedback state to match the guest state - void SyncTransformFeedback(); - /// Syncs the point state to match the guest state void SyncPointState(); @@ -192,6 +189,12 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Begin a transform feedback + void BeginTransformFeedback(GLenum primitive_mode); + + /// End a transform feedback + void EndTransformFeedback(); + /// Check for extension that are not strictly required but are needed for correct emulation void CheckExtensions(); @@ -229,6 +232,11 @@ private: BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; + std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> + transform_feedback_buffers; + std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> + enabled_transform_feedback_buffers; + /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 19d6f3dcb..849839fe3 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -23,6 +23,7 @@ #include "video_core/shader/ast.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader/transform_feedback.h" namespace OpenGL { @@ -36,6 +37,7 @@ using Tegra::Shader::IpaInterpMode; using Tegra::Shader::IpaMode; using Tegra::Shader::IpaSampleMode; using Tegra::Shader::Register; +using VideoCommon::Shader::BuildTransformFeedback; using VideoCommon::Shader::Registry; using namespace std::string_literals; @@ -49,6 +51,11 @@ class ExprDecompiler; enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat }; +constexpr std::array FLOAT_TYPES{"float", "vec2", "vec3", "vec4"}; + +constexpr std::string_view INPUT_ATTRIBUTE_NAME = "in_attr"; +constexpr std::string_view OUTPUT_ATTRIBUTE_NAME = "out_attr"; + struct TextureOffset {}; struct TextureDerivates {}; using TextureArgument = std::pair<Type, Node>; @@ -390,12 +397,22 @@ std::string FlowStackTopName(MetaStackClass stack) { return stage == ShaderType::Vertex; } +struct GenericVaryingDescription { + std::string name; + u8 first_element = 0; + bool is_scalar = false; +}; + class GLSLDecompiler final { public: explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, ShaderType stage, std::string_view identifier, std::string_view suffix) : device{device}, ir{ir}, registry{registry}, stage{stage}, - identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {} + identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { + if (stage != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + } + } void Decompile() { DeclareHeader(); @@ -403,17 +420,17 @@ public: DeclareGeometry(); DeclareFragment(); DeclareCompute(); - DeclareRegisters(); - DeclareCustomVariables(); - DeclarePredicates(); - DeclareLocalMemory(); - DeclareInternalFlags(); DeclareInputAttributes(); DeclareOutputAttributes(); - DeclareConstantBuffers(); - DeclareGlobalMemory(); - DeclareSamplers(); DeclareImages(); + DeclareSamplers(); + DeclareGlobalMemory(); + DeclareConstantBuffers(); + DeclareLocalMemory(); + DeclareRegisters(); + DeclarePredicates(); + DeclareInternalFlags(); + DeclareCustomVariables(); DeclarePhysicalAttributeReader(); code.AddLine("void main() {{"); @@ -485,7 +502,7 @@ private: if (!identifier.empty()) { code.AddLine("// {}", identifier); } - code.AddLine("#version 430 core"); + code.AddLine("#version 440 core"); code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); if (device.HasShaderBallot()) { code.AddLine("#extension GL_ARB_shader_ballot : require"); @@ -570,7 +587,13 @@ private: code.AddLine("out gl_PerVertex {{"); ++code.scope; - code.AddLine("vec4 gl_Position;"); + auto pos_xfb = GetTransformFeedbackDecoration(Attribute::Index::Position); + if (!pos_xfb.empty()) { + pos_xfb = fmt::format("layout ({}) ", pos_xfb); + } + const char* pos_type = + FLOAT_TYPES.at(GetNumComponents(Attribute::Index::Position).value_or(4) - 1); + code.AddLine("{}{} gl_Position;", pos_xfb, pos_type); for (const auto attribute : ir.GetOutputAttributes()) { if (attribute == Attribute::Index::ClipDistances0123 || @@ -703,7 +726,7 @@ private: void DeclareInputAttribute(Attribute::Index index, bool skip_unused) { const u32 location{GetGenericAttributeIndex(index)}; - std::string name{GetInputAttribute(index)}; + std::string name{GetGenericInputAttribute(index)}; if (stage == ShaderType::Geometry) { name = "gs_" + name + "[]"; } @@ -740,9 +763,59 @@ private: } } + std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + return it->second.components; + } + + std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; + } + + const VaryingTFB& tfb = it->second; + return fmt::format("xfb_buffer = {}, xfb_offset = {}, xfb_stride = {}", tfb.buffer, + tfb.offset, tfb.stride); + } + void DeclareOutputAttribute(Attribute::Index index) { - const u32 location{GetGenericAttributeIndex(index)}; - code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index)); + static constexpr std::string_view swizzle = "xyzw"; + u8 element = 0; + while (element < 4) { + auto xfb = GetTransformFeedbackDecoration(index, element); + if (!xfb.empty()) { + xfb = fmt::format(", {}", xfb); + } + const std::size_t remainder = 4 - element; + const std::size_t num_components = GetNumComponents(index, element).value_or(remainder); + const char* const type = FLOAT_TYPES.at(num_components - 1); + + const u32 location = GetGenericAttributeIndex(index); + + GenericVaryingDescription description; + description.first_element = static_cast<u8>(element); + description.is_scalar = num_components == 1; + description.name = AppendSuffix(location, OUTPUT_ATTRIBUTE_NAME); + if (element != 0 || num_components != 4) { + const std::string_view name_swizzle = swizzle.substr(element, num_components); + description.name = fmt::format("{}_{}", description.name, name_swizzle); + } + for (std::size_t i = 0; i < num_components; ++i) { + const u8 offset = static_cast<u8>(location * 4 + element + i); + varying_description.insert({offset, description}); + } + + code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element, + xfb, type, description.name); + + element = static_cast<u8>(static_cast<std::size_t>(element) + num_components); + } } void DeclareConstantBuffers() { @@ -1095,7 +1168,7 @@ private: return {"0", Type::Int}; default: if (IsGenericAttribute(attribute)) { - return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element), + return {GeometryPass(GetGenericInputAttribute(attribute)) + GetSwizzle(element), Type::Float}; } break; @@ -1164,8 +1237,7 @@ private: return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}}; default: if (IsGenericAttribute(attribute)) { - return { - {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}}; + return {{GetGenericOutputAttribute(attribute, abuf->GetElement()), Type::Float}}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); return {}; @@ -1937,16 +2009,19 @@ private: expr += GetSampler(meta->sampler); expr += ", "; - expr += constructors.at(operation.GetOperandsCount() - 1); + expr += constructors.at(operation.GetOperandsCount() + (meta->array ? 1 : 0) - 1); expr += '('; for (std::size_t i = 0; i < count; ++i) { - expr += VisitOperand(operation, i).AsInt(); - const std::size_t next = i + 1; - if (next == count) - expr += ')'; - else if (next < count) + if (i > 0) { expr += ", "; + } + expr += VisitOperand(operation, i).AsInt(); + } + if (meta->array) { + expr += ", "; + expr += Visit(meta->array).AsInt(); } + expr += ')'; if (meta->lod && !meta->sampler.IsBuffer()) { expr += ", "; @@ -2376,27 +2451,34 @@ private: static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); std::string GetRegister(u32 index) const { - return GetDeclarationWithSuffix(index, "gpr"); + return AppendSuffix(index, "gpr"); } std::string GetCustomVariable(u32 index) const { - return GetDeclarationWithSuffix(index, "custom_var"); + return AppendSuffix(index, "custom_var"); } std::string GetPredicate(Tegra::Shader::Pred pred) const { - return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred"); + return AppendSuffix(static_cast<u32>(pred), "pred"); } - std::string GetInputAttribute(Attribute::Index attribute) const { - return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr"); + std::string GetGenericInputAttribute(Attribute::Index attribute) const { + return AppendSuffix(GetGenericAttributeIndex(attribute), INPUT_ATTRIBUTE_NAME); } - std::string GetOutputAttribute(Attribute::Index attribute) const { - return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr"); + std::unordered_map<u8, GenericVaryingDescription> varying_description; + + std::string GetGenericOutputAttribute(Attribute::Index attribute, std::size_t element) const { + const u8 offset = static_cast<u8>(GetGenericAttributeIndex(attribute) * 4 + element); + const auto& description = varying_description.at(offset); + if (description.is_scalar) { + return description.name; + } + return fmt::format("{}[{}]", description.name, element - description.first_element); } std::string GetConstBuffer(u32 index) const { - return GetDeclarationWithSuffix(index, "cbuf"); + return AppendSuffix(index, "cbuf"); } std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const { @@ -2409,7 +2491,7 @@ private: } std::string GetConstBufferBlock(u32 index) const { - return GetDeclarationWithSuffix(index, "cbuf_block"); + return AppendSuffix(index, "cbuf_block"); } std::string GetLocalMemory() const { @@ -2434,14 +2516,14 @@ private: } std::string GetSampler(const Sampler& sampler) const { - return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); + return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler"); } std::string GetImage(const Image& image) const { - return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image"); + return AppendSuffix(static_cast<u32>(image.GetIndex()), "image"); } - std::string GetDeclarationWithSuffix(u32 index, std::string_view name) const { + std::string AppendSuffix(u32 index, std::string_view name) const { if (suffix.empty()) { return fmt::format("{}{}", name, index); } else { @@ -2477,6 +2559,7 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; + std::unordered_map<u8, VaryingTFB> transform_feedback; ShaderWriter code; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 2d3838a7a..f424e3000 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false}, // R8UI {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBA16F {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false}, // RGBA16U + {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false}, // RGBA16S {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false}, // RGBA16UI {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false}, // R11FG11FB10F {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false}, // RGBA32UI diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 12333e8c9..fca5e3ec0 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -5,8 +5,11 @@ #include <algorithm> #include <cstddef> #include <cstdlib> +#include <cstring> #include <memory> + #include <glad/glad.h> + #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" @@ -25,6 +28,8 @@ namespace OpenGL { +namespace { + // If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have // to wait on available presentation frames. constexpr std::size_t SWAP_CHAIN_SIZE = 3; @@ -41,124 +46,6 @@ struct Frame { bool is_srgb{}; /// Framebuffer is sRGB or RGB }; -/** - * For smooth Vsync rendering, we want to always present the latest frame that the core generates, - * but also make sure that rendering happens at the pace that the frontend dictates. This is a - * helper class that the renderer uses to sync frames between the render thread and the presentation - * thread - */ -class FrameMailbox { -public: - std::mutex swap_chain_lock; - std::condition_variable present_cv; - std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; - std::queue<Frame*> free_queue; - std::deque<Frame*> present_queue; - Frame* previous_frame{}; - - FrameMailbox() { - for (auto& frame : swap_chain) { - free_queue.push(&frame); - } - } - - ~FrameMailbox() { - // lock the mutex and clear out the present and free_queues and notify any people who are - // blocked to prevent deadlock on shutdown - std::scoped_lock lock{swap_chain_lock}; - std::queue<Frame*>().swap(free_queue); - present_queue.clear(); - present_cv.notify_all(); - } - - void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { - frame->present.Release(); - frame->present.Create(); - GLint previous_draw_fbo{}; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); - glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); - frame->color_reloaded = false; - } - - void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { - // Recreate the color texture attachment - frame->color.Release(); - frame->color.Create(); - const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; - glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); - - // Recreate the FBO for the render target - frame->render.Release(); - frame->render.Create(); - glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); - } - - frame->width = width; - frame->height = height; - frame->color_reloaded = true; - } - - Frame* GetRenderFrame() { - std::unique_lock lock{swap_chain_lock}; - - // If theres no free frames, we will reuse the oldest render frame - if (free_queue.empty()) { - auto frame = present_queue.back(); - present_queue.pop_back(); - return frame; - } - - Frame* frame = free_queue.front(); - free_queue.pop(); - return frame; - } - - void ReleaseRenderFrame(Frame* frame) { - std::unique_lock lock{swap_chain_lock}; - present_queue.push_front(frame); - present_cv.notify_one(); - } - - Frame* TryGetPresentFrame(int timeout_ms) { - std::unique_lock lock{swap_chain_lock}; - // wait for new entries in the present_queue - present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), - [&] { return !present_queue.empty(); }); - if (present_queue.empty()) { - // timed out waiting for a frame to draw so return the previous frame - return previous_frame; - } - - // free the previous frame and add it back to the free queue - if (previous_frame) { - free_queue.push(previous_frame); - } - - // the newest entries are pushed to the front of the queue - Frame* frame = present_queue.front(); - present_queue.pop_front(); - // remove all old entries from the present queue and move them back to the free_queue - for (auto f : present_queue) { - free_queue.push(f); - } - present_queue.clear(); - previous_frame = frame; - return frame; - } -}; - -namespace { - constexpr char VERTEX_SHADER[] = R"( #version 430 core @@ -211,6 +98,24 @@ struct ScreenRectVertex { std::array<GLfloat, 2> tex_coord; }; +/// Returns true if any debug tool is attached +bool HasDebugTool() { + const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); + if (nsight) { + return true; + } + + GLint num_extensions; + glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); + for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { + const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); + if (!std::strcmp(name, "GL_EXT_debug_tool")) { + return true; + } + } + return false; +} + /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * corner and (width, height) on the lower-bottom. @@ -294,6 +199,153 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit } // Anonymous namespace +/** + * For smooth Vsync rendering, we want to always present the latest frame that the core generates, + * but also make sure that rendering happens at the pace that the frontend dictates. This is a + * helper class that the renderer uses to sync frames between the render thread and the presentation + * thread + */ +class FrameMailbox { +public: + std::mutex swap_chain_lock; + std::condition_variable present_cv; + std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; + std::queue<Frame*> free_queue; + std::deque<Frame*> present_queue; + Frame* previous_frame{}; + + FrameMailbox() : has_debug_tool{HasDebugTool()} { + for (auto& frame : swap_chain) { + free_queue.push(&frame); + } + } + + ~FrameMailbox() { + // lock the mutex and clear out the present and free_queues and notify any people who are + // blocked to prevent deadlock on shutdown + std::scoped_lock lock{swap_chain_lock}; + std::queue<Frame*>().swap(free_queue); + present_queue.clear(); + present_cv.notify_all(); + } + + void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { + frame->present.Release(); + frame->present.Create(); + GLint previous_draw_fbo{}; + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); + glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); + } + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); + frame->color_reloaded = false; + } + + void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { + // Recreate the color texture attachment + frame->color.Release(); + frame->color.Create(); + const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; + glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); + + // Recreate the FBO for the render target + frame->render.Release(); + frame->render.Create(); + glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); + } + + frame->width = width; + frame->height = height; + frame->color_reloaded = true; + } + + Frame* GetRenderFrame() { + std::unique_lock lock{swap_chain_lock}; + + // If theres no free frames, we will reuse the oldest render frame + if (free_queue.empty()) { + auto frame = present_queue.back(); + present_queue.pop_back(); + return frame; + } + + Frame* frame = free_queue.front(); + free_queue.pop(); + return frame; + } + + void ReleaseRenderFrame(Frame* frame) { + std::unique_lock lock{swap_chain_lock}; + present_queue.push_front(frame); + present_cv.notify_one(); + + DebugNotifyNextFrame(); + } + + Frame* TryGetPresentFrame(int timeout_ms) { + DebugWaitForNextFrame(); + + std::unique_lock lock{swap_chain_lock}; + // wait for new entries in the present_queue + present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), + [&] { return !present_queue.empty(); }); + if (present_queue.empty()) { + // timed out waiting for a frame to draw so return the previous frame + return previous_frame; + } + + // free the previous frame and add it back to the free queue + if (previous_frame) { + free_queue.push(previous_frame); + } + + // the newest entries are pushed to the front of the queue + Frame* frame = present_queue.front(); + present_queue.pop_front(); + // remove all old entries from the present queue and move them back to the free_queue + for (auto f : present_queue) { + free_queue.push(f); + } + present_queue.clear(); + previous_frame = frame; + return frame; + } + +private: + std::mutex debug_synch_mutex; + std::condition_variable debug_synch_condition; + std::atomic_int frame_for_debug{}; + const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step + + /// Signal that a new frame is available (called from GPU thread) + void DebugNotifyNextFrame() { + if (!has_debug_tool) { + return; + } + frame_for_debug++; + std::lock_guard lock{debug_synch_mutex}; + debug_synch_condition.notify_one(); + } + + /// Wait for a new frame to be available (called from presentation thread) + void DebugWaitForNextFrame() { + if (!has_debug_tool) { + return; + } + const int last_frame = frame_for_debug; + std::unique_lock lock{debug_synch_mutex}; + debug_synch_condition.wait(lock, + [this, last_frame] { return frame_for_debug > last_frame; }); + } +}; + RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system) : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system}, frame_mailbox{std::make_unique<FrameMailbox>()} {} diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index df3ac707c..0e2e5e6c7 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -125,6 +125,7 @@ struct FormatTuple { {vk::Format::eR8Uint, Attachable | Storage}, // R8UI {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage}, // RGBA16F {vk::Format::eR16G16B16A16Unorm, Attachable | Storage}, // RGBA16U + {vk::Format::eR16G16B16A16Snorm, Attachable | Storage}, // RGBA16S {vk::Format::eR16G16B16A16Uint, Attachable | Storage}, // RGBA16UI {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage}, // R11FG11FB10F {vk::Format::eR32G32B32A32Uint, Attachable | Storage}, // RGBA32UI @@ -331,6 +332,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR16G16B16Unorm; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: return vk::Format::eR16G16B16A16Unorm; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return vk::Format::eA2B10G10R10UnormPack32; default: break; } @@ -364,6 +367,10 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR8G8B8A8Uint; case Maxwell::VertexAttribute::Size::Size_32: return vk::Format::eR32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32: + return vk::Format::eR32G32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32_32: + return vk::Format::eR32G32B32Uint; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return vk::Format::eR32G32B32A32Uint; default: diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 886bde3b9..28d2fbc4f 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -107,8 +107,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan features.occlusionQueryPrecise = true; features.fragmentStoresAndAtomics = true; features.shaderImageGatherExtended = true; - features.shaderStorageImageReadWithoutFormat = - is_shader_storage_img_read_without_format_supported; + features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported; features.shaderStorageImageWriteWithoutFormat = true; features.textureCompressionASTC_LDR = is_optimal_astc_supported; @@ -148,6 +147,15 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes"); } + vk::PhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback; + if (ext_transform_feedback) { + transform_feedback.transformFeedback = true; + transform_feedback.geometryStreams = true; + SetNext(next, transform_feedback); + } else { + LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks"); + } + if (!ext_depth_range_unrestricted) { LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted"); } @@ -385,7 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } }; - extensions.reserve(14); + extensions.reserve(15); extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME); extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME); @@ -397,18 +405,22 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami [[maybe_unused]] const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); - bool khr_shader_float16_int8{}; - bool ext_subgroup_size_control{}; + bool has_khr_shader_float16_int8{}; + bool has_ext_subgroup_size_control{}; + bool has_ext_transform_feedback{}; for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) { Test(extension, khr_uniform_buffer_standard_layout, VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); - Test(extension, khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); + Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, + false); Test(extension, ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); Test(extension, ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); - Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, + Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, + false); + Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false); if (Settings::values.renderer_debug) { Test(extension, nv_device_diagnostic_checkpoints, @@ -416,13 +428,13 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami } } - if (khr_shader_float16_int8) { + if (has_khr_shader_float16_int8) { is_float16_supported = GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16; extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME); } - if (ext_subgroup_size_control) { + if (has_ext_subgroup_size_control) { const auto features = GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi); const auto properties = @@ -439,6 +451,20 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami is_warp_potentially_bigger = true; } + if (has_ext_transform_feedback) { + const auto features = + GetFeatures<vk::PhysicalDeviceTransformFeedbackFeaturesEXT>(physical, dldi); + const auto properties = + GetProperties<vk::PhysicalDeviceTransformFeedbackPropertiesEXT>(physical, dldi); + + if (features.transformFeedback && features.geometryStreams && + properties.maxTransformFeedbackStreams >= 4 && properties.maxTransformFeedbackBuffers && + properties.transformFeedbackQueries && properties.transformFeedbackDraw) { + extensions.push_back(VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME); + ext_transform_feedback = true; + } + } + return extensions; } @@ -467,8 +493,7 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { const auto supported_features{physical.getFeatures(dldi)}; - is_shader_storage_img_read_without_format_supported = - supported_features.shaderStorageImageReadWithoutFormat; + is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat; is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); } @@ -510,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eR32G32Sfloat, vk::Format::eR32G32Uint, vk::Format::eR16G16B16A16Uint, + vk::Format::eR16G16B16A16Snorm, vk::Format::eR16G16B16A16Unorm, vk::Format::eR16G16Unorm, vk::Format::eR16G16Snorm, diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index 2c27ad730..6e656517f 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -122,11 +122,6 @@ public: return properties.limits.maxPushConstantsSize; } - /// Returns true if Shader storage Image Read Without Format supported. - bool IsShaderStorageImageReadWithoutFormatSupported() const { - return is_shader_storage_img_read_without_format_supported; - } - /// Returns true if ASTC is natively supported. bool IsOptimalAstcSupported() const { return is_optimal_astc_supported; @@ -147,6 +142,11 @@ public: return (guest_warp_stages & stage) != vk::ShaderStageFlags{}; } + /// Returns true if formatless image load is supported. + bool IsFormatlessImageLoadSupported() const { + return is_formatless_image_load_supported; + } + /// Returns true if the device supports VK_EXT_scalar_block_layout. bool IsKhrUniformBufferStandardLayoutSupported() const { return khr_uniform_buffer_standard_layout; @@ -167,6 +167,11 @@ public: return ext_shader_viewport_index_layer; } + /// Returns true if the device supports VK_EXT_transform_feedback. + bool IsExtTransformFeedbackSupported() const { + return ext_transform_feedback; + } + /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints. bool IsNvDeviceDiagnosticCheckpoints() const { return nv_device_diagnostic_checkpoints; @@ -214,26 +219,26 @@ private: static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties( const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical); - const vk::PhysicalDevice physical; ///< Physical device. - vk::DispatchLoaderDynamic dld; ///< Device function pointers. - vk::PhysicalDeviceProperties properties; ///< Device properties. - UniqueDevice logical; ///< Logical device. - vk::Queue graphics_queue; ///< Main graphics queue. - vk::Queue present_queue; ///< Main present queue. - u32 graphics_family{}; ///< Main graphics queue family index. - u32 present_family{}; ///< Main present queue family index. - vk::DriverIdKHR driver_id{}; ///< Driver ID. - vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced. - bool is_optimal_astc_supported{}; ///< Support for native ASTC. - bool is_float16_supported{}; ///< Support for float16 arithmetics. - bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + const vk::PhysicalDevice physical; ///< Physical device. + vk::DispatchLoaderDynamic dld; ///< Device function pointers. + vk::PhysicalDeviceProperties properties; ///< Device properties. + UniqueDevice logical; ///< Logical device. + vk::Queue graphics_queue; ///< Main graphics queue. + vk::Queue present_queue; ///< Main present queue. + u32 graphics_family{}; ///< Main graphics queue family index. + u32 present_family{}; ///< Main present queue family index. + vk::DriverIdKHR driver_id{}; ///< Driver ID. + vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed + bool is_optimal_astc_supported{}; ///< Support for native ASTC. + bool is_float16_supported{}; ///< Support for float16 arithmetics. + bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. + bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. + bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback. bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. - bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage - ///< image read without format // Telemetry parameters std::string vendor_name; ///< Device's driver name. diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index ebf85f311..91e7b7791 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -179,10 +179,11 @@ Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine( VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) + VKUpdateDescriptorQueue& update_descriptor_queue, + VKRenderPassCache& renderpass_cache) : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler}, descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue}, - renderpass_cache(device) {} + renderpass_cache{renderpass_cache} {} VKPipelineCache::~VKPipelineCache() = default; @@ -273,9 +274,9 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach specialization.workgroup_size = key.workgroup_size; specialization.shared_memory_size = key.shared_memory_size; - const SPIRVShader spirv_shader{ - Decompile(device, shader->GetIR(), ShaderType::Compute, specialization), - shader->GetEntries()}; + const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute, + shader->GetRegistry(), specialization), + shader->GetEntries()}; entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool, update_descriptor_queue, spirv_shader); return *entry; @@ -324,8 +325,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { const auto& gpu = system.GPU().Maxwell3D(); Specialization specialization; - specialization.primitive_topology = fixed_state.input_assembly.topology; - if (specialization.primitive_topology == Maxwell::PrimitiveTopology::Points) { + if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) { ASSERT(fixed_state.input_assembly.point_size != 0.0f); specialization.point_size = fixed_state.input_assembly.point_size; } @@ -333,9 +333,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type; } specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one; - specialization.tessellation.primitive = fixed_state.tessellation.primitive; - specialization.tessellation.spacing = fixed_state.tessellation.spacing; - specialization.tessellation.clockwise = fixed_state.tessellation.clockwise; SPIRVProgram program; std::vector<vk::DescriptorSetLayoutBinding> bindings; @@ -356,8 +353,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) { const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5 const auto program_type = GetShaderType(program_enum); const auto& entries = shader->GetEntries(); - program[stage] = {Decompile(device, shader->GetIR(), program_type, specialization), - entries}; + program[stage] = { + Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization), + entries}; if (program_enum == Maxwell::ShaderProgram::VertexA) { // VertexB was combined with VertexA, so we skip the VertexB iteration diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index e292526bb..c4c112290 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -132,6 +132,10 @@ public: return shader_ir; } + const VideoCommon::Shader::Registry& GetRegistry() const { + return registry; + } + const VideoCommon::Shader::ShaderIR& GetIR() const { return shader_ir; } @@ -157,7 +161,8 @@ public: explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer, const VKDevice& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + VKUpdateDescriptorQueue& update_descriptor_queue, + VKRenderPassCache& renderpass_cache); ~VKPipelineCache(); std::array<Shader, Maxwell::MaxShaderProgram> GetShaders(); @@ -180,8 +185,7 @@ private: VKScheduler& scheduler; VKDescriptorPool& descriptor_pool; VKUpdateDescriptorQueue& update_descriptor_queue; - - VKRenderPassCache renderpass_cache; + VKRenderPassCache& renderpass_cache; std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 2bcb17b56..755aad643 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -287,12 +287,13 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind screen_info{screen_info}, device{device}, resource_manager{resource_manager}, memory_manager{memory_manager}, state_tracker{state_tracker}, scheduler{scheduler}, staging_pool(device, memory_manager, scheduler), descriptor_pool(device), - update_descriptor_queue(device, scheduler), + update_descriptor_queue(device, scheduler), renderpass_cache(device), quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), texture_cache(system, *this, device, resource_manager, memory_manager, scheduler, staging_pool), - pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue), + pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue, + renderpass_cache), buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), sampler_cache(device), query_cache(system, *this, device, scheduler) { scheduler.SetQueryCache(query_cache); @@ -347,6 +348,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); }); } + BeginTransformFeedback(); + const auto pipeline_layout = pipeline.GetLayout(); const auto descriptor_set = pipeline.CommitDescriptorSet(); scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) { @@ -356,18 +359,23 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { } draw_params.Draw(cmdbuf, dld); }); + + EndTransformFeedback(); } void RasterizerVulkan::Clear() { MICROPROFILE_SCOPE(Vulkan_Clearing); - query_cache.UpdateCounters(); - const auto& gpu = system.GPU().Maxwell3D(); if (!system.GPU().Maxwell3D().ShouldExecute()) { return; } + sampled_views.clear(); + image_views.clear(); + + query_cache.UpdateCounters(); + const auto& regs = gpu.regs; const bool use_color = regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B || regs.clear_buffers.A; @@ -376,52 +384,54 @@ void RasterizerVulkan::Clear() { if (!use_color && !use_depth && !use_stencil) { return; } - // Clearing images requires to be out of a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); - // TODO(Rodrigo): Implement clears rendering a quad or using beginning a renderpass. + [[maybe_unused]] const auto texceptions = UpdateAttachments(); + DEBUG_ASSERT(texceptions.none()); + SetupImageTransitions(0, color_attachments, zeta_attachment); - if (use_color) { - View color_view; - { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - color_view = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT.Value(), false); - } + const vk::RenderPass renderpass = renderpass_cache.GetRenderPass(GetRenderPassParams(0)); + const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); + scheduler.RequestRenderpass({renderpass, framebuffer, {{0, 0}, render_area}, 0, nullptr}); + + const auto& scissor = regs.scissor_test[0]; + const vk::Offset2D scissor_offset(scissor.min_x, scissor.min_y); + vk::Extent2D scissor_extent{scissor.max_x - scissor.min_x, scissor.max_y - scissor.min_y}; + scissor_extent.width = std::min(scissor_extent.width, render_area.width); + scissor_extent.height = std::min(scissor_extent.height, render_area.height); - color_view->Transition(vk::ImageLayout::eTransferDstOptimal, - vk::PipelineStageFlagBits::eTransfer, - vk::AccessFlagBits::eTransferWrite); + const u32 layer = regs.clear_buffers.layer; + const vk::ClearRect clear_rect({scissor_offset, scissor_extent}, layer, 1); + if (use_color) { const std::array clear_color = {regs.clear_color[0], regs.clear_color[1], regs.clear_color[2], regs.clear_color[3]}; - const vk::ClearColorValue clear(clear_color); - scheduler.Record([image = color_view->GetImage(), - subresource = color_view->GetImageSubresourceRange(), - clear](auto cmdbuf, auto& dld) { - cmdbuf.clearColorImage(image, vk::ImageLayout::eTransferDstOptimal, clear, subresource, - dld); + const vk::ClearValue clear_value{clear_color}; + const u32 color_attachment = regs.clear_buffers.RT; + scheduler.Record([color_attachment, clear_value, clear_rect](auto cmdbuf, auto& dld) { + const vk::ClearAttachment attachment(vk::ImageAspectFlagBits::eColor, color_attachment, + clear_value); + cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld); }); } - if (use_depth || use_stencil) { - View zeta_surface; - { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - zeta_surface = texture_cache.GetDepthBufferSurface(false); - } - zeta_surface->Transition(vk::ImageLayout::eTransferDstOptimal, - vk::PipelineStageFlagBits::eTransfer, - vk::AccessFlagBits::eTransferWrite); - - const vk::ClearDepthStencilValue clear(regs.clear_depth, - static_cast<u32>(regs.clear_stencil)); - scheduler.Record([image = zeta_surface->GetImage(), - subresource = zeta_surface->GetImageSubresourceRange(), - clear](auto cmdbuf, auto& dld) { - cmdbuf.clearDepthStencilImage(image, vk::ImageLayout::eTransferDstOptimal, clear, - subresource, dld); - }); + if (!use_depth && !use_stencil) { + return; + } + vk::ImageAspectFlags aspect_flags; + if (use_depth) { + aspect_flags |= vk::ImageAspectFlagBits::eDepth; } + if (use_stencil) { + aspect_flags |= vk::ImageAspectFlagBits::eStencil; + } + + scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, + clear_rect, aspect_flags](auto cmdbuf, auto& dld) { + const vk::ClearDepthStencilValue clear_zeta(clear_depth, clear_stencil); + const vk::ClearValue clear_value{clear_zeta}; + const vk::ClearAttachment attachment(aspect_flags, 0, clear_value); + cmdbuf.clearAttachments(1, &attachment, 1, &clear_rect, dld); + }); } void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { @@ -738,6 +748,44 @@ void RasterizerVulkan::UpdateDynamicStates() { UpdateStencilFaces(regs); } +void RasterizerVulkan::BeginTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || + regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); + + UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable); + UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable); + UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable); + + const auto& binding = regs.tfb_bindings[0]; + UNIMPLEMENTED_IF(binding.buffer_enable == 0); + UNIMPLEMENTED_IF(binding.buffer_offset != 0); + + const GPUVAddr gpu_addr = binding.Address(); + const std::size_t size = binding.buffer_size; + const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true); + + scheduler.Record([buffer = *buffer, offset = offset, size](auto cmdbuf, auto& dld) { + cmdbuf.bindTransformFeedbackBuffersEXT(0, {buffer}, {offset}, {size}, dld); + cmdbuf.beginTransformFeedbackEXT(0, {}, {}, dld); + }); +} + +void RasterizerVulkan::EndTransformFeedback() { + const auto& regs = system.GPU().Maxwell3D().regs; + if (regs.tfb_enabled == 0) { + return; + } + + scheduler.Record( + [](auto cmdbuf, auto& dld) { cmdbuf.endTransformFeedbackEXT(0, {}, {}, dld); }); +} + void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, BufferBindings& buffer_bindings) { const auto& regs = system.GPU().Maxwell3D().regs; @@ -1109,7 +1157,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { // This implementation assumes that all attributes are used in the shader. const GPUVAddr start{regs.vertex_array[index].StartAddress()}; const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - DEBUG_ASSERT(end > start); + DEBUG_ASSERT(end >= start); size += (end - start + 1) * regs.vertex_array[index].enable; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 96ea05f0a..3185868e9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -169,6 +169,10 @@ private: void UpdateDynamicStates(); + void BeginTransformFeedback(); + + void EndTransformFeedback(); + bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input, @@ -249,6 +253,7 @@ private: VKStagingBufferPool staging_pool; VKDescriptorPool descriptor_pool; VKUpdateDescriptorQueue update_descriptor_queue; + VKRenderPassCache renderpass_cache; QuadArrayPass quad_array_pass; Uint8Pass uint8_pass; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index cfcca5af0..51ecb5567 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -5,7 +5,9 @@ #include <functional> #include <limits> #include <map> +#include <optional> #include <type_traits> +#include <unordered_map> #include <utility> #include <fmt/format.h> @@ -24,6 +26,7 @@ #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" +#include "video_core/shader/transform_feedback.h" namespace Vulkan { @@ -93,6 +96,12 @@ struct VertexIndices { std::optional<u32> clip_distances; }; +struct GenericVaryingDescription { + Id id = nullptr; + u32 first_element = 0; + bool is_scalar = false; +}; + spv::Dim GetSamplerDim(const Sampler& sampler) { ASSERT(!sampler.IsBuffer()); switch (sampler.GetType()) { @@ -266,9 +275,13 @@ bool IsPrecise(Operation operand) { class SPIRVDecompiler final : public Sirit::Module { public: explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage, - const Specialization& specialization) + const Registry& registry, const Specialization& specialization) : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()}, - specialization{specialization} { + registry{registry}, specialization{specialization} { + if (stage != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + } + AddCapability(spv::Capability::Shader); AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess); AddCapability(spv::Capability::ImageQuery); @@ -286,6 +299,15 @@ public: AddExtension("SPV_KHR_variable_pointers"); AddExtension("SPV_KHR_shader_draw_parameters"); + if (!transform_feedback.empty()) { + if (device.IsExtTransformFeedbackSupported()) { + AddCapability(spv::Capability::TransformFeedback); + } else { + LOG_ERROR(Render_Vulkan, "Shader requires transform feedbacks but these are not " + "supported on this device"); + } + } + if (ir.UsesLayer() || ir.UsesViewportIndex()) { if (ir.UsesViewportIndex()) { AddCapability(spv::Capability::MultiViewport); @@ -296,7 +318,7 @@ public: } } - if (device.IsShaderStorageImageReadWithoutFormatSupported()) { + if (device.IsFormatlessImageLoadSupported()) { AddCapability(spv::Capability::StorageImageReadWithoutFormat); } @@ -318,25 +340,29 @@ public: AddExecutionMode(main, spv::ExecutionMode::OutputVertices, header.common2.threads_per_input_primitive); break; - case ShaderType::TesselationEval: + case ShaderType::TesselationEval: { + const auto& info = registry.GetGraphicsInfo(); AddCapability(spv::Capability::Tessellation); AddEntryPoint(spv::ExecutionModel::TessellationEvaluation, main, "main", interfaces); - AddExecutionMode(main, GetExecutionMode(specialization.tessellation.primitive)); - AddExecutionMode(main, GetExecutionMode(specialization.tessellation.spacing)); - AddExecutionMode(main, specialization.tessellation.clockwise + AddExecutionMode(main, GetExecutionMode(info.tessellation_primitive)); + AddExecutionMode(main, GetExecutionMode(info.tessellation_spacing)); + AddExecutionMode(main, info.tessellation_clockwise ? spv::ExecutionMode::VertexOrderCw : spv::ExecutionMode::VertexOrderCcw); break; - case ShaderType::Geometry: + } + case ShaderType::Geometry: { + const auto& info = registry.GetGraphicsInfo(); AddCapability(spv::Capability::Geometry); AddEntryPoint(spv::ExecutionModel::Geometry, main, "main", interfaces); - AddExecutionMode(main, GetExecutionMode(specialization.primitive_topology)); + AddExecutionMode(main, GetExecutionMode(info.primitive_topology)); AddExecutionMode(main, GetExecutionMode(header.common3.output_topology)); AddExecutionMode(main, spv::ExecutionMode::OutputVertices, header.common4.max_output_vertices); // TODO(Rodrigo): Where can we get this info from? AddExecutionMode(main, spv::ExecutionMode::Invocations, 1U); break; + } case ShaderType::Fragment: AddEntryPoint(spv::ExecutionModel::Fragment, main, "main", interfaces); AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); @@ -545,7 +571,8 @@ private: if (stage != ShaderType::Geometry) { return; } - const u32 num_input = GetNumPrimitiveTopologyVertices(specialization.primitive_topology); + const auto& info = registry.GetGraphicsInfo(); + const u32 num_input = GetNumPrimitiveTopologyVertices(info.primitive_topology); DeclareInputVertexArray(num_input); DeclareOutputVertex(); } @@ -742,12 +769,34 @@ private: } void DeclareOutputAttributes() { + if (stage == ShaderType::Compute || stage == ShaderType::Fragment) { + return; + } + + UNIMPLEMENTED_IF(registry.GetGraphicsInfo().tfb_enabled && stage != ShaderType::Vertex); for (const auto index : ir.GetOutputAttributes()) { if (!IsGenericAttribute(index)) { continue; } - const u32 location = GetGenericAttributeLocation(index); - Id type = t_float4; + DeclareOutputAttribute(index); + } + } + + void DeclareOutputAttribute(Attribute::Index index) { + static constexpr std::string_view swizzle = "xyzw"; + + const u32 location = GetGenericAttributeLocation(index); + u8 element = 0; + while (element < 4) { + const std::size_t remainder = 4 - element; + + std::size_t num_components = remainder; + const std::optional tfb = GetTransformFeedbackInfo(index, element); + if (tfb) { + num_components = tfb->components; + } + + Id type = GetTypeVectorDefinitionLut(Type::Float).at(num_components - 1); Id varying_default = v_varying_default; if (IsOutputAttributeArray()) { const u32 num = GetNumOutputVertices(); @@ -760,13 +809,45 @@ private: } type = TypePointer(spv::StorageClass::Output, type); + std::string name = fmt::format("out_attr{}", location); + if (num_components < 4 || element > 0) { + name = fmt::format("{}_{}", name, swizzle.substr(element, num_components)); + } + const Id id = OpVariable(type, spv::StorageClass::Output, varying_default); - Name(AddGlobalVariable(id), fmt::format("out_attr{}", location)); - output_attributes.emplace(index, id); + Name(AddGlobalVariable(id), name); + + GenericVaryingDescription description; + description.id = id; + description.first_element = element; + description.is_scalar = num_components == 1; + for (u32 i = 0; i < num_components; ++i) { + const u8 offset = static_cast<u8>(static_cast<u32>(index) * 4 + element + i); + output_attributes.emplace(offset, description); + } interfaces.push_back(id); Decorate(id, spv::Decoration::Location, location); + if (element > 0) { + Decorate(id, spv::Decoration::Component, static_cast<u32>(element)); + } + if (tfb && device.IsExtTransformFeedbackSupported()) { + Decorate(id, spv::Decoration::XfbBuffer, static_cast<u32>(tfb->buffer)); + Decorate(id, spv::Decoration::XfbStride, static_cast<u32>(tfb->stride)); + Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset)); + } + + element = static_cast<u8>(static_cast<std::size_t>(element) + num_components); + } + } + + std::optional<VaryingTFB> GetTransformFeedbackInfo(Attribute::Index index, u8 element = 0) { + const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element); + const auto it = transform_feedback.find(location); + if (it == transform_feedback.end()) { + return {}; } + return it->second; } u32 DeclareConstantBuffers(u32 binding) { @@ -898,7 +979,7 @@ private: u32 GetNumInputVertices() const { switch (stage) { case ShaderType::Geometry: - return GetNumPrimitiveTopologyVertices(specialization.primitive_topology); + return GetNumPrimitiveTopologyVertices(registry.GetGraphicsInfo().primitive_topology); case ShaderType::TesselationControl: case ShaderType::TesselationEval: return NumInputPatches; @@ -1346,8 +1427,14 @@ private: } default: if (IsGenericAttribute(attribute)) { - const Id composite = output_attributes.at(attribute); - return {ArrayPass(t_out_float, composite, {element}), Type::Float}; + const u8 offset = static_cast<u8>(static_cast<u8>(attribute) * 4 + element); + const GenericVaryingDescription description = output_attributes.at(offset); + const Id composite = description.id; + std::vector<u32> indices; + if (!description.is_scalar) { + indices.push_back(element - description.first_element); + } + return {ArrayPass(t_out_float, composite, indices), Type::Float}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); @@ -1793,7 +1880,7 @@ private: } Expression ImageLoad(Operation operation) { - if (!device.IsShaderStorageImageReadWithoutFormatSupported()) { + if (!device.IsFormatlessImageLoadSupported()) { return {v_float_zero, Type::Float}; } @@ -2258,11 +2345,11 @@ private: std::array<Id, 4> GetTypeVectorDefinitionLut(Type type) const { switch (type) { case Type::Float: - return {nullptr, t_float2, t_float3, t_float4}; + return {t_float, t_float2, t_float3, t_float4}; case Type::Int: - return {nullptr, t_int2, t_int3, t_int4}; + return {t_int, t_int2, t_int3, t_int4}; case Type::Uint: - return {nullptr, t_uint2, t_uint3, t_uint4}; + return {t_uint, t_uint2, t_uint3, t_uint4}; default: UNIMPLEMENTED(); return {}; @@ -2495,7 +2582,9 @@ private: const ShaderIR& ir; const ShaderType stage; const Tegra::Shader::Header header; + const Registry& registry; const Specialization& specialization; + std::unordered_map<u8, VaryingTFB> transform_feedback; const Id t_void = Name(TypeVoid(), "void"); @@ -2584,7 +2673,7 @@ private: Id shared_memory{}; std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{}; std::map<Attribute::Index, Id> input_attributes; - std::map<Attribute::Index, Id> output_attributes; + std::unordered_map<u8, GenericVaryingDescription> output_attributes; std::map<u32, Id> constant_buffers; std::map<GlobalMemoryBase, Id> global_buffers; std::map<u32, TexelBuffer> texel_buffers; @@ -2870,8 +2959,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { } std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, - ShaderType stage, const Specialization& specialization) { - return SPIRVDecompiler(device, ir, stage, specialization).Assemble(); + ShaderType stage, const VideoCommon::Shader::Registry& registry, + const Specialization& specialization) { + return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble(); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index f5dc14d9e..ffea4709e 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -15,6 +15,7 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" +#include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" namespace Vulkan { @@ -91,17 +92,9 @@ struct Specialization final { u32 shared_memory_size{}; // Graphics specific - Maxwell::PrimitiveTopology primitive_topology{}; std::optional<float> point_size{}; std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; bool ndc_minus_one_to_one{}; - - // Tessellation specific - struct { - Maxwell::TessellationPrimitive primitive{}; - Maxwell::TessellationSpacing spacing{}; - bool clockwise{}; - } tessellation; }; // Old gcc versions don't consider this trivially copyable. // static_assert(std::is_trivially_copyable_v<Specialization>); @@ -114,6 +107,8 @@ struct SPIRVShader { ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir); std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, - Tegra::Engines::ShaderType stage, const Specialization& specialization); + Tegra::Engines::ShaderType stage, + const VideoCommon::Shader::Registry& registry, + const Specialization& specialization); } // namespace Vulkan diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp index e02bcd097..8e3b46e8e 100644 --- a/src/video_core/shader/decode/bfe.cpp +++ b/src/video_core/shader/decode/bfe.cpp @@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.bfe.negate_b); - Node op_a = GetRegister(instr.gpr8); - op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false); - - switch (opcode->get().GetId()) { - case OpCode::Id::BFE_IMM: { - UNIMPLEMENTED_IF_MSG(instr.generates_cc, - "Condition codes generation in BFE is not implemented"); + Node op_b = [&] { + switch (opcode->get().GetId()) { + case OpCode::Id::BFE_R: + return GetRegister(instr.gpr20); + case OpCode::Id::BFE_C: + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::BFE_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); + } + }(); - const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue())); - const Node outer_shift_imm = - Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position)); + UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented"); - const Node inner_shift = - Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm); - const Node outer_shift = - Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm); + const bool is_signed = instr.bfe.is_signed; - SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc); - SetRegister(bb, instr.gpr0, outer_shift); - break; - } - default: - UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName()); + // using reverse parallel method in + // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + // note for later if possible to implement faster method. + if (instr.bfe.brev) { + const auto swap = [&](u32 s, u32 mask) { + Node v1 = + SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s)); + if (mask != 0) { + v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1), + Immediate(mask)); + } + Node v2 = op_a; + if (mask != 0) { + v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2), + Immediate(mask)); + } + v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2), + Immediate(s)); + return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1), + std::move(v2)); + }; + op_a = swap(1, 0x55555555U); + op_a = swap(2, 0x33333333U); + op_a = swap(4, 0x0F0F0F0FU); + op_a = swap(8, 0x00FF00FFU); + op_a = swap(16, 0); } + const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(0), Immediate(8)); + const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(8), Immediate(8)); + auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits); + SetRegister(bb, instr.gpr0, std::move(result)); + return pc; } diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index b3dcd291c..76c56abb5 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed) return OperationCode::UBitwiseXor; case OperationCode::IBitwiseNot: return OperationCode::UBitwiseNot; + case OperationCode::IBitfieldExtract: + return OperationCode::UBitfieldExtract; case OperationCode::IBitfieldInsert: return OperationCode::UBitfieldInsert; case OperationCode::IBitCount: diff --git a/src/video_core/shader/transform_feedback.cpp b/src/video_core/shader/transform_feedback.cpp new file mode 100644 index 000000000..22a933761 --- /dev/null +++ b/src/video_core/shader/transform_feedback.cpp @@ -0,0 +1,115 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <unordered_map> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/shader/registry.h" +#include "video_core/shader/transform_feedback.h" + +namespace VideoCommon::Shader { + +namespace { + +using Maxwell = Tegra::Engines::Maxwell3D::Regs; + +// TODO(Rodrigo): Change this to constexpr std::unordered_set in C++20 + +/// Attribute offsets that describe a vector +constexpr std::array VECTORS = { + 28, // gl_Position + 32, // Generic 0 + 36, // Generic 1 + 40, // Generic 2 + 44, // Generic 3 + 48, // Generic 4 + 52, // Generic 5 + 56, // Generic 6 + 60, // Generic 7 + 64, // Generic 8 + 68, // Generic 9 + 72, // Generic 10 + 76, // Generic 11 + 80, // Generic 12 + 84, // Generic 13 + 88, // Generic 14 + 92, // Generic 15 + 96, // Generic 16 + 100, // Generic 17 + 104, // Generic 18 + 108, // Generic 19 + 112, // Generic 20 + 116, // Generic 21 + 120, // Generic 22 + 124, // Generic 23 + 128, // Generic 24 + 132, // Generic 25 + 136, // Generic 26 + 140, // Generic 27 + 144, // Generic 28 + 148, // Generic 29 + 152, // Generic 30 + 156, // Generic 31 + 160, // gl_FrontColor + 164, // gl_FrontSecondaryColor + 160, // gl_BackColor + 164, // gl_BackSecondaryColor + 192, // gl_TexCoord[0] + 196, // gl_TexCoord[1] + 200, // gl_TexCoord[2] + 204, // gl_TexCoord[3] + 208, // gl_TexCoord[4] + 212, // gl_TexCoord[5] + 216, // gl_TexCoord[6] + 220, // gl_TexCoord[7] +}; +} // namespace + +std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info) { + + std::unordered_map<u8, VaryingTFB> tfb; + + for (std::size_t buffer = 0; buffer < Maxwell::NumTransformFeedbackBuffers; ++buffer) { + const auto& locations = info.tfb_varying_locs[buffer]; + const auto& layout = info.tfb_layouts[buffer]; + const std::size_t varying_count = layout.varying_count; + + std::size_t highest = 0; + + for (std::size_t offset = 0; offset < varying_count; ++offset) { + const std::size_t base_offset = offset; + const u8 location = locations[offset]; + + VaryingTFB varying; + varying.buffer = layout.stream; + varying.stride = layout.stride; + varying.offset = offset * sizeof(u32); + varying.components = 1; + + if (std::find(VECTORS.begin(), VECTORS.end(), location / 4 * 4) != VECTORS.end()) { + UNIMPLEMENTED_IF_MSG(location % 4 != 0, "Unaligned TFB"); + + const u8 base_index = location / 4; + while (offset + 1 < varying_count && base_index == locations[offset + 1] / 4) { + ++offset; + ++varying.components; + } + } + + [[maybe_unused]] const bool inserted = tfb.emplace(location, varying).second; + UNIMPLEMENTED_IF_MSG(!inserted, "Varying already stored"); + + highest = std::max(highest, (base_offset + varying.components) * sizeof(u32)); + } + + UNIMPLEMENTED_IF(highest != layout.stride); + } + return tfb; +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/transform_feedback.h b/src/video_core/shader/transform_feedback.h new file mode 100644 index 000000000..77d05f64c --- /dev/null +++ b/src/video_core/shader/transform_feedback.h @@ -0,0 +1,23 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <unordered_map> + +#include "common/common_types.h" +#include "video_core/shader/registry.h" + +namespace VideoCommon::Shader { + +struct VaryingTFB { + std::size_t buffer; + std::size_t stride; + std::size_t offset; + std::size_t components; +}; + +std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info); + +} // namespace VideoCommon::Shader diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 9707c353d..cc7181229 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -111,6 +111,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) return PixelFormat::RGBA16F; case Tegra::RenderTargetFormat::RGBA16_UNORM: return PixelFormat::RGBA16U; + case Tegra::RenderTargetFormat::RGBA16_SNORM: + return PixelFormat::RGBA16S; case Tegra::RenderTargetFormat::RGBA16_UINT: return PixelFormat::RGBA16UI; case Tegra::RenderTargetFormat::RGBA32_FLOAT: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index d88109e5a..ae8817465 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -25,82 +25,83 @@ enum class PixelFormat { R8UI = 7, RGBA16F = 8, RGBA16U = 9, - RGBA16UI = 10, - R11FG11FB10F = 11, - RGBA32UI = 12, - DXT1 = 13, - DXT23 = 14, - DXT45 = 15, - DXN1 = 16, // This is also known as BC4 - DXN2UNORM = 17, - DXN2SNORM = 18, - BC7U = 19, - BC6H_UF16 = 20, - BC6H_SF16 = 21, - ASTC_2D_4X4 = 22, - BGRA8 = 23, - RGBA32F = 24, - RG32F = 25, - R32F = 26, - R16F = 27, - R16U = 28, - R16S = 29, - R16UI = 30, - R16I = 31, - RG16 = 32, - RG16F = 33, - RG16UI = 34, - RG16I = 35, - RG16S = 36, - RGB32F = 37, - RGBA8_SRGB = 38, - RG8U = 39, - RG8S = 40, - RG32UI = 41, - RGBX16F = 42, - R32UI = 43, - R32I = 44, - ASTC_2D_8X8 = 45, - ASTC_2D_8X5 = 46, - ASTC_2D_5X4 = 47, - BGRA8_SRGB = 48, - DXT1_SRGB = 49, - DXT23_SRGB = 50, - DXT45_SRGB = 51, - BC7U_SRGB = 52, - R4G4B4A4U = 53, - ASTC_2D_4X4_SRGB = 54, - ASTC_2D_8X8_SRGB = 55, - ASTC_2D_8X5_SRGB = 56, - ASTC_2D_5X4_SRGB = 57, - ASTC_2D_5X5 = 58, - ASTC_2D_5X5_SRGB = 59, - ASTC_2D_10X8 = 60, - ASTC_2D_10X8_SRGB = 61, - ASTC_2D_6X6 = 62, - ASTC_2D_6X6_SRGB = 63, - ASTC_2D_10X10 = 64, - ASTC_2D_10X10_SRGB = 65, - ASTC_2D_12X12 = 66, - ASTC_2D_12X12_SRGB = 67, - ASTC_2D_8X6 = 68, - ASTC_2D_8X6_SRGB = 69, - ASTC_2D_6X5 = 70, - ASTC_2D_6X5_SRGB = 71, - E5B9G9R9F = 72, + RGBA16S = 10, + RGBA16UI = 11, + R11FG11FB10F = 12, + RGBA32UI = 13, + DXT1 = 14, + DXT23 = 15, + DXT45 = 16, + DXN1 = 17, // This is also known as BC4 + DXN2UNORM = 18, + DXN2SNORM = 19, + BC7U = 20, + BC6H_UF16 = 21, + BC6H_SF16 = 22, + ASTC_2D_4X4 = 23, + BGRA8 = 24, + RGBA32F = 25, + RG32F = 26, + R32F = 27, + R16F = 28, + R16U = 29, + R16S = 30, + R16UI = 31, + R16I = 32, + RG16 = 33, + RG16F = 34, + RG16UI = 35, + RG16I = 36, + RG16S = 37, + RGB32F = 38, + RGBA8_SRGB = 39, + RG8U = 40, + RG8S = 41, + RG32UI = 42, + RGBX16F = 43, + R32UI = 44, + R32I = 45, + ASTC_2D_8X8 = 46, + ASTC_2D_8X5 = 47, + ASTC_2D_5X4 = 48, + BGRA8_SRGB = 49, + DXT1_SRGB = 50, + DXT23_SRGB = 51, + DXT45_SRGB = 52, + BC7U_SRGB = 53, + R4G4B4A4U = 54, + ASTC_2D_4X4_SRGB = 55, + ASTC_2D_8X8_SRGB = 56, + ASTC_2D_8X5_SRGB = 57, + ASTC_2D_5X4_SRGB = 58, + ASTC_2D_5X5 = 59, + ASTC_2D_5X5_SRGB = 60, + ASTC_2D_10X8 = 61, + ASTC_2D_10X8_SRGB = 62, + ASTC_2D_6X6 = 63, + ASTC_2D_6X6_SRGB = 64, + ASTC_2D_10X10 = 65, + ASTC_2D_10X10_SRGB = 66, + ASTC_2D_12X12 = 67, + ASTC_2D_12X12_SRGB = 68, + ASTC_2D_8X6 = 69, + ASTC_2D_8X6_SRGB = 70, + ASTC_2D_6X5 = 71, + ASTC_2D_6X5_SRGB = 72, + E5B9G9R9F = 73, MaxColorFormat, // Depth formats - Z32F = 73, - Z16 = 74, + Z32F = 74, + Z16 = 75, MaxDepthFormat, // DepthStencil formats - Z24S8 = 75, - S8Z24 = 76, - Z32FS8 = 77, + Z24S8 = 76, + S8Z24 = 77, + Z32FS8 = 78, MaxDepthStencilFormat, @@ -138,6 +139,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ 0, // R8UI 0, // RGBA16F 0, // RGBA16U + 0, // RGBA16S 0, // RGBA16UI 0, // R11FG11FB10F 0, // RGBA32UI @@ -235,6 +237,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -413,6 +417,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 8, // R8UI 64, // RGBA16F 64, // RGBA16U + 64, // RGBA16S 64, // RGBA16UI 32, // R11FG11FB10F 128, // RGBA32UI @@ -517,6 +522,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table SurfaceCompression::None, // R8UI SurfaceCompression::None, // RGBA16F SurfaceCompression::None, // RGBA16U + SurfaceCompression::None, // RGBA16S SurfaceCompression::None, // RGBA16UI SurfaceCompression::None, // R11FG11FB10F SurfaceCompression::None, // RGBA32UI diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index cc3ad8417..e151c26c4 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -41,7 +41,7 @@ struct Table { ComponentType alpha_component; bool is_srgb; }; -constexpr std::array<Table, 75> DefinitionTable = {{ +constexpr std::array<Table, 76> DefinitionTable = {{ {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, @@ -61,6 +61,7 @@ constexpr std::array<Table, 75> DefinitionTable = {{ {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U}, {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S}, + {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S}, {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U}, {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F}, {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI}, diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index f00839313..9931c5ef7 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -113,8 +113,10 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta params.height = tic.Height(); params.depth = tic.Depth(); params.pitch = params.is_tiled ? 0 : tic.Pitch(); - if (params.target == SurfaceTarget::TextureCubemap || - params.target == SurfaceTarget::TextureCubeArray) { + if (params.target == SurfaceTarget::Texture2D && params.depth > 1) { + params.depth = 1; + } else if (params.target == SurfaceTarget::TextureCubemap || + params.target == SurfaceTarget::TextureCubeArray) { params.depth *= 6; } params.num_levels = tic.max_mip_level + 1; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 51373b687..6cdbe63d0 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -104,6 +104,11 @@ public: if (!cache_addr) { return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); } + + if (!IsTypeCompatible(tic.texture_type, entry)) { + return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); + } + const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; const auto [surface, view] = GetSurface(gpu_addr, cache_addr, params, true, false); if (guard_samplers) { @@ -914,13 +919,15 @@ private: params.width = 1; params.height = 1; params.depth = 1; + if (target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray) { + params.depth = 6; + } params.pitch = 4; params.num_levels = 1; params.emulated_levels = 1; - params.pixel_format = VideoCore::Surface::PixelFormat::RGBA16F; + params.pixel_format = VideoCore::Surface::PixelFormat::R8U; params.type = VideoCore::Surface::SurfaceType::ColorTexture; auto surface = CreateSurface(0ULL, params); - invalid_memory.clear(); invalid_memory.resize(surface->GetHostSizeInBytes(), 0U); surface->UploadTexture(invalid_memory); surface->MarkAsModified(false, Tick()); @@ -1082,6 +1089,36 @@ private: return siblings_table[static_cast<std::size_t>(format)]; } + /// Returns true the shader sampler entry is compatible with the TIC texture type. + static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type, + const VideoCommon::Shader::Sampler& entry) { + const auto shader_type = entry.GetType(); + switch (tic_type) { + case Tegra::Texture::TextureType::Texture1D: + case Tegra::Texture::TextureType::Texture1DArray: + return shader_type == Tegra::Shader::TextureType::Texture1D; + case Tegra::Texture::TextureType::Texture1DBuffer: + // TODO(Rodrigo): Assume as valid for now + return true; + case Tegra::Texture::TextureType::Texture2D: + case Tegra::Texture::TextureType::Texture2DNoMipmap: + return shader_type == Tegra::Shader::TextureType::Texture2D; + case Tegra::Texture::TextureType::Texture2DArray: + return shader_type == Tegra::Shader::TextureType::Texture2D || + shader_type == Tegra::Shader::TextureType::TextureCube; + case Tegra::Texture::TextureType::Texture3D: + return shader_type == Tegra::Shader::TextureType::Texture3D; + case Tegra::Texture::TextureType::TextureCubeArray: + case Tegra::Texture::TextureType::TextureCubemap: + if (shader_type == Tegra::Shader::TextureType::TextureCube) { + return true; + } + return shader_type == Tegra::Shader::TextureType::Texture2D && entry.IsArray(); + } + UNREACHABLE(); + return true; + } + struct FramebufferTargetInfo { TSurface target; TView view; |