diff options
Diffstat (limited to 'src')
19 files changed, 313 insertions, 81 deletions
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 241dac881..b4ee2a255 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -146,8 +146,8 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp } IoctlSubmitGpfifo params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlSubmitGpfifo)); - LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", - params.address, params.num_entries, params.flags.raw); + LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, + params.num_entries, params.flags.raw); ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) + params.num_entries * sizeof(Tegra::CommandListHeader), @@ -179,8 +179,8 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output) } IoctlSubmitGpfifo params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlSubmitGpfifo)); - LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}", - params.address, params.num_entries, params.flags.raw); + LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, + params.num_entries, params.flags.raw); Tegra::CommandList entries(params.num_entries); Memory::ReadBlock(params.address, entries.data(), diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index c8c92757a..fb3d1112c 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -89,6 +89,9 @@ void Maxwell3D::InitializeRegisterDefaults() { // Commercial games seem to assume this value is enabled and nouveau sets this value manually. regs.rt_separate_frag_data = 1; + + // Some games (like Super Mario Odyssey) assume that SRGB is enabled. + regs.framebuffer_srgb = 1; } #define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name)) @@ -329,6 +332,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ProcessMacroBind(method_call.argument); break; } + case MAXWELL3D_REG_INDEX(firmware[4]): { + ProcessFirmwareCall4(); + break; + } case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]): @@ -419,6 +426,14 @@ void Maxwell3D::ProcessMacroBind(u32 data) { macro_positions[regs.macros.entry++] = data; } +void Maxwell3D::ProcessFirmwareCall4() { + LOG_WARNING(HW_GPU, "(STUBBED) called"); + + // Firmware call 4 is a blob that changes some registers depending on its parameters. + // These registers don't affect emulation and so are stubbed by setting 0xd00 to 1. + regs.reg_array[0xd00] = 1; +} + void Maxwell3D::ProcessQueryGet() { const GPUVAddr sequence_address{regs.query.QueryAddress()}; // Since the sequence address is given as a GPU VAddr, we have to convert it to an application @@ -526,7 +541,7 @@ void Maxwell3D::ProcessSyncPoint() { } void Maxwell3D::DrawArrays() { - LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), + LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), regs.vertex_buffer.count); ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?"); diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index f67a5389f..e5ec90717 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1089,7 +1089,9 @@ public: INSERT_PADDING_WORDS(14); } shader_config[MaxShaderProgram]; - INSERT_PADDING_WORDS(0x80); + INSERT_PADDING_WORDS(0x60); + + u32 firmware[0x20]; struct { u32 cb_size; @@ -1319,6 +1321,9 @@ private: /// Handles writes to the macro bind register. void ProcessMacroBind(u32 data); + /// Handles firmware blob 4 + void ProcessFirmwareCall4(); + /// Handles a write to the CLEAR_BUFFERS register. void ProcessClearBuffers(); @@ -1431,6 +1436,7 @@ ASSERT_REG_POSITION(vertex_array[0], 0x700); ASSERT_REG_POSITION(independent_blend, 0x780); ASSERT_REG_POSITION(vertex_array_limit[0], 0x7C0); ASSERT_REG_POSITION(shader_config[0], 0x800); +ASSERT_REG_POSITION(firmware, 0x8C0); ASSERT_REG_POSITION(const_buffer, 0x8E0); ASSERT_REG_POSITION(cb_bind[0], 0x904); ASSERT_REG_POSITION(tex_cb_index, 0x982); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 052e6d24e..a6110bd86 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -566,6 +566,13 @@ enum class ImageAtomicOperation : u64 { Exch = 8, }; +enum class ShuffleOperation : u64 { + Idx = 0, // shuffleNV + Up = 1, // shuffleUpNV + Down = 2, // shuffleDownNV + Bfly = 3, // shuffleXorNV +}; + union Instruction { Instruction& operator=(const Instruction& instr) { value = instr.value; @@ -600,6 +607,15 @@ union Instruction { } vote; union { + BitField<30, 2, ShuffleOperation> operation; + BitField<48, 3, u64> pred48; + BitField<28, 1, u64> is_index_imm; + BitField<29, 1, u64> is_mask_imm; + BitField<20, 5, u64> index_imm; + BitField<34, 13, u64> mask_imm; + } shfl; + + union { BitField<8, 8, Register> gpr; BitField<20, 24, s64> offset; } gmem; @@ -1542,6 +1558,7 @@ public: BRK, DEPBAR, VOTE, + SHFL, BFE_C, BFE_R, BFE_IMM, @@ -1833,6 +1850,7 @@ private: INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"), INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"), INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"), + INST("1110111100010---", Id::SHFL, Type::Warp, "SHFL"), INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"), INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"), INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"), diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp index 4e1cb98db..62afc0d11 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro_interpreter.cpp @@ -131,9 +131,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { // An instruction with the Exit flag will not actually // cause an exit if it's executed inside a delay slot. - // TODO(Blinkhawk): Reversed to always exit. The behavior explained above requires further - // testing on the MME code. - if (opcode.is_exit) { + if (opcode.is_exit && !is_delay_slot) { // Exit has a delay slot, execute the next instruction Step(offset, true); return false; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 4e266cdad..4dd08bccb 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -489,9 +489,6 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( // Assume that a surface will be written to if it is used as a framebuffer, even if // the shader doesn't actually write to it. texture_cache.MarkColorBufferInUse(*single_color_target); - // Workaround for and issue in nvidia drivers - // https://devtalk.nvidia.com/default/topic/776591/opengl/gl_framebuffer_srgb-functions-incorrectly/ - state.framebuffer_srgb.enabled |= color_surface->GetSurfaceParams().srgb_conversion; } fbkey.is_single_buffer = true; @@ -512,11 +509,6 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers( // Assume that a surface will be written to if it is used as a framebuffer, even // if the shader doesn't actually write to it. texture_cache.MarkColorBufferInUse(index); - // Enable sRGB only for supported formats - // Workaround for and issue in nvidia drivers - // https://devtalk.nvidia.com/default/topic/776591/opengl/gl_framebuffer_srgb-functions-incorrectly/ - state.framebuffer_srgb.enabled |= - color_surface->GetSurfaceParams().srgb_conversion; } fbkey.color_attachments[index] = @@ -906,6 +898,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, } screen_info.display_texture = surface->GetTexture(); + screen_info.display_srgb = surface->GetSurfaceParams().srgb_conversion; return true; } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 909ccb82c..0dbc4c02f 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -214,7 +214,8 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn std::string source = "#version 430 core\n" "#extension GL_ARB_separate_shader_objects : enable\n" "#extension GL_NV_gpu_shader5 : enable\n" - "#extension GL_NV_shader_thread_group : enable\n"; + "#extension GL_NV_shader_thread_group : enable\n" + "#extension GL_NV_shader_thread_shuffle : enable\n"; if (entries.shader_viewport_layer_array) { source += "#extension GL_ARB_shader_viewport_layer_array : enable\n"; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 137b23740..76439e7ab 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -325,6 +325,7 @@ public: DeclareRegisters(); DeclarePredicates(); DeclareLocalMemory(); + DeclareSharedMemory(); DeclareInternalFlags(); DeclareInputAttributes(); DeclareOutputAttributes(); @@ -499,6 +500,13 @@ private: code.AddNewLine(); } + void DeclareSharedMemory() { + if (stage != ProgramType::Compute) { + return; + } + code.AddLine("shared uint {}[];", GetSharedMemory()); + } + void DeclareInternalFlags() { for (u32 flag = 0; flag < static_cast<u32>(InternalFlag::Amount); flag++) { const auto flag_code = static_cast<InternalFlag>(flag); @@ -881,6 +889,12 @@ private: Type::Uint}; } + if (const auto smem = std::get_if<SmemNode>(&*node)) { + return { + fmt::format("{}[{} >> 2]", GetSharedMemory(), Visit(smem->GetAddress()).AsUint()), + Type::Uint}; + } + if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) { return {GetInternalFlag(internal_flag->GetFlag()), Type::Bool}; } @@ -1007,10 +1021,10 @@ private: return {std::move(temporary), value.GetType()}; } - Expression GetOutputAttribute(const AbufNode* abuf) { + std::optional<Expression> GetOutputAttribute(const AbufNode* abuf) { switch (const auto attribute = abuf->GetIndex()) { case Attribute::Index::Position: - return {"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}; + return {{"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}}; case Attribute::Index::LayerViewportPointSize: switch (abuf->GetElement()) { case 0: @@ -1020,25 +1034,25 @@ private: if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { return {}; } - return {"gl_Layer", Type::Int}; + return {{"gl_Layer", Type::Int}}; case 2: if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) { return {}; } - return {"gl_ViewportIndex", Type::Int}; + return {{"gl_ViewportIndex", Type::Int}}; case 3: UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader"); - return {"gl_PointSize", Type::Float}; + return {{"gl_PointSize", Type::Float}}; } return {}; case Attribute::Index::ClipDistances0123: - return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}; + return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}}; case Attribute::Index::ClipDistances4567: - return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}; + return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}}; default: if (IsGenericAttribute(attribute)) { - return {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), - Type::Float}; + return { + {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}}; } UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); return {}; @@ -1278,7 +1292,11 @@ private: target = {GetRegister(gpr->GetIndex()), Type::Float}; } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) { UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer()); - target = GetOutputAttribute(abuf); + auto output = GetOutputAttribute(abuf); + if (!output) { + return {}; + } + target = std::move(*output); } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) { if (stage == ProgramType::Compute) { LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders"); @@ -1286,6 +1304,11 @@ private: target = { fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()), Type::Uint}; + } else if (const auto smem = std::get_if<SmemNode>(&*dest)) { + ASSERT(stage == ProgramType::Compute); + target = { + fmt::format("{}[{} >> 2]", GetSharedMemory(), Visit(smem->GetAddress()).AsUint()), + Type::Uint}; } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { const std::string real = Visit(gmem->GetRealAddress()).AsUint(); const std::string base = Visit(gmem->GetBaseAddress()).AsUint(); @@ -1934,8 +1957,7 @@ private: Expression BallotThread(Operation operation) { const std::string value = VisitOperand(operation, 0).AsBool(); if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia warp intrinsics are not available and its required by a shader"); + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); // Stub on non-Nvidia devices by simulating all threads voting the same as the active // one. return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint}; @@ -1946,8 +1968,7 @@ private: Expression Vote(Operation operation, const char* func) { const std::string value = VisitOperand(operation, 0).AsBool(); if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia vote intrinsics are not available and its required by a shader"); + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); // Stub with a warp size of one. return {value, Type::Bool}; } @@ -1964,15 +1985,54 @@ private: Expression VoteEqual(Operation operation) { if (!device.HasWarpIntrinsics()) { - LOG_ERROR(Render_OpenGL, - "Nvidia vote intrinsics are not available and its required by a shader"); - // We must return true here since a stub for a theoretical warp size of 1 will always - // return an equal result for all its votes. + LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader"); + // We must return true here since a stub for a theoretical warp size of 1. + // This will always return an equal result across all votes. return {"true", Type::Bool}; } return Vote(operation, "allThreadsEqualNV"); } + template <const std::string_view& func> + Expression Shuffle(Operation operation) { + const std::string value = VisitOperand(operation, 0).AsFloat(); + if (!device.HasWarpIntrinsics()) { + LOG_ERROR(Render_OpenGL, "Nvidia shuffle intrinsics are required by this shader"); + // On a "single-thread" device we are either on the same thread or out of bounds. Both + // cases return the passed value. + return {value, Type::Float}; + } + + const std::string index = VisitOperand(operation, 1).AsUint(); + const std::string width = VisitOperand(operation, 2).AsUint(); + return {fmt::format("{}({}, {}, {})", func, value, index, width), Type::Float}; + } + + template <const std::string_view& func> + Expression InRangeShuffle(Operation operation) { + const std::string index = VisitOperand(operation, 0).AsUint(); + const std::string width = VisitOperand(operation, 1).AsUint(); + if (!device.HasWarpIntrinsics()) { + // On a "single-thread" device we are only in bounds when the requested index is 0. + return {fmt::format("({} == 0U)", index), Type::Bool}; + } + + const std::string in_range = code.GenerateTemporary(); + code.AddLine("bool {};", in_range); + code.AddLine("{}(0U, {}, {}, {});", func, index, width, in_range); + return {in_range, Type::Bool}; + } + + struct Func final { + Func() = delete; + ~Func() = delete; + + static constexpr std::string_view ShuffleIndexed = "shuffleNV"; + static constexpr std::string_view ShuffleUp = "shuffleUpNV"; + static constexpr std::string_view ShuffleDown = "shuffleDownNV"; + static constexpr std::string_view ShuffleButterfly = "shuffleXorNV"; + }; + static constexpr std::array operation_decompilers = { &GLSLDecompiler::Assign, @@ -2135,6 +2195,16 @@ private: &GLSLDecompiler::VoteAll, &GLSLDecompiler::VoteAny, &GLSLDecompiler::VoteEqual, + + &GLSLDecompiler::Shuffle<Func::ShuffleIndexed>, + &GLSLDecompiler::Shuffle<Func::ShuffleUp>, + &GLSLDecompiler::Shuffle<Func::ShuffleDown>, + &GLSLDecompiler::Shuffle<Func::ShuffleButterfly>, + + &GLSLDecompiler::InRangeShuffle<Func::ShuffleIndexed>, + &GLSLDecompiler::InRangeShuffle<Func::ShuffleUp>, + &GLSLDecompiler::InRangeShuffle<Func::ShuffleDown>, + &GLSLDecompiler::InRangeShuffle<Func::ShuffleButterfly>, }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); @@ -2175,6 +2245,10 @@ private: return "lmem_" + suffix; } + std::string GetSharedMemory() const { + return fmt::format("smem_{}", suffix); + } + std::string GetInternalFlag(InternalFlag flag) const { constexpr std::array InternalFlagNames = {"zero_flag", "sign_flag", "carry_flag", "overflow_flag"}; diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 6eabf4fac..bf86b5a0b 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -16,7 +16,6 @@ namespace OpenGL { using Maxwell = Tegra::Engines::Maxwell3D::Regs; OpenGLState OpenGLState::cur_state; -bool OpenGLState::s_rgb_used; namespace { @@ -282,8 +281,6 @@ void OpenGLState::ApplySRgb() const { return; cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled; if (framebuffer_srgb.enabled) { - // Track if sRGB is used - s_rgb_used = true; glEnable(GL_FRAMEBUFFER_SRGB); } else { glDisable(GL_FRAMEBUFFER_SRGB); diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 949b13051..c358d3b38 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -175,14 +175,6 @@ public: return cur_state; } - static bool GetsRGBUsed() { - return s_rgb_used; - } - - static void ClearsRGBUsed() { - s_rgb_used = false; - } - void SetDefaultViewports(); /// Apply this state as the current OpenGL state void Apply(); @@ -253,8 +245,6 @@ public: private: static OpenGLState cur_state; - // Workaround for sRGB problems caused by QT not supporting srgb output - static bool s_rgb_used; struct { bool blend_state; bool stencil_state; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index ea77dd211..9ed738171 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -145,7 +145,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, case Tegra::Texture::TextureMipmapFilter::None: return GL_LINEAR; case Tegra::Texture::TextureMipmapFilter::Nearest: - return GL_NEAREST_MIPMAP_LINEAR; + return GL_LINEAR_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: return GL_LINEAR_MIPMAP_LINEAR; } @@ -157,7 +157,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, case Tegra::Texture::TextureMipmapFilter::Nearest: return GL_NEAREST_MIPMAP_NEAREST; case Tegra::Texture::TextureMipmapFilter::Linear: - return GL_LINEAR_MIPMAP_NEAREST; + return GL_NEAREST_MIPMAP_LINEAR; } } } diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 839178152..1e6ef66ab 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -264,7 +264,6 @@ void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - OpenGLState::ClearsRGBUsed(); rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info); } @@ -343,9 +342,7 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, }}; state.textures[0] = screen_info.display_texture; - // Workaround brigthness problems in SMO by enabling sRGB in the final output - // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987 - state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed(); + state.framebuffer_srgb.enabled = screen_info.display_srgb; state.AllDirty(); state.Apply(); glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data()); @@ -355,8 +352,6 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x, state.textures[0] = 0; state.AllDirty(); state.Apply(); - // Clear sRGB state for the next frame - OpenGLState::ClearsRGBUsed(); } /** @@ -406,8 +401,8 @@ void RendererOpenGL::CaptureScreenshot() { GLuint renderbuffer; glGenRenderbuffers(1, &renderbuffer); glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer); - glRenderbufferStorage(GL_RENDERBUFFER, state.GetsRGBUsed() ? GL_SRGB8 : GL_RGB8, layout.width, - layout.height); + glRenderbufferStorage(GL_RENDERBUFFER, screen_info.display_srgb ? GL_SRGB8 : GL_RGB8, + layout.width, layout.height); glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, renderbuffer); DrawScreen(layout); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 9bd086368..cf26628ca 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -38,7 +38,8 @@ struct TextureInfo { /// Structure used for storing information about the display target for the Switch screen struct ScreenInfo { - GLuint display_texture; + GLuint display_texture{}; + bool display_srgb{}; const Common::Rectangle<float> display_texcoords{0.0f, 0.0f, 1.0f, 1.0f}; TextureInfo texture; }; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index b9153934e..f7fbbb6e4 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -1127,6 +1127,46 @@ private: return {}; } + Id ShuffleIndexed(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleUp(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleDown(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id ShuffleButterfly(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleIndexed(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleUp(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleDown(Operation) { + UNIMPLEMENTED(); + return {}; + } + + Id InRangeShuffleButterfly(Operation) { + UNIMPLEMENTED(); + return {}; + } + Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type, const std::string& name) { const Id id = OpVariable(type, storage); @@ -1431,6 +1471,16 @@ private: &SPIRVDecompiler::VoteAll, &SPIRVDecompiler::VoteAny, &SPIRVDecompiler::VoteEqual, + + &SPIRVDecompiler::ShuffleIndexed, + &SPIRVDecompiler::ShuffleUp, + &SPIRVDecompiler::ShuffleDown, + &SPIRVDecompiler::ShuffleButterfly, + + &SPIRVDecompiler::InRangeShuffleIndexed, + &SPIRVDecompiler::InRangeShuffleUp, + &SPIRVDecompiler::InRangeShuffleDown, + &SPIRVDecompiler::InRangeShuffleButterfly, }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index ed108bea8..7923d4d69 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -35,7 +35,7 @@ u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) { return 1; } } -} // namespace +} // Anonymous namespace u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; @@ -106,16 +106,17 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } break; } - case OpCode::Id::LD_L: { - LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", - static_cast<u64>(instr.ld_l.unknown.Value())); - - const auto GetLmem = [&](s32 offset) { + case OpCode::Id::LD_L: + LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown)); + [[fallthrough]]; + case OpCode::Id::LD_S: { + const auto GetMemory = [&](s32 offset) { ASSERT(offset % 4 == 0); const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset); const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate_offset); - return GetLocalMemory(address); + return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address) + : GetLocalMemory(address); }; switch (instr.ldst_sl.type.Value()) { @@ -135,14 +136,16 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { return 0; } }(); - for (u32 i = 0; i < count; ++i) - SetTemporary(bb, i, GetLmem(i * 4)); - for (u32 i = 0; i < count; ++i) + for (u32 i = 0; i < count; ++i) { + SetTemporary(bb, i, GetMemory(i * 4)); + } + for (u32 i = 0; i < count; ++i) { SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i)); + } break; } default: - UNIMPLEMENTED_MSG("LD_L Unhandled type: {}", + UNIMPLEMENTED_MSG("{} Unhandled type: {}", opcode->get().GetName(), static_cast<u32>(instr.ldst_sl.type.Value())); } break; @@ -209,27 +212,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { break; } - case OpCode::Id::ST_L: { + case OpCode::Id::ST_L: LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}", static_cast<u64>(instr.st_l.cache_management.Value())); - - const auto GetLmemAddr = [&](s32 offset) { + [[fallthrough]]; + case OpCode::Id::ST_S: { + const auto GetAddress = [&](s32 offset) { ASSERT(offset % 4 == 0); const Node immediate = Immediate(static_cast<s32>(instr.smem_imm) + offset); return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate); }; + const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L + ? &ShaderIR::SetLocalMemory + : &ShaderIR::SetSharedMemory; + switch (instr.ldst_sl.type.Value()) { case Tegra::Shader::StoreType::Bits128: - SetLocalMemory(bb, GetLmemAddr(12), GetRegister(instr.gpr0.Value() + 3)); - SetLocalMemory(bb, GetLmemAddr(8), GetRegister(instr.gpr0.Value() + 2)); + (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3)); + (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2)); + [[fallthrough]]; case Tegra::Shader::StoreType::Bits64: - SetLocalMemory(bb, GetLmemAddr(4), GetRegister(instr.gpr0.Value() + 1)); + (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1)); + [[fallthrough]]; case Tegra::Shader::StoreType::Bits32: - SetLocalMemory(bb, GetLmemAddr(0), GetRegister(instr.gpr0)); + (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0)); break; default: - UNIMPLEMENTED_MSG("ST_L Unhandled type: {}", + UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(), static_cast<u32>(instr.ldst_sl.type.Value())); } break; diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp index 04ca74f46..a8e481b3c 100644 --- a/src/video_core/shader/decode/warp.cpp +++ b/src/video_core/shader/decode/warp.cpp @@ -13,6 +13,7 @@ namespace VideoCommon::Shader { using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; using Tegra::Shader::Pred; +using Tegra::Shader::ShuffleOperation; using Tegra::Shader::VoteOperation; namespace { @@ -44,6 +45,52 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) { SetPredicate(bb, instr.vote.dest_pred, vote); break; } + case OpCode::Id::SHFL: { + Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm)) + : GetRegister(instr.gpr39); + Node width = [&] { + // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has + // been done reversing Nvidia's math. It won't work on all cases due to SHFL having + // different parameters that don't properly map to GLSL's interface, but it should work + // for cases emitted by Nvidia's compiler. + if (instr.shfl.operation == ShuffleOperation::Up) { + return Operation( + OperationCode::ILogicalShiftRight, + Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)), + Immediate(8)); + } else { + return Operation(OperationCode::ILogicalShiftRight, + Operation(OperationCode::IAdd, Immediate(0x201F), + Operation(OperationCode::INegate, std::move(mask))), + Immediate(8)); + } + }(); + + const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> { + switch (instr.shfl.operation) { + case ShuffleOperation::Idx: + return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed}; + case ShuffleOperation::Up: + return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp}; + case ShuffleOperation::Down: + return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown}; + case ShuffleOperation::Bfly: + return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly}; + } + UNREACHABLE_MSG("Invalid SHFL operation: {}", + static_cast<u64>(instr.shfl.operation.Value())); + return {}; + }(); + + // Setting the predicate before the register is intentional to avoid overwriting. + Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm)) + : GetRegister(instr.gpr20); + SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width)); + SetRegister( + bb, instr.gpr0, + Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width))); + break; + } default: UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName()); break; diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index b47b201cf..abf2cb1ab 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -181,6 +181,16 @@ enum class OperationCode { VoteAny, /// (bool) -> bool VoteEqual, /// (bool) -> bool + ShuffleIndexed, /// (uint value, uint index, uint width) -> uint + ShuffleUp, /// (uint value, uint index, uint width) -> uint + ShuffleDown, /// (uint value, uint index, uint width) -> uint + ShuffleButterfly, /// (uint value, uint index, uint width) -> uint + + InRangeShuffleIndexed, /// (uint index, uint width) -> bool + InRangeShuffleUp, /// (uint index, uint width) -> bool + InRangeShuffleDown, /// (uint index, uint width) -> bool + InRangeShuffleButterfly, /// (uint index, uint width) -> bool + Amount, }; @@ -206,12 +216,13 @@ class PredicateNode; class AbufNode; class CbufNode; class LmemNode; +class SmemNode; class GmemNode; class CommentNode; using NodeData = std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode, - PredicateNode, AbufNode, CbufNode, LmemNode, GmemNode, CommentNode>; + PredicateNode, AbufNode, CbufNode, LmemNode, SmemNode, GmemNode, CommentNode>; using Node = std::shared_ptr<NodeData>; using Node4 = std::array<Node, 4>; using NodeBlock = std::vector<Node>; @@ -583,6 +594,19 @@ private: Node address; }; +/// Shared memory node +class SmemNode final { +public: + explicit SmemNode(Node address) : address{std::move(address)} {} + + const Node& GetAddress() const { + return address; + } + +private: + Node address; +}; + /// Global memory node class GmemNode final { public: diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index 1e5c7f660..bbbab0bca 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -137,6 +137,10 @@ Node ShaderIR::GetLocalMemory(Node address) { return MakeNode<LmemNode>(std::move(address)); } +Node ShaderIR::GetSharedMemory(Node address) { + return MakeNode<SmemNode>(std::move(address)); +} + Node ShaderIR::GetTemporary(u32 id) { return GetRegister(Register::ZeroIndex + 1 + id); } @@ -378,6 +382,11 @@ void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) { Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value))); } +void ShaderIR::SetSharedMemory(NodeBlock& bb, Node address, Node value) { + bb.push_back( + Operation(OperationCode::Assign, GetSharedMemory(std::move(address)), std::move(value))); +} + void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) { SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value)); } diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 62816bd56..6aed9bb84 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -208,6 +208,8 @@ private: Node GetInternalFlag(InternalFlag flag, bool negated = false); /// Generates a node representing a local memory address Node GetLocalMemory(Node address); + /// Generates a node representing a shared memory address + Node GetSharedMemory(Node address); /// Generates a temporary, internally it uses a post-RZ register Node GetTemporary(u32 id); @@ -217,8 +219,10 @@ private: void SetPredicate(NodeBlock& bb, u64 dest, Node src); /// Sets an internal flag. src value must be a bool-evaluated node void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value); - /// Sets a local memory address. address and value must be a number-evaluated node + /// Sets a local memory address with a value. void SetLocalMemory(NodeBlock& bb, Node address, Node value); + /// Sets a shared memory address with a value. + void SetSharedMemory(NodeBlock& bb, Node address, Node value); /// Sets a temporary. Internally it uses a post-RZ register void SetTemporary(NodeBlock& bb, u32 id, Node value); |