19 files changed, 313 insertions, 81 deletions
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 241dac881..b4ee2a255 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -146,8 +146,8 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
     }
     IoctlSubmitGpfifo params{};
     std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
-                params.address, params.num_entries, params.flags.raw);
+    LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address,
+              params.num_entries, params.flags.raw);
 
     ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) +
                                    params.num_entries * sizeof(Tegra::CommandListHeader),
@@ -179,8 +179,8 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
     }
     IoctlSubmitGpfifo params{};
     std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
-                params.address, params.num_entries, params.flags.raw);
+    LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address,
+              params.num_entries, params.flags.raw);
 
     Tegra::CommandList entries(params.num_entries);
     Memory::ReadBlock(params.address, entries.data(),
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index c8c92757a..fb3d1112c 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -89,6 +89,9 @@ void Maxwell3D::InitializeRegisterDefaults() {
 
     // Commercial games seem to assume this value is enabled and nouveau sets this value manually.
     regs.rt_separate_frag_data = 1;
+
+    // Some games (like Super Mario Odyssey) assume that SRGB is enabled.
+    regs.framebuffer_srgb = 1;
 }
 
 #define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
@@ -329,6 +332,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessMacroBind(method_call.argument);
         break;
     }
+    case MAXWELL3D_REG_INDEX(firmware[4]): {
+        ProcessFirmwareCall4();
+        break;
+    }
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
@@ -419,6 +426,14 @@ void Maxwell3D::ProcessMacroBind(u32 data) {
     macro_positions[regs.macros.entry++] = data;
 }
 
+void Maxwell3D::ProcessFirmwareCall4() {
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+
+    // Firmware call 4 is a blob that changes some registers depending on its parameters.
+    // These registers don't affect emulation and so are stubbed by setting 0xd00 to 1.
+    regs.reg_array[0xd00] = 1;
+}
+
 void Maxwell3D::ProcessQueryGet() {
     const GPUVAddr sequence_address{regs.query.QueryAddress()};
     // Since the sequence address is given as a GPU VAddr, we have to convert it to an application
@@ -526,7 +541,7 @@ void Maxwell3D::ProcessSyncPoint() {
 }
 
 void Maxwell3D::DrawArrays() {
-    LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
               regs.vertex_buffer.count);
     ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
 
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index f67a5389f..e5ec90717 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1089,7 +1089,9 @@ public:
                     INSERT_PADDING_WORDS(14);
                 } shader_config[MaxShaderProgram];
 
-                INSERT_PADDING_WORDS(0x80);
+                INSERT_PADDING_WORDS(0x60);
+
+                u32 firmware[0x20];
 
                 struct {
                     u32 cb_size;
@@ -1319,6 +1321,9 @@ private:
     /// Handles writes to the macro bind register.
     void ProcessMacroBind(u32 data);
 
+    /// Handles firmware blob 4
+    void ProcessFirmwareCall4();
+
     /// Handles a write to the CLEAR_BUFFERS register.
     void ProcessClearBuffers();
 
@@ -1431,6 +1436,7 @@ ASSERT_REG_POSITION(vertex_array[0], 0x700);
 ASSERT_REG_POSITION(independent_blend, 0x780);
 ASSERT_REG_POSITION(vertex_array_limit[0], 0x7C0);
 ASSERT_REG_POSITION(shader_config[0], 0x800);
+ASSERT_REG_POSITION(firmware, 0x8C0);
 ASSERT_REG_POSITION(const_buffer, 0x8E0);
 ASSERT_REG_POSITION(cb_bind[0], 0x904);
 ASSERT_REG_POSITION(tex_cb_index, 0x982);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 052e6d24e..a6110bd86 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -566,6 +566,13 @@ enum class ImageAtomicOperation : u64 {
     Exch = 8,
 };
 
+enum class ShuffleOperation : u64 {
+    Idx = 0,  // shuffleNV
+    Up = 1,   // shuffleUpNV
+    Down = 2, // shuffleDownNV
+    Bfly = 3, // shuffleXorNV
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -600,6 +607,15 @@ union Instruction {
     } vote;
 
     union {
+        BitField<30, 2, ShuffleOperation> operation;
+        BitField<48, 3, u64> pred48;
+        BitField<28, 1, u64> is_index_imm;
+        BitField<29, 1, u64> is_mask_imm;
+        BitField<20, 5, u64> index_imm;
+        BitField<34, 13, u64> mask_imm;
+    } shfl;
+
+    union {
         BitField<8, 8, Register> gpr;
         BitField<20, 24, s64> offset;
     } gmem;
@@ -1542,6 +1558,7 @@ public:
         BRK,
         DEPBAR,
         VOTE,
+        SHFL,
         BFE_C,
         BFE_R,
         BFE_IMM,
@@ -1833,6 +1850,7 @@ private:
             INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
             INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
             INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
+            INST("1110111100010---", Id::SHFL, Type::Warp, "SHFL"),
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
             INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
             INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index 4e1cb98db..62afc0d11 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -131,9 +131,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
 
     // An instruction with the Exit flag will not actually
     // cause an exit if it's executed inside a delay slot.
-    // TODO(Blinkhawk): Reversed to always exit. The behavior explained above requires further
-    // testing on the MME code.
-    if (opcode.is_exit) {
+    if (opcode.is_exit && !is_delay_slot) {
         // Exit has a delay slot, execute the next instruction
         Step(offset, true);
         return false;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 4e266cdad..4dd08bccb 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -489,9 +489,6 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
                 // Assume that a surface will be written to if it is used as a framebuffer, even if
                 // the shader doesn't actually write to it.
                 texture_cache.MarkColorBufferInUse(*single_color_target);
-                // Workaround for and issue in nvidia drivers
-                // https://devtalk.nvidia.com/default/topic/776591/opengl/gl_framebuffer_srgb-functions-incorrectly/
-                state.framebuffer_srgb.enabled |= color_surface->GetSurfaceParams().srgb_conversion;
             }
 
             fbkey.is_single_buffer = true;
@@ -512,11 +509,6 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
                     // Assume that a surface will be written to if it is used as a framebuffer, even
                     // if the shader doesn't actually write to it.
                     texture_cache.MarkColorBufferInUse(index);
-                    // Enable sRGB only for supported formats
-                    // Workaround for and issue in nvidia drivers
-                    // https://devtalk.nvidia.com/default/topic/776591/opengl/gl_framebuffer_srgb-functions-incorrectly/
-                    state.framebuffer_srgb.enabled |=
-                        color_surface->GetSurfaceParams().srgb_conversion;
                 }
 
                 fbkey.color_attachments[index] =
@@ -906,6 +898,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     }
 
     screen_info.display_texture = surface->GetTexture();
+    screen_info.display_srgb = surface->GetSurfaceParams().srgb_conversion;
 
     return true;
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 909ccb82c..0dbc4c02f 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -214,7 +214,8 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
     std::string source = "#version 430 core\n"
                          "#extension GL_ARB_separate_shader_objects : enable\n"
                          "#extension GL_NV_gpu_shader5 : enable\n"
-                         "#extension GL_NV_shader_thread_group : enable\n";
+                         "#extension GL_NV_shader_thread_group : enable\n"
+                         "#extension GL_NV_shader_thread_shuffle : enable\n";
     if (entries.shader_viewport_layer_array) {
         source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 137b23740..76439e7ab 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -325,6 +325,7 @@ public:
         DeclareRegisters();
         DeclarePredicates();
         DeclareLocalMemory();
+        DeclareSharedMemory();
         DeclareInternalFlags();
         DeclareInputAttributes();
         DeclareOutputAttributes();
@@ -499,6 +500,13 @@ private:
         code.AddNewLine();
     }
 
+    void DeclareSharedMemory() {
+        if (stage != ProgramType::Compute) {
+            return;
+        }
+        code.AddLine("shared uint {}[];", GetSharedMemory());
+    }
+
     void DeclareInternalFlags() {
         for (u32 flag = 0; flag < static_cast<u32>(InternalFlag::Amount); flag++) {
             const auto flag_code = static_cast<InternalFlag>(flag);
@@ -881,6 +889,12 @@ private:
                 Type::Uint};
         }
 
+        if (const auto smem = std::get_if<SmemNode>(&*node)) {
+            return {
+                fmt::format("{}[{} >> 2]", GetSharedMemory(), Visit(smem->GetAddress()).AsUint()),
+                Type::Uint};
+        }
+
         if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
             return {GetInternalFlag(internal_flag->GetFlag()), Type::Bool};
         }
@@ -1007,10 +1021,10 @@ private:
         return {std::move(temporary), value.GetType()};
     }
 
-    Expression GetOutputAttribute(const AbufNode* abuf) {
+    std::optional<Expression> GetOutputAttribute(const AbufNode* abuf) {
         switch (const auto attribute = abuf->GetIndex()) {
         case Attribute::Index::Position:
-            return {"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float};
+            return {{"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float}};
         case Attribute::Index::LayerViewportPointSize:
             switch (abuf->GetElement()) {
             case 0:
@@ -1020,25 +1034,25 @@ private:
                 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
-                return {"gl_Layer", Type::Int};
+                return {{"gl_Layer", Type::Int}};
             case 2:
                 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
-                return {"gl_ViewportIndex", Type::Int};
+                return {{"gl_ViewportIndex", Type::Int}};
             case 3:
                 UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
-                return {"gl_PointSize", Type::Float};
+                return {{"gl_PointSize", Type::Float}};
             }
             return {};
         case Attribute::Index::ClipDistances0123:
-            return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float};
+            return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float}};
         case Attribute::Index::ClipDistances4567:
-            return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float};
+            return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}};
         default:
             if (IsGenericAttribute(attribute)) {
-                return {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()),
-                        Type::Float};
+                return {
+                    {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}};
             }
             UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
             return {};
@@ -1278,7 +1292,11 @@ private:
             target = {GetRegister(gpr->GetIndex()), Type::Float};
         } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
             UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
-            target = GetOutputAttribute(abuf);
+            auto output = GetOutputAttribute(abuf);
+            if (!output) {
+                return {};
+            }
+            target = std::move(*output);
         } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
             if (stage == ProgramType::Compute) {
                 LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
@@ -1286,6 +1304,11 @@ private:
             target = {
                 fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()),
                 Type::Uint};
+        } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+            ASSERT(stage == ProgramType::Compute);
+            target = {
+                fmt::format("{}[{} >> 2]", GetSharedMemory(), Visit(smem->GetAddress()).AsUint()),
+                Type::Uint};
         } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
             const std::string real = Visit(gmem->GetRealAddress()).AsUint();
             const std::string base = Visit(gmem->GetBaseAddress()).AsUint();
@@ -1934,8 +1957,7 @@ private:
     Expression BallotThread(Operation operation) {
         const std::string value = VisitOperand(operation, 0).AsBool();
         if (!device.HasWarpIntrinsics()) {
-            LOG_ERROR(Render_OpenGL,
-                      "Nvidia warp intrinsics are not available and its required by a shader");
+            LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
             // Stub on non-Nvidia devices by simulating all threads voting the same as the active
             // one.
             return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint};
@@ -1946,8 +1968,7 @@ private:
     Expression Vote(Operation operation, const char* func) {
         const std::string value = VisitOperand(operation, 0).AsBool();
         if (!device.HasWarpIntrinsics()) {
-            LOG_ERROR(Render_OpenGL,
-                      "Nvidia vote intrinsics are not available and its required by a shader");
+            LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
             // Stub with a warp size of one.
             return {value, Type::Bool};
         }
@@ -1964,15 +1985,54 @@ private:
 
     Expression VoteEqual(Operation operation) {
         if (!device.HasWarpIntrinsics()) {
-            LOG_ERROR(Render_OpenGL,
-                      "Nvidia vote intrinsics are not available and its required by a shader");
-            // We must return true here since a stub for a theoretical warp size of 1 will always
-            // return an equal result for all its votes.
+            LOG_ERROR(Render_OpenGL, "Nvidia vote intrinsics are required by this shader");
+            // We must return true here since a stub for a theoretical warp size of 1.
+            // This will always return an equal result across all votes.
             return {"true", Type::Bool};
         }
         return Vote(operation, "allThreadsEqualNV");
     }
 
+    template <const std::string_view& func>
+    Expression Shuffle(Operation operation) {
+        const std::string value = VisitOperand(operation, 0).AsFloat();
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL, "Nvidia shuffle intrinsics are required by this shader");
+            // On a "single-thread" device we are either on the same thread or out of bounds. Both
+            // cases return the passed value.
+            return {value, Type::Float};
+        }
+
+        const std::string index = VisitOperand(operation, 1).AsUint();
+        const std::string width = VisitOperand(operation, 2).AsUint();
+        return {fmt::format("{}({}, {}, {})", func, value, index, width), Type::Float};
+    }
+
+    template <const std::string_view& func>
+    Expression InRangeShuffle(Operation operation) {
+        const std::string index = VisitOperand(operation, 0).AsUint();
+        const std::string width = VisitOperand(operation, 1).AsUint();
+        if (!device.HasWarpIntrinsics()) {
+            // On a "single-thread" device we are only in bounds when the requested index is 0.
+            return {fmt::format("({} == 0U)", index), Type::Bool};
+        }
+
+        const std::string in_range = code.GenerateTemporary();
+        code.AddLine("bool {};", in_range);
+        code.AddLine("{}(0U, {}, {}, {});", func, index, width, in_range);
+        return {in_range, Type::Bool};
+    }
+
+    struct Func final {
+        Func() = delete;
+        ~Func() = delete;
+
+        static constexpr std::string_view ShuffleIndexed = "shuffleNV";
+        static constexpr std::string_view ShuffleUp = "shuffleUpNV";
+        static constexpr std::string_view ShuffleDown = "shuffleDownNV";
+        static constexpr std::string_view ShuffleButterfly = "shuffleXorNV";
+    };
+
     static constexpr std::array operation_decompilers = {
         &GLSLDecompiler::Assign,
 
@@ -2135,6 +2195,16 @@ private:
         &GLSLDecompiler::VoteAll,
         &GLSLDecompiler::VoteAny,
         &GLSLDecompiler::VoteEqual,
+
+        &GLSLDecompiler::Shuffle<Func::ShuffleIndexed>,
+        &GLSLDecompiler::Shuffle<Func::ShuffleUp>,
+        &GLSLDecompiler::Shuffle<Func::ShuffleDown>,
+        &GLSLDecompiler::Shuffle<Func::ShuffleButterfly>,
+
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleIndexed>,
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleUp>,
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleDown>,
+        &GLSLDecompiler::InRangeShuffle<Func::ShuffleButterfly>,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
@@ -2175,6 +2245,10 @@ private:
         return "lmem_" + suffix;
     }
 
+    std::string GetSharedMemory() const {
+        return fmt::format("smem_{}", suffix);
+    }
+
     std::string GetInternalFlag(InternalFlag flag) const {
         constexpr std::array InternalFlagNames = {"zero_flag", "sign_flag", "carry_flag",
                                                   "overflow_flag"};
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 6eabf4fac..bf86b5a0b 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -16,7 +16,6 @@ namespace OpenGL {
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 OpenGLState OpenGLState::cur_state;
-bool OpenGLState::s_rgb_used;
 
 namespace {
 
@@ -282,8 +281,6 @@ void OpenGLState::ApplySRgb() const {
         return;
     cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled;
     if (framebuffer_srgb.enabled) {
-        // Track if sRGB is used
-        s_rgb_used = true;
         glEnable(GL_FRAMEBUFFER_SRGB);
     } else {
         glDisable(GL_FRAMEBUFFER_SRGB);
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 949b13051..c358d3b38 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -175,14 +175,6 @@ public:
         return cur_state;
     }
 
-    static bool GetsRGBUsed() {
-        return s_rgb_used;
-    }
-
-    static void ClearsRGBUsed() {
-        s_rgb_used = false;
-    }
-
     void SetDefaultViewports();
     /// Apply this state as the current OpenGL state
     void Apply();
@@ -253,8 +245,6 @@ public:
 private:
     static OpenGLState cur_state;
 
-    // Workaround for sRGB problems caused by QT not supporting srgb output
-    static bool s_rgb_used;
     struct {
         bool blend_state;
         bool stencil_state;
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index ea77dd211..9ed738171 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -145,7 +145,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
         case Tegra::Texture::TextureMipmapFilter::None:
             return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
             return GL_LINEAR_MIPMAP_LINEAR;
         }
@@ -157,7 +157,7 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
         case Tegra::Texture::TextureMipmapFilter::Nearest:
             return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
     }
     }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 839178152..1e6ef66ab 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -264,7 +264,6 @@ void RendererOpenGL::CreateRasterizer() {
     if (rasterizer) {
         return;
     }
-    OpenGLState::ClearsRGBUsed();
     rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info);
 }
 
@@ -343,9 +342,7 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
     }};
 
     state.textures[0] = screen_info.display_texture;
-    // Workaround brigthness problems in SMO by enabling sRGB in the final output
-    // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987
-    state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed();
+    state.framebuffer_srgb.enabled = screen_info.display_srgb;
     state.AllDirty();
     state.Apply();
     glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());
@@ -355,8 +352,6 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
     state.textures[0] = 0;
     state.AllDirty();
     state.Apply();
-    // Clear sRGB state for the next frame
-    OpenGLState::ClearsRGBUsed();
 }
 
 /**
@@ -406,8 +401,8 @@ void RendererOpenGL::CaptureScreenshot() {
     GLuint renderbuffer;
     glGenRenderbuffers(1, &renderbuffer);
     glBindRenderbuffer(GL_RENDERBUFFER, renderbuffer);
-    glRenderbufferStorage(GL_RENDERBUFFER, state.GetsRGBUsed() ? GL_SRGB8 : GL_RGB8, layout.width,
-                          layout.height);
+    glRenderbufferStorage(GL_RENDERBUFFER, screen_info.display_srgb ? GL_SRGB8 : GL_RGB8,
+                          layout.width, layout.height);
     glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, renderbuffer);
 
     DrawScreen(layout);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 9bd086368..cf26628ca 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -38,7 +38,8 @@ struct TextureInfo {
 
 /// Structure used for storing information about the display target for the Switch screen
 struct ScreenInfo {
-    GLuint display_texture;
+    GLuint display_texture{};
+    bool display_srgb{};
     const Common::Rectangle<float> display_texcoords{0.0f, 0.0f, 1.0f, 1.0f};
     TextureInfo texture;
 };
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index b9153934e..f7fbbb6e4 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1127,6 +1127,46 @@ private:
         return {};
     }
 
+    Id ShuffleIndexed(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id ShuffleUp(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id ShuffleDown(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id ShuffleButterfly(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleIndexed(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleUp(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleDown(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id InRangeShuffleButterfly(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
                       const std::string& name) {
         const Id id = OpVariable(type, storage);
@@ -1431,6 +1471,16 @@ private:
         &SPIRVDecompiler::VoteAll,
         &SPIRVDecompiler::VoteAny,
         &SPIRVDecompiler::VoteEqual,
+
+        &SPIRVDecompiler::ShuffleIndexed,
+        &SPIRVDecompiler::ShuffleUp,
+        &SPIRVDecompiler::ShuffleDown,
+        &SPIRVDecompiler::ShuffleButterfly,
+
+        &SPIRVDecompiler::InRangeShuffleIndexed,
+        &SPIRVDecompiler::InRangeShuffleUp,
+        &SPIRVDecompiler::InRangeShuffleDown,
+        &SPIRVDecompiler::InRangeShuffleButterfly,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index ed108bea8..7923d4d69 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -35,7 +35,7 @@ u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
         return 1;
     }
 }
-} // namespace
+} // Anonymous namespace
 
 u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
@@ -106,16 +106,17 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         break;
     }
-    case OpCode::Id::LD_L: {
-        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}",
-                  static_cast<u64>(instr.ld_l.unknown.Value()));
-
-        const auto GetLmem = [&](s32 offset) {
+    case OpCode::Id::LD_L:
+        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown));
+        [[fallthrough]];
+    case OpCode::Id::LD_S: {
+        const auto GetMemory = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
             const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset);
             const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8),
                                            immediate_offset);
-            return GetLocalMemory(address);
+            return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address)
+                                                             : GetLocalMemory(address);
         };
 
         switch (instr.ldst_sl.type.Value()) {
@@ -135,14 +136,16 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                     return 0;
                 }
             }();
-            for (u32 i = 0; i < count; ++i)
-                SetTemporary(bb, i, GetLmem(i * 4));
-            for (u32 i = 0; i < count; ++i)
+            for (u32 i = 0; i < count; ++i) {
+                SetTemporary(bb, i, GetMemory(i * 4));
+            }
+            for (u32 i = 0; i < count; ++i) {
                 SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
+            }
             break;
         }
         default:
-            UNIMPLEMENTED_MSG("LD_L Unhandled type: {}",
+            UNIMPLEMENTED_MSG("{} Unhandled type: {}", opcode->get().GetName(),
                               static_cast<u32>(instr.ldst_sl.type.Value()));
         }
         break;
@@ -209,27 +212,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
 
         break;
     }
-    case OpCode::Id::ST_L: {
+    case OpCode::Id::ST_L:
         LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}",
                   static_cast<u64>(instr.st_l.cache_management.Value()));
-
-        const auto GetLmemAddr = [&](s32 offset) {
+        [[fallthrough]];
+    case OpCode::Id::ST_S: {
+        const auto GetAddress = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
             const Node immediate = Immediate(static_cast<s32>(instr.smem_imm) + offset);
             return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate);
         };
 
+        const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L
+                                    ? &ShaderIR::SetLocalMemory
+                                    : &ShaderIR::SetSharedMemory;
+
         switch (instr.ldst_sl.type.Value()) {
         case Tegra::Shader::StoreType::Bits128:
-            SetLocalMemory(bb, GetLmemAddr(12), GetRegister(instr.gpr0.Value() + 3));
-            SetLocalMemory(bb, GetLmemAddr(8), GetRegister(instr.gpr0.Value() + 2));
+            (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3));
+            (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2));
+            [[fallthrough]];
         case Tegra::Shader::StoreType::Bits64:
-            SetLocalMemory(bb, GetLmemAddr(4), GetRegister(instr.gpr0.Value() + 1));
+            (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1));
+            [[fallthrough]];
         case Tegra::Shader::StoreType::Bits32:
-            SetLocalMemory(bb, GetLmemAddr(0), GetRegister(instr.gpr0));
+            (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0));
             break;
         default:
-            UNIMPLEMENTED_MSG("ST_L Unhandled type: {}",
+            UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(),
                               static_cast<u32>(instr.ldst_sl.type.Value()));
         }
         break;
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
index 04ca74f46..a8e481b3c 100644
--- a/src/video_core/shader/decode/warp.cpp
+++ b/src/video_core/shader/decode/warp.cpp
@@ -13,6 +13,7 @@ namespace VideoCommon::Shader {
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Pred;
+using Tegra::Shader::ShuffleOperation;
 using Tegra::Shader::VoteOperation;
 
 namespace {
@@ -44,6 +45,52 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
         SetPredicate(bb, instr.vote.dest_pred, vote);
         break;
     }
+    case OpCode::Id::SHFL: {
+        Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm))
+                                           : GetRegister(instr.gpr39);
+        Node width = [&] {
+            // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has
+            // been done reversing Nvidia's math. It won't work on all cases due to SHFL having
+            // different parameters that don't properly map to GLSL's interface, but it should work
+            // for cases emitted by Nvidia's compiler.
+            if (instr.shfl.operation == ShuffleOperation::Up) {
+                return Operation(
+                    OperationCode::ILogicalShiftRight,
+                    Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)),
+                    Immediate(8));
+            } else {
+                return Operation(OperationCode::ILogicalShiftRight,
+                                 Operation(OperationCode::IAdd, Immediate(0x201F),
+                                           Operation(OperationCode::INegate, std::move(mask))),
+                                 Immediate(8));
+            }
+        }();
+
+        const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> {
+            switch (instr.shfl.operation) {
+            case ShuffleOperation::Idx:
+                return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed};
+            case ShuffleOperation::Up:
+                return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp};
+            case ShuffleOperation::Down:
+                return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown};
+            case ShuffleOperation::Bfly:
+                return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly};
+            }
+            UNREACHABLE_MSG("Invalid SHFL operation: {}",
+                            static_cast<u64>(instr.shfl.operation.Value()));
+            return {};
+        }();
+
+        // Setting the predicate before the register is intentional to avoid overwriting.
+        Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm))
+                                             : GetRegister(instr.gpr20);
+        SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width));
+        SetRegister(
+            bb, instr.gpr0,
+            Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width)));
+        break;
+    }
     default:
         UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
         break;
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index b47b201cf..abf2cb1ab 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -181,6 +181,16 @@ enum class OperationCode {
     VoteAny,      /// (bool) -> bool
     VoteEqual,    /// (bool) -> bool
 
+    ShuffleIndexed,   /// (uint value, uint index, uint width) -> uint
+    ShuffleUp,        /// (uint value, uint index, uint width) -> uint
+    ShuffleDown,      /// (uint value, uint index, uint width) -> uint
+    ShuffleButterfly, /// (uint value, uint index, uint width) -> uint
+
+    InRangeShuffleIndexed,   /// (uint index, uint width) -> bool
+    InRangeShuffleUp,        /// (uint index, uint width) -> bool
+    InRangeShuffleDown,      /// (uint index, uint width) -> bool
+    InRangeShuffleButterfly, /// (uint index, uint width) -> bool
+
     Amount,
 };
 
@@ -206,12 +216,13 @@ class PredicateNode;
 class AbufNode;
 class CbufNode;
 class LmemNode;
+class SmemNode;
 class GmemNode;
 class CommentNode;
 
 using NodeData =
     std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode,
-                 PredicateNode, AbufNode, CbufNode, LmemNode, GmemNode, CommentNode>;
+                 PredicateNode, AbufNode, CbufNode, LmemNode, SmemNode, GmemNode, CommentNode>;
 using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
@@ -583,6 +594,19 @@ private:
     Node address;
 };
 
+/// Shared memory node
+class SmemNode final {
+public:
+    explicit SmemNode(Node address) : address{std::move(address)} {}
+
+    const Node& GetAddress() const {
+        return address;
+    }
+
+private:
+    Node address;
+};
+
 /// Global memory node
 class GmemNode final {
 public:
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 1e5c7f660..bbbab0bca 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -137,6 +137,10 @@ Node ShaderIR::GetLocalMemory(Node address) {
     return MakeNode<LmemNode>(std::move(address));
 }
 
+Node ShaderIR::GetSharedMemory(Node address) {
+    return MakeNode<SmemNode>(std::move(address));
+}
+
 Node ShaderIR::GetTemporary(u32 id) {
     return GetRegister(Register::ZeroIndex + 1 + id);
 }
@@ -378,6 +382,11 @@ void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) {
         Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value)));
 }
 
+void ShaderIR::SetSharedMemory(NodeBlock& bb, Node address, Node value) {
+    bb.push_back(
+        Operation(OperationCode::Assign, GetSharedMemory(std::move(address)), std::move(value)));
+}
+
 void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) {
     SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value));
 }
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 62816bd56..6aed9bb84 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -208,6 +208,8 @@ private:
     Node GetInternalFlag(InternalFlag flag, bool negated = false);
     /// Generates a node representing a local memory address
     Node GetLocalMemory(Node address);
+    /// Generates a node representing a shared memory address
+    Node GetSharedMemory(Node address);
     /// Generates a temporary, internally it uses a post-RZ register
     Node GetTemporary(u32 id);
 
@@ -217,8 +219,10 @@ private:
     void SetPredicate(NodeBlock& bb, u64 dest, Node src);
     /// Sets an internal flag. src value must be a bool-evaluated node
     void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value);
-    /// Sets a local memory address. address and value must be a number-evaluated node
+    /// Sets a local memory address with a value.
     void SetLocalMemory(NodeBlock& bb, Node address, Node value);
+    /// Sets a shared memory address with a value.
+    void SetSharedMemory(NodeBlock& bb, Node address, Node value);
     /// Sets a temporary. Internally it uses a post-RZ register
     void SetTemporary(NodeBlock& bb, u32 id, Node value);