24 files changed, 209 insertions, 99 deletions
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 2977a7d81..5cf6a4cc3 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -303,6 +303,10 @@ public:
                 return (type == Type::SignedNorm) || (type == Type::UnsignedNorm);
             }
 
+            bool IsConstant() const {
+                return constant;
+            }
+
             bool IsValid() const {
                 return size != Size::Invalid;
             }
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index c66c66f6c..5e9cfba22 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -1006,6 +1006,12 @@ union Instruction {
     } stg;
 
     union {
+        BitField<23, 3, AtomicOp> operation;
+        BitField<48, 1, u64> extended;
+        BitField<20, 3, GlobalAtomicType> type;
+    } red;
+
+    union {
         BitField<52, 4, AtomicOp> operation;
         BitField<49, 3, GlobalAtomicType> type;
         BitField<28, 20, s64> offset;
@@ -1787,6 +1793,7 @@ public:
         ST_S,
         ST,    // Store in generic memory
         STG,   // Store in global memory
+        RED,   // Reduction operation
         ATOM,  // Atomic operation on global memory
         ATOMS, // Atomic operation on shared memory
         AL2P,  // Transforms attribute memory into physical memory
@@ -1871,7 +1878,8 @@ public:
         ICMP_R,
         ICMP_CR,
         ICMP_IMM,
-        FCMP_R,
+        FCMP_RR,
+        FCMP_RC,
         MUFU,  // Multi-Function Operator
         RRO_C, // Range Reduction Operator
         RRO_R,
@@ -2096,6 +2104,7 @@ private:
             INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"),
             INST("101-------------", Id::ST, Type::Memory, "ST"),
             INST("1110111011011---", Id::STG, Type::Memory, "STG"),
+            INST("1110101111111---", Id::RED, Type::Memory, "RED"),
             INST("11101101--------", Id::ATOM, Type::Memory, "ATOM"),
             INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"),
             INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"),
@@ -2179,7 +2188,8 @@ private:
             INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
             INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
-            INST("010110111010----", Id::FCMP_R, Type::Arithmetic, "FCMP_R"),
+            INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
+            INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
             INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index cc434faf7..20e73a37e 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -12,8 +12,9 @@ namespace VideoCommon {
 
 GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase>&& renderer_,
                      std::unique_ptr<Core::Frontend::GraphicsContext>&& context)
-    : GPU(system, std::move(renderer_), true), gpu_thread{system}, gpu_context(std::move(context)),
-      cpu_context(renderer->GetRenderWindow().CreateSharedContext()) {}
+    : GPU(system, std::move(renderer_), true), gpu_thread{system},
+      cpu_context(renderer->GetRenderWindow().CreateSharedContext()),
+      gpu_context(std::move(context)) {}
 
 GPUAsynch::~GPUAsynch() = default;
 
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index c286502ba..d83dca25a 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -87,7 +87,7 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
     std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
 
-    static std::array<std::size_t, 5> stage_swizzle = {0, 1, 2, 3, 4};
+    static constexpr std::array<std::size_t, 5> stage_swizzle{0, 1, 2, 3, 4};
     const u32 total_ubos = GetInteger<u32>(GL_MAX_UNIFORM_BUFFER_BINDINGS);
     const u32 total_ssbos = GetInteger<u32>(GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS);
     const u32 total_samplers = GetInteger<u32>(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 91abeb9d7..175374f0d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -140,8 +140,8 @@ void RasterizerOpenGL::SetupVertexFormat() {
         const auto attrib = gpu.regs.vertex_attrib_format[index];
         const auto gl_index = static_cast<GLuint>(index);
 
-        // Ignore invalid attributes.
-        if (!attrib.IsValid()) {
+        // Disable constant attributes.
+        if (attrib.IsConstant()) {
             glDisableVertexAttribArray(gl_index);
             continue;
         }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 6d2ff20f9..12c6dcfde 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -34,6 +34,8 @@
 namespace OpenGL {
 
 using Tegra::Engines::ShaderType;
+using VideoCommon::Shader::CompileDepth;
+using VideoCommon::Shader::CompilerSettings;
 using VideoCommon::Shader::ProgramCode;
 using VideoCommon::Shader::Registry;
 using VideoCommon::Shader::ShaderIR;
@@ -43,7 +45,7 @@ namespace {
 constexpr u32 STAGE_MAIN_OFFSET = 10;
 constexpr u32 KERNEL_MAIN_OFFSET = 0;
 
-constexpr VideoCommon::Shader::CompilerSettings COMPILER_SETTINGS{};
+constexpr CompilerSettings COMPILER_SETTINGS{CompileDepth::FullDecompile};
 
 /// Gets the address for the specified shader stage program
 GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) {
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 1f1f01313..b1804e9ea 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1821,13 +1821,15 @@ private:
     Expression HMergeH0(Operation operation) {
         const std::string dest = VisitOperand(operation, 0).AsUint();
         const std::string src = VisitOperand(operation, 1).AsUint();
-        return {fmt::format("bitfieldInsert({}, {}, 0, 16)", dest, src), Type::Uint};
+        return {fmt::format("vec2(unpackHalf2x16({}).x, unpackHalf2x16({}).y)", src, dest),
+                Type::HalfFloat};
     }
 
     Expression HMergeH1(Operation operation) {
         const std::string dest = VisitOperand(operation, 0).AsUint();
         const std::string src = VisitOperand(operation, 1).AsUint();
-        return {fmt::format("bitfieldInsert({}, {}, 16, 16)", dest, src), Type::Uint};
+        return {fmt::format("vec2(unpackHalf2x16({}).x, unpackHalf2x16({}).y)", dest, src),
+                Type::HalfFloat};
     }
 
     Expression HPack2(Operation operation) {
@@ -2117,8 +2119,14 @@ private:
             return {};
         }
         return {fmt::format("atomic{}({}, {})", opname, Visit(operation[0]).GetCode(),
-                            Visit(operation[1]).As(type)),
-                type};
+                            Visit(operation[1]).AsUint()),
+                Type::Uint};
+    }
+
+    template <const std::string_view& opname, Type type>
+    Expression Reduce(Operation operation) {
+        code.AddLine("{};", Atomic<opname, type>(operation).GetCode());
+        return {};
     }
 
     Expression Branch(Operation operation) {
@@ -2477,6 +2485,20 @@ private:
         &GLSLDecompiler::Atomic<Func::Or, Type::Int>,
         &GLSLDecompiler::Atomic<Func::Xor, Type::Int>,
 
+        &GLSLDecompiler::Reduce<Func::Add, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Min, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Max, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::And, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Or, Type::Uint>,
+        &GLSLDecompiler::Reduce<Func::Xor, Type::Uint>,
+
+        &GLSLDecompiler::Reduce<Func::Add, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Min, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Max, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::And, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Or, Type::Int>,
+        &GLSLDecompiler::Reduce<Func::Xor, Type::Int>,
+
         &GLSLDecompiler::Branch,
         &GLSLDecompiler::BranchIndirect,
         &GLSLDecompiler::PushFlowStack,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 0b4d999d7..2729d1265 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -417,7 +417,7 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
 
         switch (params.target) {
         case SurfaceTarget::Texture2DArray:
-            glFramebufferTexture(target, attachment, GetTexture(), params.base_level);
+            glFramebufferTexture(target, attachment, GetTexture(), 0);
             break;
         default:
             UNIMPLEMENTED();
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index f1a28cc21..b2a179746 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -315,8 +315,8 @@ public:
 
 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
                                Core::Frontend::GraphicsContext& context)
-    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system},
-      frame_mailbox{}, context{context}, has_debug_tool{HasDebugTool()} {}
+    : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
+      has_debug_tool{HasDebugTool()} {}
 
 RendererOpenGL::~RendererOpenGL() = default;
 
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 143478863..8681b821f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -360,6 +360,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
         default:
             break;
         }
+        break;
     case Maxwell::VertexAttribute::Type::UnsignedInt:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -370,6 +371,14 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
             return VK_FORMAT_R8G8B8_UINT;
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return VK_FORMAT_R8G8B8A8_UINT;
+        case Maxwell::VertexAttribute::Size::Size_16:
+            return VK_FORMAT_R16_UINT;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return VK_FORMAT_R16G16_UINT;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+            return VK_FORMAT_R16G16B16_UINT;
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+            return VK_FORMAT_R16G16B16A16_UINT;
         case Maxwell::VertexAttribute::Size::Size_32:
             return VK_FORMAT_R32_UINT;
         case Maxwell::VertexAttribute::Size::Size_32_32:
@@ -381,6 +390,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
         default:
             break;
         }
+        break;
     case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (size) {
         case Maxwell::VertexAttribute::Size::Size_8:
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
index 21644a7e7..fbd406f2b 100644
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -535,7 +535,9 @@ void VKBlitScreen::CreateGraphicsPipeline() {
     viewport_state_ci.pNext = nullptr;
     viewport_state_ci.flags = 0;
     viewport_state_ci.viewportCount = 1;
+    viewport_state_ci.pViewports = nullptr;
     viewport_state_ci.scissorCount = 1;
+    viewport_state_ci.pScissors = nullptr;
 
     VkPipelineRasterizationStateCreateInfo rasterization_ci;
     rasterization_ci.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.h b/src/video_core/renderer_vulkan/vk_memory_manager.h
index 35ee54d30..5b6858e9b 100644
--- a/src/video_core/renderer_vulkan/vk_memory_manager.h
+++ b/src/video_core/renderer_vulkan/vk_memory_manager.h
@@ -32,7 +32,7 @@ public:
      *                     memory. When passing false, it will try to allocate device local memory.
      * @returns A memory commit.
      */
-    VKMemoryCommit Commit(const VkMemoryRequirements& reqs, bool host_visible);
+    VKMemoryCommit Commit(const VkMemoryRequirements& requirements, bool host_visible);
 
     /// Commits memory required by the buffer and binds it.
     VKMemoryCommit Commit(const vk::Buffer& buffer, bool host_visible);
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index ab281c9e2..4ca0febb8 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -62,13 +62,16 @@ constexpr auto ComputeShaderIndex = static_cast<std::size_t>(Tegra::Engines::Sha
 
 VkViewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::size_t index) {
     const auto& src = regs.viewport_transform[index];
+    const float width = src.scale_x * 2.0f;
+    const float height = src.scale_y * 2.0f;
+
     VkViewport viewport;
     viewport.x = src.translate_x - src.scale_x;
     viewport.y = src.translate_y - src.scale_y;
-    viewport.width = src.scale_x * 2.0f;
-    viewport.height = src.scale_y * 2.0f;
+    viewport.width = width != 0.0f ? width : 1.0f;
+    viewport.height = height != 0.0f ? height : 1.0f;
 
-    const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne;
+    const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f;
     viewport.minDepth = src.translate_z - src.scale_z * reduce_z;
     viewport.maxDepth = src.translate_z + src.scale_z;
     if (!device.IsExtDepthRangeUnrestrictedSupported()) {
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 62e4ca488..aaa138f52 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1938,11 +1938,8 @@ private:
         return {};
     }
 
-    template <Id (Module::*func)(Id, Id, Id, Id, Id), Type result_type,
-              Type value_type = result_type>
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
     Expression Atomic(Operation operation) {
-        const Id type_def = GetTypeDefinition(result_type);
-
         Id pointer;
         if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
             pointer = GetSharedMemoryPointer(*smem);
@@ -1950,15 +1947,19 @@ private:
             pointer = GetGlobalMemoryPointer(*gmem);
         } else {
             UNREACHABLE();
-            return {Constant(type_def, 0), result_type};
+            return {v_float_zero, Type::Float};
         }
-
-        const Id value = As(Visit(operation[1]), value_type);
-
         const Id scope = Constant(t_uint, static_cast<u32>(spv::Scope::Device));
-        const Id semantics = Constant(type_def, 0);
+        const Id semantics = Constant(t_uint, 0);
+        const Id value = AsUint(Visit(operation[1]));
+
+        return {(this->*func)(t_uint, pointer, scope, semantics, value), Type::Uint};
+    }
 
-        return {(this->*func)(type_def, pointer, scope, semantics, value), result_type};
+    template <Id (Module::*func)(Id, Id, Id, Id, Id)>
+    Expression Reduce(Operation operation) {
+        Atomic<func>(operation);
+        return {};
     }
 
     Expression Branch(Operation operation) {
@@ -2547,21 +2548,35 @@ private:
         &SPIRVDecompiler::AtomicImageXor,
         &SPIRVDecompiler::AtomicImageExchange,
 
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMin, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMax, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr, Type::Uint>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor, Type::Uint>,
-
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMin, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMax, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr, Type::Int>,
-        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor, Type::Int>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMin>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicUMax>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor>,
+
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicExchange>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMin>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicSMax>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Atomic<&Module::OpAtomicXor>,
+
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicUMin>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicUMax>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicXor>,
+
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicIAdd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicSMin>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicSMax>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicAnd>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicOr>,
+        &SPIRVDecompiler::Reduce<&Module::OpAtomicXor>,
 
         &SPIRVDecompiler::Branch,
         &SPIRVDecompiler::BranchIndirect,
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index 2e2711350..6d313963a 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -484,17 +484,17 @@ bool TryInspectAddress(CFGRebuildState& state) {
     }
     case BlockCollision::Inside: {
         // This case is the tricky one:
-        // We need to Split the block in 2 sepparate blocks
+        // We need to split the block into 2 separate blocks
         const u32 end = state.block_info[block_index].end;
         BlockInfo& new_block = CreateBlockInfo(state, address, end);
         BlockInfo& current_block = state.block_info[block_index];
         current_block.end = address - 1;
-        new_block.branch = current_block.branch;
+        new_block.branch = std::move(current_block.branch);
         BlockBranchInfo forward_branch = MakeBranchInfo<SingleBranch>();
         const auto branch = std::get_if<SingleBranch>(forward_branch.get());
         branch->address = address;
         branch->ignore = true;
-        current_block.branch = forward_branch;
+        current_block.branch = std::move(forward_branch);
         return true;
     }
     default:
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 478394682..4db329fa5 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -136,7 +136,8 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
         SetRegister(bb, instr.gpr0, value);
         break;
     }
-    case OpCode::Id::FCMP_R: {
+    case OpCode::Id::FCMP_RR:
+    case OpCode::Id::FCMP_RC: {
         UNIMPLEMENTED_IF(instr.fcmp.ftz == 0);
         Node op_c = GetRegister(instr.gpr39);
         Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f));
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index b8f63922f..8112ead3e 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -3,7 +3,9 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <utility>
 #include <vector>
+
 #include <fmt/format.h>
 
 #include "common/alignment.h"
@@ -16,6 +18,7 @@
 
 namespace VideoCommon::Shader {
 
+using std::move;
 using Tegra::Shader::AtomicOp;
 using Tegra::Shader::AtomicType;
 using Tegra::Shader::Attribute;
@@ -27,29 +30,26 @@ using Tegra::Shader::StoreType;
 
 namespace {
 
-Node GetAtomOperation(AtomicOp op, bool is_signed, Node memory, Node data) {
-    const OperationCode operation_code = [op] {
-        switch (op) {
-        case AtomicOp::Add:
-            return OperationCode::AtomicIAdd;
-        case AtomicOp::Min:
-            return OperationCode::AtomicIMin;
-        case AtomicOp::Max:
-            return OperationCode::AtomicIMax;
-        case AtomicOp::And:
-            return OperationCode::AtomicIAnd;
-        case AtomicOp::Or:
-            return OperationCode::AtomicIOr;
-        case AtomicOp::Xor:
-            return OperationCode::AtomicIXor;
-        case AtomicOp::Exch:
-            return OperationCode::AtomicIExchange;
-        default:
-            UNIMPLEMENTED_MSG("op={}", static_cast<int>(op));
-            return OperationCode::AtomicIAdd;
-        }
-    }();
-    return SignedOperation(operation_code, is_signed, std::move(memory), std::move(data));
+OperationCode GetAtomOperation(AtomicOp op) {
+    switch (op) {
+    case AtomicOp::Add:
+        return OperationCode::AtomicIAdd;
+    case AtomicOp::Min:
+        return OperationCode::AtomicIMin;
+    case AtomicOp::Max:
+        return OperationCode::AtomicIMax;
+    case AtomicOp::And:
+        return OperationCode::AtomicIAnd;
+    case AtomicOp::Or:
+        return OperationCode::AtomicIOr;
+    case AtomicOp::Xor:
+        return OperationCode::AtomicIXor;
+    case AtomicOp::Exch:
+        return OperationCode::AtomicIExchange;
+    default:
+        UNIMPLEMENTED_MSG("op={}", static_cast<int>(op));
+        return OperationCode::AtomicIAdd;
+    }
 }
 
 bool IsUnaligned(Tegra::Shader::UniformType uniform_type) {
@@ -90,23 +90,22 @@ u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) {
 
 Node ExtractUnaligned(Node value, Node address, u32 mask, u32 size) {
     Node offset = Operation(OperationCode::UBitwiseAnd, address, Immediate(mask));
-    offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3));
-    return Operation(OperationCode::UBitfieldExtract, std::move(value), std::move(offset),
-                     Immediate(size));
+    offset = Operation(OperationCode::ULogicalShiftLeft, move(offset), Immediate(3));
+    return Operation(OperationCode::UBitfieldExtract, move(value), move(offset), Immediate(size));
 }
 
 Node InsertUnaligned(Node dest, Node value, Node address, u32 mask, u32 size) {
-    Node offset = Operation(OperationCode::UBitwiseAnd, std::move(address), Immediate(mask));
-    offset = Operation(OperationCode::ULogicalShiftLeft, std::move(offset), Immediate(3));
-    return Operation(OperationCode::UBitfieldInsert, std::move(dest), std::move(value),
-                     std::move(offset), Immediate(size));
+    Node offset = Operation(OperationCode::UBitwiseAnd, move(address), Immediate(mask));
+    offset = Operation(OperationCode::ULogicalShiftLeft, move(offset), Immediate(3));
+    return Operation(OperationCode::UBitfieldInsert, move(dest), move(value), move(offset),
+                     Immediate(size));
 }
 
 Node Sign16Extend(Node value) {
     Node sign = Operation(OperationCode::UBitwiseAnd, value, Immediate(1U << 15));
-    Node is_sign = Operation(OperationCode::LogicalUEqual, std::move(sign), Immediate(1U << 15));
+    Node is_sign = Operation(OperationCode::LogicalUEqual, move(sign), Immediate(1U << 15));
     Node extend = Operation(OperationCode::Select, is_sign, Immediate(0xFFFF0000), Immediate(0));
-    return Operation(OperationCode::UBitwiseOr, std::move(value), std::move(extend));
+    return Operation(OperationCode::UBitwiseOr, move(value), move(extend));
 }
 
 } // Anonymous namespace
@@ -379,20 +378,36 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
 
             if (IsUnaligned(type)) {
                 const u32 mask = GetUnalignedMask(type);
-                value = InsertUnaligned(gmem, std::move(value), real_address, mask, size);
+                value = InsertUnaligned(gmem, move(value), real_address, mask, size);
             }
 
             bb.push_back(Operation(OperationCode::Assign, gmem, value));
         }
         break;
     }
+    case OpCode::Id::RED: {
+        UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
+        UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
+        const auto [real_address, base_address, descriptor] =
+            TrackGlobalMemory(bb, instr, true, true);
+        if (!real_address || !base_address) {
+            // Tracking failed, skip atomic.
+            break;
+        }
+        Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
+        Node value = GetRegister(instr.gpr0);
+        bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
+        break;
+    }
     case OpCode::Id::ATOM: {
         UNIMPLEMENTED_IF_MSG(instr.atom.operation == AtomicOp::Inc ||
                                  instr.atom.operation == AtomicOp::Dec ||
                                  instr.atom.operation == AtomicOp::SafeAdd,
                              "operation={}", static_cast<int>(instr.atom.operation.Value()));
         UNIMPLEMENTED_IF_MSG(instr.atom.type == GlobalAtomicType::S64 ||
-                                 instr.atom.type == GlobalAtomicType::U64,
+                                 instr.atom.type == GlobalAtomicType::U64 ||
+                                 instr.atom.type == GlobalAtomicType::F16x2_FTZ_RN ||
+                                 instr.atom.type == GlobalAtomicType::F32_FTZ_RN,
                              "type={}", static_cast<int>(instr.atom.type.Value()));
 
         const auto [real_address, base_address, descriptor] =
@@ -403,11 +418,11 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
 
         const bool is_signed =
-            instr.atoms.type == AtomicType::S32 || instr.atoms.type == AtomicType::S64;
+            instr.atom.type == GlobalAtomicType::S32 || instr.atom.type == GlobalAtomicType::S64;
         Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
-        Node value = GetAtomOperation(static_cast<AtomicOp>(instr.atom.operation), is_signed, gmem,
-                                      GetRegister(instr.gpr20));
-        SetRegister(bb, instr.gpr0, std::move(value));
+        SetRegister(bb, instr.gpr0,
+                    SignedOperation(GetAtomOperation(instr.atom.operation), is_signed, gmem,
+                                    GetRegister(instr.gpr20)));
         break;
     }
     case OpCode::Id::ATOMS: {
@@ -421,11 +436,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             instr.atoms.type == AtomicType::S32 || instr.atoms.type == AtomicType::S64;
         const s32 offset = instr.atoms.GetImmediateOffset();
         Node address = GetRegister(instr.gpr8);
-        address = Operation(OperationCode::IAdd, std::move(address), Immediate(offset));
-        Node value =
-            GetAtomOperation(static_cast<AtomicOp>(instr.atoms.operation), is_signed,
-                             GetSharedMemory(std::move(address)), GetRegister(instr.gpr20));
-        SetRegister(bb, instr.gpr0, std::move(value));
+        address = Operation(OperationCode::IAdd, move(address), Immediate(offset));
+        SetRegister(bb, instr.gpr0,
+                    SignedOperation(GetAtomOperation(instr.atoms.operation), is_signed,
+                                    GetSharedMemory(move(address)), GetRegister(instr.gpr20)));
         break;
     }
     case OpCode::Id::AL2P: {
diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp
index 3b391d3e6..d4ffa8014 100644
--- a/src/video_core/shader/decode/shift.cpp
+++ b/src/video_core/shader/decode/shift.cpp
@@ -23,7 +23,6 @@ Node IsFull(Node shift) {
 }
 
 Node Shift(OperationCode opcode, Node value, Node shift) {
-    Node is_full = Operation(OperationCode::LogicalIEqual, shift, Immediate(32));
     Node shifted = Operation(opcode, move(value), shift);
     return Operation(OperationCode::Select, IsFull(move(shift)), Immediate(0), move(shifted));
 }
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 5fcc9da60..3eee961f5 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -178,6 +178,20 @@ enum class OperationCode {
     AtomicIOr,       /// (memory, int) -> int
     AtomicIXor,      /// (memory, int) -> int
 
+    ReduceUAdd, /// (memory, uint) -> void
+    ReduceUMin, /// (memory, uint) -> void
+    ReduceUMax, /// (memory, uint) -> void
+    ReduceUAnd, /// (memory, uint) -> void
+    ReduceUOr,  /// (memory, uint) -> void
+    ReduceUXor, /// (memory, uint) -> void
+
+    ReduceIAdd, /// (memory, int) -> void
+    ReduceIMin, /// (memory, int) -> void
+    ReduceIMax, /// (memory, int) -> void
+    ReduceIAnd, /// (memory, int) -> void
+    ReduceIOr,  /// (memory, int) -> void
+    ReduceIXor, /// (memory, int) -> void
+
     Branch,         /// (uint branch_target) -> void
     BranchIndirect, /// (uint branch_target) -> void
     PushFlowStack,  /// (uint branch_target) -> void
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 7af0e792c..715f39d0d 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -248,8 +248,14 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
 
     // Use an extra temporal buffer
     auto& tmp_buffer = staging_cache.GetBuffer(1);
+    // Special case for 3D Texture Segments
+    const bool must_read_current_data =
+        params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D;
     tmp_buffer.resize(guest_memory_size);
     host_ptr = tmp_buffer.data();
+    if (must_read_current_data) {
+        memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
+    }
 
     if (params.is_tiled) {
         ASSERT_MSG(params.block_width == 0, "Block width is defined as {}", params.block_width);
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index a39a8661b..c5ab21f56 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -72,9 +72,9 @@ public:
         return (cpu_addr < end) && (cpu_addr_end > start);
     }
 
-    bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) {
+    bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) const {
         const GPUVAddr gpu_addr_end = gpu_addr + guest_memory_size;
-        return (gpu_addr <= other_start && other_end <= gpu_addr_end);
+        return gpu_addr <= other_start && other_end <= gpu_addr_end;
     }
 
     // Use only when recycling a surface
diff --git a/src/video_core/texture_cache/surface_view.cpp b/src/video_core/texture_cache/surface_view.cpp
index 57a1f5803..6b5f5984b 100644
--- a/src/video_core/texture_cache/surface_view.cpp
+++ b/src/video_core/texture_cache/surface_view.cpp
@@ -20,4 +20,8 @@ bool ViewParams::operator==(const ViewParams& rhs) const {
            std::tie(rhs.base_layer, rhs.num_layers, rhs.base_level, rhs.num_levels, rhs.target);
 }
 
+bool ViewParams::operator!=(const ViewParams& rhs) const {
+    return !operator==(rhs);
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/texture_cache/surface_view.h b/src/video_core/texture_cache/surface_view.h
index b17fd11a9..90a8bb0ae 100644
--- a/src/video_core/texture_cache/surface_view.h
+++ b/src/video_core/texture_cache/surface_view.h
@@ -21,6 +21,7 @@ struct ViewParams {
     std::size_t Hash() const;
 
     bool operator==(const ViewParams& rhs) const;
+    bool operator!=(const ViewParams& rhs) const;
 
     bool IsLayered() const {
         switch (target) {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index cfc7fe6e9..3e8663adf 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -509,7 +509,9 @@ private:
         }
         const auto& final_params = new_surface->GetSurfaceParams();
         if (cr_params.type != final_params.type) {
-            BufferCopy(current_surface, new_surface);
+            if (Settings::values.use_accurate_gpu_emulation) {
+                BufferCopy(current_surface, new_surface);
+            }
         } else {
             std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);
             for (auto& brick : bricks) {
@@ -612,10 +614,10 @@ private:
      * textures within the GPU if possible. Falls back to LLE when it isn't possible to use any of
      * the HLE methods.
      *
-     * @param overlaps          The overlapping surfaces registered in the cache.
-     * @param params            The parameters on the new surface.
-     * @param gpu_addr          The starting address of the new surface.
-     * @param cache_addr        The starting address of the new surface on physical memory.
+     * @param overlaps  The overlapping surfaces registered in the cache.
+     * @param params    The parameters on the new surface.
+     * @param gpu_addr  The starting address of the new surface.
+     * @param cpu_addr  The starting address of the new surface on physical memory.
      */
     std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
                                                                const SurfaceParams& params,