46 files changed, 1354 insertions, 460 deletions
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 3175579cc..0094fd715 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() {
     MICROPROFILE_SCOPE(DispatchCalls);
 
     // On entering GPU code, assume all memory may be touched by the ARM core.
-    gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
+    gpu.Maxwell3D().dirty.OnMemoryWrite();
 
     dma_pushbuffer_subindex = 0;
 
@@ -31,6 +31,7 @@ void DmaPusher::DispatchCalls() {
             break;
         }
     }
+    gpu.FlushCommands();
 }
 
 bool DmaPusher::Step() {
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 7404a8163..08586d33c 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -37,7 +37,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
         const bool is_last_call = method_call.IsLastCall();
         upload_state.ProcessData(method_call.argument, is_last_call);
         if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
         }
         break;
     }
@@ -50,13 +50,14 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
 }
 
 void KeplerCompute::ProcessLaunch() {
-
     const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
     memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
                                    LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
 
-    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
-    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+    const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
+    LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
+
+    rasterizer.DispatchCompute(code_addr);
 }
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 0561f676c..44279de00 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
         const bool is_last_call = method_call.IsLastCall();
         upload_state.ProcessData(method_call.argument, is_last_call);
         if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
         }
         break;
     }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 8755b8af4..125c53360 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste
                      MemoryManager& memory_manager)
     : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
       macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+    InitDirtySettings();
     InitializeRegisterDefaults();
 }
 
@@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() {
     regs.stencil_back_func_mask = 0xFFFFFFFF;
     regs.stencil_back_mask = 0xFFFFFFFF;
 
+    regs.depth_test_func = Regs::ComparisonOp::Always;
+    regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise;
+    regs.cull.cull_face = Regs::Cull::CullFace::Back;
+
     // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
     // register carrying a default value. Assume it's OpenGL's default (1).
     regs.point_size = 1.0f;
@@ -86,6 +91,159 @@ void Maxwell3D::InitializeRegisterDefaults() {
     regs.rt_separate_frag_data = 1;
 }
 
+#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
+
+void Maxwell3D::InitDirtySettings() {
+    const auto set_block = [this](const u32 start, const u32 range, const u8 position) {
+        const auto start_itr = dirty_pointers.begin() + start;
+        const auto end_itr = start_itr + range;
+        std::fill(start_itr, end_itr, position);
+    };
+    dirty.regs.fill(true);
+
+    // Init Render Targets
+    constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
+    constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
+    constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
+    u32 rt_dirty_reg = DIRTY_REGS_POS(render_target);
+    for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
+        set_block(rt_reg, registers_per_rt, rt_dirty_reg);
+        rt_dirty_reg++;
+    }
+    constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag;
+    constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
+    constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta);
+    set_block(zeta_reg, registers_in_zeta, depth_buffer_flag);
+
+    // Init Vertex Arrays
+    constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
+    constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
+    constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
+    u32 va_reg = DIRTY_REGS_POS(vertex_array);
+    u32 vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
+         vertex_reg += vertex_array_size) {
+        set_block(vertex_reg, 3, va_reg);
+        // The divisor concerns vertex array instances
+        dirty_pointers[vertex_reg + 3] = vi_reg;
+        va_reg++;
+        vi_reg++;
+    }
+    constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
+    constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
+    constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
+    va_reg = DIRTY_REGS_POS(vertex_array);
+    for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
+         vertex_reg += vertex_limit_size) {
+        set_block(vertex_reg, vertex_limit_size, va_reg);
+        va_reg++;
+    }
+    constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
+    constexpr u32 vertex_instance_size =
+        sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
+    constexpr u32 vertex_instance_end =
+        vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
+    vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
+         vertex_reg += vertex_instance_size) {
+        set_block(vertex_reg, vertex_instance_size, vi_reg);
+        vi_reg++;
+    }
+    set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
+              DIRTY_REGS_POS(vertex_attrib_format));
+
+    // Init Shaders
+    constexpr u32 shader_registers_count =
+        sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
+    set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count,
+              DIRTY_REGS_POS(shaders));
+
+    // State
+
+    // Viewport
+    constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
+    constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
+    constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
+    set_block(viewport_start, viewport_size, viewport_dirty_reg);
+    constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control);
+    constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32);
+    set_block(view_volume_start, view_volume_size, viewport_dirty_reg);
+
+    // Viewport transformation
+    constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform);
+    constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32);
+    set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform));
+
+    // Cullmode
+    constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull);
+    constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32);
+    set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode));
+
+    // Screen y control
+    dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control);
+
+    // Primitive Restart
+    constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart);
+    constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32);
+    set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));
+
+    // Depth Test
+    constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
+
+    // Stencil Test
+    constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;
+
+    // Color Mask
+    constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
+    dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
+              color_mask_dirty_reg);
+    // Blend State
+    constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
+    set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
+              blend_state_dirty_reg);
+    dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg);
+    set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32),
+              blend_state_dirty_reg);
+
+    // Scissor State
+    constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
+    set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
+              scissor_test_dirty_reg);
+
+    // Polygon Offset
+    constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;
+}
+
 void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
     // Reset the current macro.
     executing_macro = 0;
@@ -108,6 +266,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
 
     const u32 method = method_call.method;
 
+    if (method == cb_data_state.current) {
+        regs.reg_array[method] = method_call.argument;
+        ProcessCBData(method_call.argument);
+        return;
+    } else if (cb_data_state.current != null_cb_data) {
+        FinishCBData();
+    }
+
     // It is an error to write to a register other than the current macro's ARG register before it
     // has finished execution.
     if (executing_macro != 0) {
@@ -143,49 +309,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
 
     if (regs.reg_array[method] != method_call.argument) {
         regs.reg_array[method] = method_call.argument;
-        // Color buffers
-        constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
-        constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
-        if (method >= first_rt_reg &&
-            method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
-            const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
-            dirty_flags.color_buffer.set(rt_index);
-        }
-
-        // Zeta buffer
-        constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
-        if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
-            method == MAXWELL3D_REG_INDEX(zeta_width) ||
-            method == MAXWELL3D_REG_INDEX(zeta_height) ||
-            (method >= MAXWELL3D_REG_INDEX(zeta) &&
-             method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
-            dirty_flags.zeta_buffer = true;
-        }
-
-        // Shader
-        constexpr u32 shader_registers_count =
-            sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
-        if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
-            method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
-            dirty_flags.shaders = true;
-        }
-
-        // Vertex format
-        if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
-            method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
-            dirty_flags.vertex_attrib_format = true;
-        }
-
-        // Vertex buffer
-        if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
-        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
-        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
+        const std::size_t dirty_reg = dirty_pointers[method];
+        if (dirty_reg) {
+            dirty.regs[dirty_reg] = true;
+            if (dirty_reg >= DIRTY_REGS_POS(vertex_array) &&
+                dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) {
+                dirty.vertex_array_buffers = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) &&
+                       dirty_reg < DIRTY_REGS_POS(vertex_instances)) {
+                dirty.vertex_instances = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(render_target) &&
+                       dirty_reg < DIRTY_REGS_POS(render_settings)) {
+                dirty.render_settings = true;
+            }
         }
     }
 
@@ -214,7 +350,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        ProcessCBData(method_call.argument);
+        StartCBData(method);
         break;
     }
     case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): {
@@ -249,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessQueryGet();
         break;
     }
+    case MAXWELL3D_REG_INDEX(condition.mode): {
+        ProcessQueryCondition();
+        break;
+    }
     case MAXWELL3D_REG_INDEX(sync_info): {
         ProcessSyncPoint();
         break;
@@ -261,7 +401,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         const bool is_last_call = method_call.IsLastCall();
         upload_state.ProcessData(method_call.argument, is_last_call);
         if (is_last_call) {
-            dirty_flags.OnMemoryWrite();
+            dirty.OnMemoryWrite();
         }
         break;
     }
@@ -302,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() {
         result = regs.query.query_sequence;
         break;
     default:
+        result = 1;
         UNIMPLEMENTED_MSG("Unimplemented query select type {}",
                           static_cast<u32>(regs.query.query_get.select.Value()));
     }
@@ -333,7 +474,6 @@ void Maxwell3D::ProcessQueryGet() {
             query_result.timestamp = system.CoreTiming().GetTicks();
             memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
         }
-        dirty_flags.OnMemoryWrite();
         break;
     }
     default:
@@ -342,12 +482,52 @@ void Maxwell3D::ProcessQueryGet() {
     }
 }
 
+void Maxwell3D::ProcessQueryCondition() {
+    const GPUVAddr condition_address{regs.condition.Address()};
+    switch (regs.condition.mode) {
+    case Regs::ConditionMode::Always: {
+        execute_on = true;
+        break;
+    }
+    case Regs::ConditionMode::Never: {
+        execute_on = false;
+        break;
+    }
+    case Regs::ConditionMode::ResNonZero: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+        break;
+    }
+    case Regs::ConditionMode::Equal: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+        break;
+    }
+    case Regs::ConditionMode::NotEqual: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+        break;
+    }
+    default: {
+        UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+        execute_on = true;
+        break;
+    }
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
     const u32 increment = regs.sync_info.increment.Value();
     const u32 cache_flush = regs.sync_info.unknown.Value();
-    LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
-              cache_flush);
+    if (increment) {
+        system.GPU().IncrementSyncPoint(sync_point);
+    }
 }
 
 void Maxwell3D::DrawArrays() {
@@ -405,23 +585,39 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
 }
 
 void Maxwell3D::ProcessCBData(u32 value) {
+    const u32 id = cb_data_state.id;
+    cb_data_state.buffer[id][cb_data_state.counter] = value;
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.counter++;
+}
+
+void Maxwell3D::StartCBData(u32 method) {
+    constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+    cb_data_state.start_pos = regs.const_buffer.cb_pos;
+    cb_data_state.id = method - first_cb_data;
+    cb_data_state.current = method;
+    cb_data_state.counter = 0;
+    ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
+}
+
+void Maxwell3D::FinishCBData() {
     // Write the input value to the current const buffer at the current position.
     const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
     ASSERT(buffer_address != 0);
 
     // Don't allow writing past the end of the buffer.
-    ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size);
-
-    const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos};
+    ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);
 
-    u8* ptr{memory_manager.GetPointer(address)};
-    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
-    memory_manager.Write<u32>(address, value);
+    const GPUVAddr address{buffer_address + cb_data_state.start_pos};
+    const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;
 
-    dirty_flags.OnMemoryWrite();
+    const u32 id = cb_data_state.id;
+    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
+    dirty.OnMemoryWrite();
 
-    // Increment the current buffer position.
-    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.id = null_cb_data;
+    cb_data_state.current = null_cb_data;
 }
 
 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 8d15c8a48..1ee982b76 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -90,6 +90,20 @@ public:
 
         enum class QuerySelect : u32 {
             Zero = 0,
+            TimeElapsed = 2,
+            TransformFeedbackPrimitivesGenerated = 11,
+            PrimitivesGenerated = 18,
+            SamplesPassed = 21,
+            TransformFeedbackUnknown = 26,
+        };
+
+        struct QueryCompare {
+            u32 initial_sequence;
+            u32 initial_mode;
+            u32 unknown1;
+            u32 unknown2;
+            u32 current_sequence;
+            u32 current_mode;
         };
 
         enum class QuerySyncCondition : u32 {
@@ -97,6 +111,14 @@ public:
             GreaterThan = 1,
         };
 
+        enum class ConditionMode : u32 {
+            Never = 0,
+            Always = 1,
+            ResNonZero = 2,
+            Equal = 3,
+            NotEqual = 4,
+        };
+
         enum class ShaderProgram : u32 {
             VertexA = 0,
             VertexB = 1,
@@ -815,7 +837,18 @@ public:
                     BitField<4, 1, u32> alpha_to_one;
                 } multisample_control;
 
-                INSERT_PADDING_WORDS(0x7);
+                INSERT_PADDING_WORDS(0x4);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    ConditionMode mode;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } condition;
 
                 struct {
                     u32 tsc_address_high;
@@ -1124,23 +1157,77 @@ public:
 
     State state{};
 
-    struct DirtyFlags {
-        std::bitset<8> color_buffer{0xFF};
-        std::bitset<32> vertex_array{0xFFFFFFFF};
+    struct DirtyRegs {
+        static constexpr std::size_t NUM_REGS = 256;
+        union {
+            struct {
+                bool null_dirty;
+
+                // Vertex Attributes
+                bool vertex_attrib_format;
+
+                // Vertex Arrays
+                std::array<bool, 32> vertex_array;
+
+                bool vertex_array_buffers;
+
+                // Vertex Instances
+                std::array<bool, 32> vertex_instance;
 
-        bool vertex_attrib_format = true;
-        bool zeta_buffer = true;
-        bool shaders = true;
+                bool vertex_instances;
+
+                // Render Targets
+                std::array<bool, 8> render_target;
+                bool depth_buffer;
+
+                bool render_settings;
+
+                // Shaders
+                bool shaders;
+
+                // Rasterizer State
+                bool viewport;
+                bool clip_coefficient;
+                bool cull_mode;
+                bool primitive_restart;
+                bool depth_test;
+                bool stencil_test;
+                bool blend_state;
+                bool scissor_test;
+                bool transform_feedback;
+                bool color_mask;
+                bool polygon_offset;
+
+                // Complementary
+                bool viewport_transform;
+                bool screen_y_control;
+
+                bool memory_general;
+            };
+            std::array<bool, NUM_REGS> regs;
+        };
+
+        void ResetVertexArrays() {
+            vertex_array.fill(true);
+            vertex_array_buffers = true;
+        }
+
+        void ResetRenderTargets() {
+            depth_buffer = true;
+            render_target.fill(true);
+            render_settings = true;
+        }
 
         void OnMemoryWrite() {
-            zeta_buffer = true;
             shaders = true;
-            color_buffer.set();
-            vertex_array.set();
+            memory_general = true;
+            ResetRenderTargets();
+            ResetVertexArrays();
         }
-    };
 
-    DirtyFlags dirty_flags;
+    } dirty{};
+
+    std::array<u8, Regs::NUM_REGS> dirty_pointers{};
 
     /// Reads a register value located at the input method address
     u32 GetRegisterValue(u32 method) const;
@@ -1169,6 +1256,10 @@ public:
         return macro_memory;
     }
 
+    bool ShouldExecute() const {
+        return execute_on;
+    }
+
 private:
     void InitializeRegisterDefaults();
 
@@ -1192,14 +1283,27 @@ private:
     /// Interpreter for the macro codes uploaded to the GPU.
     MacroInterpreter macro_interpreter;
 
+    static constexpr u32 null_cb_data = 0xFFFFFFFF;
+    struct {
+        std::array<std::array<u32, 0x4000>, 16> buffer;
+        u32 current{null_cb_data};
+        u32 id{null_cb_data};
+        u32 start_pos{};
+        u32 counter{};
+    } cb_data_state;
+
     Upload::State upload_state;
 
+    bool execute_on{true};
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
     /// Retrieves information about a specific TSC entry from the TSC buffer.
     Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
 
+    void InitDirtySettings();
+
     /**
      * Call a macro on this engine.
      * @param method Method to call
@@ -1219,11 +1323,16 @@ private:
     /// Handles a write to the QUERY_GET register.
     void ProcessQueryGet();
 
+    // Handles Conditional Rendering
+    void ProcessQueryCondition();
+
     /// Handles writes to syncing register.
     void ProcessSyncPoint();
 
     /// Handles a write to the CB_DATA[i] register.
+    void StartCBData(u32 method);
     void ProcessCBData(u32 value);
+    void FinishCBData();
 
     /// Handles a write to the CB_BIND register.
     void ProcessCBBind(Regs::ShaderStage stage);
@@ -1290,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
+ASSERT_REG_POSITION(condition, 0x554);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
 ASSERT_REG_POSITION(tic, 0x55D);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index afb9578d0..a28c04473 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -38,7 +38,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
 }
 
 void MaxwellDMA::HandleCopy() {
-    LOG_WARNING(HW_GPU, "Requested a DMA copy");
+    LOG_TRACE(HW_GPU, "Requested a DMA copy");
 
     const GPUVAddr source = regs.src_address.Address();
     const GPUVAddr dest = regs.dst_address.Address();
@@ -58,7 +58,7 @@ void MaxwellDMA::HandleCopy() {
     }
 
     // All copies here update the main memory, so mark all rasterizer states as invalid.
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+    system.GPU().Maxwell3D().dirty.OnMemoryWrite();
 
     if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
         // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 79d469b88..aaa1acea9 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -560,6 +560,11 @@ union Instruction {
     BitField<48, 16, u64> opcode;
 
     union {
+        BitField<8, 5, ConditionCode> cc;
+        BitField<13, 1, u64> trigger;
+    } nop;
+
+    union {
         BitField<8, 8, Register> gpr;
         BitField<20, 24, s64> offset;
     } gmem;
@@ -931,8 +936,6 @@ union Instruction {
     } csetp;
 
     union {
-        BitField<35, 4, PredCondition> cond;
-        BitField<49, 1, u64> h_and;
         BitField<6, 1, u64> ftz;
         BitField<45, 2, PredOperation> op;
         BitField<3, 3, u64> pred3;
@@ -940,9 +943,21 @@ union Instruction {
         BitField<43, 1, u64> negate_a;
         BitField<44, 1, u64> abs_a;
         BitField<47, 2, HalfType> type_a;
-        BitField<31, 1, u64> negate_b;
-        BitField<30, 1, u64> abs_b;
-        BitField<28, 2, HalfType> type_b;
+        union {
+            BitField<35, 4, PredCondition> cond;
+            BitField<49, 1, u64> h_and;
+            BitField<31, 1, u64> negate_b;
+            BitField<30, 1, u64> abs_b;
+            BitField<28, 2, HalfType> type_b;
+        } reg;
+        union {
+            BitField<56, 1, u64> negate_b;
+            BitField<54, 1, u64> abs_b;
+        } cbuf;
+        union {
+            BitField<49, 4, PredCondition> cond;
+            BitField<53, 1, u64> h_and;
+        } cbuf_and_imm;
         BitField<42, 1, u64> neg_pred;
         BitField<39, 3, u64> pred39;
     } hsetp2;
@@ -1008,8 +1023,6 @@ union Instruction {
         } f2i;
 
         union {
-            BitField<8, 2, Register::Size> src_size;
-            BitField<10, 2, Register::Size> dst_size;
             BitField<39, 4, u64> rounding;
             // H0, H1 extract for F16 missing
             BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
@@ -1506,6 +1519,7 @@ public:
         TMML,   // Texture Mip Map Level
         SUST,   // Surface Store
         EXIT,
+        NOP,
         IPA,
         OUT_R, // Emit vertex/primitive
         ISBERD,
@@ -1548,7 +1562,9 @@ public:
         HFMA2_RC,
         HFMA2_RR,
         HFMA2_IMM_R,
+        HSETP2_C,
         HSETP2_R,
+        HSETP2_IMM,
         HSET2_R,
         POPC_C,
         POPC_R,
@@ -1783,6 +1799,7 @@ private:
             INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
             INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
             INST("11101011001-----", Id::SUST, Type::Image, "SUST"),
+            INST("0101000010110---", Id::NOP, Type::Trivial, "NOP"),
             INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
             INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
             INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
@@ -1831,7 +1848,9 @@ private:
             INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),
             INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),
             INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"),
-            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"),
+            INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
+            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
+            INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 1b4975498..c409af194 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
     UNREACHABLE();
 }
 
-GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
+GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
+    : system{system}, renderer{renderer}, is_async{is_async} {
     auto& rasterizer{renderer.Rasterizer()};
     memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
@@ -50,6 +51,14 @@ const Engines::Maxwell3D& GPU::Maxwell3D() const {
     return *maxwell_3d;
 }
 
+Engines::KeplerCompute& GPU::KeplerCompute() {
+    return *kepler_compute;
+}
+
+const Engines::KeplerCompute& GPU::KeplerCompute() const {
+    return *kepler_compute;
+}
+
 MemoryManager& GPU::MemoryManager() {
     return *memory_manager;
 }
@@ -66,6 +75,55 @@ const DmaPusher& GPU::DmaPusher() const {
     return *dma_pusher;
 }
 
+void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
+    syncpoints[syncpoint_id]++;
+    std::lock_guard lock{sync_mutex};
+    if (!syncpt_interrupts[syncpoint_id].empty()) {
+        u32 value = syncpoints[syncpoint_id].load();
+        auto it = syncpt_interrupts[syncpoint_id].begin();
+        while (it != syncpt_interrupts[syncpoint_id].end()) {
+            if (value >= *it) {
+                TriggerCpuInterrupt(syncpoint_id, *it);
+                it = syncpt_interrupts[syncpoint_id].erase(it);
+                continue;
+            }
+            it++;
+        }
+    }
+}
+
+u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
+    return syncpoints[syncpoint_id].load();
+}
+
+void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    bool contains = std::any_of(interrupt.begin(), interrupt.end(),
+                                [value](u32 in_value) { return in_value == value; });
+    if (contains) {
+        return;
+    }
+    syncpt_interrupts[syncpoint_id].emplace_back(value);
+}
+
+bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    std::lock_guard lock{sync_mutex};
+    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    const auto iter =
+        std::find_if(interrupt.begin(), interrupt.end(),
+                     [value](u32 interrupt_value) { return value == interrupt_value; });
+
+    if (iter == interrupt.end()) {
+        return false;
+    }
+    interrupt.erase(iter);
+    return true;
+}
+
+void GPU::FlushCommands() {
+    renderer.Rasterizer().FlushCommands();
+}
+
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     ASSERT(format != RenderTargetFormat::NONE);
 
@@ -143,12 +201,12 @@ enum class BufferMethods {
     NotifyIntr = 0x8,
     WrcacheFlush = 0x9,
     Unk28 = 0xA,
-    Unk2c = 0xB,
+    UnkCacheFlush = 0xB,
     RefCnt = 0x14,
     SemaphoreAcquire = 0x1A,
     SemaphoreRelease = 0x1B,
-    Unk70 = 0x1C,
-    Unk74 = 0x1D,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
     Unk78 = 0x1E,
     Unk7c = 0x1F,
     Yield = 0x20,
@@ -194,6 +252,10 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
     case BufferMethods::SemaphoreAddressLow:
     case BufferMethods::SemaphoreSequence:
     case BufferMethods::RefCnt:
+    case BufferMethods::UnkCacheFlush:
+    case BufferMethods::WrcacheFlush:
+    case BufferMethods::FenceValue:
+    case BufferMethods::FenceAction:
         break;
     case BufferMethods::SemaphoreTrigger: {
         ProcessSemaphoreTriggerMethod();
@@ -204,21 +266,11 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
         LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
         break;
     }
-    case BufferMethods::WrcacheFlush: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented");
-        break;
-    }
     case BufferMethods::Unk28: {
         // TODO(Kmather73): Research and implement this method.
         LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
         break;
     }
-    case BufferMethods::Unk2c: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented");
-        break;
-    }
     case BufferMethods::SemaphoreAcquire: {
         ProcessSemaphoreAcquire();
         break;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index fe6628923..11857ff99 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -5,8 +5,12 @@
 #pragma once
 
 #include <array>
+#include <atomic>
+#include <list>
 #include <memory>
+#include <mutex>
 #include "common/common_types.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/dma_pusher.h"
 
@@ -127,7 +131,7 @@ class MemoryManager;
 
 class GPU {
 public:
-    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
+    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);
 
     virtual ~GPU();
 
@@ -149,12 +153,20 @@ public:
     /// Calls a GPU method.
     void CallMethod(const MethodCall& method_call);
 
+    void FlushCommands();
+
     /// Returns a reference to the Maxwell3D GPU engine.
     Engines::Maxwell3D& Maxwell3D();
 
     /// Returns a const reference to the Maxwell3D GPU engine.
     const Engines::Maxwell3D& Maxwell3D() const;
 
+    /// Returns a reference to the KeplerCompute GPU engine.
+    Engines::KeplerCompute& KeplerCompute();
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    const Engines::KeplerCompute& KeplerCompute() const;
+
     /// Returns a reference to the GPU memory manager.
     Tegra::MemoryManager& MemoryManager();
 
@@ -164,6 +176,22 @@ public:
     /// Returns a reference to the GPU DMA pusher.
     Tegra::DmaPusher& DmaPusher();
 
+    void IncrementSyncPoint(u32 syncpoint_id);
+
+    u32 GetSyncpointValue(u32 syncpoint_id) const;
+
+    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    std::unique_lock<std::mutex> LockSync() {
+        return std::unique_lock{sync_mutex};
+    }
+
+    bool IsAsync() const {
+        return is_async;
+    }
+
     /// Returns a const reference to the GPU DMA pusher.
     const Tegra::DmaPusher& DmaPusher() const;
 
@@ -194,7 +222,12 @@ public:
 
                 u32 semaphore_acquire;
                 u32 semaphore_release;
-                INSERT_PADDING_WORDS(0xE4);
+                u32 fence_value;
+                union {
+                    BitField<4, 4, u32> operation;
+                    BitField<8, 8, u32> id;
+                } fence_action;
+                INSERT_PADDING_WORDS(0xE2);
 
                 // Puller state
                 u32 acquire_mode;
@@ -228,6 +261,9 @@ public:
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
+protected:
+    virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
+
 private:
     void ProcessBindMethod(const MethodCall& method_call);
     void ProcessSemaphoreTriggerMethod();
@@ -246,6 +282,7 @@ private:
 protected:
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
     VideoCore::RendererBase& renderer;
+    Core::System& system;
 
 private:
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
@@ -262,6 +299,14 @@ private:
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
     /// Inline memory engine
     std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+
+    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
+
+    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
+
+    std::mutex sync_mutex;
+
+    const bool is_async;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@@ -274,6 +319,8 @@ ASSERT_REG_POSITION(semaphore_trigger, 0x7);
 ASSERT_REG_POSITION(reference_count, 0x14);
 ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
 ASSERT_REG_POSITION(semaphore_release, 0x1B);
+ASSERT_REG_POSITION(fence_value, 0x1C);
+ASSERT_REG_POSITION(fence_action, 0x1D);
 
 ASSERT_REG_POSITION(acquire_mode, 0x100);
 ASSERT_REG_POSITION(acquire_source, 0x101);
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index d4e2553a9..ea67be831 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "core/core.h"
+#include "core/hardware_interrupt_manager.h"
 #include "video_core/gpu_asynch.h"
 #include "video_core/gpu_thread.h"
 #include "video_core/renderer_base.h"
@@ -9,7 +11,7 @@
 namespace VideoCommon {
 
 GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : GPU(system, renderer), gpu_thread{system} {}
+    : GPU(system, renderer, true), gpu_thread{system} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
@@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     gpu_thread.FlushAndInvalidateRegion(addr, size);
 }
 
+void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+    auto& interrupt_manager = system.InterruptManager();
+    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 30be74cba..36377d677 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -27,6 +27,9 @@ public:
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 
+protected:
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
+
 private:
     GPUThread::ThreadManager gpu_thread;
 };
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 45e43b1dc..d4ead9c47 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -8,7 +8,7 @@
 namespace VideoCommon {
 
 GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : GPU(system, renderer) {}
+    : GPU(system, renderer, false) {}
 
 GPUSynch::~GPUSynch() = default;
 
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 3031fcf72..07bcc47f1 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -25,6 +25,10 @@ public:
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+
+protected:
+    void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
+                             [[maybe_unused]] u32 value) const override {}
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 3f0939ec9..b441e92b0 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
     MicroProfileOnThreadCreate("GpuThread");
 
     // Wait for first GPU command before acquiring the window context
-    state.WaitForCommands();
+    while (state.queue.Empty())
+        ;
 
     // If emulation was stopped during disk shader loading, abort before trying to acquire context
     if (!state.is_running) {
@@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
 
     CommandDataContainer next;
     while (state.is_running) {
-        state.WaitForCommands();
         while (!state.queue.Empty()) {
             state.queue.Pop(next);
             if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
@@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
             } else {
                 UNREACHABLE();
             }
-            state.signaled_fence = next.fence;
-            state.TrySynchronize();
+            state.signaled_fence.store(next.fence);
         }
     }
 }
@@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
 }
 
 void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
-    if (state.queue.Empty()) {
-        // It's quicker to invalidate a single region on the CPU if the queue is already empty
-        system.Renderer().Rasterizer().InvalidateRegion(addr, size);
-    } else {
-        PushCommand(InvalidateRegionCommand(addr, size));
-    }
+    system.Renderer().Rasterizer().InvalidateRegion(addr, size);
 }
 
 void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
     const u64 fence{++state.last_fence};
     state.queue.Push(CommandDataContainer(std::move(command_data), fence));
-    state.SignalCommands();
     return fence;
 }
 
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 void SynchState::WaitForSynchronization(u64 fence) {
-    if (signaled_fence >= fence) {
-        return;
-    }
-
-    // Wait for the GPU to be idle (all commands to be executed)
-    {
-        MICROPROFILE_SCOPE(GPU_wait);
-        std::unique_lock lock{synchronization_mutex};
-        synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
-    }
+    while (signaled_fence.load() < fence)
+        ;
 }
 
 } // namespace VideoCommon::GPUThread
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 05a168a72..1d9d0c39e 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -88,41 +88,9 @@ struct CommandDataContainer {
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
     std::atomic_bool is_running{true};
-    std::atomic_int queued_frame_count{};
-    std::mutex synchronization_mutex;
-    std::mutex commands_mutex;
-    std::condition_variable commands_condition;
-    std::condition_variable synchronization_condition;
-
-    /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
-    /// synchronized. This is entirely empirical.
-    bool IsSynchronized() const {
-        constexpr std::size_t max_queue_gap{5};
-        return queue.Size() <= max_queue_gap;
-    }
-
-    void TrySynchronize() {
-        if (IsSynchronized()) {
-            std::lock_guard lock{synchronization_mutex};
-            synchronization_condition.notify_one();
-        }
-    }
 
     void WaitForSynchronization(u64 fence);
 
-    void SignalCommands() {
-        if (queue.Empty()) {
-            return;
-        }
-
-        commands_condition.notify_one();
-    }
-
-    void WaitForCommands() {
-        std::unique_lock lock{commands_mutex};
-        commands_condition.wait(lock, [this] { return !queue.Empty(); });
-    }
-
     using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
     CommandQueue queue;
     u64 last_fence{};
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 2b7367568..6e44d51cf 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -34,6 +34,9 @@ public:
     /// Clear the current framebuffer
     virtual void Clear() = 0;
 
+    /// Dispatches a compute shader invocation
+    virtual void DispatchCompute(GPUVAddr code_addr) = 0;
+
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
@@ -47,6 +50,9 @@ public:
     /// and invalidated
     virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
+    // Notify the rasterizer to send all written commands to the host GPU.
+    virtual void FlushCommands() = 0;
+
     /// Notify rasterizer that a frame is about to finish
     virtual void TickFrame() = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 0bb5c068c..80cfda7e4 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <array>
+#include <bitset>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -19,6 +20,7 @@
 #include "core/core.h"
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
+#include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -105,6 +107,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.Apply();
+    clear_framebuffer.Create();
 
     LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
     CheckExtensions();
@@ -124,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
     auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
-    if (!gpu.dirty_flags.vertex_attrib_format) {
+    if (!gpu.dirty.vertex_attrib_format) {
         return state.draw.vertex_array;
     }
-    gpu.dirty_flags.vertex_attrib_format = false;
+    gpu.dirty.vertex_attrib_format = false;
 
     MICROPROFILE_SCOPE(OpenGL_VAO);
 
@@ -181,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
     }
 
     // Rebinding the VAO invalidates the vertex buffer bindings.
-    gpu.dirty_flags.vertex_array.set();
+    gpu.dirty.ResetVertexArrays();
 
     state.draw.vertex_array = vao_entry.handle;
     return vao_entry.handle;
@@ -189,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
 
 void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
     auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
-
-    if (gpu.dirty_flags.vertex_array.none())
+    if (!gpu.dirty.vertex_array_buffers)
         return;
+    gpu.dirty.vertex_array_buffers = false;
+
+    const auto& regs = gpu.regs;
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
     // Upload all guest vertex arrays sequentially to our buffer
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
-        if (!gpu.dirty_flags.vertex_array[index])
+        if (!gpu.dirty.vertex_array[index])
             continue;
+        gpu.dirty.vertex_array[index] = false;
+        gpu.dirty.vertex_instance[index] = false;
 
         const auto& vertex_array = regs.vertex_array[index];
         if (!vertex_array.IsEnabled())
@@ -224,8 +230,32 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
             glVertexArrayBindingDivisor(vao, index, 0);
         }
     }
+}
+
+void RasterizerOpenGL::SetupVertexInstances(GLuint vao) {
+    auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.dirty.vertex_instances)
+        return;
+    gpu.dirty.vertex_instances = false;
+
+    const auto& regs = gpu.regs;
+    // Upload all guest vertex arrays sequentially to our buffer
+    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        if (!gpu.dirty.vertex_instance[index])
+            continue;
+
+        gpu.dirty.vertex_instance[index] = false;
 
-    gpu.dirty_flags.vertex_array.reset();
+        if (regs.instanced_arrays.IsInstancingEnabled(index) &&
+            regs.vertex_array[index].divisor != 0) {
+            // Enable vertex buffer instancing with the specified divisor.
+            glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor);
+        } else {
+            // Disable the vertex buffer instancing.
+            glVertexArrayBindingDivisor(vao, index, 0);
+        }
+    }
 }
 
 GLintptr RasterizerOpenGL::SetupIndexBuffer() {
@@ -298,9 +328,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         Shader shader{shader_cache.GetStageProgram(program)};
 
-        const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)};
+        const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
         SetupDrawConstBuffers(stage_enum, shader);
-        SetupGlobalRegions(stage_enum, shader);
+        SetupDrawGlobalMemory(stage_enum, shader);
         const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)};
 
         const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage};
@@ -341,7 +371,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
     SyncClipEnabled(clip_distances);
 
-    gpu.dirty_flags.shaders = false;
+    gpu.dirty.shaders = false;
 }
 
 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -424,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
 
     const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
                                                  single_color_target};
-    if (fb_config_state == current_framebuffer_config_state &&
-        gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) {
+    if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) {
         // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
         // single color targets). This is done because the guest registers may not change but the
         // host framebuffer may contain different attachments
         return current_depth_stencil_usage;
     }
+    gpu.dirty.render_settings = false;
     current_framebuffer_config_state = fb_config_state;
 
     texture_cache.GuardRenderTargets(true);
@@ -519,13 +549,71 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
     return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable};
 }
 
+void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                                 bool using_depth_fb, bool using_stencil_fb) {
+    auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    texture_cache.GuardRenderTargets(true);
+    View color_surface{};
+    if (using_color_fb) {
+        color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false);
+    }
+    View depth_surface{};
+    if (using_depth_fb || using_stencil_fb) {
+        depth_surface = texture_cache.GetDepthBufferSurface(false);
+    }
+    texture_cache.GuardRenderTargets(false);
+
+    current_state.draw.draw_framebuffer = clear_framebuffer.handle;
+    current_state.ApplyFramebufferState();
+
+    if (color_surface) {
+        color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER);
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+    }
+
+    if (depth_surface) {
+        const auto& params = depth_surface->GetSurfaceParams();
+        switch (params.type) {
+        case VideoCore::Surface::SurfaceType::Depth: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+            break;
+        }
+        case VideoCore::Surface::SurfaceType::DepthStencil: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            break;
+        }
+        default: { UNIMPLEMENTED(); }
+        }
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    }
+}
+
 void RasterizerOpenGL::Clear() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& maxwell3d = system.GPU().Maxwell3D();
+
+    if (!maxwell3d.ShouldExecute()) {
+        return;
+    }
+
+    const auto& regs = maxwell3d.regs;
     bool use_color{};
     bool use_depth{};
     bool use_stencil{};
 
-    OpenGLState clear_state;
+    OpenGLState prev_state{OpenGLState::GetCurState()};
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });
+
+    OpenGLState clear_state{OpenGLState::GetCurState()};
+    clear_state.SetDefaultViewports();
     if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
         regs.clear_buffers.A) {
         use_color = true;
@@ -545,6 +633,7 @@ void RasterizerOpenGL::Clear() {
         // true.
         clear_state.depth.test_enabled = true;
         clear_state.depth.test_func = GL_ALWAYS;
+        clear_state.depth.write_mask = GL_TRUE;
     }
     if (regs.clear_buffers.S) {
         ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!");
@@ -581,8 +670,9 @@ void RasterizerOpenGL::Clear() {
         return;
     }
 
-    const auto [clear_depth, clear_stencil] = ConfigureFramebuffers(
-        clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value());
+    ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil);
+
+    SyncViewport(clear_state);
     if (regs.clear_flags.scissor) {
         SyncScissorTest(clear_state);
     }
@@ -591,21 +681,18 @@ void RasterizerOpenGL::Clear() {
         clear_state.EmulateViewportWithScissor();
     }
 
-    clear_state.ApplyColorMask();
-    clear_state.ApplyDepth();
-    clear_state.ApplyStencilTest();
-    clear_state.ApplyViewport();
-    clear_state.ApplyFramebufferState();
+    clear_state.AllDirty();
+    clear_state.Apply();
 
     if (use_color) {
-        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
+        glClearBufferfv(GL_COLOR, 0, regs.clear_color);
     }
 
-    if (clear_depth && clear_stencil) {
+    if (use_depth && use_stencil) {
         glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
-    } else if (clear_depth) {
+    } else if (use_depth) {
         glClearBufferfv(GL_DEPTH, 0, &regs.clear_depth);
-    } else if (clear_stencil) {
+    } else if (use_stencil) {
         glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
     }
 }
@@ -616,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() {
 
     MICROPROFILE_SCOPE(OpenGL_Drawing);
     auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.ShouldExecute()) {
+        return;
+    }
+
     const auto& regs = gpu.regs;
 
     SyncColorMask();
@@ -661,6 +753,7 @@ void RasterizerOpenGL::DrawArrays() {
 
     // Upload vertex and index data.
     SetupVertexBuffer(vao);
+    SetupVertexInstances(vao);
     const GLintptr index_buffer_offset = SetupIndexBuffer();
 
     // Setup draw parameters. It will automatically choose what glDraw* method to use.
@@ -687,7 +780,7 @@ void RasterizerOpenGL::DrawArrays() {
 
     if (invalidate) {
         // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array.set();
+        gpu.dirty.ResetVertexArrays();
     }
 
     shader_program_manager->ApplyTo(state);
@@ -700,6 +793,46 @@ void RasterizerOpenGL::DrawArrays() {
     params.DispatchDraw();
 
     accelerate_draw = AccelDraw::Disabled;
+    gpu.dirty.memory_general = false;
+}
+
+void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
+    if (!GLAD_GL_ARB_compute_variable_group_size) {
+        LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the "
+                                 "lack of GL_ARB_compute_variable_group_size");
+        return;
+    }
+
+    auto kernel = shader_cache.GetComputeKernel(code_addr);
+    const auto [program, next_bindings] = kernel->GetProgramHandle({});
+    state.draw.shader_program = program;
+    state.draw.program_pipeline = 0;
+
+    const std::size_t buffer_size =
+        Tegra::Engines::KeplerCompute::NumConstBuffers *
+        (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
+    buffer_cache.Map(buffer_size);
+
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    SetupComputeConstBuffers(kernel);
+    SetupComputeGlobalMemory(kernel);
+
+    // TODO(Rodrigo): Bind images and samplers
+
+    buffer_cache.Unmap();
+
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    state.ApplyShaderProgram();
+    state.ApplyProgramPipeline();
+
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y,
+                                  launch_desc.grid_dim_z, launch_desc.block_dim_x,
+                                  launch_desc.block_dim_y, launch_desc.block_dim_z);
 }
 
 void RasterizerOpenGL::FlushAll() {}
@@ -730,6 +863,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     InvalidateRegion(addr, size);
 }
 
+void RasterizerOpenGL::FlushCommands() {
+    glFlush();
+}
+
 void RasterizerOpenGL::TickFrame() {
     buffer_cache.TickFrame();
 }
@@ -775,12 +912,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                              const Shader& shader) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto stage_index = static_cast<std::size_t>(stage);
-    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
-
-    // Upload only the enabled buffers from the 16 constbuffers of each shader stage
+    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
+    const auto& shader_stage = stages[static_cast<std::size_t>(stage)];
     for (const auto& entry : shader->GetShaderEntries().const_buffers) {
-        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
+        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
+    }
+}
+
+void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+    MICROPROFILE_SCOPE(OpenGL_UBO);
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    for (const auto& entry : kernel->GetShaderEntries().const_buffers) {
+        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
+        const std::bitset<8> mask = launch_desc.memory_config.const_buffer_enable_mask.Value();
+        Tegra::Engines::ConstBufferInfo buffer;
+        buffer.address = config.Address();
+        buffer.size = config.size;
+        buffer.enabled = mask[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
     }
 }
 
@@ -801,24 +951,39 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
     bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }
 
-void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                             const Shader& shader) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
-    const auto alignment{device.GetShaderStorageBufferAlignment()};
-
     for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
         const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
-        const auto actual_addr{memory_manager.Read<u64>(addr)};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
         const auto size{memory_manager.Read<u32>(addr + 8)};
+        SetupGlobalMemory(entry, gpu_addr, size);
+    }
+}
 
-        const auto [ssbo, buffer_offset] =
-            buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten());
-        bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
+    for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+        SetupGlobalMemory(entry, gpu_addr, size);
     }
 }
 
+void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry,
+                                         GPUVAddr gpu_addr, std::size_t size) {
+    const auto alignment{device.GetShaderStorageBufferAlignment()};
+    const auto [ssbo, buffer_offset] =
+        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
+    bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+}
+
 TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
                                                    BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
@@ -907,10 +1072,11 @@ void RasterizerOpenGL::SyncClipCoef() {
 }
 
 void RasterizerOpenGL::SyncCullMode() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
 
-    state.cull.enabled = regs.cull.enabled != 0;
+    const auto& regs = maxwell3d.regs;
 
+    state.cull.enabled = regs.cull.enabled != 0;
     if (state.cull.enabled) {
         state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
         state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face);
@@ -943,16 +1109,21 @@ void RasterizerOpenGL::SyncDepthTestState() {
     state.depth.test_enabled = regs.depth_test_enable != 0;
     state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE;
 
-    if (!state.depth.test_enabled)
+    if (!state.depth.test_enabled) {
         return;
+    }
 
     state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func);
 }
 
 void RasterizerOpenGL::SyncStencilTestState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    state.stencil.test_enabled = regs.stencil_enable != 0;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.stencil_test) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
 
+    state.stencil.test_enabled = regs.stencil_enable != 0;
     if (!regs.stencil_enable) {
         return;
     }
@@ -981,10 +1152,17 @@ void RasterizerOpenGL::SyncStencilTestState() {
         state.stencil.back.action_depth_fail = GL_KEEP;
         state.stencil.back.action_depth_pass = GL_KEEP;
     }
+    state.MarkDirtyStencilState();
+    maxwell3d.dirty.stencil_test = false;
 }
 
 void RasterizerOpenGL::SyncColorMask() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.color_mask) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
     const std::size_t count =
         regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1;
     for (std::size_t i = 0; i < count; i++) {
@@ -995,6 +1173,9 @@ void RasterizerOpenGL::SyncColorMask() {
         dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE;
         dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE;
     }
+
+    state.MarkDirtyColorMask();
+    maxwell3d.dirty.color_mask = false;
 }
 
 void RasterizerOpenGL::SyncMultiSampleState() {
@@ -1009,7 +1190,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() {
 }
 
 void RasterizerOpenGL::SyncBlendState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.blend_state) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
 
     state.blend_color.red = regs.blend_color.r;
     state.blend_color.green = regs.blend_color.g;
@@ -1032,6 +1217,8 @@ void RasterizerOpenGL::SyncBlendState() {
         for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
             state.blend[i].enabled = false;
         }
+        maxwell3d.dirty.blend_state = false;
+        state.MarkDirtyBlendState();
         return;
     }
 
@@ -1048,6 +1235,9 @@ void RasterizerOpenGL::SyncBlendState() {
         blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);
         blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);
     }
+
+    state.MarkDirtyBlendState();
+    maxwell3d.dirty.blend_state = false;
 }
 
 void RasterizerOpenGL::SyncLogicOpState() {
@@ -1099,13 +1289,21 @@ void RasterizerOpenGL::SyncPointState() {
 }
 
 void RasterizerOpenGL::SyncPolygonOffset() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.polygon_offset) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
     state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0;
     state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0;
     state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0;
     state.polygon_offset.units = regs.polygon_offset_units;
     state.polygon_offset.factor = regs.polygon_offset_factor;
     state.polygon_offset.clamp = regs.polygon_offset_clamp;
+
+    state.MarkDirtyPolygonOffset();
+    maxwell3d.dirty.polygon_offset = false;
 }
 
 void RasterizerOpenGL::SyncAlphaTest() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 40b571d58..9d20a4fbf 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -58,10 +58,12 @@ public:
 
     void DrawArrays() override;
     void Clear() override;
+    void DispatchCompute(GPUVAddr code_addr) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void FlushCommands() override;
     void TickFrame() override;
     bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                const Tegra::Engines::Fermi2D::Regs::Surface& dst,
@@ -108,17 +110,30 @@ private:
         OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,
         bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
 
+    void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                   bool using_depth_fb, bool using_stencil_fb);
+
     /// Configures the current constbuffers to use for the draw command.
     void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                const Shader& shader);
 
+    /// Configures the current constbuffers to use for the kernel invocation.
+    void SetupComputeConstBuffers(const Shader& kernel);
+
     /// Configures a constant buffer.
     void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
                           const GLShader::ConstBufferEntry& entry);
 
     /// Configures the current global memory entries to use for the draw command.
-    void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader);
+    void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                               const Shader& shader);
+
+    /// Configures the current global memory entries to use for the kernel invocation.
+    void SetupComputeGlobalMemory(const Shader& kernel);
+
+    /// Configures a constant buffer.
+    void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
+                           std::size_t size);
 
     /// Configures the current textures to use for the draw command. Returns shaders texture buffer
     /// usage.
@@ -216,6 +231,7 @@ private:
     GLuint SetupVertexFormat();
 
     void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);
 
     GLintptr SetupIndexBuffer();
 
@@ -226,6 +242,8 @@ private:
     enum class AccelDraw { Disabled, Arrays, Indexed };
     AccelDraw accelerate_draw = AccelDraw::Disabled;
 
+    OGLFramebuffer clear_framebuffer;
+
     using CachedPageMap = boost::icl::interval_map<u64, int>;
     CachedPageMap cached_pages;
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 32dd9eae7..1c90facc3 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -23,13 +23,13 @@ namespace OpenGL {
 
 using VideoCommon::Shader::ProgramCode;
 
-// One UBO is always reserved for emulation values
-constexpr u32 RESERVED_UBOS = 1;
+// One UBO is always reserved for emulation values on staged shaders
+constexpr u32 STAGE_RESERVED_UBOS = 1;
 
 struct UnspecializedShader {
     std::string code;
     GLShader::ShaderEntries entries;
-    Maxwell::ShaderProgram program_type;
+    ProgramType program_type;
 };
 
 namespace {
@@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g
 }
 
 /// Gets the shader type from a Maxwell program type
-constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) {
+constexpr GLenum GetShaderType(ProgramType program_type) {
     switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
         return GL_VERTEX_SHADER;
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
         return GL_GEOMETRY_SHADER;
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
         return GL_FRAGMENT_SHADER;
+    case ProgramType::Compute:
+        return GL_COMPUTE_SHADER;
     default:
         return GL_NONE;
     }
@@ -100,6 +102,25 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen
     }
 }
 
+ProgramType GetProgramType(Maxwell::ShaderProgram program) {
+    switch (program) {
+    case Maxwell::ShaderProgram::VertexA:
+        return ProgramType::VertexA;
+    case Maxwell::ShaderProgram::VertexB:
+        return ProgramType::VertexB;
+    case Maxwell::ShaderProgram::TesselationControl:
+        return ProgramType::TessellationControl;
+    case Maxwell::ShaderProgram::TesselationEval:
+        return ProgramType::TessellationEval;
+    case Maxwell::ShaderProgram::Geometry:
+        return ProgramType::Geometry;
+    case Maxwell::ShaderProgram::Fragment:
+        return ProgramType::Fragment;
+    }
+    UNREACHABLE();
+    return {};
+}
+
 /// Calculates the size of a program stream
 std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
     constexpr std::size_t start_offset = 10;
@@ -128,13 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
 }
 
 /// Hashes one (or two) program streams
-u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
+u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code,
                         const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
     if (size_a == 0) {
         size_a = CalculateProgramSize(code);
     }
     u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
-    if (program_type != Maxwell::ShaderProgram::VertexA) {
+    if (program_type != ProgramType::VertexA) {
         return unique_identifier;
     }
     // VertexA programs include two programs
@@ -152,12 +173,12 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 }
 
 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
+GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type,
                                       ProgramCode program_code, ProgramCode program_code_b) {
     GLShader::ShaderSetup setup(program_code);
     setup.program.size_a = CalculateProgramSize(program_code);
     setup.program.size_b = 0;
-    if (program_type == Maxwell::ShaderProgram::VertexA) {
+    if (program_type == ProgramType::VertexA) {
         // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
         // Conventional HW does not support this, so we combine VertexA and VertexB into one
         // stage here.
@@ -168,22 +189,23 @@ GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgr
         program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);
 
     switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
         return GLShader::GenerateVertexShader(device, setup);
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
         return GLShader::GenerateGeometryShader(device, setup);
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
         return GLShader::GenerateFragmentShader(device, setup);
+    case ProgramType::Compute:
+        return GLShader::GenerateComputeShader(device, setup);
     default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
-        UNREACHABLE();
+        UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type));
         return {};
     }
 }
 
 CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries,
-                               Maxwell::ShaderProgram program_type, const ProgramVariant& variant,
+                               ProgramType program_type, const ProgramVariant& variant,
                                bool hint_retrievable = false) {
     auto base_bindings{variant.base_bindings};
     const auto primitive_mode{variant.primitive_mode};
@@ -194,7 +216,14 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
     if (entries.shader_viewport_layer_array) {
         source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
     }
-    source += fmt::format("\n#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+    if (program_type == ProgramType::Compute) {
+        source += "#extension GL_ARB_compute_variable_group_size : require\n";
+    }
+    source += '\n';
+
+    if (program_type != ProgramType::Compute) {
+        source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+    }
 
     for (const auto& cbuf : entries.const_buffers) {
         source +=
@@ -221,13 +250,16 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
         source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
     }
 
-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
         const auto [glsl_topology, debug_name, max_vertices] =
             GetPrimitiveDescription(primitive_mode);
 
         source += "layout (" + std::string(glsl_topology) + ") in;\n";
         source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
     }
+    if (program_type == ProgramType::Compute) {
+        source += "layout (local_size_variable) in;\n";
+    }
 
     source += code;
 
@@ -255,7 +287,7 @@ std::set<GLenum> GetSupportedFormats() {
 
 } // Anonymous namespace
 
-CachedShader::CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
                            GLShader::ProgramResult result)
     : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
       unique_identifier{params.unique_identifier}, program_type{program_type},
@@ -268,29 +300,50 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
                                            ProgramCode&& program_code_b) {
     const auto code_size{CalculateProgramSize(program_code)};
     const auto code_size_b{CalculateProgramSize(program_code_b)};
-    auto result{CreateProgram(params.device, program_type, program_code, program_code_b)};
+    auto result{
+        CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)};
     if (result.first.empty()) {
         // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
         return {};
     }
 
     params.disk_cache.SaveRaw(ShaderDiskCacheRaw(
-        params.unique_identifier, program_type, static_cast<u32>(code_size / sizeof(u64)),
-        static_cast<u32>(code_size_b / sizeof(u64)), std::move(program_code),
-        std::move(program_code_b)));
+        params.unique_identifier, GetProgramType(program_type),
+        static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)),
+        std::move(program_code), std::move(program_code_b)));
 
-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
 }
 
 Shader CachedShader::CreateStageFromCache(const ShaderParameters& params,
                                           Maxwell::ShaderProgram program_type,
                                           GLShader::ProgramResult result) {
-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) {
+    auto result{CreateProgram(params.device, ProgramType::Compute, code, {})};
+
+    const auto code_size{CalculateProgramSize(code)};
+    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute,
+                                                 static_cast<u32>(code_size / sizeof(u64)), 0,
+                                                 std::move(code), {}));
+
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params,
+                                           GLShader::ProgramResult result) {
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
 }
 
 std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
     GLuint handle{};
-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
         handle = GetGeometryShader(variant);
     } else {
         const auto [entry, is_cache_miss] = programs.try_emplace(variant);
@@ -308,8 +361,11 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar
         handle = program->handle;
     }
 
-    auto base_bindings{variant.base_bindings};
-    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS;
+    auto base_bindings = variant.base_bindings;
+    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
+    if (program_type != ProgramType::Compute) {
+        base_bindings.cbuf += STAGE_RESERVED_UBOS;
+    }
     base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
     base_bindings.sampler += static_cast<u32>(entries.samplers.size());
 
@@ -572,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
 }
 
 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!system.GPU().Maxwell3D().dirty_flags.shaders) {
+    if (!system.GPU().Maxwell3D().dirty.shaders) {
         return last_shaders[static_cast<std::size_t>(program)];
     }
 
@@ -589,13 +645,15 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     // No shader found - create a new one
     ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
     ProgramCode program_code_b;
-    if (program == Maxwell::ShaderProgram::VertexA) {
+    const bool is_program_a{program == Maxwell::ShaderProgram::VertexA};
+    if (is_program_a) {
         const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
         program_code_b = GetShaderCode(memory_manager, program_addr_b,
                                        memory_manager.GetPointer(program_addr_b));
     }
 
-    const auto unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
+    const auto unique_identifier =
+        GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b);
     const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
     const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
                                   host_ptr,   unique_identifier};
@@ -612,4 +670,30 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     return last_shaders[static_cast<std::size_t>(program)] = shader;
 }
 
+Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+    auto& memory_manager{system.GPU().MemoryManager()};
+    const auto host_ptr{memory_manager.GetPointer(code_addr)};
+    auto kernel = TryGet(host_ptr);
+    if (kernel) {
+        return kernel;
+    }
+
+    // No kernel found - create a new one
+    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
+    const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})};
+    const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
+    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
+                                  host_ptr,   unique_identifier};
+
+    const auto found = precompiled_shaders.find(unique_identifier);
+    if (found == precompiled_shaders.end()) {
+        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+    } else {
+        kernel = CachedShader::CreateKernelFromCache(params, found->second);
+    }
+
+    Register(kernel);
+    return kernel;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index bbb53cdf4..a3106a0ff 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -61,6 +61,11 @@ public:
                                        Maxwell::ShaderProgram program_type,
                                        GLShader::ProgramResult result);
 
+    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code);
+
+    static Shader CreateKernelFromCache(const ShaderParameters& params,
+                                        GLShader::ProgramResult result);
+
     VAddr GetCpuAddr() const override {
         return cpu_addr;
     }
@@ -78,7 +83,7 @@ public:
     std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);
 
 private:
-    explicit CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+    explicit CachedShader(const ShaderParameters& params, ProgramType program_type,
                           GLShader::ProgramResult result);
 
     // Geometry programs. These are needed because GLSL needs an input topology but it's not
@@ -104,7 +109,7 @@ private:
     u8* host_ptr{};
     VAddr cpu_addr{};
     u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
     ShaderDiskCacheOpenGL& disk_cache;
     const PrecompiledPrograms& precompiled_programs;
 
@@ -132,6 +137,9 @@ public:
     /// Gets the current specified shader stage program
     Shader GetStageProgram(Maxwell::ShaderProgram program);
 
+    /// Gets a compute kernel in the passed address
+    Shader GetComputeKernel(GPUVAddr code_addr);
+
 protected:
     // We do not have to flush this cache as things in it are never modified by us.
     void FlushObjectInner(const Shader& object) override {}
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 119073776..d8f722c26 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,7 +37,6 @@ using namespace std::string_literals;
 using namespace VideoCommon::Shader;
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
 using Operation = const OperationNode&;
 
 enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
@@ -162,9 +161,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
     return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }
 
+constexpr bool IsVertexShader(ProgramType stage) {
+    return stage == ProgramType::VertexA || stage == ProgramType::VertexB;
+}
+
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage,
                             std::string suffix)
         : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
 
@@ -248,25 +251,21 @@ public:
         }
         entries.clip_distances = ir.GetClipDistances();
         entries.shader_viewport_layer_array =
-            stage == ShaderStage::Vertex && (ir.UsesLayer() || ir.UsesViewportIndex());
+            IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex());
         entries.shader_length = ir.GetLength();
         return entries;
     }
 
 private:
-    using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
     void DeclareVertex() {
-        if (stage != ShaderStage::Vertex)
+        if (!IsVertexShader(stage))
             return;
 
         DeclareVertexRedeclarations();
     }
 
     void DeclareGeometry() {
-        if (stage != ShaderStage::Geometry) {
+        if (stage != ProgramType::Geometry) {
             return;
         }
 
@@ -297,14 +296,14 @@ private:
                 break;
             }
         }
-        if (stage != ShaderStage::Vertex || device.HasVertexViewportLayer()) {
+        if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) {
             if (ir.UsesLayer()) {
                 code.AddLine("int gl_Layer;");
             }
             if (ir.UsesViewportIndex()) {
                 code.AddLine("int gl_ViewportIndex;");
             }
-        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderStage::Vertex &&
+        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) &&
                    !device.HasVertexViewportLayer()) {
             LOG_ERROR(
                 Render_OpenGL,
@@ -341,11 +340,16 @@ private:
     }
 
     void DeclareLocalMemory() {
-        if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) {
-            const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
-            code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
-            code.AddNewLine();
+        // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at
+        // specialization time.
+        const u64 local_memory_size =
+            stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize();
+        if (local_memory_size == 0) {
+            return;
         }
+        const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
+        code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
+        code.AddNewLine();
     }
 
     void DeclareInternalFlags() {
@@ -399,12 +403,12 @@ private:
         const u32 location{GetGenericAttributeIndex(index)};
 
         std::string name{GetInputAttribute(index)};
-        if (stage == ShaderStage::Geometry) {
+        if (stage == ProgramType::Geometry) {
             name = "gs_" + name + "[]";
         }
 
         std::string suffix;
-        if (stage == ShaderStage::Fragment) {
+        if (stage == ProgramType::Fragment) {
             const auto input_mode{header.ps.GetAttributeUse(location)};
             if (skip_unused && input_mode == AttributeUse::Unused) {
                 return;
@@ -416,7 +420,7 @@ private:
     }
 
     void DeclareOutputAttributes() {
-        if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) {
+        if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) {
             for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) {
                 DeclareOutputAttribute(ToGenericAttribute(i));
             }
@@ -538,7 +542,7 @@ private:
                 constexpr u32 element_stride{4};
                 const u32 address{generic_base + index * generic_stride + element * element_stride};
 
-                const bool declared{stage != ShaderStage::Fragment ||
+                const bool declared{stage != ProgramType::Fragment ||
                                     header.ps.GetAttributeUse(index) != AttributeUse::Unused};
                 const std::string value{declared ? ReadAttribute(attribute, element) : "0"};
                 code.AddLine("case 0x{:x}: return {};", address, value);
@@ -642,7 +646,7 @@ private:
         }
 
         if (const auto abuf = std::get_if<AbufNode>(&*node)) {
-            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
+            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry,
                                  "Physical attributes in geometry shaders are not implemented");
             if (abuf->IsPhysicalBuffer()) {
                 return fmt::format("readPhysicalAttribute(ftou({}))",
@@ -697,6 +701,9 @@ private:
         }
 
         if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
             return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         }
 
@@ -726,7 +733,7 @@ private:
 
     std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
         const auto GeometryPass = [&](std::string_view name) {
-            if (stage == ShaderStage::Geometry && buffer) {
+            if (stage == ProgramType::Geometry && buffer) {
                 // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
                 // set an 0x80000000 index for those and the shader fails to build. Find out why
                 // this happens and what's its intent.
@@ -738,10 +745,10 @@ private:
         switch (attribute) {
         case Attribute::Index::Position:
             switch (stage) {
-            case ShaderStage::Geometry:
+            case ProgramType::Geometry:
                 return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
                                    GetSwizzle(element));
-            case ShaderStage::Fragment:
+            case ProgramType::Fragment:
                 return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
             default:
                 UNREACHABLE();
@@ -762,7 +769,7 @@ private:
             // TODO(Subv): Find out what the values are for the first two elements when inside a
             // vertex shader, and what's the value of the fourth element when inside a Tess Eval
             // shader.
-            ASSERT(stage == ShaderStage::Vertex);
+            ASSERT(IsVertexShader(stage));
             switch (element) {
             case 2:
                 // Config pack's first value is instance_id.
@@ -774,7 +781,7 @@ private:
             return "0";
         case Attribute::Index::FrontFacing:
             // TODO(Subv): Find out what the values are for the other elements.
-            ASSERT(stage == ShaderStage::Fragment);
+            ASSERT(stage == ProgramType::Fragment);
             switch (element) {
             case 3:
                 return "itof(gl_FrontFacing ? -1 : 0)";
@@ -796,7 +803,7 @@ private:
             return value;
         }
         // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
-        const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
+        const std::string precise = stage != ProgramType::Fragment ? "precise " : "";
 
         const std::string temporary = code.GenerateTemporary();
         code.AddLine("{}float {} = {};", precise, temporary, value);
@@ -831,12 +838,12 @@ private:
                 UNIMPLEMENTED();
                 return {};
             case 1:
-                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
                 return std::make_pair("gl_Layer", true);
             case 2:
-                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
                 return std::make_pair("gl_ViewportIndex", true);
@@ -1073,6 +1080,9 @@ private:
             target = result->first;
             is_integer = result->second;
         } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
             target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
             const std::string real = Visit(gmem->GetRealAddress());
@@ -1126,6 +1136,16 @@ private:
                                Type::Float);
     }
 
+    std::string FCastHalf0(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
+        return fmt::format("({})[0]", op_a);
+    }
+
+    std::string FCastHalf1(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
+        return fmt::format("({})[1]", op_a);
+    }
+
     template <Type type>
     std::string Min(Operation operation) {
         return GenerateBinaryCall(operation, "min", type, type, type);
@@ -1282,6 +1302,11 @@ private:
         return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
     }
 
+    std::string HCastFloat(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::Float);
+        return fmt::format("fromHalf2(vec2({}, 0.0f))", op_a);
+    }
+
     std::string HUnpack(Operation operation) {
         const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
         const auto value = [&]() -> std::string {
@@ -1400,14 +1425,10 @@ private:
         return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
     }
 
-    std::string LogicalAll2(Operation operation) {
+    std::string LogicalAnd2(Operation operation) {
         return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);
     }
 
-    std::string LogicalAny2(Operation operation) {
-        return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
-    }
-
     template <bool with_nan>
     std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
         const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
@@ -1630,7 +1651,7 @@ private:
     }
 
     std::string Exit(Operation operation) {
-        if (stage != ShaderStage::Fragment) {
+        if (stage != ProgramType::Fragment) {
             code.AddLine("return;");
             return {};
         }
@@ -1681,7 +1702,7 @@ private:
     }
 
     std::string EmitVertex(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                    "EmitVertex is expected to be used in a geometry shader.");
 
         // If a geometry shader is attached, it will always flip (it's the last stage before
@@ -1692,7 +1713,7 @@ private:
     }
 
     std::string EndPrimitive(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                    "EndPrimitive is expected to be used in a geometry shader.");
 
         code.AddLine("EndPrimitive();");
@@ -1714,7 +1735,7 @@ private:
         return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
     }
 
-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
         &GLSLDecompiler::Assign,
 
         &GLSLDecompiler::Select,
@@ -1726,6 +1747,8 @@ private:
         &GLSLDecompiler::Negate<Type::Float>,
         &GLSLDecompiler::Absolute<Type::Float>,
         &GLSLDecompiler::FClamp,
+        &GLSLDecompiler::FCastHalf0,
+        &GLSLDecompiler::FCastHalf1,
         &GLSLDecompiler::Min<Type::Float>,
         &GLSLDecompiler::Max<Type::Float>,
         &GLSLDecompiler::FCos,
@@ -1786,6 +1809,7 @@ private:
         &GLSLDecompiler::Absolute<Type::HalfFloat>,
         &GLSLDecompiler::HNegate,
         &GLSLDecompiler::HClamp,
+        &GLSLDecompiler::HCastFloat,
         &GLSLDecompiler::HUnpack,
         &GLSLDecompiler::HMergeF32,
         &GLSLDecompiler::HMergeH0,
@@ -1798,8 +1822,7 @@ private:
         &GLSLDecompiler::LogicalXor,
         &GLSLDecompiler::LogicalNegate,
         &GLSLDecompiler::LogicalPick2,
-        &GLSLDecompiler::LogicalAll2,
-        &GLSLDecompiler::LogicalAny2,
+        &GLSLDecompiler::LogicalAnd2,
 
         &GLSLDecompiler::LogicalLessThan<Type::Float>,
         &GLSLDecompiler::LogicalEqual<Type::Float>,
@@ -1863,6 +1886,7 @@ private:
         &GLSLDecompiler::WorkGroupId<1>,
         &GLSLDecompiler::WorkGroupId<2>,
     };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
     std::string GetRegister(u32 index) const {
         return GetDeclarationWithSuffix(index, "gpr");
@@ -1927,7 +1951,7 @@ private:
     }
 
     u32 GetNumPhysicalInputAttributes() const {
-        return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
+        return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
     }
 
     u32 GetNumPhysicalAttributes() const {
@@ -1940,7 +1964,7 @@ private:
 
     const Device& device;
     const ShaderIR& ir;
-    const ShaderStage stage;
+    const ProgramType stage;
     const std::string suffix;
     const Header header;
 
@@ -1971,7 +1995,7 @@ std::string GetCommonDeclarations() {
         MAX_CONSTBUFFER_ELEMENTS);
 }
 
-ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
+ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
                         const std::string& suffix) {
     GLSLDecompiler decompiler(device, ir, stage, suffix);
     decompiler.Decompile();
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 02586736d..2ea02f5bf 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,14 +12,26 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/shader/shader_ir.h"
 
-namespace OpenGL {
-class Device;
-}
-
 namespace VideoCommon::Shader {
 class ShaderIR;
 }
 
+namespace OpenGL {
+
+class Device;
+
+enum class ProgramType : u32 {
+    VertexA = 0,
+    VertexB = 1,
+    TessellationControl = 2,
+    TessellationEval = 3,
+    Geometry = 4,
+    Fragment = 5,
+    Compute = 6
+};
+
+} // namespace OpenGL
+
 namespace OpenGL::GLShader {
 
 struct ShaderEntries;
@@ -85,6 +97,6 @@ struct ShaderEntries {
 std::string GetCommonDeclarations();
 
 ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
-                        Maxwell::ShaderStage stage, const std::string& suffix);
+                        ProgramType stage, const std::string& suffix);
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 7893d1e26..969fe9ced 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {
 
 } // namespace
 
-ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                        u32 program_code_size, u32 program_code_size_b,
                                        ProgramCode program_code, ProgramCode program_code_b)
     : unique_identifier{unique_identifier}, program_type{program_type},
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 4f296dda6..cc8bbd61e 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -18,7 +18,6 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "core/file_sys/vfs_vector.h"
-#include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 
 namespace Core {
@@ -34,14 +33,11 @@ namespace OpenGL {
 struct ShaderDiskCacheUsage;
 struct ShaderDiskCacheDump;
 
-using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
-
 using ProgramCode = std::vector<u64>;
-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-
+using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
 using TextureBufferUsage = std::bitset<64>;
 
-/// Allocated bindings used by an OpenGL shader program.
+/// Allocated bindings used by an OpenGL shader program
 struct BaseBindings {
     u32 cbuf{};
     u32 gmem{};
@@ -126,7 +122,7 @@ namespace OpenGL {
 /// Describes a shader how it's used by the guest GPU
 class ShaderDiskCacheRaw {
 public:
-    explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+    explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                 u32 program_code_size, u32 program_code_size_b,
                                 ProgramCode program_code, ProgramCode program_code_b);
     ShaderDiskCacheRaw();
@@ -141,30 +137,13 @@ public:
     }
 
     bool HasProgramA() const {
-        return program_type == Maxwell::ShaderProgram::VertexA;
+        return program_type == ProgramType::VertexA;
     }
 
-    Maxwell::ShaderProgram GetProgramType() const {
+    ProgramType GetProgramType() const {
         return program_type;
     }
 
-    Maxwell::ShaderStage GetProgramStage() const {
-        switch (program_type) {
-        case Maxwell::ShaderProgram::VertexA:
-        case Maxwell::ShaderProgram::VertexB:
-            return Maxwell::ShaderStage::Vertex;
-        case Maxwell::ShaderProgram::TesselationControl:
-            return Maxwell::ShaderStage::TesselationControl;
-        case Maxwell::ShaderProgram::TesselationEval:
-            return Maxwell::ShaderStage::TesselationEval;
-        case Maxwell::ShaderProgram::Geometry:
-            return Maxwell::ShaderStage::Geometry;
-        case Maxwell::ShaderProgram::Fragment:
-            return Maxwell::ShaderStage::Fragment;
-        }
-        UNREACHABLE();
-    }
-
     const ProgramCode& GetProgramCode() const {
         return program_code;
     }
@@ -175,7 +154,7 @@ public:
 
 private:
     u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
     u32 program_code_size{};
     u32 program_code_size_b{};
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index f9ee8429e..3a8d9e1da 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D;
 using VideoCommon::Shader::ProgramCode;
 using VideoCommon::Shader::ShaderIR;
 
-static constexpr u32 PROGRAM_OFFSET{10};
+static constexpr u32 PROGRAM_OFFSET = 10;
+static constexpr u32 COMPUTE_OFFSET = 0;
 
 ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
@@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };
 
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
 
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
+    const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB;
+    ProgramResult program = Decompile(device, program_ir, stage, "vertex");
     out += program.first;
 
     if (setup.IsDualProgram()) {
         const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
-        ProgramResult program_b =
-            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
-
+        ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b");
         out += program_b.first;
     }
 
@@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };
 
 )";
+
     const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry");
     out += program.first;
 
     out += R"(
@@ -116,9 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
 
 )";
     const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
-
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment");
     out += program.first;
 
     out += R"(
@@ -130,4 +127,22 @@ void main() {
     return {std::move(out), std::move(program.second)};
 }
 
+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) {
+    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
+
+    std::string out = "// Shader Unique Id: CS" + id + "\n\n";
+    out += GetCommonDeclarations();
+
+    const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a);
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute");
+    out += program.first;
+
+    out += R"(
+void main() {
+    execute_compute();
+}
+)";
+    return {std::move(out), std::move(program.second)};
+}
+
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 7cbc590f8..3833e88ab 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -54,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se
 /// Generates the GLSL fragment shader program source code for the given FS program
 ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
 
+/// Generates the GLSL compute shader program source code for the given CS program
+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup);
+
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index 5f3fe067e..9e74eda0d 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -10,21 +10,25 @@
 
 namespace OpenGL::GLShader {
 
-GLuint LoadShader(const char* source, GLenum type) {
-    const char* debug_type;
+namespace {
+const char* GetStageDebugName(GLenum type) {
     switch (type) {
     case GL_VERTEX_SHADER:
-        debug_type = "vertex";
-        break;
+        return "vertex";
     case GL_GEOMETRY_SHADER:
-        debug_type = "geometry";
-        break;
+        return "geometry";
     case GL_FRAGMENT_SHADER:
-        debug_type = "fragment";
-        break;
-    default:
-        UNREACHABLE();
+        return "fragment";
+    case GL_COMPUTE_SHADER:
+        return "compute";
     }
+    UNIMPLEMENTED();
+    return "unknown";
+}
+} // Anonymous namespace
+
+GLuint LoadShader(const char* source, GLenum type) {
+    const char* debug_type = GetStageDebugName(type);
     const GLuint shader_id = glCreateShader(type);
     glShaderSource(shader_id, 1, &source, nullptr);
     LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 0eae98afe..f4777d0b0 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -165,6 +165,25 @@ OpenGLState::OpenGLState() {
     alpha_test.ref = 0.0f;
 }
 
+void OpenGLState::SetDefaultViewports() {
+    for (auto& item : viewports) {
+        item.x = 0;
+        item.y = 0;
+        item.width = 0;
+        item.height = 0;
+        item.depth_range_near = 0.0f;
+        item.depth_range_far = 1.0f;
+        item.scissor.enabled = false;
+        item.scissor.x = 0;
+        item.scissor.y = 0;
+        item.scissor.width = 0;
+        item.scissor.height = 0;
+    }
+
+    depth_clamp.far_plane = false;
+    depth_clamp.near_plane = false;
+}
+
 void OpenGLState::ApplyDefaultState() {
     glEnable(GL_BLEND);
     glDisable(GL_FRAMEBUFFER_SRGB);
@@ -526,7 +545,7 @@ void OpenGLState::ApplySamplers() const {
     }
 }
 
-void OpenGLState::Apply() const {
+void OpenGLState::Apply() {
     MICROPROFILE_SCOPE(OpenGL_State);
     ApplyFramebufferState();
     ApplyVertexArrayState();
@@ -536,19 +555,31 @@ void OpenGLState::Apply() const {
     ApplyPointSize();
     ApplyFragmentColorClamp();
     ApplyMultisample();
+    if (dirty.color_mask) {
+        ApplyColorMask();
+        dirty.color_mask = false;
+    }
     ApplyDepthClamp();
-    ApplyColorMask();
     ApplyViewport();
-    ApplyStencilTest();
+    if (dirty.stencil_state) {
+        ApplyStencilTest();
+        dirty.stencil_state = false;
+    }
     ApplySRgb();
     ApplyCulling();
     ApplyDepth();
     ApplyPrimitiveRestart();
-    ApplyBlending();
+    if (dirty.blend_state) {
+        ApplyBlending();
+        dirty.blend_state = false;
+    }
     ApplyLogicOp();
     ApplyTextures();
     ApplySamplers();
-    ApplyPolygonOffset();
+    if (dirty.polygon_offset) {
+        ApplyPolygonOffset();
+        dirty.polygon_offset = false;
+    }
     ApplyAlphaTest();
 }
 
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index b0140495d..fdf9a8a12 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -195,8 +195,9 @@ public:
         s_rgb_used = false;
     }
 
+    void SetDefaultViewports();
     /// Apply this state as the current OpenGL state
-    void Apply() const;
+    void Apply();
 
     void ApplyFramebufferState() const;
     void ApplyVertexArrayState() const;
@@ -237,11 +238,41 @@ public:
     /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
     void EmulateViewportWithScissor();
 
+    void MarkDirtyBlendState() {
+        dirty.blend_state = true;
+    }
+
+    void MarkDirtyStencilState() {
+        dirty.stencil_state = true;
+    }
+
+    void MarkDirtyPolygonOffset() {
+        dirty.polygon_offset = true;
+    }
+
+    void MarkDirtyColorMask() {
+        dirty.color_mask = true;
+    }
+
+    void AllDirty() {
+        dirty.blend_state = true;
+        dirty.stencil_state = true;
+        dirty.polygon_offset = true;
+        dirty.color_mask = true;
+    }
+
 private:
     static OpenGLState cur_state;
 
     // Workaround for sRGB problems caused by QT not supporting srgb output
     static bool s_rgb_used;
+    struct {
+        bool blend_state;
+        bool stencil_state;
+        bool viewport_state;
+        bool polygon_offset;
+        bool color_mask;
+    } dirty{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b1f6bc7c2..408332f90 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -137,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
 const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
     ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
     const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]};
-    ASSERT(component_type == format.component_type);
     return format;
 }
 
@@ -485,11 +484,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
     const auto& dst_params{dst_view->GetSurfaceParams()};
 
     OpenGLState prev_state{OpenGLState::GetCurState()};
-    SCOPE_EXIT({ prev_state.Apply(); });
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });
 
     OpenGLState state;
     state.draw.read_framebuffer = src_framebuffer.handle;
     state.draw.draw_framebuffer = dst_framebuffer.handle;
+    state.AllDirty();
     state.Apply();
 
     u32 buffers{};
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 9ecdddb0d..a05cef3b9 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -108,6 +108,7 @@ void RendererOpenGL::SwapBuffers(
 
     // Maintain the rasterizer's state as a priority
     OpenGLState prev_state = OpenGLState::GetCurState();
+    state.AllDirty();
     state.Apply();
 
     if (framebuffer) {
@@ -140,6 +141,7 @@ void RendererOpenGL::SwapBuffers(
     system.GetPerfStats().BeginSystemFrame();
 
     // Restore the rasterizer state
+    prev_state.AllDirty();
     prev_state.Apply();
 }
 
@@ -206,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() {
     // Link shaders and get variable locations
     shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);
     state.draw.shader_program = shader.handle;
+    state.AllDirty();
     state.Apply();
     uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
     uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture");
@@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
     // Workaround brigthness problems in SMO by enabling sRGB in the final output
     // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987
     state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed();
+    state.AllDirty();
     state.Apply();
     glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());
     glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
     // Restore default state
     state.framebuffer_srgb.enabled = false;
     state.texture_units[0].texture = 0;
+    state.AllDirty();
     state.Apply();
     // Clear sRGB state for the next frame
     OpenGLState::ClearsRGBUsed();
@@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() {
     GLuint old_read_fb = state.draw.read_framebuffer;
     GLuint old_draw_fb = state.draw.draw_framebuffer;
     state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle;
+    state.AllDirty();
     state.Apply();
 
     Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
@@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() {
     screenshot_framebuffer.Release();
     state.draw.read_framebuffer = old_read_fb;
     state.draw.draw_framebuffer = old_draw_fb;
+    state.AllDirty();
     state.Apply();
     glDeleteRenderbuffers(1, &renderbuffer);
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 9b2d8e987..24a591797 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -205,10 +205,6 @@ public:
     }
 
 private:
-    using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
     static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);
 
     void AllocateBindings() {
@@ -739,6 +735,16 @@ private:
         return {};
     }
 
+    Id FCastHalf0(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id FCastHalf1(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id HNegate(Operation operation) {
         UNIMPLEMENTED();
         return {};
@@ -749,6 +755,11 @@ private:
         return {};
     }
 
+    Id HCastFloat(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id HUnpack(Operation operation) {
         UNIMPLEMENTED();
         return {};
@@ -804,12 +815,7 @@ private:
         return {};
     }
 
-    Id LogicalAll2(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Id LogicalAny2(Operation operation) {
+    Id LogicalAnd2(Operation operation) {
         UNIMPLEMENTED();
         return {};
     }
@@ -1206,7 +1212,7 @@ private:
         return {};
     }
 
-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
         &SPIRVDecompiler::Assign,
 
         &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float,
@@ -1219,6 +1225,8 @@ private:
         &SPIRVDecompiler::Unary<&Module::OpFNegate, Type::Float>,
         &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::Float>,
         &SPIRVDecompiler::Ternary<&Module::OpFClamp, Type::Float>,
+        &SPIRVDecompiler::FCastHalf0,
+        &SPIRVDecompiler::FCastHalf1,
         &SPIRVDecompiler::Binary<&Module::OpFMin, Type::Float>,
         &SPIRVDecompiler::Binary<&Module::OpFMax, Type::Float>,
         &SPIRVDecompiler::Unary<&Module::OpCos, Type::Float>,
@@ -1279,6 +1287,7 @@ private:
         &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
         &SPIRVDecompiler::HNegate,
         &SPIRVDecompiler::HClamp,
+        &SPIRVDecompiler::HCastFloat,
         &SPIRVDecompiler::HUnpack,
         &SPIRVDecompiler::HMergeF32,
         &SPIRVDecompiler::HMergeH0,
@@ -1291,8 +1300,7 @@ private:
         &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,
         &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,
         &SPIRVDecompiler::LogicalPick2,
-        &SPIRVDecompiler::LogicalAll2,
-        &SPIRVDecompiler::LogicalAny2,
+        &SPIRVDecompiler::LogicalAnd2,
 
         &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>,
@@ -1357,6 +1365,7 @@ private:
         &SPIRVDecompiler::WorkGroupId<1>,
         &SPIRVDecompiler::WorkGroupId<2>,
     };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
     const VKDevice& device;
     const ShaderIR& ir;
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index fdcc970ff..ec3a76690 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -15,7 +15,7 @@
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
-
+namespace {
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 
@@ -29,8 +29,7 @@ struct Query {
 
 struct BlockStack {
     BlockStack() = default;
-    BlockStack(const BlockStack& b) = default;
-    BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
+    explicit BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
     std::stack<u32> ssy_stack{};
     std::stack<u32> pbk_stack{};
 };
@@ -58,7 +57,7 @@ struct BlockInfo {
 struct CFGRebuildState {
     explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
                              const u32 start)
-        : program_code{program_code}, program_size{program_size}, start{start} {}
+        : start{start}, program_code{program_code}, program_size{program_size} {}
 
     u32 start{};
     std::vector<BlockInfo> block_info{};
@@ -85,7 +84,7 @@ std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address)
             return {BlockCollision::Inside, index};
         }
     }
-    return {BlockCollision::None, -1};
+    return {BlockCollision::None, 0xFFFFFFFF};
 }
 
 struct ParseInfo {
@@ -365,27 +364,29 @@ bool TryQuery(CFGRebuildState& state) {
         const auto gather_end = labels.upper_bound(block.end);
         while (gather_start != gather_end) {
             cc.push(gather_start->second);
-            gather_start++;
+            ++gather_start;
         }
     };
     if (state.queries.empty()) {
         return false;
     }
+
     Query& q = state.queries.front();
     const u32 block_index = state.registered[q.address];
     BlockInfo& block = state.block_info[block_index];
-    // If the block is visted, check if the stacks match, else gather the ssy/pbk
+    // If the block is visited, check if the stacks match, else gather the ssy/pbk
     // labels into the current stack and look if the branch at the end of the block
     // consumes a label. Schedule new queries accordingly
     if (block.visited) {
         BlockStack& stack = state.stacks[q.address];
-        const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
-                              (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
+        const bool all_okay = (stack.ssy_stack.empty() || q.ssy_stack == stack.ssy_stack) &&
+                              (stack.pbk_stack.empty() || q.pbk_stack == stack.pbk_stack);
         state.queries.pop_front();
         return all_okay;
     }
     block.visited = true;
-    state.stacks[q.address] = BlockStack{q};
+    state.stacks.insert_or_assign(q.address, BlockStack{q});
+
     Query q2(q);
     state.queries.pop_front();
     gather_labels(q2.ssy_stack, state.ssy_labels, block);
@@ -394,6 +395,7 @@ bool TryQuery(CFGRebuildState& state) {
         q2.address = block.end + 1;
         state.queries.push_back(q2);
     }
+
     Query conditional_query{q2};
     if (block.branch.is_sync) {
         if (block.branch.address == unassigned_branch) {
@@ -408,13 +410,15 @@ bool TryQuery(CFGRebuildState& state) {
         conditional_query.pbk_stack.pop();
     }
     conditional_query.address = block.branch.address;
-    state.queries.push_back(conditional_query);
+    state.queries.push_back(std::move(conditional_query));
     return true;
 }
+} // Anonymous namespace
 
-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
-                                              u32 start_address) {
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address) {
     CFGRebuildState state{program_code, program_size, start_address};
+
     // Inspect Code and generate blocks
     state.labels.clear();
     state.labels.emplace(start_address);
@@ -424,10 +428,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
             return {};
         }
     }
+
     // Decompile Stacks
-    Query start_query{};
-    start_query.address = state.start;
-    state.queries.push_back(start_query);
+    state.queries.push_back(Query{state.start, {}, {}});
     bool decompiled = true;
     while (!state.queries.empty()) {
         if (!TryQuery(state)) {
@@ -435,14 +438,15 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
             break;
         }
     }
+
     // Sort and organize results
     std::sort(state.block_info.begin(), state.block_info.end(),
-              [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
+              [](const BlockInfo& a, const BlockInfo& b) { return a.start < b.start; });
     ShaderCharacteristics result_out{};
     result_out.decompilable = decompiled;
     result_out.start = start_address;
     result_out.end = start_address;
-    for (auto& block : state.block_info) {
+    for (const auto& block : state.block_info) {
         ShaderBlock new_block{};
         new_block.start = block.start;
         new_block.end = block.end;
@@ -457,8 +461,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
     }
     if (result_out.decompilable) {
         result_out.labels = std::move(state.labels);
-        return {result_out};
+        return {std::move(result_out)};
     }
+
     // If it's not decompilable, merge the unlabelled blocks together
     auto back = result_out.blocks.begin();
     auto next = std::next(back);
@@ -469,8 +474,8 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
             continue;
         }
         back = next;
-        next++;
+        ++next;
     }
-    return {result_out};
+    return {std::move(result_out)};
 }
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h
index 5e8ea3271..b0a5e4f8c 100644
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@@ -4,7 +4,6 @@
 
 #pragma once
 
-#include <cstring>
 #include <list>
 #include <optional>
 #include <unordered_set>
@@ -26,27 +25,44 @@ struct Condition {
     bool IsUnconditional() const {
         return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
     }
+
     bool operator==(const Condition& other) const {
         return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
     }
+
+    bool operator!=(const Condition& other) const {
+        return !operator==(other);
+    }
 };
 
 struct ShaderBlock {
-    u32 start{};
-    u32 end{};
-    bool ignore_branch{};
     struct Branch {
         Condition cond{};
         bool kills{};
         s32 address{};
+
         bool operator==(const Branch& b) const {
             return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
         }
-    } branch{};
+
+        bool operator!=(const Branch& b) const {
+            return !operator==(b);
+        }
+    };
+
+    u32 start{};
+    u32 end{};
+    bool ignore_branch{};
+    Branch branch{};
+
     bool operator==(const ShaderBlock& sb) const {
         return std::tie(start, end, ignore_branch, branch) ==
                std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
     }
+
+    bool operator!=(const ShaderBlock& sb) const {
+        return !operator==(sb);
+    }
 };
 
 struct ShaderCharacteristics {
@@ -57,7 +73,7 @@ struct ShaderCharacteristics {
     std::unordered_set<u32> labels{};
 };
 
-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
-                                              u32 start_address);
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address);
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index afffd157f..b547d8323 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -47,14 +47,14 @@ void ShaderIR::Decode() {
         if (shader_info.decompilable) {
             disable_flow_stack = true;
             const auto insert_block = [this](NodeBlock& nodes, u32 label) {
-                if (label == exit_branch) {
+                if (label == static_cast<u32>(exit_branch)) {
                     return;
                 }
                 basic_blocks.insert({label, nodes});
             };
             const auto& blocks = shader_info.blocks;
             NodeBlock current_block;
-            u32 current_label = exit_branch;
+            u32 current_label = static_cast<u32>(exit_branch);
             for (auto& block : blocks) {
                 if (shader_info.labels.count(block.start) != 0) {
                     insert_block(current_block, current_label);
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 87d8fecaa..1473c282a 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
     case OpCode::Id::FMUL_R:
     case OpCode::Id::FMUL_IMM: {
         // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
-        UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented",
-                             instr.fmul.tab5cb8_2.Value());
-        UNIMPLEMENTED_IF_MSG(
-            instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented",
-            instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default
+        if (instr.fmul.tab5cb8_2 != 0) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
+                        instr.fmul.tab5cb8_2.Value());
+        }
+        if (instr.fmul.tab5c68_0 != 1) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
+                        instr.fmul.tab5c68_0.Value());
+        }
 
         op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);
 
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index 7bcf38f23..6466fc011 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
             LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
         }
     } else {
-        UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
+        if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     }
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 4221f0c58..8973fbefa 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -57,7 +57,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     case OpCode::Id::I2F_R:
     case OpCode::Id::I2F_C:
     case OpCode::Id::I2F_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
         UNIMPLEMENTED_IF(instr.conversion.selector);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in I2F is not implemented");
@@ -82,14 +82,19 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         value = GetOperandAbsNegFloat(value, false, instr.conversion.negate_a);
 
         SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
+
+        if (instr.conversion.dst_size == Register::Size::Short) {
+            value = Operation(OperationCode::HCastFloat, PRECISE, value);
+        }
+
         SetRegister(bb, instr.gpr0, value);
         break;
     }
     case OpCode::Id::F2F_R:
     case OpCode::Id::F2F_C:
     case OpCode::Id::F2F_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
+        UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2F is not implemented");
 
@@ -107,6 +112,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
             }
         }();
 
+        if (instr.conversion.src_size == Register::Size::Short) {
+            // TODO: figure where extract is sey in the encoding
+            value = Operation(OperationCode::FCastHalf0, PRECISE, value);
+        }
+
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
 
         value = [&]() {
@@ -124,19 +134,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
             default:
                 UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
                                   static_cast<u32>(instr.conversion.f2f.rounding.Value()));
-                return Immediate(0);
+                return value;
             }
         }();
         value = GetSaturatedFloat(value, instr.alu.saturate_d);
 
         SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
+
+        if (instr.conversion.dst_size == Register::Size::Short) {
+            value = Operation(OperationCode::HCastFloat, PRECISE, value);
+        }
+
         SetRegister(bb, instr.gpr0, value);
         break;
     }
     case OpCode::Id::F2I_R:
     case OpCode::Id::F2I_C:
     case OpCode::Id::F2I_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2I is not implemented");
         Node value = [&]() {
@@ -153,6 +168,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
             }
         }();
 
+        if (instr.conversion.src_size == Register::Size::Short) {
+            // TODO: figure where extract is sey in the encoding
+            value = Operation(OperationCode::FCastHalf0, PRECISE, value);
+        }
+
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
 
         value = [&]() {
diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp
index 29be25ca3..ca2f39e8d 100644
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented");
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented",
-                         instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented",
-                         instr.ffma.tab5980_1.Value());
+    if (instr.ffma.tab5980_0 != 1) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
+    }
+    if (instr.ffma.tab5980_1 != 0) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
+    }
 
     const Node op_a = GetRegister(instr.gpr8);
 
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index d59d15bd8..afea33e5f 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -18,43 +18,56 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
+    DEBUG_ASSERT(instr.hsetp2.ftz == 0);
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
 
-    Node op_b = [&]() {
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::HSETP2_R:
-            return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
-                                        instr.hsetp2.negate_b);
-        default:
-            UNREACHABLE();
-            return Immediate(0);
-        }
-    }();
-    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
-
-    // We can't use the constant predicate as destination.
-    ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
-
-    const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0);
+    Tegra::Shader::PredCondition cond{};
+    bool h_and{};
+    Node op_b{};
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSETP2_C:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()),
+                                    instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
+        break;
+    case OpCode::Id::HSETP2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = UnpackHalfImmediate(instr, true);
+        break;
+    case OpCode::Id::HSETP2_R:
+        cond = instr.hsetp2.reg.cond;
+        h_and = instr.hsetp2.reg.h_and;
+        op_b =
+            UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b,
+                                                 instr.hsetp2.reg.negate_b),
+                            instr.hsetp2.reg.type_b);
+        break;
+    default:
+        UNREACHABLE();
+        op_b = Immediate(0);
+    }
 
     const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
-    const OperationCode pair_combiner =
-        instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
-
-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
-    const Node first_pred = Operation(pair_combiner, comparison);
+    const Node combined_pred = GetPredicate(instr.hsetp2.pred3, instr.hsetp2.neg_pred);
 
-    // Set the primary predicate to the result of Predicate OP SecondPredicate
-    const Node value = Operation(combiner, first_pred, second_pred);
-    SetPredicate(bb, instr.hsetp2.pred3, value);
+    const auto Write = [&](u64 dest, Node src) {
+        SetPredicate(bb, dest, Operation(combiner, std::move(src), combined_pred));
+    };
 
-    if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
-        // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled
-        const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred);
-        SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred));
+    const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
+    const u64 first = instr.hsetp2.pred0;
+    const u64 second = instr.hsetp2.pred39;
+    if (h_and) {
+        const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
+        Write(first, joined);
+        Write(second, Operation(OperationCode::LogicalNegate, joined));
+    } else {
+        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u)));
+        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));
     }
 
     return pc;
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index c3bcf1ae9..5b44cb79c 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) {
-        UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None);
     } else {
-        UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None);
     }
 
     constexpr auto identity = HalfType::H0_H1;
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index c0f64d7a0..ac0e764d6 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -22,6 +22,12 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     switch (opcode->get().GetId()) {
+    case OpCode::Id::NOP: {
+        UNIMPLEMENTED_IF(instr.nop.cc != Tegra::Shader::ConditionCode::T);
+        UNIMPLEMENTED_IF(instr.nop.trigger != 0);
+        // With the previous preconditions, this instruction is a no-operation.
+        break;
+    }
     case OpCode::Id::EXIT: {
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "EXIT condition code used: {}",
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 7427ed896..5f0852364 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -30,6 +30,8 @@ enum class OperationCode {
     FNegate,       /// (MetaArithmetic, float a) -> float
     FAbsolute,     /// (MetaArithmetic, float a) -> float
     FClamp,        /// (MetaArithmetic, float value, float min, float max) -> float
+    FCastHalf0,    /// (MetaArithmetic, f16vec2 a) -> float
+    FCastHalf1,    /// (MetaArithmetic, f16vec2 a) -> float
     FMin,          /// (MetaArithmetic, float a, float b) -> float
     FMax,          /// (MetaArithmetic, float a, float b) -> float
     FCos,          /// (MetaArithmetic, float a) -> float
@@ -83,17 +85,18 @@ enum class OperationCode {
     UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
     UBitCount,        /// (MetaArithmetic, uint) -> uint
 
-    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
-    HAbsolute, /// (f16vec2 a) -> f16vec2
-    HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
-    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
-    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
-    HMergeF32, /// (f16vec2 src) -> float
-    HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HPack2,    /// (float a, float b) -> f16vec2
+    HAdd,       /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HMul,       /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HFma,       /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
+    HAbsolute,  /// (f16vec2 a) -> f16vec2
+    HNegate,    /// (f16vec2 a, bool first, bool second) -> f16vec2
+    HClamp,     /// (f16vec2 src, float min, float max) -> f16vec2
+    HCastFloat, /// (MetaArithmetic, float a) -> f16vec2
+    HUnpack,    /// (Tegra::Shader::HalfType, T value) -> f16vec2
+    HMergeF32,  /// (f16vec2 src) -> float
+    HMergeH0,   /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HMergeH1,   /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HPack2,     /// (float a, float b) -> f16vec2
 
     LogicalAssign, /// (bool& dst, bool src) -> void
     LogicalAnd,    /// (bool a, bool b) -> bool
@@ -101,8 +104,7 @@ enum class OperationCode {
     LogicalXor,    /// (bool a, bool b) -> bool
     LogicalNegate, /// (bool a) -> bool
     LogicalPick2,  /// (bool2 pair, uint index) -> bool
-    LogicalAll2,   /// (bool2 a) -> bool
-    LogicalAny2,   /// (bool2 a) -> bool
+    LogicalAnd2,   /// (bool2 a) -> bool
 
     LogicalFLessThan,     /// (float a, float b) -> bool
     LogicalFEqual,        /// (float a, float b) -> bool
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index a53e02253..55f5949e4 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -59,8 +59,8 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
         return TrackCbuf(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
-            if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
+        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
+            if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) {
                 // Cbuf found in operand.
                 return found;
             }
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 6af9044ca..683c49207 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -24,9 +24,8 @@ StagingCache::StagingCache() = default;
 StagingCache::~StagingCache() = default;
 
 SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params)
-    : params{params}, mipmap_sizes(params.num_levels),
-      mipmap_offsets(params.num_levels), gpu_addr{gpu_addr}, host_memory_size{
-                                                                 params.GetHostSizeInBytes()} {
+    : params{params}, host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr},
+      mipmap_sizes(params.num_levels), mipmap_offsets(params.num_levels) {
     std::size_t offset = 0;
     for (u32 level = 0; level < params.num_levels; ++level) {
         const std::size_t mipmap_size{params.GetGuestMipmapSize(level)};
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 7f9623c62..a3a3770a7 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -116,10 +116,10 @@ public:
         std::lock_guard lock{mutex};
         auto& maxwell3d = system.GPU().Maxwell3D();
 
-        if (!maxwell3d.dirty_flags.zeta_buffer) {
+        if (!maxwell3d.dirty.depth_buffer) {
             return depth_buffer.view;
         }
-        maxwell3d.dirty_flags.zeta_buffer = false;
+        maxwell3d.dirty.depth_buffer = false;
 
         const auto& regs{maxwell3d.regs};
         const auto gpu_addr{regs.zeta.Address()};
@@ -145,10 +145,10 @@ public:
         std::lock_guard lock{mutex};
         ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
         auto& maxwell3d = system.GPU().Maxwell3D();
-        if (!maxwell3d.dirty_flags.color_buffer[index]) {
+        if (!maxwell3d.dirty.render_target[index]) {
             return render_targets[index].view;
         }
-        maxwell3d.dirty_flags.color_buffer.reset(index);
+        maxwell3d.dirty.render_target[index] = false;
 
         const auto& regs{maxwell3d.regs};
         if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 ||
@@ -274,10 +274,11 @@ protected:
         auto& maxwell3d = system.GPU().Maxwell3D();
         const u32 index = surface->GetRenderTarget();
         if (index == DEPTH_RT) {
-            maxwell3d.dirty_flags.zeta_buffer = true;
+            maxwell3d.dirty.depth_buffer = true;
         } else {
-            maxwell3d.dirty_flags.color_buffer.set(index, true);
+            maxwell3d.dirty.render_target[index] = true;
         }
+        maxwell3d.dirty.render_settings = true;
     }
 
     void Register(TSurface surface) {