27 files changed, 1566 insertions, 336 deletions
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
new file mode 100644
index 000000000..ec3a76690
--- /dev/null
+++ b/src/video_core/shader/control_flow.cpp
@@ -0,0 +1,481 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <list>
+#include <map>
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/shader/control_flow.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+namespace {
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+
+constexpr s32 unassigned_branch = -2;
+
+struct Query {
+    u32 address{};
+    std::stack<u32> ssy_stack{};
+    std::stack<u32> pbk_stack{};
+};
+
+struct BlockStack {
+    BlockStack() = default;
+    explicit BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
+    std::stack<u32> ssy_stack{};
+    std::stack<u32> pbk_stack{};
+};
+
+struct BlockBranchInfo {
+    Condition condition{};
+    s32 address{exit_branch};
+    bool kill{};
+    bool is_sync{};
+    bool is_brk{};
+    bool ignore{};
+};
+
+struct BlockInfo {
+    u32 start{};
+    u32 end{};
+    bool visited{};
+    BlockBranchInfo branch{};
+
+    bool IsInside(const u32 address) const {
+        return start <= address && address <= end;
+    }
+};
+
+struct CFGRebuildState {
+    explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
+                             const u32 start)
+        : start{start}, program_code{program_code}, program_size{program_size} {}
+
+    u32 start{};
+    std::vector<BlockInfo> block_info{};
+    std::list<u32> inspect_queries{};
+    std::list<Query> queries{};
+    std::unordered_map<u32, u32> registered{};
+    std::unordered_set<u32> labels{};
+    std::map<u32, u32> ssy_labels{};
+    std::map<u32, u32> pbk_labels{};
+    std::unordered_map<u32, BlockStack> stacks{};
+    const ProgramCode& program_code;
+    const std::size_t program_size;
+};
+
+enum class BlockCollision : u32 { None, Found, Inside };
+
+std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address) {
+    const auto& blocks = state.block_info;
+    for (u32 index = 0; index < blocks.size(); index++) {
+        if (blocks[index].start == address) {
+            return {BlockCollision::Found, index};
+        }
+        if (blocks[index].IsInside(address)) {
+            return {BlockCollision::Inside, index};
+        }
+    }
+    return {BlockCollision::None, 0xFFFFFFFF};
+}
+
+struct ParseInfo {
+    BlockBranchInfo branch_info{};
+    u32 end_address{};
+};
+
+BlockInfo& CreateBlockInfo(CFGRebuildState& state, u32 start, u32 end) {
+    auto& it = state.block_info.emplace_back();
+    it.start = start;
+    it.end = end;
+    const u32 index = static_cast<u32>(state.block_info.size() - 1);
+    state.registered.insert({start, index});
+    return it;
+}
+
+Pred GetPredicate(u32 index, bool negated) {
+    return static_cast<Pred>(index + (negated ? 8 : 0));
+}
+
+/**
+ * Returns whether the instruction at the specified offset is a 'sched' instruction.
+ * Sched instructions always appear before a sequence of 3 instructions.
+ */
+constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
+    constexpr u32 SchedPeriod = 4;
+    u32 absolute_offset = offset - main_offset;
+
+    return (absolute_offset % SchedPeriod) == 0;
+}
+
+enum class ParseResult : u32 {
+    ControlCaught,
+    BlockEnd,
+    AbnormalFlow,
+};
+
+std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) {
+    u32 offset = static_cast<u32>(address);
+    const u32 end_address = static_cast<u32>(state.program_size / sizeof(Instruction));
+    ParseInfo parse_info{};
+
+    const auto insert_label = [](CFGRebuildState& state, u32 address) {
+        const auto pair = state.labels.emplace(address);
+        if (pair.second) {
+            state.inspect_queries.push_back(address);
+        }
+    };
+
+    while (true) {
+        if (offset >= end_address) {
+            // ASSERT_OR_EXECUTE can't be used, as it ignores the break
+            ASSERT_MSG(false, "Shader passed the current limit!");
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.ignore = false;
+            break;
+        }
+        if (state.registered.count(offset) != 0) {
+            parse_info.branch_info.address = offset;
+            parse_info.branch_info.ignore = true;
+            break;
+        }
+        if (IsSchedInstruction(offset, state.start)) {
+            offset++;
+            continue;
+        }
+        const Instruction instr = {state.program_code[offset]};
+        const auto opcode = OpCode::Decode(instr);
+        if (!opcode || opcode->get().GetType() != OpCode::Type::Flow) {
+            offset++;
+            continue;
+        }
+
+        switch (opcode->get().GetId()) {
+        case OpCode::Id::EXIT: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::BRA: {
+            if (instr.bra.constant_buffer != 0) {
+                return {ParseResult::AbnormalFlow, parse_info};
+            }
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            const u32 branch_offset = offset + instr.bra.GetBranchTarget();
+            if (branch_offset == 0) {
+                parse_info.branch_info.address = exit_branch;
+            } else {
+                parse_info.branch_info.address = branch_offset;
+            }
+            insert_label(state, branch_offset);
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::SYNC: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = unassigned_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = true;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::BRK: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = unassigned_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = true;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::KIL: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.kill = true;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::SSY: {
+            const u32 target = offset + instr.bra.GetBranchTarget();
+            insert_label(state, target);
+            state.ssy_labels.emplace(offset, target);
+            break;
+        }
+        case OpCode::Id::PBK: {
+            const u32 target = offset + instr.bra.GetBranchTarget();
+            insert_label(state, target);
+            state.pbk_labels.emplace(offset, target);
+            break;
+        }
+        case OpCode::Id::BRX: {
+            return {ParseResult::AbnormalFlow, parse_info};
+        }
+        default:
+            break;
+        }
+
+        offset++;
+    }
+    parse_info.branch_info.kill = false;
+    parse_info.branch_info.is_sync = false;
+    parse_info.branch_info.is_brk = false;
+    parse_info.end_address = offset - 1;
+    return {ParseResult::BlockEnd, parse_info};
+}
+
+bool TryInspectAddress(CFGRebuildState& state) {
+    if (state.inspect_queries.empty()) {
+        return false;
+    }
+
+    const u32 address = state.inspect_queries.front();
+    state.inspect_queries.pop_front();
+    const auto [result, block_index] = TryGetBlock(state, address);
+    switch (result) {
+    case BlockCollision::Found: {
+        return true;
+    }
+    case BlockCollision::Inside: {
+        // This case is the tricky one:
+        // We need to Split the block in 2 sepparate blocks
+        const u32 end = state.block_info[block_index].end;
+        BlockInfo& new_block = CreateBlockInfo(state, address, end);
+        BlockInfo& current_block = state.block_info[block_index];
+        current_block.end = address - 1;
+        new_block.branch = current_block.branch;
+        BlockBranchInfo forward_branch{};
+        forward_branch.address = address;
+        forward_branch.ignore = true;
+        current_block.branch = forward_branch;
+        return true;
+    }
+    default:
+        break;
+    }
+    const auto [parse_result, parse_info] = ParseCode(state, address);
+    if (parse_result == ParseResult::AbnormalFlow) {
+        // if it's AbnormalFlow, we end it as false, ending the CFG reconstruction
+        return false;
+    }
+
+    BlockInfo& block_info = CreateBlockInfo(state, address, parse_info.end_address);
+    block_info.branch = parse_info.branch_info;
+    if (parse_info.branch_info.condition.IsUnconditional()) {
+        return true;
+    }
+
+    const u32 fallthrough_address = parse_info.end_address + 1;
+    state.inspect_queries.push_front(fallthrough_address);
+    return true;
+}
+
+bool TryQuery(CFGRebuildState& state) {
+    const auto gather_labels = [](std::stack<u32>& cc, std::map<u32, u32>& labels,
+                                  BlockInfo& block) {
+        auto gather_start = labels.lower_bound(block.start);
+        const auto gather_end = labels.upper_bound(block.end);
+        while (gather_start != gather_end) {
+            cc.push(gather_start->second);
+            ++gather_start;
+        }
+    };
+    if (state.queries.empty()) {
+        return false;
+    }
+
+    Query& q = state.queries.front();
+    const u32 block_index = state.registered[q.address];
+    BlockInfo& block = state.block_info[block_index];
+    // If the block is visited, check if the stacks match, else gather the ssy/pbk
+    // labels into the current stack and look if the branch at the end of the block
+    // consumes a label. Schedule new queries accordingly
+    if (block.visited) {
+        BlockStack& stack = state.stacks[q.address];
+        const bool all_okay = (stack.ssy_stack.empty() || q.ssy_stack == stack.ssy_stack) &&
+                              (stack.pbk_stack.empty() || q.pbk_stack == stack.pbk_stack);
+        state.queries.pop_front();
+        return all_okay;
+    }
+    block.visited = true;
+    state.stacks.insert_or_assign(q.address, BlockStack{q});
+
+    Query q2(q);
+    state.queries.pop_front();
+    gather_labels(q2.ssy_stack, state.ssy_labels, block);
+    gather_labels(q2.pbk_stack, state.pbk_labels, block);
+    if (!block.branch.condition.IsUnconditional()) {
+        q2.address = block.end + 1;
+        state.queries.push_back(q2);
+    }
+
+    Query conditional_query{q2};
+    if (block.branch.is_sync) {
+        if (block.branch.address == unassigned_branch) {
+            block.branch.address = conditional_query.ssy_stack.top();
+        }
+        conditional_query.ssy_stack.pop();
+    }
+    if (block.branch.is_brk) {
+        if (block.branch.address == unassigned_branch) {
+            block.branch.address = conditional_query.pbk_stack.top();
+        }
+        conditional_query.pbk_stack.pop();
+    }
+    conditional_query.address = block.branch.address;
+    state.queries.push_back(std::move(conditional_query));
+    return true;
+}
+} // Anonymous namespace
+
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address) {
+    CFGRebuildState state{program_code, program_size, start_address};
+
+    // Inspect Code and generate blocks
+    state.labels.clear();
+    state.labels.emplace(start_address);
+    state.inspect_queries.push_back(state.start);
+    while (!state.inspect_queries.empty()) {
+        if (!TryInspectAddress(state)) {
+            return {};
+        }
+    }
+
+    // Decompile Stacks
+    state.queries.push_back(Query{state.start, {}, {}});
+    bool decompiled = true;
+    while (!state.queries.empty()) {
+        if (!TryQuery(state)) {
+            decompiled = false;
+            break;
+        }
+    }
+
+    // Sort and organize results
+    std::sort(state.block_info.begin(), state.block_info.end(),
+              [](const BlockInfo& a, const BlockInfo& b) { return a.start < b.start; });
+    ShaderCharacteristics result_out{};
+    result_out.decompilable = decompiled;
+    result_out.start = start_address;
+    result_out.end = start_address;
+    for (const auto& block : state.block_info) {
+        ShaderBlock new_block{};
+        new_block.start = block.start;
+        new_block.end = block.end;
+        new_block.ignore_branch = block.branch.ignore;
+        if (!new_block.ignore_branch) {
+            new_block.branch.cond = block.branch.condition;
+            new_block.branch.kills = block.branch.kill;
+            new_block.branch.address = block.branch.address;
+        }
+        result_out.end = std::max(result_out.end, block.end);
+        result_out.blocks.push_back(new_block);
+    }
+    if (result_out.decompilable) {
+        result_out.labels = std::move(state.labels);
+        return {std::move(result_out)};
+    }
+
+    // If it's not decompilable, merge the unlabelled blocks together
+    auto back = result_out.blocks.begin();
+    auto next = std::next(back);
+    while (next != result_out.blocks.end()) {
+        if (state.labels.count(next->start) == 0 && next->start == back->end + 1) {
+            back->end = next->end;
+            next = result_out.blocks.erase(next);
+            continue;
+        }
+        back = next;
+        ++next;
+    }
+    return {std::move(result_out)};
+}
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h
new file mode 100644
index 000000000..b0a5e4f8c
--- /dev/null
+++ b/src/video_core/shader/control_flow.h
@@ -0,0 +1,79 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <list>
+#include <optional>
+#include <unordered_set>
+
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::ConditionCode;
+using Tegra::Shader::Pred;
+
+constexpr s32 exit_branch = -1;
+
+struct Condition {
+    Pred predicate{Pred::UnusedIndex};
+    ConditionCode cc{ConditionCode::T};
+
+    bool IsUnconditional() const {
+        return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
+    }
+
+    bool operator==(const Condition& other) const {
+        return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
+    }
+
+    bool operator!=(const Condition& other) const {
+        return !operator==(other);
+    }
+};
+
+struct ShaderBlock {
+    struct Branch {
+        Condition cond{};
+        bool kills{};
+        s32 address{};
+
+        bool operator==(const Branch& b) const {
+            return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
+        }
+
+        bool operator!=(const Branch& b) const {
+            return !operator==(b);
+        }
+    };
+
+    u32 start{};
+    u32 end{};
+    bool ignore_branch{};
+    Branch branch{};
+
+    bool operator==(const ShaderBlock& sb) const {
+        return std::tie(start, end, ignore_branch, branch) ==
+               std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
+    }
+
+    bool operator!=(const ShaderBlock& sb) const {
+        return !operator==(sb);
+    }
+};
+
+struct ShaderCharacteristics {
+    std::list<ShaderBlock> blocks{};
+    bool decompilable{};
+    u32 start{};
+    u32 end{};
+    std::unordered_set<u32> labels{};
+};
+
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address);
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index a0554c97e..47a9fd961 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -11,6 +11,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
+#include "video_core/shader/control_flow.h"
 #include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -21,20 +22,6 @@ using Tegra::Shader::OpCode;
 
 namespace {
 
-/// Merges exit method of two parallel branches.
-constexpr ExitMethod ParallelExit(ExitMethod a, ExitMethod b) {
-    if (a == ExitMethod::Undetermined) {
-        return b;
-    }
-    if (b == ExitMethod::Undetermined) {
-        return a;
-    }
-    if (a == b) {
-        return a;
-    }
-    return ExitMethod::Conditional;
-}
-
 /**
  * Returns whether the instruction at the specified offset is a 'sched' instruction.
  * Sched instructions always appear before a sequence of 3 instructions.
@@ -51,85 +38,104 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
 void ShaderIR::Decode() {
     std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));
 
-    std::set<u32> labels;
-    const ExitMethod exit_method = Scan(main_offset, MAX_PROGRAM_LENGTH, labels);
-    if (exit_method != ExitMethod::AlwaysEnd) {
-        UNREACHABLE_MSG("Program does not always end");
-    }
-
-    if (labels.empty()) {
-        basic_blocks.insert({main_offset, DecodeRange(main_offset, MAX_PROGRAM_LENGTH)});
+    disable_flow_stack = false;
+    const auto info = ScanFlow(program_code, program_size, main_offset);
+    if (info) {
+        const auto& shader_info = *info;
+        coverage_begin = shader_info.start;
+        coverage_end = shader_info.end;
+        if (shader_info.decompilable) {
+            disable_flow_stack = true;
+            const auto insert_block = [this](NodeBlock& nodes, u32 label) {
+                if (label == static_cast<u32>(exit_branch)) {
+                    return;
+                }
+                basic_blocks.insert({label, nodes});
+            };
+            const auto& blocks = shader_info.blocks;
+            NodeBlock current_block;
+            u32 current_label = static_cast<u32>(exit_branch);
+            for (auto& block : blocks) {
+                if (shader_info.labels.count(block.start) != 0) {
+                    insert_block(current_block, current_label);
+                    current_block.clear();
+                    current_label = block.start;
+                }
+                if (!block.ignore_branch) {
+                    DecodeRangeInner(current_block, block.start, block.end);
+                    InsertControlFlow(current_block, block);
+                } else {
+                    DecodeRangeInner(current_block, block.start, block.end + 1);
+                }
+            }
+            insert_block(current_block, current_label);
+            return;
+        }
+        LOG_WARNING(HW_GPU, "Flow Stack Removing Failed! Falling back to old method");
+        // we can't decompile it, fallback to standard method
+        for (const auto& block : shader_info.blocks) {
+            basic_blocks.insert({block.start, DecodeRange(block.start, block.end + 1)});
+        }
         return;
     }
+    LOG_WARNING(HW_GPU, "Flow Analysis Failed! Falling back to brute force compiling");
+
+    // Now we need to deal with an undecompilable shader. We need to brute force
+    // a shader that captures every position.
+    coverage_begin = main_offset;
+    const u32 shader_end = static_cast<u32>(program_size / sizeof(u64));
+    coverage_end = shader_end;
+    for (u32 label = main_offset; label < shader_end; label++) {
+        basic_blocks.insert({label, DecodeRange(label, label + 1)});
+    }
+}
 
-    labels.insert(main_offset);
-
-    for (const u32 label : labels) {
-        const auto next_it = labels.lower_bound(label + 1);
-        const u32 next_label = next_it == labels.end() ? MAX_PROGRAM_LENGTH : *next_it;
+NodeBlock ShaderIR::DecodeRange(u32 begin, u32 end) {
+    NodeBlock basic_block;
+    DecodeRangeInner(basic_block, begin, end);
+    return basic_block;
+}
 
-        basic_blocks.insert({label, DecodeRange(label, next_label)});
+void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) {
+    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
+        pc = DecodeInstr(bb, pc);
     }
 }
 
-ExitMethod ShaderIR::Scan(u32 begin, u32 end, std::set<u32>& labels) {
-    const auto [iter, inserted] =
-        exit_method_map.emplace(std::make_pair(begin, end), ExitMethod::Undetermined);
-    ExitMethod& exit_method = iter->second;
-    if (!inserted)
-        return exit_method;
-
-    for (u32 offset = begin; offset != end && offset != MAX_PROGRAM_LENGTH; ++offset) {
-        coverage_begin = std::min(coverage_begin, offset);
-        coverage_end = std::max(coverage_end, offset + 1);
-
-        const Instruction instr = {program_code[offset]};
-        const auto opcode = OpCode::Decode(instr);
-        if (!opcode)
-            continue;
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::EXIT: {
-            // The EXIT instruction can be predicated, which means that the shader can conditionally
-            // end on this instruction. We have to consider the case where the condition is not met
-            // and check the exit method of that other basic block.
-            using Tegra::Shader::Pred;
-            if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) {
-                return exit_method = ExitMethod::AlwaysEnd;
-            } else {
-                const ExitMethod not_met = Scan(offset + 1, end, labels);
-                return exit_method = ParallelExit(ExitMethod::AlwaysEnd, not_met);
-            }
+void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
+    const auto apply_conditions = [&](const Condition& cond, Node n) -> Node {
+        Node result = n;
+        if (cond.cc != ConditionCode::T) {
+            result = Conditional(GetConditionCode(cond.cc), {result});
         }
-        case OpCode::Id::BRA: {
-            const u32 target = offset + instr.bra.GetBranchTarget();
-            labels.insert(target);
-            const ExitMethod no_jmp = Scan(offset + 1, end, labels);
-            const ExitMethod jmp = Scan(target, end, labels);
-            return exit_method = ParallelExit(no_jmp, jmp);
-        }
-        case OpCode::Id::SSY:
-        case OpCode::Id::PBK: {
-            // The SSY and PBK use a similar encoding as the BRA instruction.
-            UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
-                                 "Constant buffer branching is not supported");
-            const u32 target = offset + instr.bra.GetBranchTarget();
-            labels.insert(target);
-            // Continue scanning for an exit method.
-            break;
+        if (cond.predicate != Pred::UnusedIndex) {
+            u32 pred = static_cast<u32>(cond.predicate);
+            const bool is_neg = pred > 7;
+            if (is_neg) {
+                pred -= 8;
+            }
+            result = Conditional(GetPredicate(pred, is_neg), {result});
         }
-        default:
-            break;
+        return result;
+    };
+    if (block.branch.address < 0) {
+        if (block.branch.kills) {
+            Node n = Operation(OperationCode::Discard);
+            n = apply_conditions(block.branch.cond, n);
+            bb.push_back(n);
+            global_code.push_back(n);
+            return;
         }
+        Node n = Operation(OperationCode::Exit);
+        n = apply_conditions(block.branch.cond, n);
+        bb.push_back(n);
+        global_code.push_back(n);
+        return;
     }
-    return exit_method = ExitMethod::AlwaysReturn;
-}
-
-NodeBlock ShaderIR::DecodeRange(u32 begin, u32 end) {
-    NodeBlock basic_block;
-    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
-        pc = DecodeInstr(basic_block, pc);
-    }
-    return basic_block;
+    Node n = Operation(OperationCode::Branch, Immediate(block.branch.address));
+    n = apply_conditions(block.branch.cond, n);
+    bb.push_back(n);
+    global_code.push_back(n);
 }
 
 u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
@@ -140,15 +146,18 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
 
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
+    const u32 nv_address = ConvertAddressToNvidiaSpace(pc);
 
     // Decoding failure
     if (!opcode) {
         UNIMPLEMENTED_MSG("Unhandled instruction: {0:x}", instr.value);
+        bb.push_back(Comment(fmt::format("{:05x} Unimplemented Shader instruction (0x{:016x})",
+                                         nv_address, instr.value)));
         return pc + 1;
     }
 
-    bb.push_back(
-        Comment(fmt::format("{}: {} (0x{:016x})", pc, opcode->get().GetName(), instr.value)));
+    bb.push_back(Comment(
+        fmt::format("{:05x} {} (0x{:016x})", nv_address, opcode->get().GetName(), instr.value)));
 
     using Tegra::Shader::Pred;
     UNIMPLEMENTED_IF_MSG(instr.pred.full_pred == Pred::NeverExecute,
@@ -167,8 +176,10 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
         {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},
         {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},
         {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
+        {OpCode::Type::Warp, &ShaderIR::DecodeWarp},
         {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
         {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
+        {OpCode::Type::Image, &ShaderIR::DecodeImage},
         {OpCode::Type::FloatSetPredicate, &ShaderIR::DecodeFloatSetPredicate},
         {OpCode::Type::IntegerSetPredicate, &ShaderIR::DecodeIntegerSetPredicate},
         {OpCode::Type::HalfSetPredicate, &ShaderIR::DecodeHalfSetPredicate},
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 87d8fecaa..1473c282a 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
     case OpCode::Id::FMUL_R:
     case OpCode::Id::FMUL_IMM: {
         // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
-        UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented",
-                             instr.fmul.tab5cb8_2.Value());
-        UNIMPLEMENTED_IF_MSG(
-            instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented",
-            instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default
+        if (instr.fmul.tab5cb8_2 != 0) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
+                        instr.fmul.tab5cb8_2.Value());
+        }
+        if (instr.fmul.tab5c68_0 != 1) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
+                        instr.fmul.tab5c68_0.Value());
+        }
 
         op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);
 
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index 7bcf38f23..6466fc011 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
             LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
         }
     } else {
-        UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
+        if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     }
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 4221f0c58..32facd6ba 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -14,6 +14,12 @@ using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 
+namespace {
+constexpr OperationCode GetFloatSelector(u64 selector) {
+    return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1;
+}
+} // Anonymous namespace
+
 u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
@@ -22,7 +28,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     case OpCode::Id::I2I_R:
     case OpCode::Id::I2I_C:
     case OpCode::Id::I2I_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.selector);
+        UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
         UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
         UNIMPLEMENTED_IF(instr.alu.saturate_d);
 
@@ -57,8 +63,8 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     case OpCode::Id::I2F_R:
     case OpCode::Id::I2F_C:
     case OpCode::Id::I2F_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.conversion.selector);
+        UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in I2F is not implemented");
 
@@ -82,14 +88,19 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         value = GetOperandAbsNegFloat(value, false, instr.conversion.negate_a);
 
         SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
+
+        if (instr.conversion.dst_size == Register::Size::Short) {
+            value = Operation(OperationCode::HCastFloat, PRECISE, value);
+        }
+
         SetRegister(bb, instr.gpr0, value);
         break;
     }
     case OpCode::Id::F2F_R:
     case OpCode::Id::F2F_C:
     case OpCode::Id::F2F_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
+        UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2F is not implemented");
 
@@ -107,6 +118,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
             }
         }();
 
+        if (instr.conversion.src_size == Register::Size::Short) {
+            value = Operation(GetFloatSelector(instr.conversion.float_src.selector), NO_PRECISE,
+                              std::move(value));
+        } else {
+            ASSERT(instr.conversion.float_src.selector == 0);
+        }
+
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
 
         value = [&]() {
@@ -124,19 +142,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
             default:
                 UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
                                   static_cast<u32>(instr.conversion.f2f.rounding.Value()));
-                return Immediate(0);
+                return value;
             }
         }();
         value = GetSaturatedFloat(value, instr.alu.saturate_d);
 
         SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
+
+        if (instr.conversion.dst_size == Register::Size::Short) {
+            value = Operation(OperationCode::HCastFloat, PRECISE, value);
+        }
+
         SetRegister(bb, instr.gpr0, value);
         break;
     }
     case OpCode::Id::F2I_R:
     case OpCode::Id::F2I_C:
     case OpCode::Id::F2I_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2I is not implemented");
         Node value = [&]() {
@@ -153,6 +176,13 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
             }
         }();
 
+        if (instr.conversion.src_size == Register::Size::Short) {
+            value = Operation(GetFloatSelector(instr.conversion.float_src.selector), NO_PRECISE,
+                              std::move(value));
+        } else {
+            ASSERT(instr.conversion.float_src.selector == 0);
+        }
+
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
 
         value = [&]() {
diff --git a/src/video_core/shader/decode/decode_integer_set.cpp b/src/video_core/shader/decode/decode_integer_set.cpp
deleted file mode 100644
index e69de29bb..000000000
--- a/src/video_core/shader/decode/decode_integer_set.cpp
+++ /dev/null
diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp
index 29be25ca3..ca2f39e8d 100644
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented");
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented",
-                         instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented",
-                         instr.ffma.tab5980_1.Value());
+    if (instr.ffma.tab5980_0 != 1) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
+    }
+    if (instr.ffma.tab5980_1 != 0) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
+    }
 
     const Node op_a = GetRegister(instr.gpr8);
 
diff --git a/src/video_core/shader/decode/float_set.cpp b/src/video_core/shader/decode/float_set.cpp
index f5013e44a..5614e8a0d 100644
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0,
                                             instr.fset.neg_a != 0);
diff --git a/src/video_core/shader/decode/float_set_predicate.cpp b/src/video_core/shader/decode/float_set_predicate.cpp
index 2323052b0..200c2c983 100644
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -16,10 +16,9 @@ using Tegra::Shader::Pred;
 
 u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
-    const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
-                                            instr.fsetp.neg_a != 0);
+    Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
+                                      instr.fsetp.neg_a != 0);
     Node op_b = [&]() {
         if (instr.is_b_imm) {
             return GetImmediate19(instr);
@@ -29,12 +28,13 @@ u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
             return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
         }
     }();
-    op_b = GetOperandAbsNegFloat(op_b, instr.fsetp.abs_b, false);
+    op_b = GetOperandAbsNegFloat(std::move(op_b), instr.fsetp.abs_b, instr.fsetp.neg_b);
 
     // We can't use the constant predicate as destination.
     ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
 
-    const Node predicate = GetPredicateComparisonFloat(instr.fsetp.cond, op_a, op_b);
+    const Node predicate =
+        GetPredicateComparisonFloat(instr.fsetp.cond, std::move(op_a), std::move(op_b));
     const Node second_pred = GetPredicate(instr.fsetp.pred39, instr.fsetp.neg_pred != 0);
 
     const OperationCode combiner = GetPredicateCombiner(instr.fsetp.op);
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index d59d15bd8..840694527 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -18,43 +18,55 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
+    DEBUG_ASSERT(instr.hsetp2.ftz == 0);
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
 
-    Node op_b = [&]() {
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::HSETP2_R:
-            return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
-                                        instr.hsetp2.negate_b);
-        default:
-            UNREACHABLE();
-            return Immediate(0);
-        }
-    }();
-    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
-
-    // We can't use the constant predicate as destination.
-    ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
-
-    const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0);
+    Tegra::Shader::PredCondition cond{};
+    bool h_and{};
+    Node op_b{};
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSETP2_C:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()),
+                                    instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
+        break;
+    case OpCode::Id::HSETP2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = UnpackHalfImmediate(instr, true);
+        break;
+    case OpCode::Id::HSETP2_R:
+        cond = instr.hsetp2.reg.cond;
+        h_and = instr.hsetp2.reg.h_and;
+        op_b =
+            GetOperandAbsNegHalf(UnpackHalfFloat(GetRegister(instr.gpr20), instr.hsetp2.reg.type_b),
+                                 instr.hsetp2.reg.abs_b, instr.hsetp2.reg.negate_b);
+        break;
+    default:
+        UNREACHABLE();
+        op_b = Immediate(0);
+    }
 
     const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
-    const OperationCode pair_combiner =
-        instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
-
-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
-    const Node first_pred = Operation(pair_combiner, comparison);
+    const Node combined_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred);
 
-    // Set the primary predicate to the result of Predicate OP SecondPredicate
-    const Node value = Operation(combiner, first_pred, second_pred);
-    SetPredicate(bb, instr.hsetp2.pred3, value);
+    const auto Write = [&](u64 dest, Node src) {
+        SetPredicate(bb, dest, Operation(combiner, std::move(src), combined_pred));
+    };
 
-    if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
-        // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled
-        const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred);
-        SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred));
+    const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
+    const u64 first = instr.hsetp2.pred3;
+    const u64 second = instr.hsetp2.pred0;
+    if (h_and) {
+        Node joined = Operation(OperationCode::LogicalAnd2, comparison);
+        Write(first, joined);
+        Write(second, Operation(OperationCode::LogicalNegate, std::move(joined)));
+    } else {
+        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0U)));
+        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1U)));
     }
 
     return pc;
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index c3bcf1ae9..5b44cb79c 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) {
-        UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None);
     } else {
-        UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None);
     }
 
     constexpr auto identity = HalfType::H0_H1;
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
new file mode 100644
index 000000000..d54fb88c9
--- /dev/null
+++ b/src/video_core/shader/decode/image.cpp
@@ -0,0 +1,164 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <vector>
+#include <fmt/format.h>
+
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+
+namespace {
+std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
+    switch (image_type) {
+    case Tegra::Shader::ImageType::Texture1D:
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return 1;
+    case Tegra::Shader::ImageType::Texture1DArray:
+    case Tegra::Shader::ImageType::Texture2D:
+        return 2;
+    case Tegra::Shader::ImageType::Texture2DArray:
+    case Tegra::Shader::ImageType::Texture3D:
+        return 3;
+    }
+    UNREACHABLE();
+    return 1;
+}
+} // Anonymous namespace
+
+u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
+    const Instruction instr = {program_code[pc]};
+    const auto opcode = OpCode::Decode(instr);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::SUST: {
+        UNIMPLEMENTED_IF(instr.sust.mode != Tegra::Shader::SurfaceDataMode::P);
+        UNIMPLEMENTED_IF(instr.sust.out_of_bounds_store != Tegra::Shader::OutOfBoundsStore::Ignore);
+        UNIMPLEMENTED_IF(instr.sust.component_mask_selector != 0xf); // Ensure we have an RGBA store
+
+        std::vector<Node> values;
+        constexpr std::size_t hardcoded_size{4};
+        for (std::size_t i = 0; i < hardcoded_size; ++i) {
+            values.push_back(GetRegister(instr.gpr0.Value() + i));
+        }
+
+        std::vector<Node> coords;
+        const std::size_t num_coords{GetImageTypeNumCoordinates(instr.sust.image_type)};
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            coords.push_back(GetRegister(instr.gpr8.Value() + i));
+        }
+
+        const auto type{instr.sust.image_type};
+        auto& image{instr.sust.is_immediate ? GetImage(instr.image, type)
+                                            : GetBindlessImage(instr.gpr39, type)};
+        image.MarkWrite();
+
+        MetaImage meta{image, values};
+        bb.push_back(Operation(OperationCode::ImageStore, meta, std::move(coords)));
+        break;
+    }
+    case OpCode::Id::SUATOM: {
+        UNIMPLEMENTED_IF(instr.suatom_d.is_ba != 0);
+
+        Node value = GetRegister(instr.gpr0);
+
+        std::vector<Node> coords;
+        const std::size_t num_coords{GetImageTypeNumCoordinates(instr.sust.image_type)};
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            coords.push_back(GetRegister(instr.gpr8.Value() + i));
+        }
+
+        const OperationCode operation_code = [instr] {
+            switch (instr.suatom_d.operation) {
+            case Tegra::Shader::ImageAtomicOperation::Add:
+                return OperationCode::AtomicImageAdd;
+            case Tegra::Shader::ImageAtomicOperation::Min:
+                return OperationCode::AtomicImageMin;
+            case Tegra::Shader::ImageAtomicOperation::Max:
+                return OperationCode::AtomicImageMax;
+            case Tegra::Shader::ImageAtomicOperation::And:
+                return OperationCode::AtomicImageAnd;
+            case Tegra::Shader::ImageAtomicOperation::Or:
+                return OperationCode::AtomicImageOr;
+            case Tegra::Shader::ImageAtomicOperation::Xor:
+                return OperationCode::AtomicImageXor;
+            case Tegra::Shader::ImageAtomicOperation::Exch:
+                return OperationCode::AtomicImageExchange;
+            default:
+                UNIMPLEMENTED_MSG("Unimplemented operation={}",
+                                  static_cast<u32>(instr.suatom_d.operation.Value()));
+                return OperationCode::AtomicImageAdd;
+            }
+        }();
+
+        const auto& image{GetImage(instr.image, instr.suatom_d.image_type, instr.suatom_d.size)};
+        MetaImage meta{image, {std::move(value)}};
+        SetRegister(bb, instr.gpr0, Operation(operation_code, meta, std::move(coords)));
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unhandled image instruction: {}", opcode->get().GetName());
+    }
+
+    return pc;
+}
+
+Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type,
+                          std::optional<Tegra::Shader::ImageAtomicSize> size) {
+    const auto offset{static_cast<std::size_t>(image.index.Value())};
+    if (const auto image = TryUseExistingImage(offset, type, size)) {
+        return *image;
+    }
+
+    const std::size_t next_index{used_images.size()};
+    return used_images.emplace(offset, Image{offset, next_index, type, size}).first->second;
+}
+
+Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type,
+                                  std::optional<Tegra::Shader::ImageAtomicSize> size) {
+    const Node image_register{GetRegister(reg)};
+    const auto [base_image, cbuf_index, cbuf_offset]{
+        TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))};
+    const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};
+
+    if (const auto image = TryUseExistingImage(cbuf_key, type, size)) {
+        return *image;
+    }
+
+    const std::size_t next_index{used_images.size()};
+    return used_images.emplace(cbuf_key, Image{cbuf_index, cbuf_offset, next_index, type, size})
+        .first->second;
+}
+
+Image* ShaderIR::TryUseExistingImage(u64 offset, Tegra::Shader::ImageType type,
+                                     std::optional<Tegra::Shader::ImageAtomicSize> size) {
+    auto it = used_images.find(offset);
+    if (it == used_images.end()) {
+        return nullptr;
+    }
+    auto& image = it->second;
+    ASSERT(image.GetType() == type);
+
+    if (size) {
+        // We know the size, if it's known it has to be the same as before, otherwise we can set it.
+        if (image.IsSizeKnown()) {
+            ASSERT(image.GetSize() == size);
+        } else {
+            image.SetSize(*size);
+        }
+    }
+    return &image;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/integer_set.cpp b/src/video_core/shader/decode/integer_set.cpp
index 46e3d5905..59809bcd8 100644
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -14,7 +14,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetRegister(instr.gpr8);
     const Node op_b = [&]() {
diff --git a/src/video_core/shader/decode/integer_set_predicate.cpp b/src/video_core/shader/decode/integer_set_predicate.cpp
index dd20775d7..25e48fef8 100644
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;
 
 u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetRegister(instr.gpr8);
 
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 80fc0ccfc..7923d4d69 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -35,7 +35,7 @@ u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
         return 1;
     }
 }
-} // namespace
+} // Anonymous namespace
 
 u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
@@ -95,10 +95,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             const Node op_b =
                 GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 4, index);
 
-            SetTemporal(bb, 0, op_a);
-            SetTemporal(bb, 1, op_b);
-            SetRegister(bb, instr.gpr0, GetTemporal(0));
-            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporal(1));
+            SetTemporary(bb, 0, op_a);
+            SetTemporary(bb, 1, op_b);
+            SetRegister(bb, instr.gpr0, GetTemporary(0));
+            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporary(1));
             break;
         }
         default:
@@ -106,16 +106,17 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         break;
     }
-    case OpCode::Id::LD_L: {
-        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}",
-                  static_cast<u64>(instr.ld_l.unknown.Value()));
-
-        const auto GetLmem = [&](s32 offset) {
+    case OpCode::Id::LD_L:
+        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown));
+        [[fallthrough]];
+    case OpCode::Id::LD_S: {
+        const auto GetMemory = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
             const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset);
             const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8),
                                            immediate_offset);
-            return GetLocalMemory(address);
+            return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address)
+                                                             : GetLocalMemory(address);
         };
 
         switch (instr.ldst_sl.type.Value()) {
@@ -135,14 +136,16 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                     return 0;
                 }
             }();
-            for (u32 i = 0; i < count; ++i)
-                SetTemporal(bb, i, GetLmem(i * 4));
-            for (u32 i = 0; i < count; ++i)
-                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+            for (u32 i = 0; i < count; ++i) {
+                SetTemporary(bb, i, GetMemory(i * 4));
+            }
+            for (u32 i = 0; i < count; ++i) {
+                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
+            }
             break;
         }
         default:
-            UNIMPLEMENTED_MSG("LD_L Unhandled type: {}",
+            UNIMPLEMENTED_MSG("{} Unhandled type: {}", opcode->get().GetName(),
                               static_cast<u32>(instr.ldst_sl.type.Value()));
         }
         break;
@@ -172,10 +175,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                 Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
             const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
 
-            SetTemporal(bb, i, gmem);
+            SetTemporary(bb, i, gmem);
         }
         for (u32 i = 0; i < count; ++i) {
-            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
         }
         break;
     }
@@ -209,27 +212,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
 
         break;
     }
-    case OpCode::Id::ST_L: {
+    case OpCode::Id::ST_L:
         LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}",
                   static_cast<u64>(instr.st_l.cache_management.Value()));
-
-        const auto GetLmemAddr = [&](s32 offset) {
+        [[fallthrough]];
+    case OpCode::Id::ST_S: {
+        const auto GetAddress = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
             const Node immediate = Immediate(static_cast<s32>(instr.smem_imm) + offset);
             return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate);
         };
 
+        const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L
+                                    ? &ShaderIR::SetLocalMemory
+                                    : &ShaderIR::SetSharedMemory;
+
         switch (instr.ldst_sl.type.Value()) {
         case Tegra::Shader::StoreType::Bits128:
-            SetLocalMemory(bb, GetLmemAddr(12), GetRegister(instr.gpr0.Value() + 3));
-            SetLocalMemory(bb, GetLmemAddr(8), GetRegister(instr.gpr0.Value() + 2));
+            (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3));
+            (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2));
+            [[fallthrough]];
         case Tegra::Shader::StoreType::Bits64:
-            SetLocalMemory(bb, GetLmemAddr(4), GetRegister(instr.gpr0.Value() + 1));
+            (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1));
+            [[fallthrough]];
         case Tegra::Shader::StoreType::Bits32:
-            SetLocalMemory(bb, GetLmemAddr(0), GetRegister(instr.gpr0));
+            (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0));
             break;
         default:
-            UNIMPLEMENTED_MSG("ST_L Unhandled type: {}",
+            UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(),
                               static_cast<u32>(instr.ldst_sl.type.Value()));
         }
         break;
@@ -253,11 +263,11 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             TrackAndGetGlobalMemory(bb, instr, true);
 
         // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
-        SetTemporal(bb, 0, real_address_base);
+        SetTemporary(bb, 0, real_address_base);
 
         const u32 count = GetUniformTypeElementsCount(type);
         for (u32 i = 0; i < count; ++i) {
-            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
+            SetTemporary(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
         }
         for (u32 i = 0; i < count; ++i) {
             const Node it_offset = Immediate(i * 4);
@@ -265,7 +275,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                 Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
             const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
 
-            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
+            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporary(i + 1)));
         }
         break;
     }
@@ -297,18 +307,13 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeB
     const auto addr_register{GetRegister(instr.gmem.gpr)};
     const auto immediate_offset{static_cast<u32>(instr.gmem.offset)};
 
-    const Node base_address{
-        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf = std::get_if<CbufNode>(&*base_address);
-    ASSERT(cbuf != nullptr);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
+    const auto [base_address, index, offset] =
+        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
+    ASSERT(base_address != nullptr);
 
-    bb.push_back(
-        Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
+    bb.push_back(Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", index, offset)));
 
-    const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+    const GlobalMemoryBase descriptor{index, offset};
     const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
     auto& usage = entry->second;
     if (is_write) {
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index 6fc07f213..d46e0f823 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -22,6 +22,12 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     switch (opcode->get().GetId()) {
+    case OpCode::Id::NOP: {
+        UNIMPLEMENTED_IF(instr.nop.cc != Tegra::Shader::ConditionCode::T);
+        UNIMPLEMENTED_IF(instr.nop.trigger != 0);
+        // With the previous preconditions, this instruction is a no-operation.
+        break;
+    }
     case OpCode::Id::EXIT: {
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "EXIT condition code used: {}",
@@ -68,6 +74,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
             case SystemVariable::InvocationInfo:
                 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
                 return Immediate(0u);
+            case SystemVariable::Tid: {
+                Node value = Immediate(0);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5);
+                return value;
+            }
             case SystemVariable::TidX:
                 return Operation(OperationCode::LocalInvocationIdX);
             case SystemVariable::TidY:
@@ -91,11 +104,46 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::BRA: {
-        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
-                             "BRA with constant buffers are not implemented");
+        Node branch;
+        if (instr.bra.constant_buffer == 0) {
+            const u32 target = pc + instr.bra.GetBranchTarget();
+            branch = Operation(OperationCode::Branch, Immediate(target));
+        } else {
+            const u32 target = pc + 1;
+            const Node op_a = GetConstBuffer(instr.cbuf36.index, instr.cbuf36.GetOffset());
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            const Node operand =
+                Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+            branch = Operation(OperationCode::BranchIndirect, operand);
+        }
 
-        const u32 target = pc + instr.bra.GetBranchTarget();
-        const Node branch = Operation(OperationCode::Branch, Immediate(target));
+        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
+        if (cc != Tegra::Shader::ConditionCode::T) {
+            bb.push_back(Conditional(GetConditionCode(cc), {branch}));
+        } else {
+            bb.push_back(branch);
+        }
+        break;
+    }
+    case OpCode::Id::BRX: {
+        Node operand;
+        if (instr.brx.constant_buffer != 0) {
+            const s32 target = pc + 1;
+            const Node index = GetRegister(instr.gpr8);
+            const Node op_a =
+                GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 0, index);
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+        } else {
+            const s32 target = pc + instr.brx.GetBranchExtend();
+            const Node op_a = GetRegister(instr.gpr8);
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+        }
+        const Node branch = Operation(OperationCode::BranchIndirect, operand);
 
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         if (cc != Tegra::Shader::ConditionCode::T) {
@@ -109,22 +157,28 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer flow is not supported");
 
-        // The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the
-        // target of the jump that the SYNC instruction will make. The SSY opcode has a similar
-        // structure to the BRA opcode.
+        if (disable_flow_stack) {
+            break;
+        }
+
+        // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target)));
         break;
     }
     case OpCode::Id::PBK: {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer PBK is not supported");
 
-        // PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but
-        // using SYNC on a PBK address will kill the shader execution. We don't emulate this because
-        // it's very unlikely a driver will emit such invalid shader.
+        if (disable_flow_stack) {
+            break;
+        }
+
+        // PBK pushes to a stack the address where BRK will jump to.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target)));
         break;
     }
     case OpCode::Id::SYNC: {
@@ -132,17 +186,24 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}",
                              static_cast<u32>(cc));
 
+        if (disable_flow_stack) {
+            break;
+        }
+
         // The SYNC opcode jumps to the address previously set by the SSY opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
         break;
     }
     case OpCode::Id::BRK: {
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}",
                              static_cast<u32>(cc));
+        if (disable_flow_stack) {
+            break;
+        }
 
         // The BRK opcode jumps to the address previously set by the PBK opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
         break;
     }
     case OpCode::Id::IPA: {
diff --git a/src/video_core/shader/decode/predicate_set_register.cpp b/src/video_core/shader/decode/predicate_set_register.cpp
index febbfeb50..84dbc50fe 100644
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                          "Condition codes generation in PSET is not implemented");
diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp
index 2ac16eeb0..f6ee68a54 100644
--- a/src/video_core/shader/decode/shift.cpp
+++ b/src/video_core/shader/decode/shift.cpp
@@ -17,8 +17,8 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    const Node op_a = GetRegister(instr.gpr8);
-    const Node op_b = [&]() {
+    Node op_a = GetRegister(instr.gpr8);
+    Node op_b = [&]() {
         if (instr.is_b_imm) {
             return Immediate(instr.alu.GetSignedImm20_20());
         } else if (instr.is_b_gpr) {
@@ -32,16 +32,23 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
     case OpCode::Id::SHR_C:
     case OpCode::Id::SHR_R:
     case OpCode::Id::SHR_IMM: {
-        const Node value = SignedOperation(OperationCode::IArithmeticShiftRight,
-                                           instr.shift.is_signed, PRECISE, op_a, op_b);
+        if (instr.shr.wrap) {
+            op_b = Operation(OperationCode::UBitwiseAnd, std::move(op_b), Immediate(0x1f));
+        } else {
+            op_b = Operation(OperationCode::IMax, std::move(op_b), Immediate(0));
+            op_b = Operation(OperationCode::IMin, std::move(op_b), Immediate(31));
+        }
+
+        Node value = SignedOperation(OperationCode::IArithmeticShiftRight, instr.shift.is_signed,
+                                     std::move(op_a), std::move(op_b));
         SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
     case OpCode::Id::SHL_C:
     case OpCode::Id::SHL_R:
     case OpCode::Id::SHL_IMM: {
-        const Node value = Operation(OperationCode::ILogicalShiftLeft, PRECISE, op_a, op_b);
+        const Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b);
         SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
         SetRegister(bb, instr.gpr0, value);
         break;
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 4a356dbd4..0b934a069 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -181,10 +181,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                 const Node value =
                     Operation(OperationCode::TextureQueryDimensions, meta,
                               GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0)));
-                SetTemporal(bb, indexer++, value);
+                SetTemporary(bb, indexer++, value);
             }
             for (u32 i = 0; i < indexer; ++i) {
-                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
             }
             break;
         }
@@ -238,13 +238,25 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             auto params = coords;
             MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
             const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
-            SetTemporal(bb, indexer++, value);
+            SetTemporary(bb, indexer++, value);
         }
         for (u32 i = 0; i < indexer; ++i) {
-            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
         }
         break;
     }
+    case OpCode::Id::TLD: {
+        UNIMPLEMENTED_IF_MSG(instr.tld.aoffi, "AOFFI is not implemented");
+        UNIMPLEMENTED_IF_MSG(instr.tld.ms, "MS is not implemented");
+        UNIMPLEMENTED_IF_MSG(instr.tld.cl, "CL is not implemented");
+
+        if (instr.tld.nodep_flag) {
+            LOG_WARNING(HW_GPU, "TLD.NODEP implementation is incomplete");
+        }
+
+        WriteTexInstructionFloat(bb, instr, GetTldCode(instr));
+        break;
+    }
     case OpCode::Id::TLDS: {
         const Tegra::Shader::TextureType texture_type{instr.tlds.GetTextureType()};
         const bool is_array{instr.tlds.IsArrayTexture()};
@@ -257,7 +269,13 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             LOG_WARNING(HW_GPU, "TLDS.NODEP implementation is incomplete");
         }
 
-        WriteTexsInstructionFloat(bb, instr, GetTldsCode(instr, texture_type, is_array));
+        const Node4 components = GetTldsCode(instr, texture_type, is_array);
+
+        if (instr.tlds.fp32_flag) {
+            WriteTexsInstructionFloat(bb, instr, components);
+        } else {
+            WriteTexsInstructionHalfFloat(bb, instr, components);
+        }
         break;
     }
     default:
@@ -290,13 +308,9 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, Textu
 const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, TextureType type,
                                             bool is_array, bool is_shadow) {
     const Node sampler_register = GetRegister(reg);
-    const Node base_sampler =
+    const auto [base_sampler, cbuf_index, cbuf_offset] =
         TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    const auto cbuf = std::get_if<CbufNode>(&*base_sampler);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
-    const auto cbuf_index = cbuf->GetIndex();
+    ASSERT(base_sampler != nullptr);
     const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);
 
     // If this sampler has already been used, return the existing mapping.
@@ -322,11 +336,11 @@ void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const
             // Skip disabled components
             continue;
         }
-        SetTemporal(bb, dest_elem++, components[elem]);
+        SetTemporary(bb, dest_elem++, components[elem]);
     }
     // After writing values in temporals, move them to the real registers
     for (u32 i = 0; i < dest_elem; ++i) {
-        SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+        SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
     }
 }
 
@@ -339,17 +353,17 @@ void ShaderIR::WriteTexsInstructionFloat(NodeBlock& bb, Instruction instr,
     for (u32 component = 0; component < 4; ++component) {
         if (!instr.texs.IsComponentEnabled(component))
             continue;
-        SetTemporal(bb, dest_elem++, components[component]);
+        SetTemporary(bb, dest_elem++, components[component]);
     }
 
     for (u32 i = 0; i < dest_elem; ++i) {
         if (i < 2) {
             // Write the first two swizzle components to gpr0 and gpr0+1
-            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporary(i));
         } else {
             ASSERT(instr.texs.HasTwoDestinations());
             // Write the rest of the swizzle components to gpr28 and gpr28+1
-            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporal(i));
+            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporary(i));
         }
     }
 }
@@ -377,11 +391,11 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,
         return;
     }
 
-    SetTemporal(bb, 0, first_value);
-    SetTemporal(bb, 1, Operation(OperationCode::HPack2, values[2], values[3]));
+    SetTemporary(bb, 0, first_value);
+    SetTemporary(bb, 1, Operation(OperationCode::HPack2, values[2], values[3]));
 
-    SetRegister(bb, instr.gpr0, GetTemporal(0));
-    SetRegister(bb, instr.gpr28, GetTemporal(1));
+    SetRegister(bb, instr.gpr0, GetTemporary(0));
+    SetRegister(bb, instr.gpr28, GetTemporary(1));
 }
 
 Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
@@ -575,6 +589,39 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
     return values;
 }
 
+Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
+    const auto texture_type{instr.tld.texture_type};
+    const bool is_array{instr.tld.is_array};
+    const bool lod_enabled{instr.tld.GetTextureProcessMode() == TextureProcessMode::LL};
+    const std::size_t coord_count{GetCoordCount(texture_type)};
+
+    u64 gpr8_cursor{instr.gpr8.Value()};
+    const Node array_register{is_array ? GetRegister(gpr8_cursor++) : nullptr};
+
+    std::vector<Node> coords;
+    coords.reserve(coord_count);
+    for (std::size_t i = 0; i < coord_count; ++i) {
+        coords.push_back(GetRegister(gpr8_cursor++));
+    }
+
+    u64 gpr20_cursor{instr.gpr20.Value()};
+    // const Node bindless_register{is_bindless ? GetRegister(gpr20_cursor++) : nullptr};
+    const Node lod{lod_enabled ? GetRegister(gpr20_cursor++) : Immediate(0u)};
+    // const Node aoffi_register{is_aoffi ? GetRegister(gpr20_cursor++) : nullptr};
+    // const Node multisample{is_multisample ? GetRegister(gpr20_cursor++) : nullptr};
+
+    const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, false);
+
+    Node4 values;
+    for (u32 element = 0; element < values.size(); ++element) {
+        auto coords_copy = coords;
+        MetaTexture meta{sampler, array_register, {}, {}, {}, lod, {}, element};
+        values[element] = Operation(OperationCode::TexelFetch, meta, std::move(coords_copy));
+    }
+
+    return values;
+}
+
 Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is_array) {
     const std::size_t type_coord_count = GetCoordCount(texture_type);
     const bool lod_enabled = instr.tlds.GetTextureProcessMode() == TextureProcessMode::LL;
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
new file mode 100644
index 000000000..a8e481b3c
--- /dev/null
+++ b/src/video_core/shader/decode/warp.cpp
@@ -0,0 +1,102 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+using Tegra::Shader::Pred;
+using Tegra::Shader::ShuffleOperation;
+using Tegra::Shader::VoteOperation;
+
+namespace {
+OperationCode GetOperationCode(VoteOperation vote_op) {
+    switch (vote_op) {
+    case VoteOperation::All:
+        return OperationCode::VoteAll;
+    case VoteOperation::Any:
+        return OperationCode::VoteAny;
+    case VoteOperation::Eq:
+        return OperationCode::VoteEqual;
+    default:
+        UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op));
+        return OperationCode::VoteAll;
+    }
+}
+} // Anonymous namespace
+
+u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
+    const Instruction instr = {program_code[pc]};
+    const auto opcode = OpCode::Decode(instr);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::VOTE: {
+        const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);
+        const Node active = Operation(OperationCode::BallotThread, value);
+        const Node vote = Operation(GetOperationCode(instr.vote.operation), value);
+        SetRegister(bb, instr.gpr0, active);
+        SetPredicate(bb, instr.vote.dest_pred, vote);
+        break;
+    }
+    case OpCode::Id::SHFL: {
+        Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm))
+                                           : GetRegister(instr.gpr39);
+        Node width = [&] {
+            // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has
+            // been done reversing Nvidia's math. It won't work on all cases due to SHFL having
+            // different parameters that don't properly map to GLSL's interface, but it should work
+            // for cases emitted by Nvidia's compiler.
+            if (instr.shfl.operation == ShuffleOperation::Up) {
+                return Operation(
+                    OperationCode::ILogicalShiftRight,
+                    Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)),
+                    Immediate(8));
+            } else {
+                return Operation(OperationCode::ILogicalShiftRight,
+                                 Operation(OperationCode::IAdd, Immediate(0x201F),
+                                           Operation(OperationCode::INegate, std::move(mask))),
+                                 Immediate(8));
+            }
+        }();
+
+        const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> {
+            switch (instr.shfl.operation) {
+            case ShuffleOperation::Idx:
+                return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed};
+            case ShuffleOperation::Up:
+                return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp};
+            case ShuffleOperation::Down:
+                return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown};
+            case ShuffleOperation::Bfly:
+                return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly};
+            }
+            UNREACHABLE_MSG("Invalid SHFL operation: {}",
+                            static_cast<u64>(instr.shfl.operation.Value()));
+            return {};
+        }();
+
+        // Setting the predicate before the register is intentional to avoid overwriting.
+        Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm))
+                                             : GetRegister(instr.gpr20);
+        SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width));
+        SetRegister(
+            bb, instr.gpr0,
+            Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width)));
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
+        break;
+    }
+
+    return pc;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp
index 93dee77d1..206961909 100644
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -73,8 +73,8 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
     if (is_psl) {
         product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16));
     }
-    SetTemporal(bb, 0, product);
-    product = GetTemporal(0);
+    SetTemporary(bb, 0, product);
+    product = GetTemporary(0);
 
     const Node original_c = op_c;
     const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error
@@ -98,13 +98,13 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
         }
     }();
 
-    SetTemporal(bb, 1, op_c);
-    op_c = GetTemporal(1);
+    SetTemporary(bb, 1, op_c);
+    op_c = GetTemporary(1);
 
     // TODO(Rodrigo): Use an appropiate sign for this operation
     Node sum = Operation(OperationCode::IAdd, product, op_c);
-    SetTemporal(bb, 2, sum);
-    sum = GetTemporal(2);
+    SetTemporary(bb, 2, sum);
+    sum = GetTemporary(2);
     if (is_merge) {
         const Node a = BitfieldExtract(sum, 0, 16);
         const Node b =
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c002f90f9..abf2cb1ab 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <cstddef>
 #include <memory>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -30,6 +31,8 @@ enum class OperationCode {
     FNegate,       /// (MetaArithmetic, float a) -> float
     FAbsolute,     /// (MetaArithmetic, float a) -> float
     FClamp,        /// (MetaArithmetic, float value, float min, float max) -> float
+    FCastHalf0,    /// (MetaArithmetic, f16vec2 a) -> float
+    FCastHalf1,    /// (MetaArithmetic, f16vec2 a) -> float
     FMin,          /// (MetaArithmetic, float a, float b) -> float
     FMax,          /// (MetaArithmetic, float a, float b) -> float
     FCos,          /// (MetaArithmetic, float a) -> float
@@ -83,17 +86,18 @@ enum class OperationCode {
     UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
     UBitCount,        /// (MetaArithmetic, uint) -> uint
 
-    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
-    HAbsolute, /// (f16vec2 a) -> f16vec2
-    HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
-    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
-    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
-    HMergeF32, /// (f16vec2 src) -> float
-    HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HPack2,    /// (float a, float b) -> f16vec2
+    HAdd,       /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HMul,       /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HFma,       /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
+    HAbsolute,  /// (f16vec2 a) -> f16vec2
+    HNegate,    /// (f16vec2 a, bool first, bool second) -> f16vec2
+    HClamp,     /// (f16vec2 src, float min, float max) -> f16vec2
+    HCastFloat, /// (MetaArithmetic, float a) -> f16vec2
+    HUnpack,    /// (Tegra::Shader::HalfType, T value) -> f16vec2
+    HMergeF32,  /// (f16vec2 src) -> float
+    HMergeH0,   /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HMergeH1,   /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HPack2,     /// (float a, float b) -> f16vec2
 
     LogicalAssign, /// (bool& dst, bool src) -> void
     LogicalAnd,    /// (bool a, bool b) -> bool
@@ -101,8 +105,7 @@ enum class OperationCode {
     LogicalXor,    /// (bool a, bool b) -> bool
     LogicalNegate, /// (bool a) -> bool
     LogicalPick2,  /// (bool2 pair, uint index) -> bool
-    LogicalAll2,   /// (bool2 a) -> bool
-    LogicalAny2,   /// (bool2 a) -> bool
+    LogicalAnd2,   /// (bool2 a) -> bool
 
     LogicalFLessThan,     /// (float a, float b) -> bool
     LogicalFEqual,        /// (float a, float b) -> bool
@@ -146,11 +149,21 @@ enum class OperationCode {
     TextureQueryLod,        /// (MetaTexture, float[N] coords) -> float4
     TexelFetch,             /// (MetaTexture, int[N], int) -> float4
 
-    Branch,        /// (uint branch_target) -> void
-    PushFlowStack, /// (uint branch_target) -> void
-    PopFlowStack,  /// () -> void
-    Exit,          /// () -> void
-    Discard,       /// () -> void
+    ImageStore,          /// (MetaImage, int[N] values) -> void
+    AtomicImageAdd,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageMin,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageMax,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageAnd,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageOr,       /// (MetaImage, int[N] coords) -> void
+    AtomicImageXor,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageExchange, /// (MetaImage, int[N] coords) -> void
+
+    Branch,         /// (uint branch_target) -> void
+    BranchIndirect, /// (uint branch_target) -> void
+    PushFlowStack,  /// (uint branch_target) -> void
+    PopFlowStack,   /// () -> void
+    Exit,           /// () -> void
+    Discard,        /// () -> void
 
     EmitVertex,   /// () -> void
     EndPrimitive, /// () -> void
@@ -163,6 +176,21 @@ enum class OperationCode {
     WorkGroupIdY,       /// () -> uint
     WorkGroupIdZ,       /// () -> uint
 
+    BallotThread, /// (bool) -> uint
+    VoteAll,      /// (bool) -> bool
+    VoteAny,      /// (bool) -> bool
+    VoteEqual,    /// (bool) -> bool
+
+    ShuffleIndexed,   /// (uint value, uint index, uint width) -> uint
+    ShuffleUp,        /// (uint value, uint index, uint width) -> uint
+    ShuffleDown,      /// (uint value, uint index, uint width) -> uint
+    ShuffleButterfly, /// (uint value, uint index, uint width) -> uint
+
+    InRangeShuffleIndexed,   /// (uint index, uint width) -> bool
+    InRangeShuffleUp,        /// (uint index, uint width) -> bool
+    InRangeShuffleDown,      /// (uint index, uint width) -> bool
+    InRangeShuffleButterfly, /// (uint index, uint width) -> bool
+
     Amount,
 };
 
@@ -174,6 +202,11 @@ enum class InternalFlag {
     Amount = 4,
 };
 
+enum class MetaStackClass {
+    Ssy,
+    Pbk,
+};
+
 class OperationNode;
 class ConditionalNode;
 class GprNode;
@@ -183,12 +216,13 @@ class PredicateNode;
 class AbufNode;
 class CbufNode;
 class LmemNode;
+class SmemNode;
 class GmemNode;
 class CommentNode;
 
 using NodeData =
     std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode,
-                 PredicateNode, AbufNode, CbufNode, LmemNode, GmemNode, CommentNode>;
+                 PredicateNode, AbufNode, CbufNode, LmemNode, SmemNode, GmemNode, CommentNode>;
 using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
@@ -258,6 +292,87 @@ private:
     bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
 };
 
+class Image final {
+public:
+    constexpr explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type,
+                             std::optional<Tegra::Shader::ImageAtomicSize> size)
+        : offset{offset}, index{index}, type{type}, is_bindless{false}, size{size} {}
+
+    constexpr explicit Image(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
+                             Tegra::Shader::ImageType type,
+                             std::optional<Tegra::Shader::ImageAtomicSize> size)
+        : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
+          is_bindless{true}, size{size} {}
+
+    constexpr explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type,
+                             bool is_bindless, bool is_written, bool is_read,
+                             std::optional<Tegra::Shader::ImageAtomicSize> size)
+        : offset{offset}, index{index}, type{type}, is_bindless{is_bindless},
+          is_written{is_written}, is_read{is_read}, size{size} {}
+
+    void MarkWrite() {
+        is_written = true;
+    }
+
+    void MarkRead() {
+        is_read = true;
+    }
+
+    void SetSize(Tegra::Shader::ImageAtomicSize size_) {
+        size = size_;
+    }
+
+    constexpr std::size_t GetOffset() const {
+        return offset;
+    }
+
+    constexpr std::size_t GetIndex() const {
+        return index;
+    }
+
+    constexpr Tegra::Shader::ImageType GetType() const {
+        return type;
+    }
+
+    constexpr bool IsBindless() const {
+        return is_bindless;
+    }
+
+    constexpr bool IsWritten() const {
+        return is_written;
+    }
+
+    constexpr bool IsRead() const {
+        return is_read;
+    }
+
+    constexpr std::pair<u32, u32> GetBindlessCBuf() const {
+        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
+    }
+
+    constexpr bool IsSizeKnown() const {
+        return size.has_value();
+    }
+
+    constexpr Tegra::Shader::ImageAtomicSize GetSize() const {
+        return size.value();
+    }
+
+    constexpr bool operator<(const Image& rhs) const {
+        return std::tie(offset, index, type, size, is_bindless) <
+               std::tie(rhs.offset, rhs.index, rhs.type, rhs.size, rhs.is_bindless);
+    }
+
+private:
+    u64 offset{};
+    std::size_t index{};
+    Tegra::Shader::ImageType type{};
+    bool is_bindless{};
+    bool is_written{};
+    bool is_read{};
+    std::optional<Tegra::Shader::ImageAtomicSize> size{};
+};
+
 struct GlobalMemoryBase {
     u32 cbuf_index{};
     u32 cbuf_offset{};
@@ -284,8 +399,14 @@ struct MetaTexture {
     u32 element{};
 };
 
+struct MetaImage {
+    const Image& image;
+    std::vector<Node> values;
+};
+
 /// Parameters that modify an operation but are not part of any particular operand
-using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
+using Meta =
+    std::variant<MetaArithmetic, MetaTexture, MetaImage, MetaStackClass, Tegra::Shader::HalfType>;
 
 /// Holds any kind of operation that can be done in the IR
 class OperationNode final {
@@ -473,6 +594,19 @@ private:
     Node address;
 };
 
+/// Shared memory node
+class SmemNode final {
+public:
+    explicit SmemNode(Node address) : address{std::move(address)} {}
+
+    const Node& GetAddress() const {
+        return address;
+    }
+
+private:
+    Node address;
+};
+
 /// Global memory node
 class GmemNode final {
 public:
diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp
index 6fccbbba3..b3dcd291c 100644
--- a/src/video_core/shader/node_helper.cpp
+++ b/src/video_core/shader/node_helper.cpp
@@ -12,7 +12,7 @@
 namespace VideoCommon::Shader {
 
 Node Conditional(Node condition, std::vector<Node> code) {
-    return MakeNode<ConditionalNode>(condition, std::move(code));
+    return MakeNode<ConditionalNode>(std::move(condition), std::move(code));
 }
 
 Node Comment(std::string text) {
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 11b545cca..bbbab0bca 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -22,8 +22,8 @@ using Tegra::Shader::PredCondition;
 using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;
 
-ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset)
-    : program_code{program_code}, main_offset{main_offset} {
+ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, const std::size_t size)
+    : program_code{program_code}, main_offset{main_offset}, program_size{size} {
     Decode();
 }
 
@@ -61,8 +61,17 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {
     const auto [entry, is_new] = used_cbufs.try_emplace(index);
     entry->second.MarkAsUsedIndirect();
 
-    const Node final_offset = Operation(OperationCode::UAdd, NO_PRECISE, node, Immediate(offset));
-    return MakeNode<CbufNode>(index, final_offset);
+    Node final_offset = [&] {
+        // Attempt to inline constant buffer without a variable offset. This is done to allow
+        // tracking LDC calls.
+        if (const auto gpr = std::get_if<GprNode>(&*node)) {
+            if (gpr->GetIndex() == Register::ZeroIndex) {
+                return Immediate(offset);
+            }
+        }
+        return Operation(OperationCode::UAdd, NO_PRECISE, std::move(node), Immediate(offset));
+    }();
+    return MakeNode<CbufNode>(index, std::move(final_offset));
 }
 
 Node ShaderIR::GetPredicate(u64 pred_, bool negated) {
@@ -80,7 +89,7 @@ Node ShaderIR::GetPredicate(bool immediate) {
 
 Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) {
     used_input_attributes.emplace(index);
-    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }
 
 Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_address, Node buffer) {
@@ -89,6 +98,22 @@ Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_addres
 }
 
 Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) {
+    if (index == Attribute::Index::LayerViewportPointSize) {
+        switch (element) {
+        case 0:
+            UNIMPLEMENTED();
+            break;
+        case 1:
+            uses_layer = true;
+            break;
+        case 2:
+            uses_viewport_index = true;
+            break;
+        case 3:
+            uses_point_size = true;
+            break;
+        }
+    }
     if (index == Attribute::Index::ClipDistances0123 ||
         index == Attribute::Index::ClipDistances4567) {
         const auto clip_index =
@@ -97,7 +122,7 @@ Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buff
     }
     used_output_attributes.insert(index);
 
-    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }
 
 Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
@@ -109,19 +134,23 @@ Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
 }
 
 Node ShaderIR::GetLocalMemory(Node address) {
-    return MakeNode<LmemNode>(address);
+    return MakeNode<LmemNode>(std::move(address));
+}
+
+Node ShaderIR::GetSharedMemory(Node address) {
+    return MakeNode<SmemNode>(std::move(address));
 }
 
-Node ShaderIR::GetTemporal(u32 id) {
+Node ShaderIR::GetTemporary(u32 id) {
     return GetRegister(Register::ZeroIndex + 1 + id);
 }
 
 Node ShaderIR::GetOperandAbsNegFloat(Node value, bool absolute, bool negate) {
     if (absolute) {
-        value = Operation(OperationCode::FAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::FAbsolute, NO_PRECISE, std::move(value));
     }
     if (negate) {
-        value = Operation(OperationCode::FNegate, NO_PRECISE, value);
+        value = Operation(OperationCode::FNegate, NO_PRECISE, std::move(value));
     }
     return value;
 }
@@ -130,24 +159,26 @@ Node ShaderIR::GetSaturatedFloat(Node value, bool saturate) {
     if (!saturate) {
         return value;
     }
-    const Node positive_zero = Immediate(std::copysignf(0, 1));
-    const Node positive_one = Immediate(1.0f);
-    return Operation(OperationCode::FClamp, NO_PRECISE, value, positive_zero, positive_one);
+
+    Node positive_zero = Immediate(std::copysignf(0, 1));
+    Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::FClamp, NO_PRECISE, std::move(value), std::move(positive_zero),
+                     std::move(positive_one));
 }
 
-Node ShaderIR::ConvertIntegerSize(Node value, Tegra::Shader::Register::Size size, bool is_signed) {
+Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signed) {
     switch (size) {
     case Register::Size::Byte:
-        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value,
-                                Immediate(24));
-        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value,
-                                Immediate(24));
+        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(24));
+        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(24));
         return value;
     case Register::Size::Short:
-        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value,
-                                Immediate(16));
-        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value,
-                                Immediate(16));
+        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(16));
+        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(16));
     case Register::Size::Word:
         // Default - do nothing
         return value;
@@ -163,27 +194,29 @@ Node ShaderIR::GetOperandAbsNegInteger(Node value, bool absolute, bool negate, b
         return value;
     }
     if (absolute) {
-        value = Operation(OperationCode::IAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::IAbsolute, NO_PRECISE, std::move(value));
     }
     if (negate) {
-        value = Operation(OperationCode::INegate, NO_PRECISE, value);
+        value = Operation(OperationCode::INegate, NO_PRECISE, std::move(value));
     }
     return value;
 }
 
 Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) {
-    const Node value = Immediate(instr.half_imm.PackImmediates());
+    Node value = Immediate(instr.half_imm.PackImmediates());
     if (!has_negation) {
         return value;
     }
-    const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
-    const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
 
-    return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate);
+    Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
+    Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
+
+    return Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), std::move(first_negate),
+                     std::move(second_negate));
 }
 
 Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) {
-    return Operation(OperationCode::HUnpack, type, value);
+    return Operation(OperationCode::HUnpack, type, std::move(value));
 }
 
 Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
@@ -191,11 +224,11 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
     case Tegra::Shader::HalfMerge::H0_H1:
         return src;
     case Tegra::Shader::HalfMerge::F32:
-        return Operation(OperationCode::HMergeF32, src);
+        return Operation(OperationCode::HMergeF32, std::move(src));
     case Tegra::Shader::HalfMerge::Mrg_H0:
-        return Operation(OperationCode::HMergeH0, dest, src);
+        return Operation(OperationCode::HMergeH0, std::move(dest), std::move(src));
     case Tegra::Shader::HalfMerge::Mrg_H1:
-        return Operation(OperationCode::HMergeH1, dest, src);
+        return Operation(OperationCode::HMergeH1, std::move(dest), std::move(src));
     }
     UNREACHABLE();
     return src;
@@ -203,10 +236,10 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
 
 Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {
     if (absolute) {
-        value = Operation(OperationCode::HAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::HAbsolute, NO_PRECISE, std::move(value));
     }
     if (negate) {
-        value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true),
+        value = Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), GetPredicate(true),
                           GetPredicate(true));
     }
     return value;
@@ -216,9 +249,11 @@ Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
     if (!saturate) {
         return value;
     }
-    const Node positive_zero = Immediate(std::copysignf(0, 1));
-    const Node positive_one = Immediate(1.0f);
-    return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one);
+
+    Node positive_zero = Immediate(std::copysignf(0, 1));
+    Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::HClamp, NO_PRECISE, std::move(value), std::move(positive_zero),
+                     std::move(positive_one));
 }
 
 Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
@@ -246,7 +281,6 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N
         condition == PredCondition::LessEqualWithNan ||
         condition == PredCondition::GreaterThanWithNan ||
         condition == PredCondition::GreaterEqualWithNan) {
-
         predicate = Operation(OperationCode::LogicalOr, predicate,
                               Operation(OperationCode::LogicalFIsNan, op_a));
         predicate = Operation(OperationCode::LogicalOr, predicate,
@@ -275,7 +309,8 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
     UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                          "Unknown predicate comparison operation");
 
-    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, op_a, op_b);
+    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a),
+                                     std::move(op_b));
 
     UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
                              condition == PredCondition::NotEqualWithNan ||
@@ -305,9 +340,7 @@ Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition
     UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                          "Unknown predicate comparison operation");
 
-    const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
-
-    return predicate;
+    return Operation(comparison->second, NO_PRECISE, std::move(op_a), std::move(op_b));
 }
 
 OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) {
@@ -333,31 +366,37 @@ Node ShaderIR::GetConditionCode(Tegra::Shader::ConditionCode cc) {
 }
 
 void ShaderIR::SetRegister(NodeBlock& bb, Register dest, Node src) {
-    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), src));
+    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), std::move(src)));
 }
 
 void ShaderIR::SetPredicate(NodeBlock& bb, u64 dest, Node src) {
-    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), src));
+    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), std::move(src)));
 }
 
 void ShaderIR::SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value) {
-    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), value));
+    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), std::move(value)));
 }
 
 void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) {
-    bb.push_back(Operation(OperationCode::Assign, GetLocalMemory(address), value));
+    bb.push_back(
+        Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value)));
+}
+
+void ShaderIR::SetSharedMemory(NodeBlock& bb, Node address, Node value) {
+    bb.push_back(
+        Operation(OperationCode::Assign, GetSharedMemory(std::move(address)), std::move(value)));
 }
 
-void ShaderIR::SetTemporal(NodeBlock& bb, u32 id, Node value) {
-    SetRegister(bb, Register::ZeroIndex + 1 + id, value);
+void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) {
+    SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value));
 }
 
 void ShaderIR::SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc) {
     if (!sets_cc) {
         return;
     }
-    const Node zerop = Operation(OperationCode::LogicalFEqual, value, Immediate(0.0f));
-    SetInternalFlag(bb, InternalFlag::Zero, zerop);
+    Node zerop = Operation(OperationCode::LogicalFEqual, std::move(value), Immediate(0.0f));
+    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));
     LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");
 }
 
@@ -365,13 +404,18 @@ void ShaderIR::SetInternalFlagsFromInteger(NodeBlock& bb, Node value, bool sets_
     if (!sets_cc) {
         return;
     }
-    const Node zerop = Operation(OperationCode::LogicalIEqual, value, Immediate(0));
-    SetInternalFlag(bb, InternalFlag::Zero, zerop);
+    Node zerop = Operation(OperationCode::LogicalIEqual, std::move(value), Immediate(0));
+    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));
     LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");
 }
 
 Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
-    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, value, Immediate(offset),
+    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, std::move(value),
+                     Immediate(offset), Immediate(bits));
+}
+
+Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) {
+    return Operation(OperationCode::UBitfieldInsert, NO_PRECISE, base, insert, Immediate(offset),
                      Immediate(bits));
 }
 
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index edcf2288e..6aed9bb84 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -5,13 +5,10 @@
 #pragma once
 
 #include <array>
-#include <cstring>
 #include <map>
 #include <optional>
 #include <set>
-#include <string>
 #include <tuple>
-#include <variant>
 #include <vector>
 
 #include "common/common_types.h"
@@ -22,18 +19,12 @@
 
 namespace VideoCommon::Shader {
 
+struct ShaderBlock;
+
 using ProgramCode = std::vector<u64>;
 
 constexpr u32 MAX_PROGRAM_LENGTH = 0x1000;
 
-/// Describes the behaviour of code path of a given entry point and a return point.
-enum class ExitMethod {
-    Undetermined, ///< Internal value. Only occur when analyzing JMP loop.
-    AlwaysReturn, ///< All code paths reach the return point.
-    Conditional,  ///< Code path reaches the return point or an END instruction conditionally.
-    AlwaysEnd,    ///< All code paths reach a END instruction.
-};
-
 class ConstBuffer {
 public:
     explicit ConstBuffer(u32 max_offset, bool is_indirect)
@@ -73,7 +64,7 @@ struct GlobalMemoryUsage {
 
 class ShaderIR final {
 public:
-    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset);
+    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, std::size_t size);
     ~ShaderIR();
 
     const std::map<u32, NodeBlock>& GetBasicBlocks() const {
@@ -104,6 +95,10 @@ public:
         return used_samplers;
     }
 
+    const std::map<u64, Image>& GetImages() const {
+        return used_images;
+    }
+
     const std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances>& GetClipDistances()
         const {
         return used_clip_distances;
@@ -117,6 +112,18 @@ public:
         return static_cast<std::size_t>(coverage_end * sizeof(u64));
     }
 
+    bool UsesLayer() const {
+        return uses_layer;
+    }
+
+    bool UsesViewportIndex() const {
+        return uses_viewport_index;
+    }
+
+    bool UsesPointSize() const {
+        return uses_point_size;
+    }
+
     bool HasPhysicalAttributes() const {
         return uses_physical_attributes;
     }
@@ -125,12 +132,20 @@ public:
         return header;
     }
 
+    bool IsFlowStackDisabled() const {
+        return disable_flow_stack;
+    }
+
+    u32 ConvertAddressToNvidiaSpace(const u32 address) const {
+        return (address - main_offset) * sizeof(Tegra::Shader::Instruction);
+    }
+
 private:
     void Decode();
 
-    ExitMethod Scan(u32 begin, u32 end, std::set<u32>& labels);
-
     NodeBlock DecodeRange(u32 begin, u32 end);
+    void DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end);
+    void InsertControlFlow(NodeBlock& bb, const ShaderBlock& block);
 
     /**
      * Decodes a single instruction from Tegra to IR.
@@ -152,8 +167,10 @@ private:
     u32 DecodeFfma(NodeBlock& bb, u32 pc);
     u32 DecodeHfma2(NodeBlock& bb, u32 pc);
     u32 DecodeConversion(NodeBlock& bb, u32 pc);
+    u32 DecodeWarp(NodeBlock& bb, u32 pc);
     u32 DecodeMemory(NodeBlock& bb, u32 pc);
     u32 DecodeTexture(NodeBlock& bb, u32 pc);
+    u32 DecodeImage(NodeBlock& bb, u32 pc);
     u32 DecodeFloatSetPredicate(NodeBlock& bb, u32 pc);
     u32 DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc);
     u32 DecodeHalfSetPredicate(NodeBlock& bb, u32 pc);
@@ -191,8 +208,10 @@ private:
     Node GetInternalFlag(InternalFlag flag, bool negated = false);
     /// Generates a node representing a local memory address
     Node GetLocalMemory(Node address);
-    /// Generates a temporal, internally it uses a post-RZ register
-    Node GetTemporal(u32 id);
+    /// Generates a node representing a shared memory address
+    Node GetSharedMemory(Node address);
+    /// Generates a temporary, internally it uses a post-RZ register
+    Node GetTemporary(u32 id);
 
     /// Sets a register. src value must be a number-evaluated node.
     void SetRegister(NodeBlock& bb, Tegra::Shader::Register dest, Node src);
@@ -200,10 +219,12 @@ private:
     void SetPredicate(NodeBlock& bb, u64 dest, Node src);
     /// Sets an internal flag. src value must be a bool-evaluated node
     void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value);
-    /// Sets a local memory address. address and value must be a number-evaluated node
+    /// Sets a local memory address with a value.
     void SetLocalMemory(NodeBlock& bb, Node address, Node value);
-    /// Sets a temporal. Internally it uses a post-RZ register
-    void SetTemporal(NodeBlock& bb, u32 id, Node value);
+    /// Sets a shared memory address with a value.
+    void SetSharedMemory(NodeBlock& bb, Node address, Node value);
+    /// Sets a temporary. Internally it uses a post-RZ register
+    void SetTemporary(NodeBlock& bb, u32 id, Node value);
 
     /// Sets internal flags from a float
     void SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc = true);
@@ -254,9 +275,24 @@ private:
                                       Tegra::Shader::TextureType type, bool is_array,
                                       bool is_shadow);
 
+    /// Accesses an image.
+    Image& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type,
+                    std::optional<Tegra::Shader::ImageAtomicSize> size = {});
+
+    /// Access a bindless image sampler.
+    Image& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type,
+                            std::optional<Tegra::Shader::ImageAtomicSize> size = {});
+
+    /// Tries to access an existing image, updating it's state as needed
+    Image* TryUseExistingImage(u64 offset, Tegra::Shader::ImageType type,
+                               std::optional<Tegra::Shader::ImageAtomicSize> size);
+
     /// Extracts a sequence of bits from a node
     Node BitfieldExtract(Node value, u32 offset, u32 bits);
 
+    /// Inserts a sequence of bits from a node
+    Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
+
     void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
                                   const Node4& components);
 
@@ -277,6 +313,8 @@ private:
     Node4 GetTld4Code(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                       bool depth_compare, bool is_array, bool is_aoffi);
 
+    Node4 GetTldCode(Tegra::Shader::Instruction instr);
+
     Node4 GetTldsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                       bool is_array);
 
@@ -301,7 +339,7 @@ private:
     void WriteLop3Instruction(NodeBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                               Node op_c, Node imm_lut, bool sets_cc);
 
-    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
+    std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
     std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
@@ -313,10 +351,11 @@ private:
 
     const ProgramCode& program_code;
     const u32 main_offset;
+    const std::size_t program_size;
+    bool disable_flow_stack{};
 
     u32 coverage_begin{};
     u32 coverage_end{};
-    std::map<std::pair<u32, u32>, ExitMethod> exit_method_map;
 
     std::map<u32, NodeBlock> basic_blocks;
     NodeBlock global_code;
@@ -327,8 +366,12 @@ private:
     std::set<Tegra::Shader::Attribute::Index> used_output_attributes;
     std::map<u32, ConstBuffer> used_cbufs;
     std::set<Sampler> used_samplers;
+    std::map<u64, Image> used_images;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
     std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
+    bool uses_layer{};
+    bool uses_viewport_index{};
+    bool uses_point_size{};
     bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes
 
     Tegra::Shader::Header header;
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index fc957d980..55f5949e4 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -15,56 +15,63 @@ namespace {
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                    OperationCode operation_code) {
     for (; cursor >= 0; --cursor) {
-        const Node node = code.at(cursor);
+        Node node = code.at(cursor);
+
         if (const auto operation = std::get_if<OperationNode>(&*node)) {
             if (operation->GetCode() == operation_code) {
-                return {node, cursor};
+                return {std::move(node), cursor};
             }
         }
+
         if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
             const auto& conditional_code = conditional->GetCode();
-            const auto [found, internal_cursor] = FindOperation(
+            auto [found, internal_cursor] = FindOperation(
                 conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);
             if (found) {
-                return {found, cursor};
+                return {std::move(found), cursor};
             }
         }
     }
     return {};
 }
-} // namespace
+} // Anonymous namespace
 
-Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const {
+std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
+                                               s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
-        // Cbuf found, but it has to be immediate
-        return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
+        // Constant buffer found, test if it's an immediate
+        const auto offset = cbuf->GetOffset();
+        if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
+            return {tracked, cbuf->GetIndex(), immediate->GetValue()};
+        }
+        return {};
     }
     if (const auto gpr = std::get_if<GprNode>(&*tracked)) {
         if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
-            return nullptr;
+            return {};
         }
         // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
         // register that it uses as operand
         const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
         if (!source) {
-            return nullptr;
+            return {};
         }
         return TrackCbuf(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
-            if (const auto found = TrackCbuf((*operation)[i], code, cursor)) {
-                // Cbuf found in operand
+        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
+            if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) {
+                // Cbuf found in operand.
                 return found;
             }
         }
-        return nullptr;
+        return {};
     }
     if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) {
         const auto& conditional_code = conditional->GetCode();
         return TrackCbuf(tracked, conditional_code, static_cast<s64>(conditional_code.size()));
     }
-    return nullptr;
+    return {};
 }
 
 std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const {